blob: 535514338729298b80ac58b91e06385b5ede8f3a [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
#include "../gen/line.h"
#include "../grapheme.h"
#include "util.h"
static inline enum line_break_property
get_break_prop(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum line_break_property)
line_break_minor[line_break_major[cp >> 8] +
(cp & 0xff)];
} else {
return LINE_BREAK_PROP_AL;
}
}
static size_t
next_line_break(HERODOTUS_READER *r)
{
HERODOTUS_READER tmp;
enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
uint_least32_t cp;
uint_least8_t lb25_level = 0;
bool lb21a_flag = false, ri_even = true;
/*
* Apply line breaking algorithm (UAX #14), see
* https://unicode.org/reports/tr14/#Algorithm and tailoring
* https://unicode.org/reports/tr14/#Examples (example 7),
* given the automatic test-cases implement this example for
* better number handling.
*
*/
/*
* Initialize the different properties such that we have
* a good state after the state-update in the loop
*/
last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
for (herodotus_read_codepoint(r, true, &cp),
cp0_prop = get_break_prop(cp);
herodotus_read_codepoint(r, false, &cp) ==
HERODOTUS_STATUS_SUCCESS;
herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
/* get property of the right codepoint */
cp1_prop = get_break_prop(cp);
/* update retention-states */
/*
* store the last observed non-CM-or-ZWJ-property for
* LB9 and following.
*/
if (cp0_prop != LINE_BREAK_PROP_CM &&
cp0_prop != LINE_BREAK_PROP_ZWJ) {
/*
* check if the property we are overwriting now is an
* HL. If so, we set the LB21a-flag which depends on
* this knowledge.
*/
lb21a_flag =
(last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
/* check regional indicator state */
if (cp0_prop == LINE_BREAK_PROP_RI) {
/*
* The property we just shifted in is
* a regional indicator, increasing the
* number of consecutive RIs on the left
* side of the breakpoint by one, changing
* the oddness.
*
*/
ri_even = !ri_even;
} else {
/*
* We saw no regional indicator, so the
* number of consecutive RIs on the left
* side of the breakpoint is zero, which
* is an even number.
*
*/
ri_even = true;
}
/*
* Here comes a bit of magic. The tailored rule
* LB25 (using example 7) has a very complicated
* left-hand-side-rule of the form
*
* NU (NU | SY | IS)* (CL | CP)?
*
* but instead of backtracking, we keep the state
* as some kind of "power level" in the variable
*
* lb25_level
*
* that goes from 0 to 3
*
* 0: we are not in the sequence
* 1: we have one NU to the left of the middle
* spot
* 2: we have one NU and one or more (NU | SY | IS)
* to the left of the middle spot
* 3: we have one NU, zero or more (NU | SY | IS)
* and one (CL | CP) to the left of the middle
* spot
*/
if ((lb25_level == 0 || lb25_level == 1) &&
cp0_prop == LINE_BREAK_PROP_NU) {
/* sequence has begun */
lb25_level = 1;
} else if ((lb25_level == 1 || lb25_level == 2) &&
(cp0_prop == LINE_BREAK_PROP_NU ||
cp0_prop == LINE_BREAK_PROP_SY ||
cp0_prop == LINE_BREAK_PROP_IS)) {
/* (NU | SY | IS) sequence begins or continued
*/
lb25_level = 2;
} else if (
(lb25_level == 1 || lb25_level == 2) &&
(cp0_prop == LINE_BREAK_PROP_CL ||
cp0_prop ==
LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
/* CL or CP at the end of the sequence */
lb25_level = 3;
} else {
/* sequence broke */
lb25_level = 0;
}
last_non_cm_or_zwj_prop = cp0_prop;
}
/*
* store the last observed non-SP-property for LB8, LB14,
* LB15, LB16 and LB17. LB8 gets its own unskipped property,
* whereas the others build on top of the CM-ZWJ-skipped
* properties as they come after LB9
*/
if (cp0_prop != LINE_BREAK_PROP_SP) {
last_non_sp_prop = cp0_prop;
}
if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
}
/* apply the algorithm */
/* LB4 */
if (cp0_prop == LINE_BREAK_PROP_BK) {
break;
}
/* LB5 */
if (cp0_prop == LINE_BREAK_PROP_CR &&
cp1_prop == LINE_BREAK_PROP_LF) {
continue;
}
if (cp0_prop == LINE_BREAK_PROP_CR ||
cp0_prop == LINE_BREAK_PROP_LF ||
cp0_prop == LINE_BREAK_PROP_NL) {
break;
}
/* LB6 */
if (cp1_prop == LINE_BREAK_PROP_BK ||
cp1_prop == LINE_BREAK_PROP_CR ||
cp1_prop == LINE_BREAK_PROP_LF ||
cp1_prop == LINE_BREAK_PROP_NL) {
continue;
}
/* LB7 */
if (cp1_prop == LINE_BREAK_PROP_SP ||
cp1_prop == LINE_BREAK_PROP_ZW) {
continue;
}
/* LB8 */
if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
break;
}
/* LB8a */
if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
continue;
}
/* LB9 */
if ((cp0_prop != LINE_BREAK_PROP_BK &&
cp0_prop != LINE_BREAK_PROP_CR &&
cp0_prop != LINE_BREAK_PROP_LF &&
cp0_prop != LINE_BREAK_PROP_NL &&
cp0_prop != LINE_BREAK_PROP_SP &&
cp0_prop != LINE_BREAK_PROP_ZW) &&
(cp1_prop == LINE_BREAK_PROP_CM ||
cp1_prop == LINE_BREAK_PROP_ZWJ)) {
/*
* given we skip them, we don't break in such
* a sequence
*/
continue;
}
/* LB10 is baked into the following rules */
/* LB11 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
cp1_prop == LINE_BREAK_PROP_WJ) {
continue;
}
/* LB12 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
continue;
}
/* LB12a */
if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
cp1_prop == LINE_BREAK_PROP_GL) {
continue;
}
/* LB13 (affected by tailoring for LB25, see example 7) */
if (cp1_prop == LINE_BREAK_PROP_EX ||
(last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
(cp1_prop == LINE_BREAK_PROP_CL ||
cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_IS ||
cp1_prop == LINE_BREAK_PROP_SY))) {
continue;
}
/* LB14 */
if (last_non_sp_cm_or_zwj_prop ==
LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
last_non_sp_cm_or_zwj_prop ==
LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
continue;
}
/* LB15 */
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
continue;
}
/* LB16 */
if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
last_non_sp_cm_or_zwj_prop ==
LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
last_non_sp_cm_or_zwj_prop ==
LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
cp1_prop == LINE_BREAK_PROP_NS) {
continue;
}
/* LB17 */
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
cp1_prop == LINE_BREAK_PROP_B2) {
continue;
}
/* LB18 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
break;
}
/* LB19 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
cp1_prop == LINE_BREAK_PROP_QU) {
continue;
}
/* LB20 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
cp1_prop == LINE_BREAK_PROP_CB) {
break;
}
/* LB21 */
if (cp1_prop == LINE_BREAK_PROP_BA ||
cp1_prop == LINE_BREAK_PROP_HY ||
cp1_prop == LINE_BREAK_PROP_NS ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
continue;
}
/* LB21a */
if (lb21a_flag &&
(last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
continue;
}
/* LB21b */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
cp1_prop == LINE_BREAK_PROP_HL) {
continue;
}
/* LB22 */
if (cp1_prop == LINE_BREAK_PROP_IN) {
continue;
}
/* LB23 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
cp1_prop == LINE_BREAK_PROP_NU) {
continue;
}
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
/* LB23a */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
(cp1_prop == LINE_BREAK_PROP_ID ||
cp1_prop == LINE_BREAK_PROP_EB ||
cp1_prop == LINE_BREAK_PROP_EM)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
cp1_prop == LINE_BREAK_PROP_PO) {
continue;
}
/* LB24 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
(cp1_prop == LINE_BREAK_PROP_PR ||
cp1_prop == LINE_BREAK_PROP_PO)) {
continue;
}
/* LB25 (tailored with example 7) */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
if (cp1_prop == LINE_BREAK_PROP_NU) {
continue;
}
/* this stupid rule is the reason why we cannot
* simply have a stateful break-detection between
* two adjacent codepoints as we have it with
* characters.
*/
herodotus_reader_copy(r, &tmp);
herodotus_read_codepoint(&tmp, true, &cp);
if (herodotus_read_codepoint(&tmp, true, &cp) ==
HERODOTUS_STATUS_SUCCESS &&
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_HY)) {
if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
continue;
}
}
}
if ((last_non_cm_or_zwj_prop ==
LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
last_non_cm_or_zwj_prop ==
LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
cp1_prop == LINE_BREAK_PROP_NU) {
continue;
}
if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU ||
cp1_prop == LINE_BREAK_PROP_SY ||
cp1_prop == LINE_BREAK_PROP_IS)) {
continue;
}
if ((lb25_level == 1 || lb25_level == 2) &&
(cp1_prop == LINE_BREAK_PROP_NU ||
cp1_prop == LINE_BREAK_PROP_SY ||
cp1_prop == LINE_BREAK_PROP_IS ||
cp1_prop == LINE_BREAK_PROP_CL ||
cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
continue;
}
if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
(cp1_prop == LINE_BREAK_PROP_PO ||
cp1_prop == LINE_BREAK_PROP_PR)) {
continue;
}
/* LB26 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
(cp1_prop == LINE_BREAK_PROP_JL ||
cp1_prop == LINE_BREAK_PROP_JV ||
cp1_prop == LINE_BREAK_PROP_H2 ||
cp1_prop == LINE_BREAK_PROP_H3)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
(cp1_prop == LINE_BREAK_PROP_JV ||
cp1_prop == LINE_BREAK_PROP_JT)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
cp1_prop == LINE_BREAK_PROP_JT) {
continue;
}
/* LB27 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
cp1_prop == LINE_BREAK_PROP_PO) {
continue;
}
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
(cp1_prop == LINE_BREAK_PROP_JL ||
cp1_prop == LINE_BREAK_PROP_JV ||
cp1_prop == LINE_BREAK_PROP_JT ||
cp1_prop == LINE_BREAK_PROP_H2 ||
cp1_prop == LINE_BREAK_PROP_H3)) {
continue;
}
/* LB28 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
/* LB29 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
/* LB30 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
continue;
}
if (last_non_cm_or_zwj_prop ==
LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL ||
cp1_prop == LINE_BREAK_PROP_NU)) {
continue;
}
/* LB30a */
if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
cp1_prop == LINE_BREAK_PROP_RI) {
continue;
}
/* LB30b */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
cp1_prop == LINE_BREAK_PROP_EM) {
continue;
}
if (last_non_cm_or_zwj_prop ==
LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
cp1_prop == LINE_BREAK_PROP_EM) {
continue;
}
/* LB31 */
break;
}
return herodotus_reader_number_read(r);
}
size_t
grapheme_next_line_break(const uint_least32_t *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
return next_line_break(&r);
}
size_t
grapheme_next_line_break_utf8(const char *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
return next_line_break(&r);
}