| /* See LICENSE file for copyright and license details. */ |
| #include <stdbool.h> |
| #include <stddef.h> |
| |
| #include "../gen/line.h" |
| #include "../grapheme.h" |
| #include "util.h" |
| |
| static inline enum line_break_property |
| get_break_prop(uint_least32_t cp) |
| { |
| if (likely(cp <= UINT32_C(0x10FFFF))) { |
| return (enum line_break_property) |
| line_break_minor[line_break_major[cp >> 8] + |
| (cp & 0xff)]; |
| } else { |
| return LINE_BREAK_PROP_AL; |
| } |
| } |
| |
| static size_t |
| next_line_break(HERODOTUS_READER *r) |
| { |
| HERODOTUS_READER tmp; |
| enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop, |
| last_non_sp_prop, last_non_sp_cm_or_zwj_prop; |
| uint_least32_t cp; |
| uint_least8_t lb25_level = 0; |
| bool lb21a_flag = false, ri_even = true; |
| |
| /* |
| * Apply line breaking algorithm (UAX #14), see |
| * https://unicode.org/reports/tr14/#Algorithm and tailoring |
| * https://unicode.org/reports/tr14/#Examples (example 7), |
| * given the automatic test-cases implement this example for |
| * better number handling. |
| * |
| */ |
| |
| /* |
| * Initialize the different properties such that we have |
| * a good state after the state-update in the loop |
| */ |
| last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */ |
| last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS; |
| |
| for (herodotus_read_codepoint(r, true, &cp), |
| cp0_prop = get_break_prop(cp); |
| herodotus_read_codepoint(r, false, &cp) == |
| HERODOTUS_STATUS_SUCCESS; |
| herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) { |
| /* get property of the right codepoint */ |
| cp1_prop = get_break_prop(cp); |
| |
| /* update retention-states */ |
| |
| /* |
| * store the last observed non-CM-or-ZWJ-property for |
| * LB9 and following. |
| */ |
| if (cp0_prop != LINE_BREAK_PROP_CM && |
| cp0_prop != LINE_BREAK_PROP_ZWJ) { |
| /* |
| * check if the property we are overwriting now is an |
| * HL. If so, we set the LB21a-flag which depends on |
| * this knowledge. |
| */ |
| lb21a_flag = |
| (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL); |
| |
| /* check regional indicator state */ |
| if (cp0_prop == LINE_BREAK_PROP_RI) { |
| /* |
| * The property we just shifted in is |
| * a regional indicator, increasing the |
| * number of consecutive RIs on the left |
| * side of the breakpoint by one, changing |
| * the oddness. |
| * |
| */ |
| ri_even = !ri_even; |
| } else { |
| /* |
| * We saw no regional indicator, so the |
| * number of consecutive RIs on the left |
| * side of the breakpoint is zero, which |
| * is an even number. |
| * |
| */ |
| ri_even = true; |
| } |
| |
| /* |
| * Here comes a bit of magic. The tailored rule |
| * LB25 (using example 7) has a very complicated |
| * left-hand-side-rule of the form |
| * |
| * NU (NU | SY | IS)* (CL | CP)? |
| * |
| * but instead of backtracking, we keep the state |
| * as some kind of "power level" in the variable |
| * |
| * lb25_level |
| * |
| * that goes from 0 to 3 |
| * |
| * 0: we are not in the sequence |
| * 1: we have one NU to the left of the middle |
| * spot |
| * 2: we have one NU and one or more (NU | SY | IS) |
| * to the left of the middle spot |
| * 3: we have one NU, zero or more (NU | SY | IS) |
| * and one (CL | CP) to the left of the middle |
| * spot |
| */ |
| if ((lb25_level == 0 || lb25_level == 1) && |
| cp0_prop == LINE_BREAK_PROP_NU) { |
| /* sequence has begun */ |
| lb25_level = 1; |
| } else if ((lb25_level == 1 || lb25_level == 2) && |
| (cp0_prop == LINE_BREAK_PROP_NU || |
| cp0_prop == LINE_BREAK_PROP_SY || |
| cp0_prop == LINE_BREAK_PROP_IS)) { |
| /* (NU | SY | IS) sequence begins or continued |
| */ |
| lb25_level = 2; |
| } else if ( |
| (lb25_level == 1 || lb25_level == 2) && |
| (cp0_prop == LINE_BREAK_PROP_CL || |
| cp0_prop == |
| LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || |
| cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { |
| /* CL or CP at the end of the sequence */ |
| lb25_level = 3; |
| } else { |
| /* sequence broke */ |
| lb25_level = 0; |
| } |
| |
| last_non_cm_or_zwj_prop = cp0_prop; |
| } |
| |
| /* |
| * store the last observed non-SP-property for LB8, LB14, |
| * LB15, LB16 and LB17. LB8 gets its own unskipped property, |
| * whereas the others build on top of the CM-ZWJ-skipped |
| * properties as they come after LB9 |
| */ |
| if (cp0_prop != LINE_BREAK_PROP_SP) { |
| last_non_sp_prop = cp0_prop; |
| } |
| if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) { |
| last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop; |
| } |
| |
| /* apply the algorithm */ |
| |
| /* LB4 */ |
| if (cp0_prop == LINE_BREAK_PROP_BK) { |
| break; |
| } |
| |
| /* LB5 */ |
| if (cp0_prop == LINE_BREAK_PROP_CR && |
| cp1_prop == LINE_BREAK_PROP_LF) { |
| continue; |
| } |
| if (cp0_prop == LINE_BREAK_PROP_CR || |
| cp0_prop == LINE_BREAK_PROP_LF || |
| cp0_prop == LINE_BREAK_PROP_NL) { |
| break; |
| } |
| |
| /* LB6 */ |
| if (cp1_prop == LINE_BREAK_PROP_BK || |
| cp1_prop == LINE_BREAK_PROP_CR || |
| cp1_prop == LINE_BREAK_PROP_LF || |
| cp1_prop == LINE_BREAK_PROP_NL) { |
| continue; |
| } |
| |
| /* LB7 */ |
| if (cp1_prop == LINE_BREAK_PROP_SP || |
| cp1_prop == LINE_BREAK_PROP_ZW) { |
| continue; |
| } |
| |
| /* LB8 */ |
| if (last_non_sp_prop == LINE_BREAK_PROP_ZW) { |
| break; |
| } |
| |
| /* LB8a */ |
| if (cp0_prop == LINE_BREAK_PROP_ZWJ) { |
| continue; |
| } |
| |
| /* LB9 */ |
| if ((cp0_prop != LINE_BREAK_PROP_BK && |
| cp0_prop != LINE_BREAK_PROP_CR && |
| cp0_prop != LINE_BREAK_PROP_LF && |
| cp0_prop != LINE_BREAK_PROP_NL && |
| cp0_prop != LINE_BREAK_PROP_SP && |
| cp0_prop != LINE_BREAK_PROP_ZW) && |
| (cp1_prop == LINE_BREAK_PROP_CM || |
| cp1_prop == LINE_BREAK_PROP_ZWJ)) { |
| /* |
| * given we skip them, we don't break in such |
| * a sequence |
| */ |
| continue; |
| } |
| |
| /* LB10 is baked into the following rules */ |
| |
| /* LB11 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ || |
| cp1_prop == LINE_BREAK_PROP_WJ) { |
| continue; |
| } |
| |
| /* LB12 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) { |
| continue; |
| } |
| |
| /* LB12a */ |
| if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP && |
| last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA && |
| last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) && |
| cp1_prop == LINE_BREAK_PROP_GL) { |
| continue; |
| } |
| |
| /* LB13 (affected by tailoring for LB25, see example 7) */ |
| if (cp1_prop == LINE_BREAK_PROP_EX || |
| (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU && |
| (cp1_prop == LINE_BREAK_PROP_CL || |
| cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || |
| cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF || |
| cp1_prop == LINE_BREAK_PROP_IS || |
| cp1_prop == LINE_BREAK_PROP_SY))) { |
| continue; |
| } |
| |
| /* LB14 */ |
| if (last_non_sp_cm_or_zwj_prop == |
| LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || |
| last_non_sp_cm_or_zwj_prop == |
| LINE_BREAK_PROP_OP_WITH_EAW_HWF) { |
| continue; |
| } |
| |
| /* LB15 */ |
| if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU && |
| (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || |
| cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) { |
| continue; |
| } |
| |
| /* LB16 */ |
| if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL || |
| last_non_sp_cm_or_zwj_prop == |
| LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || |
| last_non_sp_cm_or_zwj_prop == |
| LINE_BREAK_PROP_CP_WITH_EAW_HWF) && |
| cp1_prop == LINE_BREAK_PROP_NS) { |
| continue; |
| } |
| |
| /* LB17 */ |
| if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 && |
| cp1_prop == LINE_BREAK_PROP_B2) { |
| continue; |
| } |
| |
| /* LB18 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) { |
| break; |
| } |
| |
| /* LB19 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU || |
| cp1_prop == LINE_BREAK_PROP_QU) { |
| continue; |
| } |
| |
| /* LB20 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB || |
| cp1_prop == LINE_BREAK_PROP_CB) { |
| break; |
| } |
| |
| /* LB21 */ |
| if (cp1_prop == LINE_BREAK_PROP_BA || |
| cp1_prop == LINE_BREAK_PROP_HY || |
| cp1_prop == LINE_BREAK_PROP_NS || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) { |
| continue; |
| } |
| |
| /* LB21a */ |
| if (lb21a_flag && |
| (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) { |
| continue; |
| } |
| |
| /* LB21b */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY && |
| cp1_prop == LINE_BREAK_PROP_HL) { |
| continue; |
| } |
| |
| /* LB22 */ |
| if (cp1_prop == LINE_BREAK_PROP_IN) { |
| continue; |
| } |
| |
| /* LB23 */ |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && |
| cp1_prop == LINE_BREAK_PROP_NU) { |
| continue; |
| } |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU && |
| (cp1_prop == LINE_BREAK_PROP_AL || |
| cp1_prop == LINE_BREAK_PROP_HL)) { |
| continue; |
| } |
| |
| /* LB23a */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && |
| (cp1_prop == LINE_BREAK_PROP_ID || |
| cp1_prop == LINE_BREAK_PROP_EB || |
| cp1_prop == LINE_BREAK_PROP_EM)) { |
| continue; |
| } |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) && |
| cp1_prop == LINE_BREAK_PROP_PO) { |
| continue; |
| } |
| |
| /* LB24 */ |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) && |
| (cp1_prop == LINE_BREAK_PROP_AL || |
| cp1_prop == LINE_BREAK_PROP_HL)) { |
| continue; |
| } |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && |
| (cp1_prop == LINE_BREAK_PROP_PR || |
| cp1_prop == LINE_BREAK_PROP_PO)) { |
| continue; |
| } |
| |
| /* LB25 (tailored with example 7) */ |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) { |
| if (cp1_prop == LINE_BREAK_PROP_NU) { |
| continue; |
| } |
| |
| /* this stupid rule is the reason why we cannot |
| * simply have a stateful break-detection between |
| * two adjacent codepoints as we have it with |
| * characters. |
| */ |
| herodotus_reader_copy(r, &tmp); |
| herodotus_read_codepoint(&tmp, true, &cp); |
| if (herodotus_read_codepoint(&tmp, true, &cp) == |
| HERODOTUS_STATUS_SUCCESS && |
| (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || |
| cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF || |
| cp1_prop == LINE_BREAK_PROP_HY)) { |
| if (get_break_prop(cp) == LINE_BREAK_PROP_NU) { |
| continue; |
| } |
| } |
| } |
| if ((last_non_cm_or_zwj_prop == |
| LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || |
| last_non_cm_or_zwj_prop == |
| LINE_BREAK_PROP_OP_WITH_EAW_HWF || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) && |
| cp1_prop == LINE_BREAK_PROP_NU) { |
| continue; |
| } |
| if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU || |
| cp1_prop == LINE_BREAK_PROP_SY || |
| cp1_prop == LINE_BREAK_PROP_IS)) { |
| continue; |
| } |
| if ((lb25_level == 1 || lb25_level == 2) && |
| (cp1_prop == LINE_BREAK_PROP_NU || |
| cp1_prop == LINE_BREAK_PROP_SY || |
| cp1_prop == LINE_BREAK_PROP_IS || |
| cp1_prop == LINE_BREAK_PROP_CL || |
| cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || |
| cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { |
| continue; |
| } |
| if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) && |
| (cp1_prop == LINE_BREAK_PROP_PO || |
| cp1_prop == LINE_BREAK_PROP_PR)) { |
| continue; |
| } |
| |
| /* LB26 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL && |
| (cp1_prop == LINE_BREAK_PROP_JL || |
| cp1_prop == LINE_BREAK_PROP_JV || |
| cp1_prop == LINE_BREAK_PROP_H2 || |
| cp1_prop == LINE_BREAK_PROP_H3)) { |
| continue; |
| } |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) && |
| (cp1_prop == LINE_BREAK_PROP_JV || |
| cp1_prop == LINE_BREAK_PROP_JT)) { |
| continue; |
| } |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && |
| cp1_prop == LINE_BREAK_PROP_JT) { |
| continue; |
| } |
| |
| /* LB27 */ |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && |
| cp1_prop == LINE_BREAK_PROP_PO) { |
| continue; |
| } |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && |
| (cp1_prop == LINE_BREAK_PROP_JL || |
| cp1_prop == LINE_BREAK_PROP_JV || |
| cp1_prop == LINE_BREAK_PROP_JT || |
| cp1_prop == LINE_BREAK_PROP_H2 || |
| cp1_prop == LINE_BREAK_PROP_H3)) { |
| continue; |
| } |
| |
| /* LB28 */ |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && |
| (cp1_prop == LINE_BREAK_PROP_AL || |
| cp1_prop == LINE_BREAK_PROP_HL)) { |
| continue; |
| } |
| |
| /* LB29 */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS && |
| (cp1_prop == LINE_BREAK_PROP_AL || |
| cp1_prop == LINE_BREAK_PROP_HL)) { |
| continue; |
| } |
| |
| /* LB30 */ |
| if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL || |
| last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) && |
| cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) { |
| continue; |
| } |
| if (last_non_cm_or_zwj_prop == |
| LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF && |
| (cp1_prop == LINE_BREAK_PROP_AL || |
| cp1_prop == LINE_BREAK_PROP_HL || |
| cp1_prop == LINE_BREAK_PROP_NU)) { |
| continue; |
| } |
| |
| /* LB30a */ |
| if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI && |
| cp1_prop == LINE_BREAK_PROP_RI) { |
| continue; |
| } |
| |
| /* LB30b */ |
| if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB && |
| cp1_prop == LINE_BREAK_PROP_EM) { |
| continue; |
| } |
| if (last_non_cm_or_zwj_prop == |
| LINE_BREAK_PROP_BOTH_CN_EXTPICT && |
| cp1_prop == LINE_BREAK_PROP_EM) { |
| continue; |
| } |
| |
| /* LB31 */ |
| break; |
| } |
| |
| return herodotus_reader_number_read(r); |
| } |
| |
| size_t |
| grapheme_next_line_break(const uint_least32_t *str, size_t len) |
| { |
| HERODOTUS_READER r; |
| |
| herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); |
| |
| return next_line_break(&r); |
| } |
| |
| size_t |
| grapheme_next_line_break_utf8(const char *str, size_t len) |
| { |
| HERODOTUS_READER r; |
| |
| herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); |
| |
| return next_line_break(&r); |
| } |