| /* See LICENSE file for copyright and license details. */ |
| #include <stdbool.h> |
| #include <stddef.h> |
| |
| #include "../gen/sentence.h" |
| #include "../grapheme.h" |
| #include "util.h" |
| |
| struct sentence_break_state { |
| uint_least8_t aterm_close_sp_level; |
| uint_least8_t saterm_close_sp_parasep_level; |
| }; |
| |
| static inline uint_least8_t |
| get_sentence_break_prop(uint_least32_t cp) |
| { |
| if (likely(cp <= UINT32_C(0x10FFFF))) { |
| return (uint_least8_t) |
| sentence_break_minor[sentence_break_major[cp >> 8] + |
| (cp & 0xff)]; |
| } else { |
| return SENTENCE_BREAK_PROP_OTHER; |
| } |
| } |
| |
| static bool |
| is_skippable_sentence_prop(uint_least8_t prop) |
| { |
| return prop == SENTENCE_BREAK_PROP_EXTEND || |
| prop == SENTENCE_BREAK_PROP_FORMAT; |
| } |
| |
| static void |
| sentence_skip_shift_callback(uint_least8_t prop, void *s) |
| { |
| struct sentence_break_state *state = (struct sentence_break_state *)s; |
| |
| /* |
| * Here comes a bit of magic. The rules |
| * SB8, SB8a, SB9 and SB10 have very complicated |
| * left-hand-side-rules of the form |
| * |
| * ATerm Close* Sp* |
| * SATerm Close* |
| * SATerm Close* Sp* |
| * SATerm Close* Sp* ParaSep? |
| * |
| * but instead of backtracking, we keep the |
| * state as some kind of "power level" in |
| * two state-variables |
| * |
| * aterm_close_sp_level |
| * saterm_close_sp_parasep_level |
| * |
| * that go from 0 to 3/4: |
| * |
| * 0: we are not in the sequence |
| * 1: we have one ATerm/SATerm to the left of |
| * the middle spot |
| * 2: we have one ATerm/SATerm and one or more |
| * Close to the left of the middle spot |
| * 3: we have one ATerm/SATerm, zero or more |
| * Close and one or more Sp to the left of |
| * the middle spot. |
| * 4: we have one SATerm, zero or more Close, |
| * zero or more Sp and one ParaSep to the |
| * left of the middle spot. |
| * |
| */ |
| if ((state->aterm_close_sp_level == 0 || |
| state->aterm_close_sp_level == 1) && |
| prop == SENTENCE_BREAK_PROP_ATERM) { |
| /* sequence has begun */ |
| state->aterm_close_sp_level = 1; |
| } else if ((state->aterm_close_sp_level == 1 || |
| state->aterm_close_sp_level == 2) && |
| prop == SENTENCE_BREAK_PROP_CLOSE) { |
| /* close-sequence begins or continued */ |
| state->aterm_close_sp_level = 2; |
| } else if ((state->aterm_close_sp_level == 1 || |
| state->aterm_close_sp_level == 2 || |
| state->aterm_close_sp_level == 3) && |
| prop == SENTENCE_BREAK_PROP_SP) { |
| /* sp-sequence begins or continued */ |
| state->aterm_close_sp_level = 3; |
| } else { |
| /* sequence broke */ |
| state->aterm_close_sp_level = 0; |
| } |
| |
| if ((state->saterm_close_sp_parasep_level == 0 || |
| state->saterm_close_sp_parasep_level == 1) && |
| (prop == SENTENCE_BREAK_PROP_STERM || |
| prop == SENTENCE_BREAK_PROP_ATERM)) { |
| /* sequence has begun */ |
| state->saterm_close_sp_parasep_level = 1; |
| } else if ((state->saterm_close_sp_parasep_level == 1 || |
| state->saterm_close_sp_parasep_level == 2) && |
| prop == SENTENCE_BREAK_PROP_CLOSE) { |
| /* close-sequence begins or continued */ |
| state->saterm_close_sp_parasep_level = 2; |
| } else if ((state->saterm_close_sp_parasep_level == 1 || |
| state->saterm_close_sp_parasep_level == 2 || |
| state->saterm_close_sp_parasep_level == 3) && |
| prop == SENTENCE_BREAK_PROP_SP) { |
| /* sp-sequence begins or continued */ |
| state->saterm_close_sp_parasep_level = 3; |
| } else if ((state->saterm_close_sp_parasep_level == 1 || |
| state->saterm_close_sp_parasep_level == 2 || |
| state->saterm_close_sp_parasep_level == 3) && |
| (prop == SENTENCE_BREAK_PROP_SEP || |
| prop == SENTENCE_BREAK_PROP_CR || |
| prop == SENTENCE_BREAK_PROP_LF)) { |
| /* ParaSep at the end of the sequence */ |
| state->saterm_close_sp_parasep_level = 4; |
| } else { |
| /* sequence broke */ |
| state->saterm_close_sp_parasep_level = 0; |
| } |
| } |
| |
| static size_t |
| next_sentence_break(HERODOTUS_READER *r) |
| { |
| HERODOTUS_READER tmp; |
| enum sentence_break_property prop; |
| struct proper p; |
| struct sentence_break_state state = { 0 }; |
| uint_least32_t cp; |
| |
| /* |
| * Apply sentence breaking algorithm (UAX #29), see |
| * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules |
| */ |
| proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS, |
| get_sentence_break_prop, is_skippable_sentence_prop, |
| sentence_skip_shift_callback, &p); |
| |
| while (!proper_advance(&p)) { |
| /* SB3 */ |
| if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR && |
| p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) { |
| continue; |
| } |
| |
| /* SB4 */ |
| if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP || |
| p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR || |
| p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) { |
| break; |
| } |
| |
| /* SB5 */ |
| if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND || |
| p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) { |
| continue; |
| } |
| |
| /* SB6 */ |
| if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) { |
| continue; |
| } |
| |
| /* SB7 */ |
| if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER || |
| p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) && |
| p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) { |
| continue; |
| } |
| |
| /* SB8 */ |
| if (state.aterm_close_sp_level == 1 || |
| state.aterm_close_sp_level == 2 || |
| state.aterm_close_sp_level == 3) { |
| /* |
| * This is the most complicated rule, requiring |
| * the right-hand-side to satisfy the regular expression |
| * |
| * ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* |
| * Lower |
| * |
| * which we simply check "manually" given LUT-lookups |
| * are very cheap by starting at the mid_reader. |
| * |
| */ |
| herodotus_reader_copy(&(p.mid_reader), &tmp); |
| |
| prop = NUM_SENTENCE_BREAK_PROPS; |
| while (herodotus_read_codepoint(&tmp, true, &cp) == |
| HERODOTUS_STATUS_SUCCESS) { |
| prop = get_sentence_break_prop(cp); |
| |
| /* |
| * the skippable properties are ignored |
| * automatically here given they do not |
| * match the following condition |
| */ |
| if (prop == SENTENCE_BREAK_PROP_OLETTER || |
| prop == SENTENCE_BREAK_PROP_UPPER || |
| prop == SENTENCE_BREAK_PROP_LOWER || |
| prop == SENTENCE_BREAK_PROP_SEP || |
| prop == SENTENCE_BREAK_PROP_CR || |
| prop == SENTENCE_BREAK_PROP_LF || |
| prop == SENTENCE_BREAK_PROP_STERM || |
| prop == SENTENCE_BREAK_PROP_ATERM) { |
| break; |
| } |
| } |
| |
| if (prop == SENTENCE_BREAK_PROP_LOWER) { |
| continue; |
| } |
| } |
| |
| /* SB8a */ |
| if ((state.saterm_close_sp_parasep_level == 1 || |
| state.saterm_close_sp_parasep_level == 2 || |
| state.saterm_close_sp_parasep_level == 3) && |
| (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) { |
| continue; |
| } |
| |
| /* SB9 */ |
| if ((state.saterm_close_sp_parasep_level == 1 || |
| state.saterm_close_sp_parasep_level == 2) && |
| (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { |
| continue; |
| } |
| |
| /* SB10 */ |
| if ((state.saterm_close_sp_parasep_level == 1 || |
| state.saterm_close_sp_parasep_level == 2 || |
| state.saterm_close_sp_parasep_level == 3) && |
| (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || |
| p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { |
| continue; |
| } |
| |
| /* SB11 */ |
| if (state.saterm_close_sp_parasep_level == 1 || |
| state.saterm_close_sp_parasep_level == 2 || |
| state.saterm_close_sp_parasep_level == 3 || |
| state.saterm_close_sp_parasep_level == 4) { |
| break; |
| } |
| |
| /* SB998 */ |
| continue; |
| } |
| |
| return herodotus_reader_number_read(&(p.mid_reader)); |
| } |
| |
| size_t |
| grapheme_next_sentence_break(const uint_least32_t *str, size_t len) |
| { |
| HERODOTUS_READER r; |
| |
| herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); |
| |
| return next_sentence_break(&r); |
| } |
| |
| size_t |
| grapheme_next_sentence_break_utf8(const char *str, size_t len) |
| { |
| HERODOTUS_READER r; |
| |
| herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); |
| |
| return next_sentence_break(&r); |
| } |