blob: 291c7f49d6ebcf8b9b40374f467a45679a412f04 [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
#include "../gen/sentence.h"
#include "../grapheme.h"
#include "util.h"
struct sentence_break_state {
uint_least8_t aterm_close_sp_level;
uint_least8_t saterm_close_sp_parasep_level;
};
static inline uint_least8_t
get_sentence_break_prop(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (uint_least8_t)
sentence_break_minor[sentence_break_major[cp >> 8] +
(cp & 0xff)];
} else {
return SENTENCE_BREAK_PROP_OTHER;
}
}
static bool
is_skippable_sentence_prop(uint_least8_t prop)
{
return prop == SENTENCE_BREAK_PROP_EXTEND ||
prop == SENTENCE_BREAK_PROP_FORMAT;
}
static void
sentence_skip_shift_callback(uint_least8_t prop, void *s)
{
struct sentence_break_state *state = (struct sentence_break_state *)s;
/*
* Here comes a bit of magic. The rules
* SB8, SB8a, SB9 and SB10 have very complicated
* left-hand-side-rules of the form
*
* ATerm Close* Sp*
* SATerm Close*
* SATerm Close* Sp*
* SATerm Close* Sp* ParaSep?
*
* but instead of backtracking, we keep the
* state as some kind of "power level" in
* two state-variables
*
* aterm_close_sp_level
* saterm_close_sp_parasep_level
*
* that go from 0 to 3/4:
*
* 0: we are not in the sequence
* 1: we have one ATerm/SATerm to the left of
* the middle spot
* 2: we have one ATerm/SATerm and one or more
* Close to the left of the middle spot
* 3: we have one ATerm/SATerm, zero or more
* Close and one or more Sp to the left of
* the middle spot.
* 4: we have one SATerm, zero or more Close,
* zero or more Sp and one ParaSep to the
* left of the middle spot.
*
*/
if ((state->aterm_close_sp_level == 0 ||
state->aterm_close_sp_level == 1) &&
prop == SENTENCE_BREAK_PROP_ATERM) {
/* sequence has begun */
state->aterm_close_sp_level = 1;
} else if ((state->aterm_close_sp_level == 1 ||
state->aterm_close_sp_level == 2) &&
prop == SENTENCE_BREAK_PROP_CLOSE) {
/* close-sequence begins or continued */
state->aterm_close_sp_level = 2;
} else if ((state->aterm_close_sp_level == 1 ||
state->aterm_close_sp_level == 2 ||
state->aterm_close_sp_level == 3) &&
prop == SENTENCE_BREAK_PROP_SP) {
/* sp-sequence begins or continued */
state->aterm_close_sp_level = 3;
} else {
/* sequence broke */
state->aterm_close_sp_level = 0;
}
if ((state->saterm_close_sp_parasep_level == 0 ||
state->saterm_close_sp_parasep_level == 1) &&
(prop == SENTENCE_BREAK_PROP_STERM ||
prop == SENTENCE_BREAK_PROP_ATERM)) {
/* sequence has begun */
state->saterm_close_sp_parasep_level = 1;
} else if ((state->saterm_close_sp_parasep_level == 1 ||
state->saterm_close_sp_parasep_level == 2) &&
prop == SENTENCE_BREAK_PROP_CLOSE) {
/* close-sequence begins or continued */
state->saterm_close_sp_parasep_level = 2;
} else if ((state->saterm_close_sp_parasep_level == 1 ||
state->saterm_close_sp_parasep_level == 2 ||
state->saterm_close_sp_parasep_level == 3) &&
prop == SENTENCE_BREAK_PROP_SP) {
/* sp-sequence begins or continued */
state->saterm_close_sp_parasep_level = 3;
} else if ((state->saterm_close_sp_parasep_level == 1 ||
state->saterm_close_sp_parasep_level == 2 ||
state->saterm_close_sp_parasep_level == 3) &&
(prop == SENTENCE_BREAK_PROP_SEP ||
prop == SENTENCE_BREAK_PROP_CR ||
prop == SENTENCE_BREAK_PROP_LF)) {
/* ParaSep at the end of the sequence */
state->saterm_close_sp_parasep_level = 4;
} else {
/* sequence broke */
state->saterm_close_sp_parasep_level = 0;
}
}
static size_t
next_sentence_break(HERODOTUS_READER *r)
{
HERODOTUS_READER tmp;
enum sentence_break_property prop;
struct proper p;
struct sentence_break_state state = { 0 };
uint_least32_t cp;
/*
* Apply sentence breaking algorithm (UAX #29), see
* https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
*/
proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
get_sentence_break_prop, is_skippable_sentence_prop,
sentence_skip_shift_callback, &p);
while (!proper_advance(&p)) {
/* SB3 */
if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
continue;
}
/* SB4 */
if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
break;
}
/* SB5 */
if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
continue;
}
/* SB6 */
if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
continue;
}
/* SB7 */
if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
continue;
}
/* SB8 */
if (state.aterm_close_sp_level == 1 ||
state.aterm_close_sp_level == 2 ||
state.aterm_close_sp_level == 3) {
/*
* This is the most complicated rule, requiring
* the right-hand-side to satisfy the regular expression
*
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )*
* Lower
*
* which we simply check "manually" given LUT-lookups
* are very cheap by starting at the mid_reader.
*
*/
herodotus_reader_copy(&(p.mid_reader), &tmp);
prop = NUM_SENTENCE_BREAK_PROPS;
while (herodotus_read_codepoint(&tmp, true, &cp) ==
HERODOTUS_STATUS_SUCCESS) {
prop = get_sentence_break_prop(cp);
/*
* the skippable properties are ignored
* automatically here given they do not
* match the following condition
*/
if (prop == SENTENCE_BREAK_PROP_OLETTER ||
prop == SENTENCE_BREAK_PROP_UPPER ||
prop == SENTENCE_BREAK_PROP_LOWER ||
prop == SENTENCE_BREAK_PROP_SEP ||
prop == SENTENCE_BREAK_PROP_CR ||
prop == SENTENCE_BREAK_PROP_LF ||
prop == SENTENCE_BREAK_PROP_STERM ||
prop == SENTENCE_BREAK_PROP_ATERM) {
break;
}
}
if (prop == SENTENCE_BREAK_PROP_LOWER) {
continue;
}
}
/* SB8a */
if ((state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2 ||
state.saterm_close_sp_parasep_level == 3) &&
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
continue;
}
/* SB9 */
if ((state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2) &&
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB10 */
if ((state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2 ||
state.saterm_close_sp_parasep_level == 3) &&
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB11 */
if (state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2 ||
state.saterm_close_sp_parasep_level == 3 ||
state.saterm_close_sp_parasep_level == 4) {
break;
}
/* SB998 */
continue;
}
return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
return next_sentence_break(&r);
}
size_t
grapheme_next_sentence_break_utf8(const char *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
return next_sentence_break(&r);
}