blob: 4a0a05efdc81ec67709804da6ca5cff477a8918d [file]
/* See LICENSE file for copyright and license details. */
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include "../gen/character.h"
#include "../grapheme.h"
#include "util.h"
static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_OTHER] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_CR] =
UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
[CHAR_BREAK_PROP_EXTEND] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_L] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_V] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_T] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LV] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LVT] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_PREPEND] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
(UINT16_C(0xFFFF) &
~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
UINT16_C(1) << CHAR_BREAK_PROP_LF |
UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
)
), /* GB9b */
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_SPACINGMARK] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_ZWJ] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
};
static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
};
static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
};
static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static inline enum char_break_property
get_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
return (enum char_break_property)
char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
} else {
return CHAR_BREAK_PROP_OTHER;
}
}
bool
grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
{
enum char_break_property cp0_prop, cp1_prop;
bool notbreak = false;
if (likely(state)) {
if (likely(state->prop_set)) {
cp0_prop = state->prop;
} else {
cp0_prop = get_break_prop(cp0);
}
cp1_prop = get_break_prop(cp1);
/* preserve prop of right codepoint for next iteration */
state->prop = (uint_least8_t)cp1_prop;
state->prop_set = true;
/* update flags */
state->gb11_flag =
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
state->gb11_flag] &
UINT16_C(1) << cp1_prop;
state->gb12_13_flag =
flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
state->gb12_13_flag] &
UINT16_C(1) << cp1_prop;
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*/
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
(dont_break_gb11[cp0_prop + state->gb11_flag *
NUM_CHAR_BREAK_PROPS] &
(UINT16_C(1) << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
NUM_CHAR_BREAK_PROPS] &
(UINT16_C(1) << cp1_prop));
/* update or reset flags (when we have a break) */
if (likely(!notbreak)) {
state->gb11_flag = state->gb12_13_flag = false;
}
} else {
cp0_prop = get_break_prop(cp0);
cp1_prop = get_break_prop(cp1);
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*
* Given we have no state, this behaves as if the state-booleans
* were all set to false
*/
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
(dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
}
return !notbreak;
}
size_t
grapheme_next_character_break(const uint_least32_t *str, size_t len)
{
GRAPHEME_STATE state = { 0 };
size_t off;
if (str == NULL || len == 0) {
return 0;
}
for (off = 1; off < len; off++) {
if (grapheme_is_character_break(str[off - 1], str[off], &state)) {
break;
}
}
return off;
}
size_t
grapheme_next_character_break_utf8(const char *str, size_t len)
{
GRAPHEME_STATE state = { 0 };
uint_least32_t cp0 = 0, cp1 = 0;
size_t off, ret;
if (str == NULL || len == 0) {
return 0;
}
for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
cp0 = cp1;
ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
SIZE_MAX : len - off, &cp1);
if (len != SIZE_MAX && ret > (len - off)) {
/* string ended abruptly, simply accept cropping */
ret = len - off;
}
if (len == SIZE_MAX && cp1 == 0) {
/* we hit a NUL-byte and are done */
break;
}
if (off == 0) {
/*
* we skip the first round, as we need both
* cp0 and cp1 to be initialized
*/
continue;
} else if (grapheme_is_character_break(cp0, cp1, &state)) {
break;
}
}
return off;
}