src/word.c - external/github.com/FRIGN/libgrapheme - Git at Google

 /* See LICENSE file for copyright and license details. */
 #include <stdbool.h>
 #include <stddef.h>

 #include "../gen/word.h"
 #include "../grapheme.h"
 #include "util.h"

 static inline enum word_break_property
 get_break_prop(uint_least32_t cp)
 {
 	if (likely(cp <= 0x10FFFF)) {
 		return (enum word_break_property)
 		       word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
 	} else {
 		return WORD_BREAK_PROP_OTHER;
 	}
 }

 static size_t
 next_word_break(const void *str, size_t len, size_t (*get_codepoint)
                 (const void *, size_t, size_t, uint_least32_t *))
 {
 	struct {
 		enum word_break_property a, b, c, d;
 	} raw, skip;
 	enum word_break_property res;
 	uint_least32_t cp;
 	size_t off, tmp, new_off;
 	bool ri_even = true;

 	/* check degenerate cases */
 	if (str == NULL || len == 0) {
 		return 0;
 	}

 	/*
 	 * Apply word breaking algorithm (UAX #29), see
 	 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
 	 *
 	 * There are 4 slots (a, b, c, d) of "break" properties and
 	 * we check if there is a break in the middle between b and c.
 	 *
 	 * The position of this middle spot is determined by off,
 	 * which gives the offset of the first element on the right
 	 * hand side of said spot, or, in other words, gives the number
 	 * of elements on the left hand side.
 	 *
 	 * It is further complicated by the fact that the algorithm
 	 * expects you to skip certain characters for the second
 	 * half of the rules (after WB4). Thus, we do not only have
 	 * the "raw" properties as described above, but also the "skip"
 	 * properties, where the skip.a and skip.b, for instance,
 	 * give the two preceding character properties behind the
 	 * currently investigated breakpoint.
 	 *
 	 */

 	/*
 	 * Initialize the different properties such that we have
 	 * a good state after the state-update in the loop
 	 */
 	raw.b = NUM_WORD_BREAK_PROPS;
 	if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
 		/*
 		 * A line is at least one codepoint long, so we can
 		 * safely return here
 		 */
 		return len;
 	}
 	raw.c = get_break_prop(cp);
 	(void)get_codepoint(str, len, off, &cp);
 	raw.d = get_break_prop(cp);
 	skip.a = skip.b = NUM_WORD_BREAK_PROPS;

 	for (; off < len; off = new_off) {
 		/*
 		 * Update left side (a and b) of the skip state by
 		 * "shifting in" the raw.c property as long as it is
 		 * not one of the "ignored" character properties.
 		 * While at it, update the RI-counter.
 		 *
 		 */
 		if (raw.c != WORD_BREAK_PROP_EXTEND &&
 		    raw.c != WORD_BREAK_PROP_FORMAT &&
 		    raw.c != WORD_BREAK_PROP_ZWJ) {
 		    	skip.a = skip.b;
 			skip.b = raw.c;

 			if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
 				/*
 				 * The property we just shifted in is
 				 * a regional indicator, increasing the
 				 * number of consecutive RIs on the left
 				 * side of the breakpoint by one, changing
 				 * the oddness.
 				 *
 				 */
 				ri_even = !ri_even;
 			} else {
 				/*
 				 * We saw no regional indicator, so the
 				 * number of consecutive RIs on the left
 				 * side of the breakpoint is zero, which
 				 * is an even number.
 				 *
 				 */
 				ri_even = true;
 			}
 		}

 		/*
 		 * Update right side (b and c) of the skip state by
 		 * starting at the breakpoint and detecting the two
 		 * following non-ignored character classes
 		 *
 		 */
 		skip.c = NUM_WORD_BREAK_PROPS;
 		for (tmp = off; tmp < len; ) {
 			tmp += get_codepoint(str, len, tmp, &cp);
 			res = get_break_prop(cp);

 			if (res != WORD_BREAK_PROP_EXTEND &&
 			    res != WORD_BREAK_PROP_FORMAT &&
 			    res != WORD_BREAK_PROP_ZWJ) {
 				skip.c = res;
 				break;
 			}
 		}
 		skip.d = NUM_WORD_BREAK_PROPS;
 		for (; tmp < len; ) {
 			tmp += get_codepoint(str, len, tmp, &cp);
 			res = get_break_prop(cp);

 			if (res != WORD_BREAK_PROP_EXTEND &&
 			    res != WORD_BREAK_PROP_FORMAT &&
 			    res != WORD_BREAK_PROP_ZWJ) {
 				skip.d = res;
 				break;
 			}
 		}

 		/*
 		 * Update the raw state by simply shifting everything
 		 * in and, if we still have data left, determining
 		 * the character class of the next codepoint.
 		 *
 		 */
 		raw.a = raw.b;
 		raw.b = raw.c;
 		raw.c = raw.d;
 		if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
 			get_codepoint(str, len, new_off, &cp);
 			raw.d = get_break_prop(cp);
 		} else {
 			raw.d = NUM_WORD_BREAK_PROPS;
 		}

 		/* WB3 */
 		if (raw.b == WORD_BREAK_PROP_CR &&
 		    raw.c == WORD_BREAK_PROP_LF) {
 			continue;
 		}

 		/* WB3a */
 		if (raw.b == WORD_BREAK_PROP_NEWLINE ||
 		    raw.b == WORD_BREAK_PROP_CR      ||
 		    raw.b == WORD_BREAK_PROP_LF) {
 			break;
 		}

 		/* WB3b */
 		if (raw.c == WORD_BREAK_PROP_NEWLINE ||
 		    raw.c == WORD_BREAK_PROP_CR      ||
 		    raw.c == WORD_BREAK_PROP_LF) {
 			break;
 		}

 		/* WB3c */
 		if (raw.b == WORD_BREAK_PROP_ZWJ &&
 		    (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
 		     raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
 			continue;
 		}

 		/* WB3d */
 		if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
 		    raw.c == WORD_BREAK_PROP_WSEGSPACE) {
 			continue;
 		}

 		/* WB4 */
 		if (raw.c == WORD_BREAK_PROP_EXTEND ||
 		    raw.c == WORD_BREAK_PROP_FORMAT ||
 		    raw.c == WORD_BREAK_PROP_ZWJ) {
 			continue;
 		}

 		/* WB5 */
 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
 			continue;
 		}

 		/* WB6 */
 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
 		    (skip.c == WORD_BREAK_PROP_MIDLETTER    ||
 		     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
 		     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
 		    len > 2 &&
 		    (skip.d == WORD_BREAK_PROP_ALETTER              ||
 		     skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
 			continue;
 		}

 		/* WB7 */
 		if ((skip.b == WORD_BREAK_PROP_MIDLETTER    ||
 		     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
 		     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
 		    len > 2 &&
 		    (skip.a == WORD_BREAK_PROP_ALETTER              ||
 		     skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
 			continue;
 		}

 		/* WB7a */
 		if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
 		    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
 			continue;
 		}

 		/* WB7b */
 		if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
 		    skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
 		    len > 2 &&
 		    skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
 			continue;
 		}

 		/* WB7c */
 		if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
 		    skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
 		    off > 1 &&
 		    skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
 			continue;
 		}

 		/* WB8 */
 		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
 		    skip.c == WORD_BREAK_PROP_NUMERIC) {
 			continue;
 		}

 		/* WB9 */
 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
 		    skip.c == WORD_BREAK_PROP_NUMERIC) {
 			continue;
 		}

 		/* WB10 */
 		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
 			continue;
 		}

 		/* WB11 */
 		if ((skip.b == WORD_BREAK_PROP_MIDNUM       ||
 		     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
 		     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
 		    skip.c == WORD_BREAK_PROP_NUMERIC &&
 		    off > 1 &&
 		    skip.a == WORD_BREAK_PROP_NUMERIC) {
 			continue;
 		}

 		/* WB12 */
 		if (skip.b == WORD_BREAK_PROP_NUMERIC &&
 		    (skip.c == WORD_BREAK_PROP_MIDNUM       ||
 		     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
 		     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
 		    len > 2 &&
 		    skip.d == WORD_BREAK_PROP_NUMERIC) {
 			continue;
 		}

 		/* WB13 */
 		if (skip.b == WORD_BREAK_PROP_KATAKANA &&
 		    skip.c == WORD_BREAK_PROP_KATAKANA) {
 			continue;
 		}

 		/* WB13a */
 		if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
 		     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.b == WORD_BREAK_PROP_HEBREW_LETTER        ||
 		     skip.b == WORD_BREAK_PROP_NUMERIC              ||
 		     skip.b == WORD_BREAK_PROP_KATAKANA             ||
 		     skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
 		    skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
 			continue;
 		}

 		/* WB13b */
 		if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
 		    (skip.c == WORD_BREAK_PROP_ALETTER              ||
 		     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
 		     skip.c == WORD_BREAK_PROP_HEBREW_LETTER        ||
 		     skip.c == WORD_BREAK_PROP_NUMERIC              ||
 		     skip.c == WORD_BREAK_PROP_KATAKANA)) {
 			continue;
 		}

 		/* WB15 and WB16 */
 		if (!ri_even &&
 		    skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
 			continue;
 		}

 		/* WB999 */
 		break;
 	}

 	return off;
 }

 size_t
 grapheme_next_word_break(const uint_least32_t *str, size_t len)
 {
 	return next_word_break(str, len, get_codepoint);
 }

 size_t
 grapheme_next_word_break_utf8(const char *str, size_t len)
 {
 	return next_word_break(str, len, get_codepoint_utf8);
 }
	/* See LICENSE file for copyright and license details. */
	#include <stdbool.h>
	#include <stddef.h>

	#include "../gen/word.h"
	#include "../grapheme.h"
	#include "util.h"

	static inline enum word_break_property
	get_break_prop(uint_least32_t cp)
	{
	if (likely(cp <= 0x10FFFF)) {
	return (enum word_break_property)
	word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
	} else {
	return WORD_BREAK_PROP_OTHER;
	}
	}

	static size_t
	next_word_break(const void str, size_t len, size_t (get_codepoint)
	(const void , size_t, size_t, uint_least32_t ))
	{
	struct {
	enum word_break_property a, b, c, d;
	} raw, skip;
	enum word_break_property res;
	uint_least32_t cp;
	size_t off, tmp, new_off;
	bool ri_even = true;

	/* check degenerate cases */
	if (str == NULL \|\| len == 0) {
	return 0;
	}

	/*
	* Apply word breaking algorithm (UAX #29), see
	* https://unicode.org/reports/tr29/#Word_Boundary_Rules
	*
	* There are 4 slots (a, b, c, d) of "break" properties and
	* we check if there is a break in the middle between b and c.
	*
	* The position of this middle spot is determined by off,
	* which gives the offset of the first element on the right
	* hand side of said spot, or, in other words, gives the number
	* of elements on the left hand side.
	*
	* It is further complicated by the fact that the algorithm
	* expects you to skip certain characters for the second
	* half of the rules (after WB4). Thus, we do not only have
	* the "raw" properties as described above, but also the "skip"
	* properties, where the skip.a and skip.b, for instance,
	* give the two preceding character properties behind the
	* currently investigated breakpoint.
	*
	*/

	/*
	* Initialize the different properties such that we have
	* a good state after the state-update in the loop
	*/
	raw.b = NUM_WORD_BREAK_PROPS;
	if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
	/*
	* A line is at least one codepoint long, so we can
	* safely return here
	*/
	return len;
	}
	raw.c = get_break_prop(cp);
	(void)get_codepoint(str, len, off, &cp);
	raw.d = get_break_prop(cp);
	skip.a = skip.b = NUM_WORD_BREAK_PROPS;

	for (; off < len; off = new_off) {
	/*
	* Update left side (a and b) of the skip state by
	* "shifting in" the raw.c property as long as it is
	* not one of the "ignored" character properties.
	* While at it, update the RI-counter.
	*
	*/
	if (raw.c != WORD_BREAK_PROP_EXTEND &&
	raw.c != WORD_BREAK_PROP_FORMAT &&
	raw.c != WORD_BREAK_PROP_ZWJ) {
	skip.a = skip.b;
	skip.b = raw.c;

	if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
	/*
	* The property we just shifted in is
	* a regional indicator, increasing the
	* number of consecutive RIs on the left
	* side of the breakpoint by one, changing
	* the oddness.
	*
	*/
	ri_even = !ri_even;
	} else {
	/*
	* We saw no regional indicator, so the
	* number of consecutive RIs on the left
	* side of the breakpoint is zero, which
	* is an even number.
	*
	*/
	ri_even = true;
	}
	}

	/*
	* Update right side (b and c) of the skip state by
	* starting at the breakpoint and detecting the two
	* following non-ignored character classes
	*
	*/
	skip.c = NUM_WORD_BREAK_PROPS;
	for (tmp = off; tmp < len; ) {
	tmp += get_codepoint(str, len, tmp, &cp);
	res = get_break_prop(cp);

	if (res != WORD_BREAK_PROP_EXTEND &&
	res != WORD_BREAK_PROP_FORMAT &&
	res != WORD_BREAK_PROP_ZWJ) {
	skip.c = res;
	break;
	}
	}
	skip.d = NUM_WORD_BREAK_PROPS;
	for (; tmp < len; ) {
	tmp += get_codepoint(str, len, tmp, &cp);
	res = get_break_prop(cp);

	if (res != WORD_BREAK_PROP_EXTEND &&
	res != WORD_BREAK_PROP_FORMAT &&
	res != WORD_BREAK_PROP_ZWJ) {
	skip.d = res;
	break;
	}
	}

	/*
	* Update the raw state by simply shifting everything
	* in and, if we still have data left, determining
	* the character class of the next codepoint.
	*
	*/
	raw.a = raw.b;
	raw.b = raw.c;
	raw.c = raw.d;
	if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
	get_codepoint(str, len, new_off, &cp);
	raw.d = get_break_prop(cp);
	} else {
	raw.d = NUM_WORD_BREAK_PROPS;
	}

	/* WB3 */
	if (raw.b == WORD_BREAK_PROP_CR &&
	raw.c == WORD_BREAK_PROP_LF) {
	continue;
	}

	/* WB3a */
	if (raw.b == WORD_BREAK_PROP_NEWLINE \|\|
	raw.b == WORD_BREAK_PROP_CR \|\|
	raw.b == WORD_BREAK_PROP_LF) {
	break;
	}

	/* WB3b */
	if (raw.c == WORD_BREAK_PROP_NEWLINE \|\|
	raw.c == WORD_BREAK_PROP_CR \|\|
	raw.c == WORD_BREAK_PROP_LF) {
	break;
	}

	/* WB3c */
	if (raw.b == WORD_BREAK_PROP_ZWJ &&
	(raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC \|\|
	raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
	continue;
	}

	/* WB3d */
	if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
	raw.c == WORD_BREAK_PROP_WSEGSPACE) {
	continue;
	}

	/* WB4 */
	if (raw.c == WORD_BREAK_PROP_EXTEND \|\|
	raw.c == WORD_BREAK_PROP_FORMAT \|\|
	raw.c == WORD_BREAK_PROP_ZWJ) {
	continue;
	}

	/* WB5 */
	if ((skip.b == WORD_BREAK_PROP_ALETTER \|\|
	skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
	(skip.c == WORD_BREAK_PROP_ALETTER \|\|
	skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
	continue;
	}

	/* WB6 */
	if ((skip.b == WORD_BREAK_PROP_ALETTER \|\|
	skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
	(skip.c == WORD_BREAK_PROP_MIDLETTER \|\|
	skip.c == WORD_BREAK_PROP_MIDNUMLET \|\|
	skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
	len > 2 &&
	(skip.d == WORD_BREAK_PROP_ALETTER \|\|
	skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
	continue;
	}

	/* WB7 */
	if ((skip.b == WORD_BREAK_PROP_MIDLETTER \|\|
	skip.b == WORD_BREAK_PROP_MIDNUMLET \|\|
	skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
	(skip.c == WORD_BREAK_PROP_ALETTER \|\|
	skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
	len > 2 &&
	(skip.a == WORD_BREAK_PROP_ALETTER \|\|
	skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
	continue;
	}

	/* WB7a */
	if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
	skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
	continue;
	}

	/* WB7b */
	if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
	skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
	len > 2 &&
	skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
	continue;
	}

	/* WB7c */
	if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
	skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
	off > 1 &&
	skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
	continue;
	}

	/* WB8 */
	if (skip.b == WORD_BREAK_PROP_NUMERIC &&
	skip.c == WORD_BREAK_PROP_NUMERIC) {
	continue;
	}

	/* WB9 */
	if ((skip.b == WORD_BREAK_PROP_ALETTER \|\|
	skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
	skip.c == WORD_BREAK_PROP_NUMERIC) {
	continue;
	}

	/* WB10 */
	if (skip.b == WORD_BREAK_PROP_NUMERIC &&
	(skip.c == WORD_BREAK_PROP_ALETTER \|\|
	skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
	continue;
	}

	/* WB11 */
	if ((skip.b == WORD_BREAK_PROP_MIDNUM \|\|
	skip.b == WORD_BREAK_PROP_MIDNUMLET \|\|
	skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
	skip.c == WORD_BREAK_PROP_NUMERIC &&
	off > 1 &&
	skip.a == WORD_BREAK_PROP_NUMERIC) {
	continue;
	}

	/* WB12 */
	if (skip.b == WORD_BREAK_PROP_NUMERIC &&
	(skip.c == WORD_BREAK_PROP_MIDNUM \|\|
	skip.c == WORD_BREAK_PROP_MIDNUMLET \|\|
	skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
	len > 2 &&
	skip.d == WORD_BREAK_PROP_NUMERIC) {
	continue;
	}

	/* WB13 */
	if (skip.b == WORD_BREAK_PROP_KATAKANA &&
	skip.c == WORD_BREAK_PROP_KATAKANA) {
	continue;
	}

	/* WB13a */
	if ((skip.b == WORD_BREAK_PROP_ALETTER \|\|
	skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.b == WORD_BREAK_PROP_HEBREW_LETTER \|\|
	skip.b == WORD_BREAK_PROP_NUMERIC \|\|
	skip.b == WORD_BREAK_PROP_KATAKANA \|\|
	skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
	skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
	continue;
	}

	/* WB13b */
	if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
	(skip.c == WORD_BREAK_PROP_ALETTER \|\|
	skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT \|\|
	skip.c == WORD_BREAK_PROP_HEBREW_LETTER \|\|
	skip.c == WORD_BREAK_PROP_NUMERIC \|\|
	skip.c == WORD_BREAK_PROP_KATAKANA)) {
	continue;
	}

	/* WB15 and WB16 */
	if (!ri_even &&
	skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
	continue;
	}

	/* WB999 */
	break;
	}

	return off;
	}

	size_t
	grapheme_next_word_break(const uint_least32_t *str, size_t len)
	{
	return next_word_break(str, len, get_codepoint);
	}

	size_t
	grapheme_next_word_break_utf8(const char *str, size_t len)
	{
	return next_word_break(str, len, get_codepoint_utf8);
	}