blob: f7116ef8aeddba195d14ffa4716a153c02418ec1 [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
#include "../gen/bidirectional.h"
#include "../grapheme.h"
#include "util.h"
#define MAX_DEPTH 125
enum state_type {
STATE_PROP, /* in 0..23, bidi_property */
STATE_PRESERVED_PROP, /* in 0..23, preserved bidi_prop for L1-rule */
STATE_BRACKET_OFF, /* in 0..255, offset in bidi_bracket */
STATE_LEVEL, /* in 0..MAX_DEPTH+1=126, embedding level */
STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
STATE_VISITED, /* in 0..1, visited within isolating run */
};
static struct {
uint_least32_t filter_mask;
size_t mask_shift;
int_least16_t value_offset;
} state_lut[] = {
[STATE_PROP] = {
.filter_mask = 0x000001F, /* 00000000 00000000 00000000 00011111 */
.mask_shift = 0,
.value_offset = 0,
},
[STATE_PRESERVED_PROP] = {
.filter_mask = 0x00003E0, /* 00000000 00000000 00000011 11100000 */
.mask_shift = 5,
.value_offset = 0,
},
[STATE_BRACKET_OFF] = {
.filter_mask = 0x003FC00, /* 00000000 00000011 11111100 00000000 */
.mask_shift = 10,
.value_offset = 0,
},
[STATE_LEVEL] = {
.filter_mask = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */
.mask_shift = 18,
.value_offset = -1,
},
[STATE_PARAGRAPH_LEVEL] = {
.filter_mask = 0x2000000, /* 00000010 00000000 00000000 00000000 */
.mask_shift = 25,
.value_offset = 0,
},
[STATE_VISITED] = {
.filter_mask = 0x4000000, /* 00000100 00000000 00000000 00000000 */
.mask_shift = 26,
.value_offset = 0,
},
};
static inline int_least16_t
get_state(enum state_type t, uint_least32_t input)
{
return (int_least16_t)((input & state_lut[t].filter_mask) >>
state_lut[t].mask_shift) +
state_lut[t].value_offset;
}
static inline void
set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
{
*output &= ~state_lut[t].filter_mask;
*output |= ((uint_least32_t)(value - state_lut[t].value_offset)
<< state_lut[t].mask_shift) &
state_lut[t].filter_mask;
}
struct isolate_runner {
uint_least32_t *buf;
size_t buflen;
size_t start;
struct {
size_t off;
} prev, cur, next;
enum bidi_property sos, eos;
uint_least8_t paragraph_level;
int_least8_t isolating_run_level;
};
static inline enum bidi_property
ir_get_previous_prop(const struct isolate_runner *ir)
{
return (ir->prev.off == SIZE_MAX) ?
ir->sos :
(uint_least8_t)get_state(STATE_PROP,
ir->buf[ir->prev.off]);
}
static inline enum bidi_property
ir_get_current_prop(const struct isolate_runner *ir)
{
return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]);
}
static inline enum bidi_property
ir_get_next_prop(const struct isolate_runner *ir)
{
return (ir->next.off == SIZE_MAX) ?
ir->eos :
(uint_least8_t)get_state(STATE_PROP,
ir->buf[ir->next.off]);
}
static inline enum bidi_property
ir_get_current_preserved_prop(const struct isolate_runner *ir)
{
return (uint_least8_t)get_state(STATE_PRESERVED_PROP,
ir->buf[ir->cur.off]);
}
static inline int_least8_t
ir_get_current_level(const struct isolate_runner *ir)
{
return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]);
}
static inline const struct bracket *
ir_get_current_bracket_prop(const struct isolate_runner *ir)
{
return bidi_bracket +
(int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]);
}
static void
ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop)
{
set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off]));
}
static void
ir_init(uint_least32_t *buf, size_t buflen, size_t off,
uint_least8_t paragraph_level, bool within, struct isolate_runner *ir)
{
size_t i;
int_least8_t sos_level;
/* initialize invariants */
ir->buf = buf;
ir->buflen = buflen;
ir->paragraph_level = paragraph_level;
ir->start = off;
/* advance off until we are at a non-removed character */
for (; off < buflen; off++) {
if (get_state(STATE_LEVEL, buf[off]) != -1) {
break;
}
}
if (off == buflen) {
/* we encountered no more non-removed character, terminate */
ir->next.off = SIZE_MAX;
return;
}
/* set the isolating run level to that of the current offset */
ir->isolating_run_level =
(int_least8_t)get_state(STATE_LEVEL, buf[off]);
/* initialize sos and eos to dummy values */
ir->sos = ir->eos = NUM_BIDI_PROPS;
/*
* we write the information of the "current" state into next,
* so that the shift-in at the first advancement moves it in
* cur, as desired.
*/
ir->next.off = off;
/*
* determine the previous state but store its offset in cur.off,
* given it's shifted in on the first advancement
*/
ir->cur.off = SIZE_MAX;
for (i = off, sos_level = -1; i >= 1; i--) {
if (get_state(STATE_LEVEL, buf[i - 1]) != -1) {
/*
* we found a character that has not been
* removed in X9
*/
sos_level = (int_least8_t)get_state(STATE_LEVEL,
buf[i - 1]);
if (within) {
/* we just take it */
ir->cur.off = i;
}
break;
}
}
if (sos_level == -1) {
/*
* there were no preceding non-removed characters, set
* sos-level to paragraph embedding level
*/
sos_level = (int_least8_t)paragraph_level;
}
if (!within || ir->cur.off == SIZE_MAX) {
/*
* we are at the beginning of the sequence; initialize
* it faithfully according to the algorithm by looking
* at the sos-level
*/
if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) {
/* the higher level is even, set sos to L */
ir->sos = BIDI_PROP_L;
} else {
/* the higher level is odd, set sos to R */
ir->sos = BIDI_PROP_R;
}
}
}
static int
ir_advance(struct isolate_runner *ir)
{
enum bidi_property prop;
int_least8_t level, isolate_level, last_isolate_level;
size_t i;
if (ir->next.off == SIZE_MAX) {
/* the sequence is over */
return 1;
}
/* shift in */
ir->prev.off = ir->cur.off;
ir->cur.off = ir->next.off;
/* mark as visited */
set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off]));
/* initialize next state by going to the next character in the sequence
*/
ir->next.off = SIZE_MAX;
last_isolate_level = -1;
for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) {
level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]);
prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]);
if (level == -1) {
/* this is one of the ignored characters, skip */
continue;
} else if (level == ir->isolating_run_level) {
last_isolate_level = level;
}
/* follow BD8/BD9 and P2 to traverse the current sequence */
if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
prop == BIDI_PROP_FSI) {
/*
* we encountered an isolate initiator, increment
* counter, but go into processing when we
* were not isolated before
*/
if (isolate_level < MAX_DEPTH) {
isolate_level++;
}
if (isolate_level != 1) {
continue;
}
} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
isolate_level--;
/*
* if the current PDI dropped the isolate-level
* to zero, it is itself part of the isolating
* run sequence; otherwise we simply continue.
*/
if (isolate_level > 0) {
continue;
}
} else if (isolate_level > 0) {
/* we are in an isolating sequence */
continue;
}
/*
* now we either still are in our sequence or we hit
* the eos-case as we left the sequence and hit the
* first non-isolating-sequence character.
*/
if (i == ir->cur.off) {
/* we were in the first initializing round */
continue;
} else if (level == ir->isolating_run_level) {
/* isolate_level-skips have been handled before, we're
* good */
/* still in the sequence */
ir->next.off = i;
} else {
/* out of sequence or isolated, compare levels via eos
*/
ir->next.off = SIZE_MAX;
if (MAX(last_isolate_level, level) % 2 == 0) {
ir->eos = BIDI_PROP_L;
} else {
ir->eos = BIDI_PROP_R;
}
}
break;
}
if (i == ir->buflen) {
/*
* the sequence ended before we could grab an offset.
* we need to determine the eos-prop by comparing the
* level of the last element in the isolating run sequence
* with the paragraph level.
*/
ir->next.off = SIZE_MAX;
if (MAX(last_isolate_level, ir->paragraph_level) % 2 == 0) {
/* the higher level is even, set eos to L */
ir->eos = BIDI_PROP_L;
} else {
/* the higher level is odd, set eos to R */
ir->eos = BIDI_PROP_R;
}
}
return 0;
}
static enum bidi_property
ir_get_last_strong_prop(const struct isolate_runner *ir)
{
struct isolate_runner tmp;
enum bidi_property last_strong_prop = ir->sos, prop;
ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
&tmp);
for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
prop = ir_get_current_prop(&tmp);
if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
prop == BIDI_PROP_AL) {
last_strong_prop = prop;
}
}
return last_strong_prop;
}
static enum bidi_property
ir_get_last_strong_or_number_prop(const struct isolate_runner *ir)
{
struct isolate_runner tmp;
enum bidi_property last_strong_or_number_prop = ir->sos, prop;
ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
&tmp);
for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
prop = ir_get_current_prop(&tmp);
if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
prop == BIDI_PROP_AL || prop == BIDI_PROP_EN ||
prop == BIDI_PROP_AN) {
last_strong_or_number_prop = prop;
}
}
return last_strong_or_number_prop;
}
static void
preprocess_bracket_pair(const struct isolate_runner *start,
const struct isolate_runner *end)
{
enum bidi_property prop, bracket_prop, last_strong_or_number_prop;
struct isolate_runner ir;
size_t strong_type_off;
/*
* check if the bracket contains a strong type (L or R|EN|AN)
*/
for (ir = *start, strong_type_off = SIZE_MAX,
bracket_prop = NUM_BIDI_PROPS;
!ir_advance(&ir) && ir.cur.off < end->cur.off;) {
prop = ir_get_current_prop(&ir);
if (prop == BIDI_PROP_L) {
strong_type_off = ir.cur.off;
if (ir.isolating_run_level % 2 == 0) {
/*
* set the type for both brackets to L (so they
* match the strong type they contain)
*/
bracket_prop = BIDI_PROP_L;
}
} else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN ||
prop == BIDI_PROP_AN) {
strong_type_off = ir.cur.off;
if (ir.isolating_run_level % 2 != 0) {
/*
* set the type for both brackets to R (so they
* match the strong type they contain)
*/
bracket_prop = BIDI_PROP_R;
}
}
}
if (strong_type_off == SIZE_MAX) {
/*
* there are no strong types within the brackets and we just
* leave the brackets as is
*/
return;
}
if (bracket_prop == NUM_BIDI_PROPS) {
/*
* We encountered a strong type, but it was opposite
* to the embedding direction.
* Check the previous strong type before the opening
* bracket
*/
last_strong_or_number_prop =
ir_get_last_strong_or_number_prop(start);
if (last_strong_or_number_prop == BIDI_PROP_L &&
ir.isolating_run_level % 2 != 0) {
/*
* the previous strong type is also opposite
* to the embedding direction, so the context
* was established and we set the brackets
* accordingly.
*/
bracket_prop = BIDI_PROP_L;
} else if ((last_strong_or_number_prop == BIDI_PROP_R ||
last_strong_or_number_prop == BIDI_PROP_EN ||
last_strong_or_number_prop == BIDI_PROP_AN) &&
ir.isolating_run_level % 2 == 0) {
/*
* the previous strong type is also opposite
* to the embedding direction, so the context
* was established and we set the brackets
* accordingly.
*/
bracket_prop = BIDI_PROP_R;
} else {
/* set brackets to the embedding direction */
if (ir.isolating_run_level % 2 == 0) {
bracket_prop = BIDI_PROP_L;
} else {
bracket_prop = BIDI_PROP_R;
}
}
}
ir_set_current_prop(start, bracket_prop);
ir_set_current_prop(end, bracket_prop);
/*
* any sequence of NSMs after opening or closing brackets get
* the same property as the one we set on the brackets
*/
for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop(
&ir) == BIDI_PROP_NSM;) {
ir_set_current_prop(&ir, bracket_prop);
}
for (ir = *end; !ir_advance(&ir) &&
ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) {
ir_set_current_prop(&ir, bracket_prop);
}
}
static void
preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off,
uint_least8_t paragraph_level)
{
/*
* The N0-rule deals with bracket pairs that shall be determined
* with the rule BD16. This is specified as an algorithm with a
* stack of 63 bracket openings that are used to resolve into a
* separate list of pairs, which is then to be sorted by opening
* position. Thus, even though the bracketing-depth is limited
* by 63, the algorithm, as is, requires dynamic memory
* management.
*
* A naive approach (used by Fribidi) would be to screw the
* stack-approach and simply directly determine the
* corresponding closing bracket offset for a given opening
* bracket, leading to O(n²) time complexity in the worst case
* with a lot of brackets. While many brackets are not common,
* it is still possible to find a middle ground where you obtain
* strongly linear time complexity in most common cases:
*
* Instead of a stack, we use a FIFO data structure which is
* filled with bracket openings in the order of appearance (thus
* yielding an implicit sorting!) at the top. If the
* corresponding closing bracket is encountered, it is added to
* the respective entry, making it ready to "move out" at the
* bottom (i.e. passed to the bracket processing). Due to the
* nature of the specified pair detection algorithm, which only
* cares about the bracket type and nothing else (bidi class,
* level, etc.), we can mix processing and bracket detection.
*
* Obviously, if you, for instance, have one big bracket pair at
* the bottom that has not been closed yet, it will block the
* bracket processing and the FIFO might hit its capacity limit.
* At this point, the blockage is manually resolved using the
* naive quadratic approach.
*
* To remain within the specified standard behaviour, which
* mandates that processing of brackets should stop when the
* bracketing-depth is at 63, we simply check in an "overflow"
* scenario if all 63 elements in the LIFO are unfinished, which
* corresponds with such a bracketing depth.
*/
enum bidi_property prop;
struct {
bool complete;
size_t bracket_class;
struct isolate_runner start;
struct isolate_runner end;
} fifo[63];
const struct bracket *bracket_prop, *tmp_bracket_prop;
struct isolate_runner ir, tmp_ir;
size_t fifo_len = 0, i, blevel, j, k;
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
prop = ir_get_current_prop(&ir);
bracket_prop = ir_get_current_bracket_prop(&ir);
if (prop == BIDI_PROP_ON &&
bracket_prop->type == BIDI_BRACKET_OPEN) {
if (fifo_len == LEN(fifo)) {
/*
* The FIFO is full, check first if it's
* completely blocked (i.e. no finished
* bracket pairs, triggering the standard
* that mandates to abort in such a case
*/
for (i = 0; i < fifo_len; i++) {
if (fifo[i].complete) {
break;
}
}
if (i == fifo_len) {
/* abort processing */
return;
}
/*
* by construction, the bottom entry
* in the FIFO is guaranteed to be
* unfinished (given we "consume" all
* finished bottom entries after each
* iteration).
*
* iterate, starting after the opening
* bracket, and find the corresponding
* closing bracket.
*
* if we find none, just drop the FIFO
* entry silently
*/
for (tmp_ir = fifo[0].start, blevel = 0;
!ir_advance(&tmp_ir);) {
tmp_bracket_prop =
ir_get_current_bracket_prop(
&tmp_ir);
if (tmp_bracket_prop->type ==
BIDI_BRACKET_OPEN &&
tmp_bracket_prop->class ==
bracket_prop->class) {
/* we encountered another
* opening bracket of the
* same class */
blevel++;
} else if (tmp_bracket_prop->type ==
BIDI_BRACKET_CLOSE &&
tmp_bracket_prop->class ==
bracket_prop
->class) {
/* we encountered a closing
* bracket of the same class
*/
if (blevel == 0) {
/* this is the
* corresponding
* closing bracket
*/
fifo[0].complete = true;
fifo[0].end = ir;
} else {
blevel--;
}
}
}
if (fifo[0].complete) {
/* we found the matching bracket */
preprocess_bracket_pair(
&(fifo[i].start),
&(fifo[i].end));
}
/* shift FIFO one to the left */
for (i = 1; i < fifo_len; i++) {
fifo[i - 1] = fifo[i];
}
fifo_len--;
}
/* add element to the FIFO */
fifo_len++;
fifo[fifo_len - 1].complete = false;
fifo[fifo_len - 1].bracket_class = bracket_prop->class;
fifo[fifo_len - 1].start = ir;
} else if (prop == BIDI_PROP_ON &&
bracket_prop->type == BIDI_BRACKET_CLOSE) {
/*
* go backwards in the FIFO, skip finished entries
* and simply ignore (do nothing) the closing
* bracket if we do not match anything
*/
for (i = fifo_len; i > 0; i--) {
if (bracket_prop->class ==
fifo[i - 1].bracket_class &&
!fifo[i - 1].complete) {
/* we have found a pair */
fifo[i - 1].complete = true;
fifo[i - 1].end = ir;
/* remove all uncompleted FIFO elements
* above i - 1 */
for (j = i; j < fifo_len;) {
if (fifo[j].complete) {
j++;
continue;
}
/* shift FIFO one to the left */
for (k = j + 1; k < fifo_len;
k++) {
fifo[k - 1] = fifo[k];
}
fifo_len--;
}
break;
}
}
}
/* process all ready bracket pairs from the FIFO bottom */
while (fifo_len > 0 && fifo[0].complete) {
preprocess_bracket_pair(&(fifo[0].start),
&(fifo[0].end));
/* shift FIFO one to the left */
for (j = 0; j + 1 < fifo_len; j++) {
fifo[j] = fifo[j + 1];
}
fifo_len--;
}
}
/*
* afterwards, we still might have unfinished bracket pairs
* that will remain as such, but the subsequent finished pairs
* also need to be taken into account, so we go through the
* FIFO once more and process all finished pairs
*/
for (i = 0; i < fifo_len; i++) {
if (fifo[i].complete) {
preprocess_bracket_pair(&(fifo[i].start),
&(fifo[i].end));
}
}
}
static size_t
preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
size_t off, uint_least8_t paragraph_level)
{
enum bidi_property sequence_prop, prop;
struct isolate_runner ir, tmp;
size_t runsince, sequence_end;
/* W1 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) {
prop = ir_get_previous_prop(&ir);
if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
prop == BIDI_PROP_FSI || prop == BIDI_PROP_PDI) {
ir_set_current_prop(&ir, BIDI_PROP_ON);
} else {
ir_set_current_prop(&ir, prop);
}
}
}
/* W2 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) {
ir_set_current_prop(&ir, BIDI_PROP_AN);
}
}
/* W3 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (ir_get_current_prop(&ir) == BIDI_PROP_AL) {
ir_set_current_prop(&ir, BIDI_PROP_R);
}
}
/* W4 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (ir_get_previous_prop(&ir) == BIDI_PROP_EN &&
(ir_get_current_prop(&ir) == BIDI_PROP_ES ||
ir_get_current_prop(&ir) == BIDI_PROP_CS) &&
ir_get_next_prop(&ir) == BIDI_PROP_EN) {
ir_set_current_prop(&ir, BIDI_PROP_EN);
}
if (ir_get_previous_prop(&ir) == BIDI_PROP_AN &&
ir_get_current_prop(&ir) == BIDI_PROP_CS &&
ir_get_next_prop(&ir) == BIDI_PROP_AN) {
ir_set_current_prop(&ir, BIDI_PROP_AN);
}
}
/* W5 */
runsince = SIZE_MAX;
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (ir_get_current_prop(&ir) == BIDI_PROP_ET) {
if (runsince == SIZE_MAX) {
/* a new run has begun */
runsince = ir.cur.off;
}
} else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) {
/* set the preceding sequence */
if (runsince != SIZE_MAX) {
ir_init(buf, buflen, runsince, paragraph_level,
(runsince > off), &tmp);
while (!ir_advance(&tmp) &&
tmp.cur.off < ir.cur.off) {
ir_set_current_prop(&tmp, BIDI_PROP_EN);
}
runsince = SIZE_MAX;
} else {
ir_init(buf, buflen, ir.cur.off,
paragraph_level, (ir.cur.off > off),
&tmp);
ir_advance(&tmp);
}
/* follow the succeeding sequence */
while (!ir_advance(&tmp)) {
if (ir_get_current_prop(&tmp) != BIDI_PROP_ET) {
break;
}
ir_set_current_prop(&tmp, BIDI_PROP_EN);
}
} else {
/* sequence ended */
runsince = SIZE_MAX;
}
}
/* W6 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
prop = ir_get_current_prop(&ir);
if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET ||
prop == BIDI_PROP_CS) {
ir_set_current_prop(&ir, BIDI_PROP_ON);
}
}
/* W7 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
ir_get_last_strong_prop(&ir) == BIDI_PROP_L) {
ir_set_current_prop(&ir, BIDI_PROP_L);
}
}
/* N0 */
preprocess_bracket_pairs(buf, buflen, off, paragraph_level);
/* N1 */
sequence_end = SIZE_MAX;
sequence_prop = NUM_BIDI_PROPS;
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
if (sequence_end == SIZE_MAX) {
prop = ir_get_current_prop(&ir);
if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
/* the current character is an NI (neutral
* or isolate) */
/* scan ahead to the end of the NI-sequence
*/
ir_init(buf, buflen, ir.cur.off,
paragraph_level, (ir.cur.off > off),
&tmp);
while (!ir_advance(&tmp)) {
prop = ir_get_next_prop(&tmp);
if (prop != BIDI_PROP_B &&
prop != BIDI_PROP_S &&
prop != BIDI_PROP_WS &&
prop != BIDI_PROP_ON &&
prop != BIDI_PROP_FSI &&
prop != BIDI_PROP_LRI &&
prop != BIDI_PROP_RLI &&
prop != BIDI_PROP_PDI) {
break;
}
}
/*
* check what follows and see if the text
* has the same direction on both sides
*/
if (ir_get_previous_prop(&ir) == BIDI_PROP_L &&
ir_get_next_prop(&tmp) == BIDI_PROP_L) {
sequence_end = tmp.cur.off;
sequence_prop = BIDI_PROP_L;
} else if ((ir_get_previous_prop(&ir) ==
BIDI_PROP_R ||
ir_get_previous_prop(&ir) ==
BIDI_PROP_EN ||
ir_get_previous_prop(&ir) ==
BIDI_PROP_AN) &&
(ir_get_next_prop(&tmp) ==
BIDI_PROP_R ||
ir_get_next_prop(&tmp) ==
BIDI_PROP_EN ||
ir_get_next_prop(&tmp) ==
BIDI_PROP_AN)) {
sequence_end = tmp.cur.off;
sequence_prop = BIDI_PROP_R;
}
}
}
if (sequence_end != SIZE_MAX) {
if (ir.cur.off <= sequence_end) {
ir_set_current_prop(&ir, sequence_prop);
} else {
/* end of sequence, reset */
sequence_end = SIZE_MAX;
sequence_prop = NUM_BIDI_PROPS;
}
}
}
/* N2 */
ir_init(buf, buflen, off, paragraph_level, false, &ir);
while (!ir_advance(&ir)) {
prop = ir_get_current_prop(&ir);
if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
/* N2 */
if (ir_get_current_level(&ir) % 2 == 0) {
/* even embedding level */
ir_set_current_prop(&ir, BIDI_PROP_L);
} else {
/* odd embedding level */
ir_set_current_prop(&ir, BIDI_PROP_R);
}
}
}
return 0;
}
static uint_least8_t
get_paragraph_level(enum grapheme_bidirectional_override override,
bool terminate_on_pdi, const uint_least32_t *buf,
size_t buflen)
{
enum bidi_property prop;
int_least8_t isolate_level;
size_t bufoff;
/* check overrides first according to rule HL1 */
if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
return 0;
} else if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
return 1;
}
/* determine paragraph level (rules P1-P3) */
for (bufoff = 0, isolate_level = 0; bufoff < buflen; bufoff++) {
prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
if (prop == BIDI_PROP_PDI && isolate_level == 0 &&
terminate_on_pdi) {
/*
* we are in a FSI-subsection of a paragraph and
* matched with the terminating PDI
*/
break;
}
/* BD8/BD9 */
if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
prop == BIDI_PROP_FSI) &&
isolate_level < MAX_DEPTH) {
/* we hit an isolate initiator, increment counter */
isolate_level++;
} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
isolate_level--;
}
/* P2 */
if (isolate_level > 0) {
continue;
}
/* P3 */
if (prop == BIDI_PROP_L) {
return 0;
} else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
return 1;
}
}
return 0;
}
static void
preprocess_paragraph(enum grapheme_bidirectional_override override,
uint_least32_t *buf, size_t buflen)
{
enum bidi_property prop;
int_least8_t level;
struct {
int_least8_t level;
enum grapheme_bidirectional_override override;
bool directional_isolate;
} directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
size_t overflow_isolate_count, overflow_embedding_count,
valid_isolate_count, bufoff, i, runsince;
uint_least8_t paragraph_level;
paragraph_level = get_paragraph_level(override, false, buf, buflen);
/* X1 */
dirstat->level = (int_least8_t)paragraph_level;
dirstat->override = GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
dirstat->directional_isolate = false;
overflow_isolate_count = overflow_embedding_count =
valid_isolate_count = 0;
for (bufoff = 0; bufoff < buflen; bufoff++) {
prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
/* set paragraph level we need for line-level-processing */
set_state(STATE_PARAGRAPH_LEVEL, paragraph_level,
&(buf[bufoff]));
again:
if (prop == BIDI_PROP_RLE) {
/* X2 */
if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
MAX_DEPTH &&
overflow_isolate_count == 0 &&
overflow_embedding_count == 0) {
/* valid RLE */
dirstat++;
dirstat->level =
(dirstat - 1)->level +
((dirstat - 1)->level % 2 != 0) + 1;
dirstat->override =
GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
dirstat->directional_isolate = false;
} else {
/* overflow RLE */
overflow_embedding_count +=
(overflow_isolate_count == 0);
}
} else if (prop == BIDI_PROP_LRE) {
/* X3 */
if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
MAX_DEPTH &&
overflow_isolate_count == 0 &&
overflow_embedding_count == 0) {
/* valid LRE */
dirstat++;
dirstat->level =
(dirstat - 1)->level +
((dirstat - 1)->level % 2 == 0) + 1;
dirstat->override =
GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
dirstat->directional_isolate = false;
} else {
/* overflow LRE */
overflow_embedding_count +=
(overflow_isolate_count == 0);
}
} else if (prop == BIDI_PROP_RLO) {
/* X4 */
if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
MAX_DEPTH &&
overflow_isolate_count == 0 &&
overflow_embedding_count == 0) {
/* valid RLO */
dirstat++;
dirstat->level =
(dirstat - 1)->level +
((dirstat - 1)->level % 2 != 0) + 1;
dirstat->override =
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
dirstat->directional_isolate = false;
} else {
/* overflow RLO */
overflow_embedding_count +=
(overflow_isolate_count == 0);
}
} else if (prop == BIDI_PROP_LRO) {
/* X5 */
if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
MAX_DEPTH &&
overflow_isolate_count == 0 &&
overflow_embedding_count == 0) {
/* valid LRE */
dirstat++;
dirstat->level =
(dirstat - 1)->level +
((dirstat - 1)->level % 2 == 0) + 1;
dirstat->override =
GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
dirstat->directional_isolate = false;
} else {
/* overflow LRO */
overflow_embedding_count +=
(overflow_isolate_count == 0);
}
} else if (prop == BIDI_PROP_RLI) {
/* X5a */
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
MAX_DEPTH &&
overflow_isolate_count == 0 &&
overflow_embedding_count == 0) {
/* valid RLI */
valid_isolate_count++;
dirstat++;
dirstat->level =
(dirstat - 1)->level +
((dirstat - 1)->level % 2 != 0) + 1;
dirstat->override =
GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
dirstat->directional_isolate = true;
} else {
/* overflow RLI */
overflow_isolate_count++;
}
} else if (prop == BIDI_PROP_LRI) {
/* X5b */
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
MAX_DEPTH &&
overflow_isolate_count == 0 &&
overflow_embedding_count == 0) {
/* valid LRI */
valid_isolate_count++;
dirstat++;
dirstat->level =
(dirstat - 1)->level +
((dirstat - 1)->level % 2 == 0) + 1;
dirstat->override =
GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
dirstat->directional_isolate = true;
} else {
/* overflow LRI */
overflow_isolate_count++;
}
} else if (prop == BIDI_PROP_FSI) {
/* X5c */
if (get_paragraph_level(
GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL,
true, buf + (bufoff + 1),
buflen - (bufoff + 1)) == 1) {
prop = BIDI_PROP_RLI;
goto again;
} else { /* ... == 0 */
prop = BIDI_PROP_LRI;
goto again;
}
} else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN &&
prop != BIDI_PROP_PDF && prop != BIDI_PROP_PDI) {
/* X6 */
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
} else if (prop == BIDI_PROP_PDI) {
/* X6a */
if (overflow_isolate_count > 0) {
/* PDI matches an overflow isolate initiator
*/
overflow_isolate_count--;
} else if (valid_isolate_count > 0) {
/* PDI matches a normal isolate initiator */
overflow_embedding_count = 0;
while (dirstat->directional_isolate == false &&
dirstat > directional_status) {
/*
* we are safe here as given the
* valid isolate count is positive
* there must be a stack-entry
* with positive directional
* isolate status, but we take
* no chances and include an
* explicit check
*
* POSSIBLE OPTIMIZATION: Whenever
* we push on the stack, check if it
* has the directional isolate
* status true and store a pointer
* to it so we can jump to it very
* quickly.
*/
dirstat--;
}
/*
* as above, the following check is not
* necessary, given we are guaranteed to
* have at least one stack entry left,
* but it's better to be safe
*/
if (dirstat > directional_status) {
dirstat--;
}
valid_isolate_count--;
}
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
} else if (prop == BIDI_PROP_PDF) {
/* X7 */
if (overflow_isolate_count > 0) {
/* do nothing */
} else if (overflow_embedding_count > 0) {
overflow_embedding_count--;
} else if (dirstat->directional_isolate == false &&
dirstat > directional_status) {
dirstat--;
}
} else if (prop == BIDI_PROP_B) {
/* X8 */
set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
}
/* X9 */
if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE ||
prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO ||
prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) {
set_state(STATE_LEVEL, -1, &(buf[bufoff]));
}
}
/* X10 (W1-W7, N0-N2) */
for (bufoff = 0; bufoff < buflen; bufoff++) {
if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
get_state(STATE_LEVEL, buf[bufoff]) != -1) {
bufoff += preprocess_isolating_run_sequence(
buf, buflen, bufoff, paragraph_level);
}
}
/*
* I1-I2 (given our sequential approach to processing the
* isolating run sequences, we apply this rule separately)
*/
for (bufoff = 0; bufoff < buflen; bufoff++) {
level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
if (level % 2 == 0) {
/* even level */
if (prop == BIDI_PROP_R) {
set_state(STATE_LEVEL, level + 1,
&(buf[bufoff]));
} else if (prop == BIDI_PROP_AN ||
prop == BIDI_PROP_EN) {
set_state(STATE_LEVEL, level + 2,
&(buf[bufoff]));
}
} else {
/* odd level */
if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN ||
prop == BIDI_PROP_AN) {
set_state(STATE_LEVEL, level + 1,
&(buf[bufoff]));
}
}
}
/* L1 (rules 1-3) */
runsince = SIZE_MAX;
for (bufoff = 0; bufoff < buflen; bufoff++) {
level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
buf[bufoff]);
if (level == -1) {
/* ignored character */
continue;
}
/* rules 1 and 2 */
if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
}
/* rule 3 */
if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
prop == BIDI_PROP_PDI) {
if (runsince == SIZE_MAX) {
/* a new run has begun */
runsince = bufoff;
}
} else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) &&
runsince != SIZE_MAX) {
/*
* we hit a segment or paragraph separator in a
* sequence, reset sequence-levels
*/
for (i = runsince; i < bufoff; i++) {
if (get_state(STATE_LEVEL, buf[i]) != -1) {
set_state(STATE_LEVEL, paragraph_level,
&(buf[i]));
}
}
runsince = SIZE_MAX;
} else {
/* sequence ended */
runsince = SIZE_MAX;
}
}
if (runsince != SIZE_MAX) {
/*
* this is the end of the paragraph and we
* are in a run
*/
for (i = runsince; i < buflen; i++) {
if (get_state(STATE_LEVEL, buf[i]) != -1) {
set_state(STATE_LEVEL, paragraph_level,
&(buf[i]));
}
}
runsince = SIZE_MAX;
}
}
static inline uint_least8_t
get_bidi_property(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
0x1F /* 00011111 */;
} else {
return BIDI_PROP_L;
}
}
static inline uint_least8_t
get_bidi_bracket_off(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] +
(cp & 0xff)]) >>
5);
} else {
return 0;
}
}
static size_t
preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
uint_least32_t *buf, size_t buflen)
{
size_t bufoff, bufsize, lastparoff;
uint_least32_t cp;
if (buf == NULL) {
for (; herodotus_read_codepoint(r, true, &cp) ==
HERODOTUS_STATUS_SUCCESS;) {
;
}
/* see below for return value reasoning */
return herodotus_reader_number_read(r);
}
/*
* the first step is to determine the bidirectional properties
* and store them in the buffer
*/
for (bufoff = 0;
herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;
bufoff++) {
if (bufoff < buflen) {
/*
* actually only do something when we have
* space in the level-buffer. We continue
* the iteration to be able to give a good
* return value
*/
set_state(STATE_PROP,
(uint_least8_t)get_bidi_property(cp),
&(buf[bufoff]));
set_state(STATE_BRACKET_OFF, get_bidi_bracket_off(cp),
&(buf[bufoff]));
set_state(STATE_LEVEL, 0, &(buf[bufoff]));
set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff]));
set_state(STATE_VISITED, 0, &(buf[bufoff]));
set_state(STATE_PRESERVED_PROP,
(uint_least8_t)get_bidi_property(cp),
&(buf[bufoff]));
}
}
bufsize = herodotus_reader_number_read(r);
for (bufoff = 0, lastparoff = 0; bufoff < bufsize; bufoff++) {
if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
bufoff != bufsize - 1) {
continue;
}
/*
* we either encountered a paragraph terminator or this
* is the last character in the string.
* Call the paragraph handler on the paragraph, including
* the terminating character or last character of the
* string respectively
*/
preprocess_paragraph(override, buf + lastparoff,
bufoff + 1 - lastparoff);
lastparoff = bufoff + 1;
}
/*
* we return the number of total bytes read, as the function
* should indicate if the given level-buffer is too small
*/
return herodotus_reader_number_read(r);
}
size_t
grapheme_bidirectional_preprocess(const uint_least32_t *src, size_t srclen,
enum grapheme_bidirectional_override override,
uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return preprocess(&r, override, dest, destlen);
}
size_t
grapheme_bidirectional_preprocess_utf8(
const char *src, size_t srclen,
enum grapheme_bidirectional_override override, uint_least32_t *dest,
size_t destlen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return preprocess(&r, override, dest, destlen);
}
size_t
grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
size_t linelen,
int_least8_t *lev,
size_t levlen)
{
enum bidi_property prop;
size_t i, runsince;
int_least8_t level;
/* rule L1.4 */
runsince = SIZE_MAX;
for (i = 0; i < linelen; i++) {
level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
linedata[i]);
/* write level into level array if we still have space */
if (i < levlen) {
lev[i] = level;
}
if (level == -1) {
/* ignored character */
continue;
}
if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
prop == BIDI_PROP_PDI) {
if (runsince == SIZE_MAX) {
/* a new run has begun */
runsince = i;
}
} else {
/* sequence ended */
runsince = SIZE_MAX;
}
}
if (runsince != SIZE_MAX) {
/*
* we hit the end of the line but were in a run;
* reset the line levels to the paragraph level
*/
for (i = runsince; i < MIN(linelen, levlen); i++) {
if (lev[i] != -1) {
lev[i] = (int_least8_t)get_state(
STATE_PARAGRAPH_LEVEL, linedata[i]);
}
}
}
return linelen;
}