| /* See LICENSE file for copyright and license details. */ |
| #include <limits.h> |
| #include <stdbool.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include "../gen/types.h" |
| #include "../grapheme.h" |
| #include "util.h" |
| |
| void |
| herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type, |
| const void *src, size_t srclen) |
| { |
| size_t i; |
| |
| r->type = type; |
| r->src = src; |
| r->srclen = srclen; |
| r->off = 0; |
| r->terminated_by_null = false; |
| |
| for (i = 0; i < LEN(r->soft_limit); i++) { |
| r->soft_limit[i] = SIZE_MAX; |
| } |
| } |
| |
| void |
| herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest) |
| { |
| size_t i; |
| |
| /* |
| * we copy such that we have a "fresh" start and build |
| * on the fact that src->soft_limit[i] for any i and src->srclen |
| * are always larger or equal to src->off |
| */ |
| dest->type = src->type; |
| if (src->type == HERODOTUS_TYPE_CODEPOINT) { |
| dest->src = ((const uint_least32_t *)(src->src)) + src->off; |
| } else { /* src->type == HERODOTUS_TYPE_UTF8 */ |
| dest->src = ((const char *)(src->src)) + src->off; |
| } |
| if (src->srclen == SIZE_MAX) { |
| dest->srclen = SIZE_MAX; |
| } else { |
| dest->srclen = src->srclen - src->off; |
| } |
| dest->off = 0; |
| dest->terminated_by_null = src->terminated_by_null; |
| |
| for (i = 0; i < LEN(src->soft_limit); i++) { |
| if (src->soft_limit[i] == SIZE_MAX) { |
| dest->soft_limit[i] = src->soft_limit[i]; |
| } else { |
| dest->soft_limit[i] = src->soft_limit[i] - src->off; |
| } |
| } |
| } |
| |
| void |
| herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count) |
| { |
| size_t i; |
| |
| for (i = LEN(r->soft_limit) - 1; i >= 1; i--) { |
| r->soft_limit[i] = r->soft_limit[i - 1]; |
| } |
| r->soft_limit[0] = r->off + count; |
| } |
| |
| void |
| herodotus_reader_pop_limit(HERODOTUS_READER *r) |
| { |
| size_t i; |
| |
| for (i = 0; i < LEN(r->soft_limit) - 1; i++) { |
| r->soft_limit[i] = r->soft_limit[i + 1]; |
| } |
| r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX; |
| } |
| |
| size_t |
| herodotus_reader_next_word_break(const HERODOTUS_READER *r) |
| { |
| if (r->type == HERODOTUS_TYPE_CODEPOINT) { |
| return grapheme_next_word_break( |
| (const uint_least32_t *)(r->src) + r->off, |
| MIN(r->srclen, r->soft_limit[0]) - r->off); |
| } else { /* r->type == HERODOTUS_TYPE_UTF8 */ |
| return grapheme_next_word_break_utf8( |
| (const char *)(r->src) + r->off, |
| MIN(r->srclen, r->soft_limit[0]) - r->off); |
| } |
| } |
| |
| size_t |
| herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) |
| { |
| if (r->type == HERODOTUS_TYPE_CODEPOINT) { |
| return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0; |
| } else { /* r->type == HERODOTUS_TYPE_UTF8 */ |
| return grapheme_decode_utf8( |
| (const char *)(r->src) + r->off, |
| MIN(r->srclen, r->soft_limit[0]) - r->off, NULL); |
| } |
| } |
| |
| size_t |
| herodotus_reader_number_read(const HERODOTUS_READER *r) |
| { |
| return r->off; |
| } |
| |
| enum herodotus_status |
| herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp) |
| { |
| size_t ret; |
| |
| if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) { |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return HERODOTUS_STATUS_END_OF_BUFFER; |
| } |
| |
| if (r->off >= r->soft_limit[0]) { |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return HERODOTUS_STATUS_SOFT_LIMIT_REACHED; |
| } |
| |
| if (r->type == HERODOTUS_TYPE_CODEPOINT) { |
| *cp = ((const uint_least32_t *)(r->src))[r->off]; |
| |
| if (advance) { |
| r->off++; |
| } |
| } else { /* r->type == HERODOTUS_TYPE_UTF8 */ |
| ret = grapheme_decode_utf8((const char *)r->src + r->off, |
| MIN(r->srclen, r->soft_limit[0]) - |
| r->off, cp); |
| |
| if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) { |
| /* |
| * We encountered a NUL-byte. Don't increment |
| * offset and return as if the buffer had ended |
| * here all along |
| */ |
| r->terminated_by_null = true; |
| return HERODOTUS_STATUS_END_OF_BUFFER; |
| } |
| |
| if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) { |
| /* |
| * we want more than we have; instead of |
| * returning garbage we terminate here. |
| */ |
| return HERODOTUS_STATUS_END_OF_BUFFER; |
| } |
| |
| /* |
| * Increase offset which we now know won't surpass |
| * the limits, unless we got told otherwise |
| */ |
| if (advance) { |
| r->off += ret; |
| } |
| } |
| |
| return HERODOTUS_STATUS_SUCCESS; |
| } |
| |
| void |
| herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, |
| void *dest, size_t destlen) |
| { |
| w->type = type; |
| w->dest = dest; |
| w->destlen = destlen; |
| w->off = 0; |
| w->first_unwritable_offset = SIZE_MAX; |
| } |
| |
| void |
| herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) |
| { |
| if (w->dest == NULL) { |
| return; |
| } |
| |
| if (w->off < w->destlen) { |
| /* We still have space in the buffer. Simply use it */ |
| if (w->type == HERODOTUS_TYPE_CODEPOINT) { |
| ((uint_least32_t *)(w->dest))[w->off] = 0; |
| } else { /* w->type == HERODOTUS_TYPE_UTF8 */ |
| ((char *)(w->dest))[w->off] = '\0'; |
| } |
| } else if (w->first_unwritable_offset < w->destlen) { |
| /* |
| * There is no more space in the buffer. However, |
| * we have noted down the first offset we couldn't |
| * use to write into the buffer and it's smaller than |
| * destlen. Thus we bailed writing into the |
| * destination when a multibyte-codepoint couldn't be |
| * written. So the last "real" byte might be at |
| * destlen-4, destlen-3, destlen-2 or destlen-1 |
| * (the last case meaning truncation). |
| */ |
| if (w->type == HERODOTUS_TYPE_CODEPOINT) { |
| ((uint_least32_t *)(w->dest)) |
| [w->first_unwritable_offset] = 0; |
| } else { /* w->type == HERODOTUS_TYPE_UTF8 */ |
| ((char *)(w->dest))[w->first_unwritable_offset] = '\0'; |
| } |
| } else { |
| /* |
| * In this case, there is no more space in the buffer and |
| * the last unwritable offset is larger than |
| * or equal to the destination buffer length. This means |
| * that we are forced to simply write into the last |
| * byte. |
| */ |
| if (w->type == HERODOTUS_TYPE_CODEPOINT) { |
| ((uint_least32_t *)(w->dest)) |
| [w->destlen - 1] = 0; |
| } else { /* w->type == HERODOTUS_TYPE_UTF8 */ |
| ((char *)(w->dest))[w->destlen - 1] = '\0'; |
| } |
| } |
| |
| /* w->off is not incremented in any case */ |
| } |
| |
| size_t |
| herodotus_writer_number_written(const HERODOTUS_WRITER *w) |
| { |
| return w->off; |
| } |
| |
| void |
| herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) |
| { |
| size_t ret; |
| |
| /* |
| * This function will always faithfully say how many codepoints |
| * were written, even if the buffer ends. This is used to enable |
| * truncation detection. |
| */ |
| if (w->type == HERODOTUS_TYPE_CODEPOINT) { |
| if (w->dest != NULL && w->off < w->destlen) { |
| ((uint_least32_t *)(w->dest))[w->off] = cp; |
| } |
| |
| w->off += 1; |
| } else { /* w->type == HERODOTUS_TYPE_UTF8 */ |
| /* |
| * First determine how many bytes we need to encode the |
| * codepoint |
| */ |
| ret = grapheme_encode_utf8(cp, NULL, 0); |
| |
| if (w->dest != NULL && w->off + ret < w->destlen) { |
| /* we still have enough room in the buffer */ |
| grapheme_encode_utf8(cp, (char *)(w->dest) + |
| w->off, w->destlen - w->off); |
| } else if (w->first_unwritable_offset == SIZE_MAX) { |
| /* |
| * the first unwritable offset has not been |
| * noted down, so this is the first time we can't |
| * write (completely) to an offset |
| */ |
| w->first_unwritable_offset = w->off; |
| } |
| |
| w->off += ret; |
| } |
| } |
| |
| void |
| proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop, |
| uint_least8_t (*get_break_prop)(uint_least32_t), |
| bool (*is_skippable_prop)(uint_least8_t), |
| void (*skip_shift_callback)(uint_least8_t, void *), |
| struct proper *p) |
| { |
| uint_least8_t prop; |
| uint_least32_t cp; |
| size_t i; |
| |
| /* set internal variables */ |
| p->state = state; |
| p->no_prop = no_prop; |
| p->get_break_prop = get_break_prop; |
| p->is_skippable_prop = is_skippable_prop; |
| p->skip_shift_callback = skip_shift_callback; |
| |
| /* |
| * Initialize mid-reader, which is basically just there |
| * to reflect the current position of the viewing-line |
| */ |
| herodotus_reader_copy(r, &(p->mid_reader)); |
| |
| /* |
| * In the initialization, we simply (try to) fill in next_prop. |
| * If we cannot read in more (due to the buffer ending), we |
| * fill in the prop as invalid |
| */ |
| |
| /* |
| * initialize the previous properties to have no property |
| * (given we are at the start of the buffer) |
| */ |
| p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; |
| p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; |
| |
| /* |
| * initialize the next properties |
| */ |
| |
| /* initialize the raw reader */ |
| herodotus_reader_copy(r, &(p->raw_reader)); |
| |
| /* fill in the two next raw properties (after no-initialization) */ |
| p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; |
| for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) == |
| HERODOTUS_STATUS_SUCCESS; ) { |
| p->raw.next_prop[i++] = p->get_break_prop(cp); |
| } |
| |
| /* initialize the skip reader */ |
| herodotus_reader_copy(r, &(p->skip_reader)); |
| |
| /* fill in the two next skip properties (after no-initialization) */ |
| p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; |
| for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) == |
| HERODOTUS_STATUS_SUCCESS; ) { |
| prop = p->get_break_prop(cp); |
| if (!p->is_skippable_prop(prop)) { |
| p->skip.next_prop[i++] = prop; |
| } |
| } |
| } |
| |
| int |
| proper_advance(struct proper *p) |
| { |
| uint_least8_t prop; |
| uint_least32_t cp; |
| |
| /* read in next "raw" property */ |
| if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == |
| HERODOTUS_STATUS_SUCCESS) { |
| prop = p->get_break_prop(cp); |
| } else { |
| prop = p->no_prop; |
| } |
| |
| /* |
| * do a shift-in, unless we find that the property that is to |
| * be moved past the "raw-viewing-line" (this property is stored |
| * in p->raw.next_prop[0]) is a no_prop, indicating that |
| * we are at the end of the buffer. |
| */ |
| if (p->raw.next_prop[0] == p->no_prop) { |
| return 1; |
| } |
| |
| /* shift in the properties */ |
| p->raw.prev_prop[1] = p->raw.prev_prop[0]; |
| p->raw.prev_prop[0] = p->raw.next_prop[0]; |
| p->raw.next_prop[0] = p->raw.next_prop[1]; |
| p->raw.next_prop[1] = prop; |
| |
| /* advance the middle reader viewing-line */ |
| (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); |
| |
| /* check skippability-property */ |
| if (!p->is_skippable_prop(p->raw.prev_prop[0])) { |
| /* |
| * the property that has moved past the "raw-viewing-line" |
| * (this property is now (after the raw-shift) stored in |
| * p->raw.prev_prop[0] and guaranteed not to be a no-prop, |
| * guaranteeing that we won't shift a no-prop past the |
| * "viewing-line" in the skip-properties) is not a skippable |
| * property, thus we need to shift the skip property as well. |
| */ |
| p->skip.prev_prop[1] = p->skip.prev_prop[0]; |
| p->skip.prev_prop[0] = p->skip.next_prop[0]; |
| p->skip.next_prop[0] = p->skip.next_prop[1]; |
| |
| /* |
| * call the skip-shift-callback on the property that |
| * passed the skip-viewing-line (this property is now |
| * stored in p->skip.prev_prop[0]). |
| */ |
| p->skip_shift_callback(p->skip.prev_prop[0], p->state); |
| |
| /* determine the next shift property */ |
| p->skip.next_prop[1] = p->no_prop; |
| while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) == |
| HERODOTUS_STATUS_SUCCESS) { |
| prop = p->get_break_prop(cp); |
| if (!p->is_skippable_prop(prop)) { |
| p->skip.next_prop[1] = prop; |
| break; |
| } |
| } |
| } |
| |
| return 0; |
| } |
| |
| inline size_t |
| get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) |
| { |
| if (offset < len) { |
| *cp = ((const uint_least32_t *)str)[offset]; |
| return 1; |
| } else { |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return 0; |
| } |
| } |
| |
| inline size_t |
| get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) |
| { |
| size_t ret; |
| |
| if (offset < len) { |
| ret = grapheme_decode_utf8((const char *)str + offset, |
| len - offset, cp); |
| |
| if (unlikely(len == SIZE_MAX && cp == 0)) { |
| return 0; |
| } else { |
| return ret; |
| } |
| } else { |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return 0; |
| } |
| } |
| |
| inline size_t |
| set_codepoint(uint_least32_t cp, void *str, size_t len, size_t offset) |
| { |
| if (str == NULL || len == 0) { |
| return 1; |
| } |
| |
| if (offset < len) { |
| ((uint_least32_t *)str)[offset] = cp; |
| return 1; |
| } else { |
| return 0; |
| } |
| } |
| |
| inline size_t |
| set_codepoint_utf8(uint_least32_t cp, void *str, size_t len, size_t offset) |
| { |
| if (str == NULL || len == 0) { |
| return grapheme_encode_utf8(cp, NULL, 0); |
| } |
| |
| if (offset < len) { |
| return grapheme_encode_utf8(cp, (char *)str + offset, |
| len - offset); |
| } else { |
| return grapheme_encode_utf8(cp, NULL, 0); |
| } |
| } |