| /* See LICENSE file for copyright and license details. */ |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include "../grapheme.h" |
| #include "util.h" |
| |
| #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u)) |
| |
| /* lookup-table for the types of sequence first bytes */ |
| static const struct { |
| uint_least8_t lower; /* lower bound of sequence first byte */ |
| uint_least8_t upper; /* upper bound of sequence first byte */ |
| uint_least32_t mincp; /* smallest non-overlong encoded codepoint */ |
| uint_least32_t maxcp; /* largest encodable codepoint */ |
| /* |
| * implicit: table-offset represents the number of following |
| * bytes of the form 10xxxxxx (6 bits capacity each) |
| */ |
| } lut[] = { |
| [0] = { |
| /* 0xxxxxxx */ |
| .lower = 0x00, /* 00000000 */ |
| .upper = 0x7F, /* 01111111 */ |
| .mincp = (uint_least32_t)0, |
| .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */ |
| }, |
| [1] = { |
| /* 110xxxxx */ |
| .lower = 0xC0, /* 11000000 */ |
| .upper = 0xDF, /* 11011111 */ |
| .mincp = (uint_least32_t)1 << 7, |
| .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */ |
| }, |
| [2] = { |
| /* 1110xxxx */ |
| .lower = 0xE0, /* 11100000 */ |
| .upper = 0xEF, /* 11101111 */ |
| .mincp = (uint_least32_t)1 << 11, |
| .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */ |
| }, |
| [3] = { |
| /* 11110xxx */ |
| .lower = 0xF0, /* 11110000 */ |
| .upper = 0xF7, /* 11110111 */ |
| .mincp = (uint_least32_t)1 << 16, |
| .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */ |
| }, |
| }; |
| |
| size_t |
| grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp) |
| { |
| size_t off, i; |
| uint_least32_t tmp; |
| |
| if (cp == NULL) { |
| /* |
| * instead of checking every time if cp is NULL within |
| * the decoder, simply point it at a dummy variable here. |
| */ |
| cp = &tmp; |
| } |
| |
| if (str == NULL || len == 0) { |
| /* a sequence must be at least 1 byte long */ |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return 0; |
| } |
| |
| /* identify sequence type with the first byte */ |
| for (off = 0; off < LEN(lut); off++) { |
| if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower, |
| lut[off].upper)) { |
| /* |
| * first byte is within the bounds; fill |
| * p with the the first bits contained in |
| * the first byte (by subtracting the high bits) |
| */ |
| *cp = ((const unsigned char *)str)[0] - lut[off].lower; |
| break; |
| } |
| } |
| if (off == LEN(lut)) { |
| /* |
| * first byte does not match a sequence type; |
| * set cp as invalid and return 1 byte processed |
| * |
| * this also includes the cases where bits higher than |
| * the 8th are set on systems with CHAR_BIT > 8 |
| */ |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return 1; |
| } |
| if (1 + off > len) { |
| /* |
| * input is not long enough, set cp as invalid |
| */ |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| |
| /* |
| * count the following continuation bytes, but nothing |
| * else in case we have a "rogue" case where e.g. such a |
| * sequence starter occurs right before a NUL-byte. |
| */ |
| for (i = 0; 1 + i < len; i++) { |
| if (!BETWEEN(((const unsigned char *)str)[1 + i], 0x80, |
| 0xBF)) { |
| break; |
| } |
| } |
| |
| /* |
| * if the continuation bytes do not continue until |
| * the end, return the incomplete sequence length. |
| * Otherwise return the number of bytes we actually |
| * expected, which is larger than n. |
| */ |
| return ((1 + i) < len) ? (1 + i) : (1 + off); |
| } |
| |
| /* |
| * process 'off' following bytes, each of the form 10xxxxxx |
| * (i.e. between 0x80 (10000000) and 0xBF (10111111)) |
| */ |
| for (i = 1; i <= off; i++) { |
| if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) { |
| /* |
| * byte does not match format; return |
| * number of bytes processed excluding the |
| * unexpected character as recommended since |
| * Unicode 6 (chapter 3) |
| * |
| * this also includes the cases where bits |
| * higher than the 8th are set on systems |
| * with CHAR_BIT > 8 |
| */ |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| return 1 + (i - 1); |
| } |
| /* |
| * shift codepoint by 6 bits and add the 6 stored bits |
| * in s[i] to it using the bitmask 0x3F (00111111) |
| */ |
| *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F); |
| } |
| |
| if (*cp < lut[off].mincp || |
| BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || |
| *cp > UINT32_C(0x10FFFF)) { |
| /* |
| * codepoint is overlong encoded in the sequence, is a |
| * high or low UTF-16 surrogate half (0xD800..0xDFFF) or |
| * not representable in UTF-16 (>0x10FFFF) (RFC-3629 |
| * specifies the latter two conditions) |
| */ |
| *cp = GRAPHEME_INVALID_CODEPOINT; |
| } |
| |
| return 1 + off; |
| } |
| |
| size_t |
| grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len) |
| { |
| size_t off, i; |
| |
| if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || |
| cp > UINT32_C(0x10FFFF)) { |
| /* |
| * codepoint is a high or low UTF-16 surrogate half |
| * (0xD800..0xDFFF) or not representable in UTF-16 |
| * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. |
| */ |
| cp = GRAPHEME_INVALID_CODEPOINT; |
| } |
| |
| /* determine necessary sequence type */ |
| for (off = 0; off < LEN(lut); off++) { |
| if (cp <= lut[off].maxcp) { |
| break; |
| } |
| } |
| if (1 + off > len || str == NULL || len == 0) { |
| /* |
| * specified buffer is too small to store sequence or |
| * the caller just wanted to know how many bytes the |
| * codepoint needs by passing a NULL-buffer. |
| */ |
| return 1 + off; |
| } |
| |
| /* build sequence by filling cp-bits into each byte */ |
| |
| /* |
| * lut[off].lower is the bit-format for the first byte and |
| * the bits to fill into it are determined by shifting the |
| * cp 6 times the number of following bytes, as each |
| * following byte stores 6 bits, yielding the wanted bits. |
| * |
| * We do not overwrite the mask because we guaranteed earlier |
| * that there are no bits higher than the mask allows. |
| */ |
| ((unsigned char *)str)[0] = |
| lut[off].lower | (uint_least8_t)(cp >> (6 * off)); |
| |
| for (i = 1; i <= off; i++) { |
| /* |
| * the bit-format for following bytes is 10000000 (0x80) |
| * and it each stores 6 bits in the 6 low bits that we |
| * extract from the properly-shifted value using the |
| * mask 00111111 (0x3F) |
| */ |
| ((unsigned char *)str)[i] = |
| 0x80 | ((cp >> (6 * (off - i))) & 0x3F); |
| } |
| |
| return 1 + off; |
| } |