Add wuffs_base__utf_8__next_from_end
diff --git a/internal/cgen/base/strconv-public.h b/internal/cgen/base/strconv-public.h
index 734d60c..a957871 100644
--- a/internal/cgen/base/strconv-public.h
+++ b/internal/cgen/base/strconv-public.h
@@ -371,6 +371,11 @@
WUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //
wuffs_base__utf_8__next(wuffs_base__slice_u8 s);
+// wuffs_base__utf_8__next_from_end is like wuffs_base__utf_8__next except that
+// it looks at the end of s instead of the start.
+WUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //
+wuffs_base__utf_8__next_from_end(wuffs_base__slice_u8 s);
+
// wuffs_base__utf_8__longest_valid_prefix returns the largest n such that the
// sub-slice s[..n] is valid UTF-8.
//
diff --git a/internal/cgen/base/utf8-submodule.c b/internal/cgen/base/utf8-submodule.c
index 8fa3131..40a70cb 100644
--- a/internal/cgen/base/utf8-submodule.c
+++ b/internal/cgen/base/utf8-submodule.c
@@ -167,6 +167,39 @@
WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);
}
+WUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //
+wuffs_base__utf_8__next_from_end(wuffs_base__slice_u8 s) {
+ if (s.len == 0) {
+ return wuffs_base__make_utf_8__next__output(0, 0);
+ }
+ uint8_t* ptr = &s.ptr[s.len - 1];
+ if (*ptr < 0x80) {
+ return wuffs_base__make_utf_8__next__output(*ptr, 1);
+
+ } else if (*ptr < 0xC0) {
+ uint8_t* too_far = &s.ptr[(s.len > 4) ? (s.len - 4) : 0];
+ uint32_t n = 1;
+ while (ptr != too_far) {
+ ptr--;
+ n++;
+ if (*ptr < 0x80) {
+ break;
+ } else if (*ptr < 0xC0) {
+ continue;
+ }
+ wuffs_base__utf_8__next__output o =
+ wuffs_base__utf_8__next(wuffs_base__make_slice_u8(ptr, n));
+ if (o.byte_length != n) {
+ break;
+ }
+ return o;
+ }
+ }
+
+ return wuffs_base__make_utf_8__next__output(
+ WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);
+}
+
WUFFS_BASE__MAYBE_STATIC size_t //
wuffs_base__utf_8__longest_valid_prefix(wuffs_base__slice_u8 s) {
// TODO: possibly optimize the all-ASCII case (4 or 8 bytes at a time).
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index ae40721..514ad39 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -209,8 +209,9 @@
" 6 7\n // 8 9 A B C D E F\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x00 ..= 0x07.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x08 ..= 0x0F.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x10 ..= 0x17.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x18 ..= 0x1F.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x20 ..= 0x27.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x28 ..= 0x2F.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x30 ..= 0x37.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x38 ..= 0x3F.\n\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x40 ..= 0x47.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x48 ..= 0x4F.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x50 ..= 0x57.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x58 ..= 0x5F.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x60 ..= 0x67.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00" +
", 0x00, 0x00, // 0x68 ..= 0x6F.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x70 ..= 0x77.\n 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x78 ..= 0x7F.\n\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x80 ..= 0x87.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x88 ..= 0x8F.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x90 ..= 0x97.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x98 ..= 0x9F.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0xA0 ..= 0xA7.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0xA8 ..= 0xAF.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0xB0 ..= 0xB7.\n 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0xB8 ..= 0xBF.\n\n 0x80, 0x80, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xC0 ..= 0xC7.\n 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xC8 ..= 0xCF.\n 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xD0 ..= 0xD7.\n 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xD8 ..= 0" +
"xDF.\n 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xE0 ..= 0xE7.\n 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xE8 ..= 0xEF.\n 0x03, 0x03, 0x03, 0x03, 0x03, 0x80, 0x80, 0x80, // 0xF0 ..= 0xF7.\n 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0xF8 ..= 0xFF.\n // 0 1 2 3 4 5 6 7\n // 8 9 A B C D E F\n};\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //\nwuffs_base__utf_8__next(wuffs_base__slice_u8 s) {\n if (s.len == 0) {\n return wuffs_base__make_utf_8__next__output(0, 0);\n }\n uint32_t c = s.ptr[0];\n switch (wuffs_base__utf_8__byte_length_minus_1[c & 0xFF]) {\n case 0:\n return wuffs_base__make_utf_8__next__output(c, 1);\n\n case 1:\n if (s.len < 2) {\n break;\n }\n c = wuffs_base__load_u16le__no_bounds_check(s.ptr);\n if ((c & 0xC000) != 0x8000) {\n break;\n }\n c = (0x0007C0 & (c << 6)) | (0x00003F & (c >> 8));\n return wuffs_base__make_utf_8__next__output(" +
- "c, 2);\n\n case 2:\n if (s.len < 3) {\n break;\n }\n c = wuffs_base__load_u24le__no_bounds_check(s.ptr);\n if ((c & 0xC0C000) != 0x808000) {\n break;\n }\n c = (0x00F000 & (c << 12)) | (0x000FC0 & (c >> 2)) |\n (0x00003F & (c >> 16));\n if ((c <= 0x07FF) || ((0xD800 <= c) && (c <= 0xDFFF))) {\n break;\n }\n return wuffs_base__make_utf_8__next__output(c, 3);\n\n case 3:\n if (s.len < 4) {\n break;\n }\n c = wuffs_base__load_u32le__no_bounds_check(s.ptr);\n if ((c & 0xC0C0C000) != 0x80808000) {\n break;\n }\n c = (0x1C0000 & (c << 18)) | (0x03F000 & (c << 4)) |\n (0x000FC0 & (c >> 10)) | (0x00003F & (c >> 24));\n if ((c <= 0xFFFF) || (0x110000 <= c)) {\n break;\n }\n return wuffs_base__make_utf_8__next__output(c, 4);\n }\n\n return wuffs_base__make_utf_8__next__output(\n WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);\n}\n\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__utf_8__longest_va" +
- "lid_prefix(wuffs_base__slice_u8 s) {\n // TODO: possibly optimize the all-ASCII case (4 or 8 bytes at a time).\n //\n // TODO: possibly optimize this by manually inlining the\n // wuffs_base__utf_8__next calls.\n size_t original_len = s.len;\n while (s.len > 0) {\n wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);\n if ((o.code_point > 0x7F) && (o.byte_length == 1)) {\n break;\n }\n s.ptr += o.byte_length;\n s.len -= o.byte_length;\n }\n return original_len - s.len;\n}\n\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__ascii__longest_valid_prefix(wuffs_base__slice_u8 s) {\n // TODO: possibly optimize this by checking 4 or 8 bytes at a time.\n uint8_t* original_ptr = s.ptr;\n uint8_t* p = s.ptr;\n uint8_t* q = s.ptr + s.len;\n for (; (p != q) && ((*p & 0x80) == 0); p++) {\n }\n return (size_t)(p - original_ptr);\n}\n" +
+ "c, 2);\n\n case 2:\n if (s.len < 3) {\n break;\n }\n c = wuffs_base__load_u24le__no_bounds_check(s.ptr);\n if ((c & 0xC0C000) != 0x808000) {\n break;\n }\n c = (0x00F000 & (c << 12)) | (0x000FC0 & (c >> 2)) |\n (0x00003F & (c >> 16));\n if ((c <= 0x07FF) || ((0xD800 <= c) && (c <= 0xDFFF))) {\n break;\n }\n return wuffs_base__make_utf_8__next__output(c, 3);\n\n case 3:\n if (s.len < 4) {\n break;\n }\n c = wuffs_base__load_u32le__no_bounds_check(s.ptr);\n if ((c & 0xC0C0C000) != 0x80808000) {\n break;\n }\n c = (0x1C0000 & (c << 18)) | (0x03F000 & (c << 4)) |\n (0x000FC0 & (c >> 10)) | (0x00003F & (c >> 24));\n if ((c <= 0xFFFF) || (0x110000 <= c)) {\n break;\n }\n return wuffs_base__make_utf_8__next__output(c, 4);\n }\n\n return wuffs_base__make_utf_8__next__output(\n WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);\n}\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //\nwuff" +
+ "s_base__utf_8__next_from_end(wuffs_base__slice_u8 s) {\n if (s.len == 0) {\n return wuffs_base__make_utf_8__next__output(0, 0);\n }\n uint8_t* ptr = &s.ptr[s.len - 1];\n if (*ptr < 0x80) {\n return wuffs_base__make_utf_8__next__output(*ptr, 1);\n\n } else if (*ptr < 0xC0) {\n uint8_t* too_far = &s.ptr[(s.len > 4) ? (s.len - 4) : 0];\n uint32_t n = 1;\n while (ptr != too_far) {\n ptr--;\n n++;\n if (*ptr < 0x80) {\n break;\n } else if (*ptr < 0xC0) {\n continue;\n }\n wuffs_base__utf_8__next__output o =\n wuffs_base__utf_8__next(wuffs_base__make_slice_u8(ptr, n));\n if (o.byte_length != n) {\n break;\n }\n return o;\n }\n }\n\n return wuffs_base__make_utf_8__next__output(\n WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);\n}\n\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__utf_8__longest_valid_prefix(wuffs_base__slice_u8 s) {\n // TODO: possibly optimize the all-ASCII case (4 or 8 bytes at a time).\n //\n // TODO: possibly optimize this b" +
+ "y manually inlining the\n // wuffs_base__utf_8__next calls.\n size_t original_len = s.len;\n while (s.len > 0) {\n wuffs_base__utf_8__next__output o = wuffs_base__utf_8__next(s);\n if ((o.code_point > 0x7F) && (o.byte_length == 1)) {\n break;\n }\n s.ptr += o.byte_length;\n s.len -= o.byte_length;\n }\n return original_len - s.len;\n}\n\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__ascii__longest_valid_prefix(wuffs_base__slice_u8 s) {\n // TODO: possibly optimize this by checking 4 or 8 bytes at a time.\n uint8_t* original_ptr = s.ptr;\n uint8_t* p = s.ptr;\n uint8_t* q = s.ptr + s.len;\n for (; (p != q) && ((*p & 0x80) == 0); p++) {\n }\n return (size_t)(p - original_ptr);\n}\n" +
""
const BaseFundamentalPrivateH = "" +
@@ -468,8 +469,8 @@
"FFF));\n case 4:\n return (0x00010000 <= cp) && (cp <= 0x0010FFFF);\n }\n }\n return false;\n}\n\n#ifdef __cplusplus\n\ninline bool //\nwuffs_base__utf_8__next__output::is_valid() const {\n return wuffs_base__utf_8__next__output__is_valid(this);\n}\n\n#endif // __cplusplus\n\n" +
"" +
"// --------\n\n// wuffs_base__utf_8__encode writes the UTF-8 encoding of code_point to s and\n// returns the number of bytes written. If code_point is invalid, or if s is\n// shorter than the entire encoding, it returns 0 (and no bytes are written).\n//\n// s will never be too short if its length is at least 4, also known as\n// WUFFS_BASE__UTF_8__BYTE_LENGTH__MAX_INCL.\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__utf_8__encode(wuffs_base__slice_u8 dst, uint32_t code_point);\n\n// wuffs_base__utf_8__next returns the next UTF-8 code point (and that code\n// point's byte length) at the start of s.\n//\n// There are exactly two cases in which this function returns something where\n// wuffs_base__utf_8__next__output__is_valid is false:\n// - If s is empty then it returns {.code_point=0, .byte_length=0}.\n// - If s is non-empty and starts with invalid UTF-8 then it returns\n// {.code_point=WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, .byte_length=1}.\n//\n// Otherwise, it returns something where\n// wuffs_base__utf_8__next__ou" +
- "tput__is_valid is true.\n//\n// In any case, it always returns an output that satisfies both of:\n// - (output.code_point <= WUFFS_BASE__UNICODE_CODE_POINT__MAX_INCL).\n// - (output.byte_length <= s.len).\n//\n// If s is a sub-slice of a larger slice of valid UTF-8, but that sub-slice\n// boundary occurs in the middle of a multi-byte UTF-8 encoding of a single\n// code point, then this function may return something invalid. It is the\n// caller's responsibility to split on or otherwise manage UTF-8 boundaries.\nWUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //\nwuffs_base__utf_8__next(wuffs_base__slice_u8 s);\n\n// wuffs_base__utf_8__longest_valid_prefix returns the largest n such that the\n// sub-slice s[..n] is valid UTF-8.\n//\n// In particular, it returns s.len if and only if all of s is valid UTF-8.\n//\n// If s is a sub-slice of a larger slice of valid UTF-8, but that sub-slice\n// boundary occurs in the middle of a multi-byte UTF-8 encoding of a single\n// code point, then this function will return less than" +
- " s.len. It is the\n// caller's responsibility to split on or otherwise manage UTF-8 boundaries.\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__utf_8__longest_valid_prefix(wuffs_base__slice_u8 s);\n\n// wuffs_base__ascii__longest_valid_prefix returns the largest n such that the\n// sub-slice s[..n] is valid ASCII.\n//\n// In particular, it returns s.len if and only if all of s is valid ASCII.\n// Equivalently, when none of the bytes in s have the 0x80 high bit set.\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__ascii__longest_valid_prefix(wuffs_base__slice_u8 s);\n" +
+ "tput__is_valid is true.\n//\n// In any case, it always returns an output that satisfies both of:\n// - (output.code_point <= WUFFS_BASE__UNICODE_CODE_POINT__MAX_INCL).\n// - (output.byte_length <= s.len).\n//\n// If s is a sub-slice of a larger slice of valid UTF-8, but that sub-slice\n// boundary occurs in the middle of a multi-byte UTF-8 encoding of a single\n// code point, then this function may return something invalid. It is the\n// caller's responsibility to split on or otherwise manage UTF-8 boundaries.\nWUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //\nwuffs_base__utf_8__next(wuffs_base__slice_u8 s);\n\n// wuffs_base__utf_8__next_from_end is like wuffs_base__utf_8__next except that\n// it looks at the end of s instead of the start.\nWUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //\nwuffs_base__utf_8__next_from_end(wuffs_base__slice_u8 s);\n\n// wuffs_base__utf_8__longest_valid_prefix returns the largest n such that the\n// sub-slice s[..n] is valid UTF-8.\n//\n// In particular, it returns s.len " +
+ "if and only if all of s is valid UTF-8.\n//\n// If s is a sub-slice of a larger slice of valid UTF-8, but that sub-slice\n// boundary occurs in the middle of a multi-byte UTF-8 encoding of a single\n// code point, then this function will return less than s.len. It is the\n// caller's responsibility to split on or otherwise manage UTF-8 boundaries.\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__utf_8__longest_valid_prefix(wuffs_base__slice_u8 s);\n\n// wuffs_base__ascii__longest_valid_prefix returns the largest n such that the\n// sub-slice s[..n] is valid ASCII.\n//\n// In particular, it returns s.len if and only if all of s is valid ASCII.\n// Equivalently, when none of the bytes in s have the 0x80 high bit set.\nWUFFS_BASE__MAYBE_STATIC size_t //\nwuffs_base__ascii__longest_valid_prefix(wuffs_base__slice_u8 s);\n" +
""
const BaseTokenPrivateH = "" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 6722b55..7098044 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -4261,6 +4261,11 @@
WUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //
wuffs_base__utf_8__next(wuffs_base__slice_u8 s);
+// wuffs_base__utf_8__next_from_end is like wuffs_base__utf_8__next except that
+// it looks at the end of s instead of the start.
+WUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //
+wuffs_base__utf_8__next_from_end(wuffs_base__slice_u8 s);
+
// wuffs_base__utf_8__longest_valid_prefix returns the largest n such that the
// sub-slice s[..n] is valid UTF-8.
//
@@ -12881,6 +12886,39 @@
WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);
}
+WUFFS_BASE__MAYBE_STATIC wuffs_base__utf_8__next__output //
+wuffs_base__utf_8__next_from_end(wuffs_base__slice_u8 s) {
+ if (s.len == 0) {
+ return wuffs_base__make_utf_8__next__output(0, 0);
+ }
+ uint8_t* ptr = &s.ptr[s.len - 1];
+ if (*ptr < 0x80) {
+ return wuffs_base__make_utf_8__next__output(*ptr, 1);
+
+ } else if (*ptr < 0xC0) {
+ uint8_t* too_far = &s.ptr[(s.len > 4) ? (s.len - 4) : 0];
+ uint32_t n = 1;
+ while (ptr != too_far) {
+ ptr--;
+ n++;
+ if (*ptr < 0x80) {
+ break;
+ } else if (*ptr < 0xC0) {
+ continue;
+ }
+ wuffs_base__utf_8__next__output o =
+ wuffs_base__utf_8__next(wuffs_base__make_slice_u8(ptr, n));
+ if (o.byte_length != n) {
+ break;
+ }
+ return o;
+ }
+ }
+
+ return wuffs_base__make_utf_8__next__output(
+ WUFFS_BASE__UNICODE_REPLACEMENT_CHARACTER, 1);
+}
+
WUFFS_BASE__MAYBE_STATIC size_t //
wuffs_base__utf_8__longest_valid_prefix(wuffs_base__slice_u8 s) {
// TODO: possibly optimize the all-ASCII case (4 or 8 bytes at a time).
diff --git a/test/c/std/json.c b/test/c/std/json.c
index 4bd4834..3f3784e 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c
@@ -1725,99 +1725,125 @@
the_nul_byte[0] = '\x00';
struct {
- uint32_t want_cp;
- uint32_t want_bl;
+ // The uint32_t want is packed as:
+ // - the high 8 bits are the byte length.
+ // - the low 24 bits are the code point.
+ uint32_t want0; // For wuffs_base__utf_8__next.
+ uint32_t want1; // For wuffs_base__utf_8__next_from_end.
const char* str;
} test_cases[] = {
- {.want_cp = 0x00000000, .want_bl = 0, .str = ""},
- {.want_cp = 0x00000000, .want_bl = 1, .str = "The <NUL> byte"},
- {.want_cp = 0x00000009, .want_bl = 1, .str = "\t"},
- {.want_cp = 0x00000041, .want_bl = 1, .str = "A"},
- {.want_cp = 0x00000061, .want_bl = 1, .str = "abdefghij"},
- {.want_cp = 0x0000007F, .want_bl = 1, .str = "\x7F"},
- {.want_cp = 0x00000080, .want_bl = 2, .str = "\xC2\x80"},
- {.want_cp = 0x000007FF, .want_bl = 2, .str = "\xDF\xBF"},
- {.want_cp = 0x00000800, .want_bl = 3, .str = "\xE0\xA0\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 3, .str = "\xEF\xBF\xBD"},
- {.want_cp = 0x0000FFFF, .want_bl = 3, .str = "\xEF\xBF\xBF"},
- {.want_cp = 0x00010000, .want_bl = 4, .str = "\xF0\x90\x80\x80"},
- {.want_cp = 0x0010FFFF, .want_bl = 4, .str = "\xF4\x8F\xBF\xBF"},
+ {.want0 = 0x00000000, .want1 = 0x00000000, .str = ""},
+ {.want0 = 0x01000000, .want1 = 0x01000000, .str = "The <NUL> byte"},
+ {.want0 = 0x01000009, .want1 = 0x01000009, .str = "\t"},
+ {.want0 = 0x01000041, .want1 = 0x01000041, .str = "A"},
+ {.want0 = 0x01000061, .want1 = 0x0100006A, .str = "abdefghij"},
+ {.want0 = 0x0100007F, .want1 = 0x0100007F, .str = "\x7F"},
+ {.want0 = 0x02000080, .want1 = 0x02000080, .str = "\xC2\x80"},
+ {.want0 = 0x020007FF, .want1 = 0x020007FF, .str = "\xDF\xBF"},
+ {.want0 = 0x03000800, .want1 = 0x03000800, .str = "\xE0\xA0\x80"},
+ {.want0 = 0x0300FFFD, .want1 = 0x0300FFFD, .str = "\xEF\xBF\xBD"},
+ {.want0 = 0x0300FFFF, .want1 = 0x0300FFFF, .str = "\xEF\xBF\xBF"},
+ {.want0 = 0x04010000, .want1 = 0x04010000, .str = "\xF0\x90\x80\x80"},
+ {.want0 = 0x0410FFFF, .want1 = 0x0410FFFF, .str = "\xF4\x8F\xBF\xBF"},
// U+00000394 GREEK CAPITAL LETTER DELTA.
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94+"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94++"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94+++"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94++++"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94\x80"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94\x80\x80"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94\x80\x80\x80"},
- {.want_cp = 0x00000394, .want_bl = 2, .str = "\xCE\x94\x80\x80\x80\x80"},
+ {.want0 = 0x02000394, .want1 = 0x02000394, .str = "\xCE\x94"},
+ {.want0 = 0x02000394, .want1 = 0x01000070, .str = "\xCE\x94p"},
+ {.want0 = 0x02000394, .want1 = 0x01000071, .str = "\xCE\x94pq"},
+ {.want0 = 0x02000394, .want1 = 0x01000072, .str = "\xCE\x94pqr"},
+ {.want0 = 0x02000394, .want1 = 0x01000073, .str = "\xCE\x94pqrs"},
+ {.want0 = 0x02000394, .want1 = 0x0100FFFD, .str = "\xCE\x94\x80"},
+ {.want0 = 0x02000394, .want1 = 0x0100FFFD, .str = "\xCE\x94\x80\x81"},
+ {.want0 = 0x02000394, .want1 = 0x0100FFFD, .str = "\xCE\x94\x80\x81\x82"},
+ {.want0 = 0x02000394,
+ .want1 = 0x0100FFFD,
+ .str = "\xCE\x94\x80\x81\x82\x83"},
+ {.want0 = 0x01000070, .want1 = 0x02000394, .str = "p\xCE\x94"},
// U+00002603 SNOWMAN.
- {.want_cp = 0x00002603, .want_bl = 3, .str = "\xE2\x98\x83"},
- {.want_cp = 0x00002603, .want_bl = 3, .str = "\xE2\x98\x83+"},
- {.want_cp = 0x00002603, .want_bl = 3, .str = "\xE2\x98\x83++"},
- {.want_cp = 0x00002603, .want_bl = 3, .str = "\xE2\x98\x83+++"},
- {.want_cp = 0x00002603, .want_bl = 3, .str = "\xE2\x98\x83++++"},
- {.want_cp = 0x00002603, .want_bl = 3, .str = "\xE2\x98\x83\xFF"},
+ {.want0 = 0x03002603, .want1 = 0x03002603, .str = "\xE2\x98\x83"},
+ {.want0 = 0x03002603, .want1 = 0x01000070, .str = "\xE2\x98\x83p"},
+ {.want0 = 0x03002603, .want1 = 0x01000071, .str = "\xE2\x98\x83pq"},
+ {.want0 = 0x03002603, .want1 = 0x01000072, .str = "\xE2\x98\x83pqr"},
+ {.want0 = 0x03002603, .want1 = 0x01000073, .str = "\xE2\x98\x83pqrs"},
+ {.want0 = 0x03002603, .want1 = 0x0100FFFD, .str = "\xE2\x98\x83\xFF"},
+ {.want0 = 0x01000070, .want1 = 0x03002603, .str = "p\xE2\x98\x83"},
// U+0001F4A9 PILE OF POO.
- {.want_cp = 0x0001F4A9, .want_bl = 4, .str = "\xF0\x9F\x92\xA9"},
- {.want_cp = 0x0001F4A9, .want_bl = 4, .str = "\xF0\x9F\x92\xA9+"},
- {.want_cp = 0x0001F4A9, .want_bl = 4, .str = "\xF0\x9F\x92\xA9++"},
- {.want_cp = 0x0001F4A9, .want_bl = 4, .str = "\xF0\x9F\x92\xA9+++"},
- {.want_cp = 0x0001F4A9, .want_bl = 4, .str = "\xF0\x9F\x92\xA9++++"},
- {.want_cp = 0x0001F4A9, .want_bl = 4, .str = "\xF0\x9F\x92\xA9\xFF"},
+ {.want0 = 0x0401F4A9, .want1 = 0x0401F4A9, .str = "\xF0\x9F\x92\xA9"},
+ {.want0 = 0x0401F4A9, .want1 = 0x01000070, .str = "\xF0\x9F\x92\xA9p"},
+ {.want0 = 0x0401F4A9, .want1 = 0x01000071, .str = "\xF0\x9F\x92\xA9pq"},
+ {.want0 = 0x0401F4A9, .want1 = 0x01000072, .str = "\xF0\x9F\x92\xA9pqr"},
+ {.want0 = 0x0401F4A9, .want1 = 0x01000073, .str = "\xF0\x9F\x92\xA9pqrs"},
+ {.want0 = 0x0401F4A9, .want1 = 0x0100FFFD, .str = "\xF0\x9F\x92\xA9\xFF"},
+ {.want0 = 0x01000070, .want1 = 0x0401F4A9, .str = "p\xF0\x9F\x92\xA9"},
// Invalid.
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xBF"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xC0\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xC1\xBF"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xC2"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xC2\x7F"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xC2\xC0"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xC2\xFF"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xCE"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xDF\xC0"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xDF\xFF"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xE0\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xE0\x80\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xE0\x9F\xBF"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xE2"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF0"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF0\x80\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF0\x80\x80\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF0\x8F\xBF\xBF"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF4\x90\x80\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF5"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF6\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xF7\x80\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xFF\xFF\xFF\xFF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\x80"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xBF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xC0\x80"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xC1\xBF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xC2"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100007F, .str = "\xC2\x7F"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xC2\xC0"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xC2\xFF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xCE"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xDF\xC0"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xDF\xFF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xE0\x80"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xE0\x80\x81"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xE0\x9F\xBF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xE2"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF0"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF0\x80\x81"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF0\x80\x81\x82"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF0\x8F\xBF\xBF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF4\x90\x81\x82"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF5"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF6\x80"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF7\x80\x81"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xF8\x90\x91\x92\x93"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xFF\xFF\xFF\xFF"},
// Invalid. UTF-8 cannot contain the surrogates U+D800 ..= U+DFFF.
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xED\xA0\x80"},
- {.want_cp = 0x0000FFFD, .want_bl = 1, .str = "\xED\xBF\xBF"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xED\xA0\x80"},
+ {.want0 = 0x0100FFFD, .want1 = 0x0100FFFD, .str = "\xED\xBF\xBF"},
};
int tc;
for (tc = 0; tc < WUFFS_TESTLIB_ARRAY_SIZE(test_cases); tc++) {
wuffs_base__slice_u8 s = wuffs_base__make_slice_u8(
(void*)test_cases[tc].str, strlen(test_cases[tc].str));
-
// Override "The <NUL> byte" with "\x00".
- if ((test_cases[tc].want_cp == 0) && (test_cases[tc].want_bl == 1)) {
+ if (test_cases[tc].want0 == 0x01000000) {
s = wuffs_base__make_slice_u8(&the_nul_byte[0], 1);
}
- wuffs_base__utf_8__next__output have = wuffs_base__utf_8__next(s);
- if ((have.code_point != test_cases[tc].want_cp) ||
- (have.byte_length != test_cases[tc].want_bl)) {
- RETURN_FAIL("\"%s\": have cp=0x%" PRIX32 " bl=%" PRIu32
- ", want cp=0x%" PRIX32 " bl=%" PRIu32,
- test_cases[tc].str, have.code_point, have.byte_length,
- test_cases[tc].want_cp, test_cases[tc].want_bl);
+ // Test wuffs_base__utf_8__next.
+ {
+ uint32_t want_bl = test_cases[tc].want0 >> 24;
+ uint32_t want_cp = test_cases[tc].want0 & 0xFFFFFF;
+ wuffs_base__utf_8__next__output have = wuffs_base__utf_8__next(s);
+ if ((have.code_point != want_cp) || (have.byte_length != want_bl)) {
+ RETURN_FAIL("next(\"%s\"): have cp=0x%" PRIX32 " bl=%" PRIu32
+ ", want cp=0x%" PRIX32 " bl=%" PRIu32,
+ test_cases[tc].str, have.code_point, have.byte_length,
+ want_cp, want_bl);
+ }
+ }
+
+ // Test wuffs_base__utf_8__next_from_end.
+ {
+ uint32_t want_bl = test_cases[tc].want1 >> 24;
+ uint32_t want_cp = test_cases[tc].want1 & 0xFFFFFF;
+ wuffs_base__utf_8__next__output have =
+ wuffs_base__utf_8__next_from_end(s);
+ if ((have.code_point != want_cp) || (have.byte_length != want_bl)) {
+ RETURN_FAIL("next_from_end(\"%s\"): have cp=0x%" PRIX32 " bl=%" PRIu32
+ ", want cp=0x%" PRIX32 " bl=%" PRIu32,
+ test_cases[tc].str, have.code_point, have.byte_length,
+ want_cp, want_bl);
+ }
}
}