blob: b4ab58def7db230a996498fd2a563fa4d3bc0af3 [file] [log] [blame] [edit]
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "../grapheme.h"
#include "util.h"
static const struct {
char *arr; /* UTF-8 byte sequence */
size_t len; /* length of UTF-8 byte sequence */
size_t exp_len; /* expected length returned */
uint_least32_t exp_cp; /* expected codepoint returned */
} dec_test[] = {
{
/* empty sequence
* [ ] ->
* INVALID
*/
.arr = NULL,
.len = 0,
.exp_len = 0,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid lead byte
* [ 11111101 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xFD },
.len = 1,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* valid 1-byte sequence
* [ 00000001 ] ->
* 0000001
*/
.arr = (char *)(unsigned char[]) { 0x01 },
.len = 1,
.exp_len = 1,
.exp_cp = 0x1,
},
{
/* valid 2-byte sequence
* [ 11000011 10111111 ] ->
* 00011111111
*/
.arr = (char *)(unsigned char[]) { 0xC3, 0xBF },
.len = 2,
.exp_len = 2,
.exp_cp = 0xFF,
},
{
/* invalid 2-byte sequence (second byte missing)
* [ 11000011 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xC3 },
.len = 1,
.exp_len = 2,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 2-byte sequence (second byte malformed)
* [ 11000011 11111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xC3, 0xFF },
.len = 2,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 2-byte sequence (overlong encoded)
* [ 11000001 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xC1, 0xBF },
.len = 2,
.exp_len = 2,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* valid 3-byte sequence
* [ 11100000 10111111 10111111 ] ->
* 0000111111111111
*/
.arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF },
.len = 3,
.exp_len = 3,
.exp_cp = 0xFFF,
},
{
/* invalid 3-byte sequence (second byte missing)
* [ 11100000 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xE0 },
.len = 1,
.exp_len = 3,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 3-byte sequence (second byte malformed)
* [ 11100000 01111111 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF },
.len = 3,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 3-byte sequence (short string, second byte malformed)
* [ 11100000 01111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xE0, 0x7F },
.len = 2,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 3-byte sequence (third byte missing)
* [ 11100000 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xE0, 0xBF },
.len = 2,
.exp_len = 3,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 3-byte sequence (third byte malformed)
* [ 11100000 10111111 01111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F },
.len = 3,
.exp_len = 2,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 3-byte sequence (overlong encoded)
* [ 11100000 10011111 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF },
.len = 3,
.exp_len = 3,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 3-byte sequence (UTF-16 surrogate half)
* [ 11101101 10100000 10000000 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 },
.len = 3,
.exp_len = 3,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* valid 4-byte sequence
* [ 11110011 10111111 10111111 10111111 ] ->
* 011111111111111111111
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xBF },
.len = 4,
.exp_len = 4,
.exp_cp = UINT32_C(0xFFFFF),
},
{
/* invalid 4-byte sequence (second byte missing)
* [ 11110011 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3 },
.len = 1,
.exp_len = 4,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (second byte malformed)
* [ 11110011 01111111 10111111 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xBF },
.len = 4,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (short string 1, second byte
* malformed) [ 11110011 011111111 ] -> INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0x7F },
.len = 2,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (short string 2, second byte
* malformed) [ 11110011 011111111 10111111 ] -> INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF },
.len = 3,
.exp_len = 1,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (third byte missing)
* [ 11110011 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0xBF },
.len = 2,
.exp_len = 4,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (third byte malformed)
* [ 11110011 10111111 01111111 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xBF },
.len = 4,
.exp_len = 2,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (short string, third byte malformed)
* [ 11110011 10111111 01111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F },
.len = 3,
.exp_len = 2,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (fourth byte missing)
* [ 11110011 10111111 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF },
.len = 3,
.exp_len = 4,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (fourth byte malformed)
* [ 11110011 10111111 10111111 01111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7F },
.len = 4,
.exp_len = 3,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (overlong encoded)
* [ 11110000 10000000 10000001 10111111 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xBF },
.len = 4,
.exp_len = 4,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
{
/* invalid 4-byte sequence (UTF-16-unrepresentable)
* [ 11110100 10010000 10000000 10000000 ] ->
* INVALID
*/
.arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x80 },
.len = 4,
.exp_len = 4,
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
},
};
int
main(int argc, char *argv[])
{
size_t i, failed;
(void)argc;
/* UTF-8 decoder test */
for (i = 0, failed = 0; i < LEN(dec_test); i++) {
size_t len;
uint_least32_t cp;
len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len,
&cp);
if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) {
fprintf(stderr,
"%s: Failed test %zu: "
"Expected (%zx,%u), but got (%zx,%u).\n",
argv[0], i, dec_test[i].exp_len,
dec_test[i].exp_cp, len, cp);
failed++;
}
}
printf("%s: %zu/%zu unit tests passed.\n", argv[0],
LEN(dec_test) - failed, LEN(dec_test));
return (failed > 0) ? 1 : 0;
}