UTF-8: Reject surrogates and out-of-range code points.
diff --git a/ChangeLog b/ChangeLog
index 2818f2d..69c7f7f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2016-11-17 Bruno Haible <bruno@clisp.org>
+
+ UTF-8: Reject surrogates and out-of-range code points.
+ * lib/utf8.h (utf8_mbtowc, utf8_wctomb): Reject code points in the
+ range 0xD800..0xDFFF and >= 0x110000.
+ * tests/genutf8.c (main): Don't emit mappings for 0xD800..0xDFFF.
+
2016-10-22 Bruno Haible <bruno@clisp.org>
Switch to libtool 2.4.6.
diff --git a/NEWS b/NEWS
index aebc36c..cb2a5fb 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,5 @@
New in 1.15:
+* The UTF-8 converter now rejects surrogates and out-of-range code points.
* Added ISO-2022-JP-MS converter.
* Updated the CP1255 converter to map one more character.
* The functions now support strings longer than 2 GB.
diff --git a/lib/utf8.h b/lib/utf8.h
index 8fab264..016ac52 100644
--- a/lib/utf8.h
+++ b/lib/utf8.h
@@ -45,7 +45,8 @@
if (n < 3)
return RET_TOOFEW(0);
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (c >= 0xe1 || s[1] >= 0xa0)))
+ && (c >= 0xe1 || s[1] >= 0xa0)
+ && (c != 0xed || s[1] < 0xa0)))
return RET_ILSEQ;
*pwc = ((ucs4_t) (c & 0x0f) << 12)
| ((ucs4_t) (s[1] ^ 0x80) << 6)
@@ -56,41 +57,14 @@
return RET_TOOFEW(0);
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
&& (s[3] ^ 0x80) < 0x40
- && (c >= 0xf1 || s[1] >= 0x90)))
+ && (c >= 0xf1 || s[1] >= 0x90)
+ && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))))
return RET_ILSEQ;
*pwc = ((ucs4_t) (c & 0x07) << 18)
| ((ucs4_t) (s[1] ^ 0x80) << 12)
| ((ucs4_t) (s[2] ^ 0x80) << 6)
| (ucs4_t) (s[3] ^ 0x80);
return 4;
- } else if (c < 0xfc && sizeof(ucs4_t)*8 >= 32) {
- if (n < 5)
- return RET_TOOFEW(0);
- if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (c >= 0xf9 || s[1] >= 0x88)))
- return RET_ILSEQ;
- *pwc = ((ucs4_t) (c & 0x03) << 24)
- | ((ucs4_t) (s[1] ^ 0x80) << 18)
- | ((ucs4_t) (s[2] ^ 0x80) << 12)
- | ((ucs4_t) (s[3] ^ 0x80) << 6)
- | (ucs4_t) (s[4] ^ 0x80);
- return 5;
- } else if (c < 0xfe && sizeof(ucs4_t)*8 >= 32) {
- if (n < 6)
- return RET_TOOFEW(0);
- if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
- && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
- && (s[5] ^ 0x80) < 0x40
- && (c >= 0xfd || s[1] >= 0x84)))
- return RET_ILSEQ;
- *pwc = ((ucs4_t) (c & 0x01) << 30)
- | ((ucs4_t) (s[1] ^ 0x80) << 24)
- | ((ucs4_t) (s[2] ^ 0x80) << 18)
- | ((ucs4_t) (s[3] ^ 0x80) << 12)
- | ((ucs4_t) (s[4] ^ 0x80) << 6)
- | (ucs4_t) (s[5] ^ 0x80);
- return 6;
} else
return RET_ILSEQ;
}
@@ -103,21 +77,18 @@
count = 1;
else if (wc < 0x800)
count = 2;
- else if (wc < 0x10000)
- count = 3;
- else if (wc < 0x200000)
+ else if (wc < 0x10000) {
+ if (wc < 0xd800 || wc >= 0xe000)
+ count = 3;
+ else
+ return RET_ILUNI;
+ } else if (wc < 0x110000)
count = 4;
- else if (wc < 0x4000000)
- count = 5;
- else if (wc <= 0x7fffffff)
- count = 6;
else
return RET_ILUNI;
if (n < count)
return RET_TOOSMALL;
switch (count) { /* note: code falls through cases! */
- case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
- case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
diff --git a/tests/genutf8.c b/tests/genutf8.c
index 85086fb..e20477a 100644
--- a/tests/genutf8.c
+++ b/tests/genutf8.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000, 2004-2005, 2012 Free Software Foundation, Inc.
+/* Copyright (C) 2000, 2004-2005, 2012, 2016 Free Software Foundation, Inc.
This file is part of the GNU LIBICONV Library.
The GNU LIBICONV Library is free software; you can redistribute it
@@ -39,11 +39,14 @@
for (i1 = 2; i1 < 32; i1++)
for (i2 = 0; i2 < 64; i2++)
printf("0x%02X%02X\t0x%04X\n", 0xc0+i1,0x80+i2, (i1<<6)+i2);
- /* Range 0x0800..0xffff */
+ /* Range 0x0800..0xffff, except 0xd800..0xdfff */
for (i1 = 0; i1 < 16; i1++)
for (i2 = (i1==0 ? 32 : 0); i2 < 64; i2++)
- for (i3 = 0; i3 < 64; i3++)
- printf("0x%02X%02X%02X\t0x%04X\n", 0xe0+i1,0x80+i2,0x80+i3, (i1<<12)+(i2<<6)+i3);
+ for (i3 = 0; i3 < 64; i3++) {
+ int u = (i1<<12)+(i2<<6)+i3;
+ if (!(u >= 0xd800 && u < 0xe000))
+ printf("0x%02X%02X%02X\t0x%04X\n", 0xe0+i1,0x80+i2,0x80+i3, u);
+ }
if (ferror(stdout) || fclose(stdout))
exit(1);