GB18030: Help transitioning away from PUA code points.
* lib/gb18030ext.h (gb18030_2005_ext_wctomb): Remove function.
(gb18030ext_wctomb): Renamed from gb18030_2022_ext_wctomb.
* lib/gb18030uni.h (gb18030_2005_uni_wctomb): Map 6 Ext-B code points to
4-bytes sequences.
(gb18030_2022_uni_wctomb): Small refactoring.
* lib/gb18030_2005.h (gb18030_2005_pua2charset): Map 6 PUA code points
to 4-bytes sequences instead of 2-bytes sequences.
(gb18030_2005_wctomb): Update accordingly. Invoke gb18030ext_wctomb
instead of gb18030_2005_ext_wctomb.
* lib/gb18030_2022.h (gb18030_2022_wctomb): Invoke gb18030ext_wctomb
instead of gb18030_2022_ext_wctomb.
* tests/GB18030-2005.IRREVERSIBLE.TXT: Update the inverse mappings of 6
Ext-B code points and 6 PUA code points.
* NEWS: Mention it.
diff --git a/ChangeLog b/ChangeLog
index e95e3b9..d077b4d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2023-05-29 Bruno Haible <bruno@clisp.org>
+
+ GB18030: Help transitioning away from PUA code points.
+ * lib/gb18030ext.h (gb18030_2005_ext_wctomb): Remove function.
+ (gb18030ext_wctomb): Renamed from gb18030_2022_ext_wctomb.
+ * lib/gb18030uni.h (gb18030_2005_uni_wctomb): Map 6 Ext-B code points to
+ 4-bytes sequences.
+ (gb18030_2022_uni_wctomb): Small refactoring.
+ * lib/gb18030_2005.h (gb18030_2005_pua2charset): Map 6 PUA code points
+ to 4-bytes sequences instead of 2-bytes sequences.
+ (gb18030_2005_wctomb): Update accordingly. Invoke gb18030ext_wctomb
+ instead of gb18030_2005_ext_wctomb.
+ * lib/gb18030_2022.h (gb18030_2022_wctomb): Invoke gb18030ext_wctomb
+ instead of gb18030_2022_ext_wctomb.
+ * tests/GB18030-2005.IRREVERSIBLE.TXT: Update the inverse mappings of 6
+ Ext-B code points and 6 PUA code points.
+ * NEWS: Mention it.
+
2023-05-24 Bruno Haible <bruno@clisp.org>
man pages: List a fifth condition when iconv(3) may stop.
diff --git a/NEWS b/NEWS
index 5321981..364b741 100644
--- a/NEWS
+++ b/NEWS
@@ -3,7 +3,8 @@
* GB18030 is now an alias for GB18030:2005. A new converter for GB18030:2022
is added. Since this encoding merely cleans up a few private-use-area
mappings, you can continue to use the GB18030 converter, for backward
- compatibility.
+ compatibility. Its Unicode to GB18030 conversion direction has been
+ enhanced, to help transitioning away from PUA code points.
* When converting from/to an EBCDIC encoding, a non-standard way of
converting newlines can be requested
- at the C level, by calling iconvctl with argument ICONV_SET_FROM_SURFACE
diff --git a/lib/gb18030_2005.h b/lib/gb18030_2005.h
index 43f952f..4e7fca5 100644
--- a/lib/gb18030_2005.h
+++ b/lib/gb18030_2005.h
@@ -266,39 +266,43 @@
}
}
-static const unsigned short gb18030_2005_pua2charset[31*3] = {
-/* Unicode range GB18030 range */
- 0xe766, 0xe76b, 0xa2ab, /*.. 0xa2b0, */
- 0xe76d, 0xe76d, 0xa2e4,
- 0xe76e, 0xe76f, 0xa2ef, /*.. 0xa2f0, */
- 0xe770, 0xe771, 0xa2fd, /*.. 0xa2fe, */
- 0xe772, 0xe77c, 0xa4f4, /*.. 0xa4fe, */
- 0xe77d, 0xe784, 0xa5f7, /*.. 0xa5fe, */
- 0xe785, 0xe78c, 0xa6b9, /*.. 0xa6c0, */
- 0xe78d, 0xe793, 0xa6d9, /*.. 0xa6df, */
- 0xe794, 0xe795, 0xa6ec, /*.. 0xa6ed, */
- 0xe796, 0xe796, 0xa6f3,
- 0xe797, 0xe79f, 0xa6f6, /*.. 0xa6fe, */
- 0xe7a0, 0xe7ae, 0xa7c2, /*.. 0xa7d0, */
- 0xe7af, 0xe7bb, 0xa7f2, /*.. 0xa7fe, */
- 0xe7bc, 0xe7c6, 0xa896, /*.. 0xa8a0, */
- 0xe7c9, 0xe7cc, 0xa8c1, /*.. 0xa8c4, */
- 0xe7cd, 0xe7e1, 0xa8ea, /*.. 0xa8fe, */
- 0xe7e2, 0xe7e2, 0xa958,
- 0xe7e3, 0xe7e3, 0xa95b,
- 0xe7e4, 0xe7e6, 0xa95d, /*.. 0xa95f, */
- 0xe7f4, 0xe800, 0xa997, /*.. 0xa9a3, */
- 0xe801, 0xe80f, 0xa9f0, /*.. 0xa9fe, */
- 0xe810, 0xe814, 0xd7fa, /*.. 0xd7fe, */
- 0xe816, 0xe818, 0xfe51, /*.. 0xfe53, */
- 0xe81e, 0xe81e, 0xfe59,
- 0xe826, 0xe826, 0xfe61,
- 0xe82b, 0xe82c, 0xfe66, /*.. 0xfe67, */
- 0xe831, 0xe832, 0xfe6c, /*.. 0xfe6d, */
- 0xe83b, 0xe83b, 0xfe76,
- 0xe843, 0xe843, 0xfe7e,
- 0xe854, 0xe855, 0xfe90, /*.. 0xfe91, */
- 0xe864, 0xe864, 0xfea0,
+static const struct { unsigned short uni[2]; unsigned int charset; } gb18030_2005_pua2charset[35] = {
+/* Unicode range GB18030 range */
+ { { 0xe766, 0xe76b }, 0xa2ab /*.. 0xa2b0, */ },
+ { { 0xe76d, 0xe76d }, 0xa2e4 },
+ { { 0xe76e, 0xe76f }, 0xa2ef /*.. 0xa2f0, */ },
+ { { 0xe770, 0xe771 }, 0xa2fd /*.. 0xa2fe, */ },
+ { { 0xe772, 0xe77c }, 0xa4f4 /*.. 0xa4fe, */ },
+ { { 0xe77d, 0xe784 }, 0xa5f7 /*.. 0xa5fe, */ },
+ { { 0xe785, 0xe78c }, 0xa6b9 /*.. 0xa6c0, */ },
+ { { 0xe78d, 0xe793 }, 0xa6d9 /*.. 0xa6df, */ },
+ { { 0xe794, 0xe795 }, 0xa6ec /*.. 0xa6ed, */ },
+ { { 0xe796, 0xe796 }, 0xa6f3 },
+ { { 0xe797, 0xe79f }, 0xa6f6 /*.. 0xa6fe, */ },
+ { { 0xe7a0, 0xe7ae }, 0xa7c2 /*.. 0xa7d0, */ },
+ { { 0xe7af, 0xe7bb }, 0xa7f2 /*.. 0xa7fe, */ },
+ { { 0xe7bc, 0xe7c6 }, 0xa896 /*.. 0xa8a0, */ },
+ { { 0xe7c9, 0xe7cc }, 0xa8c1 /*.. 0xa8c4, */ },
+ { { 0xe7cd, 0xe7e1 }, 0xa8ea /*.. 0xa8fe, */ },
+ { { 0xe7e2, 0xe7e2 }, 0xa958 },
+ { { 0xe7e3, 0xe7e3 }, 0xa95b },
+ { { 0xe7e4, 0xe7e6 }, 0xa95d /*.. 0xa95f, */ },
+ { { 0xe7f4, 0xe800 }, 0xa997 /*.. 0xa9a3, */ },
+ { { 0xe801, 0xe80f }, 0xa9f0 /*.. 0xa9fe, */ },
+ { { 0xe810, 0xe814 }, 0xd7fa /*.. 0xd7fe, */ },
+ { { 0xe816, 0xe816 }, 0x95329031 },
+ { { 0xe817, 0xe817 }, 0x95329033 },
+ { { 0xe818, 0xe818 }, 0x95329730 },
+ { { 0xe81e, 0xe81e }, 0xfe59 },
+ { { 0xe826, 0xe826 }, 0xfe61 },
+ { { 0xe82b, 0xe82c }, 0xfe66 /*.. 0xfe67, */ },
+ { { 0xe831, 0xe831 }, 0x9536b937 },
+ { { 0xe832, 0xe832 }, 0xfe6d },
+ { { 0xe83b, 0xe83b }, 0x9630ba35 },
+ { { 0xe843, 0xe843 }, 0xfe7e },
+ { { 0xe854, 0xe854 }, 0xfe90 },
+ { { 0xe855, 0xe855 }, 0x9635b630 },
+ { { 0xe864, 0xe864 }, 0xfea0 },
};
static int
@@ -316,7 +320,7 @@
if (ret != RET_ILUNI)
return ret;
- ret = gb18030_2005_ext_wctomb(conv,r,wc,n);
+ ret = gb18030ext_wctomb(conv,r,wc,n);
if (ret != RET_ILUNI)
return ret;
@@ -337,23 +341,32 @@
return 2;
}
} else {
- /* User-defined characters, two-byte part of range U+E766..U+E864 */
+ /* User-defined characters, two-byte part and 6 four-byte mappings in
+ range U+E766..U+E864 */
unsigned int k1 = 0;
- unsigned int k2 = 31;
+ unsigned int k2 = 35;
/* Invariant: We know that if wc occurs in Unicode interval in
gb18030_2005_pua2charset, it does so at a k with k1 <= k < k2. */
while (k1 < k2) {
unsigned int k = (k1 + k2) / 2;
- if (wc < gb18030_2005_pua2charset[k*3+0])
+ if (wc < gb18030_2005_pua2charset[k].uni[0])
k2 = k;
- else if (wc > gb18030_2005_pua2charset[k*3+1])
+ else if (wc > gb18030_2005_pua2charset[k].uni[1])
k1 = k + 1;
else {
- unsigned short c =
- gb18030_2005_pua2charset[k*3+2] + (wc - gb18030_2005_pua2charset[k*3+0]);
- r[0] = (c >> 8);
- r[1] = (c & 0xff);
- return 2;
+ unsigned int c =
+ gb18030_2005_pua2charset[k].charset + (wc - gb18030_2005_pua2charset[k].uni[0]);
+ if (c < 0x10000) {
+ r[0] = (c >> 8);
+ r[1] = c & 0xff;
+ return 2;
+ } else {
+ r[0] = (c >> 24);
+ r[1] = (c >> 16) & 0xff;
+ r[2] = (c >> 8) & 0xff;
+ r[3] = c & 0xff;
+ return 4;
+ }
}
}
}
diff --git a/lib/gb18030_2022.h b/lib/gb18030_2022.h
index f3e762c..4d70a92 100644
--- a/lib/gb18030_2022.h
+++ b/lib/gb18030_2022.h
@@ -151,7 +151,7 @@
if (ret != RET_ILUNI)
return ret;
- ret = gb18030_2022_ext_wctomb(conv,r,wc,n);
+ ret = gb18030ext_wctomb(conv,r,wc,n);
if (ret != RET_ILUNI)
return ret;
diff --git a/lib/gb18030ext.h b/lib/gb18030ext.h
index 894efeb..97f01da 100644
--- a/lib/gb18030ext.h
+++ b/lib/gb18030ext.h
@@ -357,93 +357,7 @@
};
static int
-gb18030_2005_ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
-{
- if (n >= 2) {
- unsigned short c = 0;
- if (wc == 0x01f9)
- c = 0xa8bf;
- else if (wc == 0x1e3f)
- c = 0xa8bc;
- else if (wc == 0x20ac)
- c = 0xa2e3;
- else if (wc >= 0x2e80 && wc < 0x2ed0)
- c = gb18030ext_page2e[wc-0x2e80];
- else if (wc >= 0x2ff0 && wc < 0x3000)
- c = gb18030ext_page2f[wc-0x2ff0];
- else if (wc == 0x303e)
- c = 0xa989;
- else if (wc >= 0x3440 && wc < 0x3478)
- c = gb18030ext_page34[wc-0x3440];
- else if (wc == 0x359e)
- c = 0xfe5a;
- else if (wc >= 0x3608 && wc < 0x3620)
- c = gb18030ext_page36[wc-0x3608];
- else if (wc == 0x3918)
- c = 0xfe60;
- else if (wc == 0x396e)
- c = 0xfe5f;
- else if (wc >= 0x39c8 && wc < 0x39e0)
- c = gb18030ext_page39[wc-0x39c8];
- else if (wc == 0x3a73)
- c = 0xfe64;
- else if (wc == 0x3b4e)
- c = 0xfe68;
- else if (wc == 0x3c6e)
- c = 0xfe69;
- else if (wc == 0x3ce0)
- c = 0xfe6a;
- else if (wc == 0x4056)
- c = 0xfe6f;
- else if (wc == 0x415f)
- c = 0xfe70;
- else if (wc == 0x4337)
- c = 0xfe72;
- else if (wc >= 0x43a8 && wc < 0x43e0)
- c = gb18030ext_page43[wc-0x43a8];
- else if (wc == 0x44d6)
- c = 0xfe7b;
- else if (wc >= 0x4648 && wc < 0x4668)
- c = gb18030ext_page46[wc-0x4648];
- else if (wc >= 0x4720 && wc < 0x4730)
- c = gb18030ext_page47_1[wc-0x4720];
- else if (wc >= 0x4778 && wc < 0x4790)
- c = gb18030ext_page47_2[wc-0x4778];
- else if (wc >= 0x4940 && wc < 0x49b8)
- c = gb18030ext_page49[wc-0x4940];
- else if (wc >= 0x4c70 && wc < 0x4ca8)
- c = gb18030ext_page4c[wc-0x4c70];
- else if (wc >= 0x4d10 && wc < 0x4d20)
- c = gb18030ext_page4d[wc-0x4d10];
- else if (wc == 0x4dae)
- c = 0xfe9f;
- else if (wc >= 0x9fb4 && wc < 0x9fbc)
- c = gb18030ext_page9f[wc-0x9fb0];
- else if (wc >= 0xfe10 && wc < 0xfe1a)
- c = gb18030ext_pagefe[wc-0xfe10];
- else if (wc == 0x20087)
- c = 0xfe51;
- else if (wc == 0x20089)
- c = 0xfe52;
- else if (wc == 0x200cc)
- c = 0xfe53;
- else if (wc == 0x215d7)
- c = 0xfe6c;
- else if (wc == 0x2298f)
- c = 0xfe76;
- else if (wc == 0x241fe)
- c = 0xfe91;
- if (c != 0) {
- r[0] = (c >> 8); r[1] = (c & 0xff);
- return 2;
- }
- return RET_ILUNI;
- }
- return RET_TOOSMALL;
-}
-
-static int
-gb18030_2022_ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
+gb18030ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
{
if (n >= 2) {
unsigned short c = 0;
diff --git a/lib/gb18030uni.h b/lib/gb18030uni.h
index f2f2838..1f5a2cc 100644
--- a/lib/gb18030uni.h
+++ b/lib/gb18030uni.h
@@ -301,13 +301,14 @@
gb18030_2005_uni_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
{
if (n >= 4) {
- unsigned int i = wc;
- if (i >= 0x0080 && i <= 0xffff) {
- if (i == 0xe7c7) {
+ unsigned int i;
+ if (wc >= 0x0080 && wc <= 0xffff) {
+ if (wc == 0xe7c7) {
i = 7457;
} else {
unsigned int k1 = 0;
unsigned int k2 = 205;
+ i = wc;
while (k1 < k2) {
unsigned int k = (k1 + k2) / 2;
if (i <= gb18030uni_uni2charset_ranges[2*k+1])
@@ -322,13 +323,28 @@
i -= diff;
}
}
- r[3] = (i % 10) + 0x30; i = i / 10;
- r[2] = (i % 126) + 0x81; i = i / 126;
- r[1] = (i % 10) + 0x30; i = i / 10;
- r[0] = i + 0x81;
- return 4;
- }
- return RET_ILUNI;
+ } else if (wc >= 0x20087 && wc <= 0x241fe) {
+ if (wc == 0x20087)
+ i = 0x3e2cf;
+ else if (wc == 0x20089)
+ i = 0x3e2d1;
+ else if (wc == 0x200cc)
+ i = 0x3e314;
+ else if (wc == 0x215d7)
+ i = 0x3f81f;
+ else if (wc == 0x2298f)
+ i = 0x40bd7;
+ else if (wc == 0x241fe)
+ i = 0x42446;
+ else
+ return RET_ILUNI;
+ } else
+ return RET_ILUNI;
+ r[3] = (i % 10) + 0x30; i = i / 10;
+ r[2] = (i % 126) + 0x81; i = i / 126;
+ r[1] = (i % 10) + 0x30; i = i / 10;
+ r[0] = i + 0x81;
+ return 4;
}
return RET_TOOSMALL;
}
@@ -337,17 +353,18 @@
gb18030_2022_uni_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
{
if (n >= 4) {
- unsigned int i = wc;
- if (i >= 0x0080 && i <= 0xffff) {
- if (i == 0xe7c7) {
+ if (wc >= 0x0080 && wc <= 0xffff) {
+ unsigned int i;
+ if (wc == 0xe7c7) {
i = 7457;
- } else if (i >= 0xe78d && i <= 0xe796) {
- i = 39076 + gb18030_2022_uni2charset_pua2[i-0xe78d];
- } else if (i >= 0xe81e && i <= 0xe864 && gb18030_2022_uni2charset_pua1[i-0xe81e]) {
- i = 19056 + gb18030_2022_uni2charset_pua1[i-0xe81e];
+ } else if (wc >= 0xe78d && wc <= 0xe796) {
+ i = 39076 + gb18030_2022_uni2charset_pua2[wc-0xe78d];
+ } else if (wc >= 0xe81e && wc <= 0xe864 && gb18030_2022_uni2charset_pua1[wc-0xe81e]) {
+ i = 19056 + gb18030_2022_uni2charset_pua1[wc-0xe81e];
} else {
unsigned int k1 = 0;
unsigned int k2 = 205;
+ i = wc;
while (k1 < k2) {
unsigned int k = (k1 + k2) / 2;
if (i <= gb18030uni_uni2charset_ranges[2*k+1])
diff --git a/tests/GB18030-2005.IRREVERSIBLE.TXT b/tests/GB18030-2005.IRREVERSIBLE.TXT
index 5e84bc3..48692ea 100644
--- a/tests/GB18030-2005.IRREVERSIBLE.TXT
+++ b/tests/GB18030-2005.IRREVERSIBLE.TXT
@@ -16,12 +16,12 @@
0x84318333 0xFE17
0x84318334 0xFE18
0x84318335 0xFE19
-0x95329031 0x20087
-0x95329033 0x20089
-0x95329730 0x200CC
-0x9536B937 0x215D7
-0x9630BA35 0x2298F
-0x9635B630 0x241FE
+0x95329031 0xE816
+0x95329033 0xE817
+0x95329730 0xE818
+0x9536B937 0xE831
+0x9630BA35 0xE83B
+0x9635B630 0xE855
0xA6D9 0xE78D
0xA6DA 0xE78E
0xA6DB 0xE78F
@@ -32,17 +32,17 @@
0xA6EC 0xE794
0xA6ED 0xE795
0xA6F3 0xE796
-0xFE51 0xE816
-0xFE52 0xE817
-0xFE53 0xE818
+0xFE51 0x20087
+0xFE52 0x20089
+0xFE53 0x200CC
0xFE59 0xE81E
0xFE61 0xE826
0xFE66 0xE82B
0xFE67 0xE82C
-0xFE6C 0xE831
+0xFE6C 0x215D7
0xFE6D 0xE832
-0xFE76 0xE83B
+0xFE76 0x2298F
0xFE7E 0xE843
0xFE90 0xE854
-0xFE91 0xE855
+0xFE91 0x241FE
0xFEA0 0xE864