ICU-21524 UnicodeSet.hasStrings(): no complement in toPattern()
diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp
index 967ea2e..92a81a1 100644
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@@ -2095,7 +2095,10 @@
// getRangeEnd(last) == MAX_VALUE)
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
// If limit == len then len is even and the last range ends with MAX_VALUE.
- if (len >= 4 && list[0] == 0 && limit == len) {
+ //
+ // *But* do not write the inverse (complement) if there are strings.
+ // Since ICU 70, the '^' performs a code point complement which removes all strings.
+ if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
// Emit the inverse
result.append(u'^');
// Offsetting the inversion list index by one lets us
diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c
index a444ce5..f5528d0 100644
--- a/icu4c/source/test/cintltst/usettest.c
+++ b/icu4c/source/test/cintltst/usettest.c
@@ -137,7 +137,7 @@
uset_removeString(set, STR_ab, STR_ab_LEN);
expect(set, "acd{bc}", "bfg{ab}", NULL);
- /* [^acd{bc}] */
+ /* [[^acd]{bc}] */
uset_complement(set);
expect(set, "bef{bc}", "acd{ac}", NULL);
@@ -436,8 +436,8 @@
strlen(items)==0 ? "TRUE" : "FALSE");
}
- /* Don't test patterns starting with "[^" */
- if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
+ /* Don't test patterns starting with "[^" or "[\\u0000". */
+ if ((u_strlen(ustr) > 2 && ustr[1] == u'^') || uset_contains(set, 0)) {
return;
}
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 7a0641f..b4bee76 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4147,6 +4147,24 @@
notBasic.contains(U'🚲'));
}
+ // When there are strings, we must not use the complement for a more compact toPattern().
+ {
+ UnicodeSet set;
+ set.add(0, u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
+ UnicodeString pattern;
+ set.toPattern(pattern, true);
+ UnicodeSet set2(pattern, errorCode);
+ checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
+ assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
+
+ set.add("ch").add("ss");
+ set.toPattern(pattern, true);
+ set2 = UnicodeSet(pattern, errorCode);
+ checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
+ assertEquals("set(with 0 & max, with strings).toPattern()",
+ u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
+ }
+
// The complement() API behavior does not change under this ticket.
{
UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
index e677fca..d799b03 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -818,7 +818,10 @@
// getRangeEnd(last) == MAX_VALUE)
// Invariant: list[len-1] == HIGH == MAX_VALUE + 1
// If limit == len then len is even and the last range ends with MAX_VALUE.
- if (len >= 4 && list[0] == 0 && limit == len) {
+ //
+ // *But* do not write the inverse (complement) if there are strings.
+ // Since ICU 70, the '^' performs a code point complement which removes all strings.
+ if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
// Emit the inverse
result.append('^');
// Offsetting the inversion list index by one lets us
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
index a94b9fe..e31d92b 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -2971,6 +2971,23 @@
notBasic.contains("🚲"));
}
+ // When there are strings, we must not use the complement for a more compact toPattern().
+ {
+ UnicodeSet set = new UnicodeSet();
+ set.add(0, 'Y').add('b', 'q').add('x', 0x10ffff);
+ String pattern = set.toPattern(true);
+ UnicodeSet set2 = new UnicodeSet(pattern);
+ checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
+ assertEquals("set(with 0 & max, only code points).toPattern()", "[^Z-ar-w]", pattern);
+
+ set.add("ch").add("ss");
+ pattern = set.toPattern(true);
+ set2 = new UnicodeSet(pattern);
+ checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
+ assertEquals("set(with 0 & max, with strings).toPattern()",
+ "[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
+ }
+
// The complement() API behavior does not change under this ticket.
{
UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();