ICU-13702 add missing API functions
and fix a bug in Java UnicodeSet.retain(String) which added the string even if the set did not contain it before,
and some drive-by API doc fixes/clarifications
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 71fa1eb..8403c40 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -599,7 +599,7 @@
/**
* Make this object represent the range `start - end`.
- * If `end > start` then this object is set to an empty range.
+ * If `start > end` then this object is set to an empty range.
* A frozen set will not be modified.
*
* @param start first character in the set, inclusive
@@ -1075,7 +1075,7 @@
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
- * the call leaves this set unchanged. If <code>end > start</code>
+ * the call leaves this set unchanged. If <code>start > end</code>
* then an empty range is added, leaving the set unchanged.
* This is equivalent to a boolean logic OR, or a set UNION.
* A frozen set will not be modified.
@@ -1093,6 +1093,9 @@
* present. If this set already contains the specified character,
* the call leaves this set unchanged.
* A frozen set will not be modified.
+ *
+ * @param c the character (code point)
+ * @return this object, for chaining
* @stable ICU 2.0
*/
UnicodeSet& add(UChar32 c);
@@ -1122,8 +1125,8 @@
public:
/**
- * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
+ * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+ * If this set already contains any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
@@ -1133,7 +1136,6 @@
/**
* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
@@ -1143,7 +1145,6 @@
/**
* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
@@ -1153,7 +1154,6 @@
/**
* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
@@ -1183,15 +1183,13 @@
/**
* Retain only the elements in this set that are contained in the
- * specified range. If <code>end > start</code> then an empty range is
+ * specified range. If <code>start > end</code> then an empty range is
* retained, leaving the set empty. This is equivalent to
* a boolean logic AND, or a set INTERSECTION.
* A frozen set will not be modified.
*
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
* @stable ICU 2.0
*/
virtual UnicodeSet& retain(UChar32 start, UChar32 end);
@@ -1200,14 +1198,31 @@
/**
* Retain the specified character from this set if it is present.
* A frozen set will not be modified.
+ *
+ * @param c the character (code point)
+ * @return this object, for chaining
* @stable ICU 2.0
*/
UnicodeSet& retain(UChar32 c);
+#ifndef U_HIDE_DRAFT_API
+ /**
+ * Retains only the specified string from this set if it is present.
+ * Upon return this set will be empty if it did not contain s, or
+ * will only contain s if it did contain s.
+ * A frozen set will not be modified.
+ *
+ * @param s the source string
+ * @return this object, for chaining
+ * @draft ICU 69
+ */
+ UnicodeSet& retain(const UnicodeString &s);
+#endif // U_HIDE_DRAFT_API
+
/**
* Removes the specified range from this set if it is present.
* The set will not contain the specified range once the call
- * returns. If <code>end > start</code> then an empty range is
+ * returns. If <code>start > end</code> then an empty range is
* removed, leaving the set unchanged.
* A frozen set will not be modified.
*
@@ -1224,6 +1239,9 @@
* The set will not contain the specified range once the call
* returns.
* A frozen set will not be modified.
+ *
+ * @param c the character (code point)
+ * @return this object, for chaining
* @stable ICU 2.0
*/
UnicodeSet& remove(UChar32 c);
@@ -1251,15 +1269,13 @@
/**
* Complements the specified range in this set. Any character in
* the range will be removed if it is in this set, or will be
- * added if it is not in this set. If <code>end > start</code>
+ * added if it is not in this set. If <code>start > end</code>
* then an empty range is complemented, leaving the set unchanged.
* This is equivalent to a boolean logic XOR.
* A frozen set will not be modified.
*
- * @param start first character, inclusive, of range to be removed
- * from this set.
- * @param end last character, inclusive, of range to be removed
- * from this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
* @stable ICU 2.0
*/
virtual UnicodeSet& complement(UChar32 start, UChar32 end);
@@ -1269,14 +1285,16 @@
* will be removed if it is in this set, or will be added if it is
* not in this set.
* A frozen set will not be modified.
+ *
+ * @param c the character (code point)
+ * @return this object, for chaining
* @stable ICU 2.0
*/
UnicodeSet& complement(UChar32 c);
/**
* Complement the specified string in this set.
- * The set will not contain the specified string once the call
- * returns.
+ * The string will be removed if it is in this set, or will be added if it is not in this set.
* A frozen set will not be modified.
*
* @param s the string to complement
diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h
index 502ea8d..1d0daf9 100644
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@@ -582,8 +582,8 @@
uset_addString(USet* set, const UChar* str, int32_t strLen);
/**
- * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
+ * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+ * If this set already contains any particular character, it has no effect on that character.
* A frozen set will not be modified.
* @param set the object to which to add the character
* @param str the source string
@@ -628,6 +628,20 @@
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif // U_HIDE_DRAFT_API
+
/**
* Removes from this set all of its elements that are contained in the
* specified set. This operation effectively modifies this
@@ -650,15 +664,41 @@
* A frozen set will not be modified.
*
* @param set the object for which to retain only the specified range
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
* @stable ICU 3.2
*/
U_CAPI void U_EXPORT2
uset_retain(USet* set, UChar32 start, UChar32 end);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Retains only the specified string from this set if it is present.
+ * Upon return this set will be empty if it did not contain s, or
+ * will only contain s if it did contain s.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif // U_HIDE_DRAFT_API
+
/**
* Retains only the elements in this set that are contained in the
* specified set. In other words, removes from this set all of
@@ -696,6 +736,49 @@
U_CAPI void U_EXPORT2
uset_complement(USet* set);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Complements the specified range in this set. Any character in
+ * the range will be removed if it is in this set, or will be
+ * added if it is not in this set. If <code>start > end</code>
+ * then an empty range is complemented, leaving the set unchanged.
+ * This is equivalent to a boolean logic XOR.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end);
+
+/**
+ * Complements the specified string in this set.
+ * The string will be removed if it is in this set, or will be added if it is not in this set.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif // U_HIDE_DRAFT_API
+
/**
* Complements in this set all elements contained in the specified
* set. Any character in the other set will be removed if it is
diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp
index be6ffb7..461e5a7 100644
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@@ -1120,6 +1120,26 @@
return retain(c, c);
}
+UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
+ if (isFrozen() || isBogus()) { return *this; }
+ UChar32 cp = getSingleCP(s);
+ if (cp < 0) {
+ bool isIn = stringsContains(s);
+ // Check for getRangeCount() first to avoid somewhat-expensive size()
+ // when there are single code points.
+ if (isIn && getRangeCount() == 0 && size() == 1) {
+ return *this;
+ }
+ clear();
+ if (isIn) {
+ _add(s);
+ }
+ } else {
+ retain(cp, cp);
+ }
+ return *this;
+}
+
/**
* Removes the specified range from this set if it is present.
* The set will not contain the specified range once the call
diff --git a/icu4c/source/common/uset.cpp b/icu4c/source/common/uset.cpp
index eae7981..a7e3046 100644
--- a/icu4c/source/common/uset.cpp
+++ b/icu4c/source/common/uset.cpp
@@ -117,6 +117,12 @@
}
U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length) {
+ UnicodeString s(length==-1, str, length);
+ ((UnicodeSet*) set)->UnicodeSet::removeAll(s);
+}
+
+U_CAPI void U_EXPORT2
uset_removeAll(USet* set, const USet* remove) {
((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
}
@@ -127,6 +133,18 @@
}
U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length) {
+ UnicodeString s(length==-1, str, length);
+ ((UnicodeSet*) set)->UnicodeSet::retain(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length) {
+ UnicodeString s(length==-1, str, length);
+ ((UnicodeSet*) set)->UnicodeSet::retainAll(s);
+}
+
+U_CAPI void U_EXPORT2
uset_retainAll(USet* set, const USet* retain) {
((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
}
@@ -142,6 +160,23 @@
}
U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end) {
+ ((UnicodeSet*) set)->UnicodeSet::complement(start, end);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length) {
+ UnicodeString s(length==-1, str, length);
+ ((UnicodeSet*) set)->UnicodeSet::complement(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length) {
+ UnicodeString s(length==-1, str, length);
+ ((UnicodeSet*) set)->UnicodeSet::complementAll(s);
+}
+
+U_CAPI void U_EXPORT2
uset_complementAll(USet* set, const USet* complement) {
((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
}
diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c
index d656964..9fe2362 100644
--- a/icu4c/source/test/cintltst/usettest.c
+++ b/icu4c/source/test/cintltst/usettest.c
@@ -211,6 +211,45 @@
uset_retainAll(set2, set);
expect(set2, "ghi", "abcdef{bc}", NULL);
+ // ICU 69 added some missing functions for parity with C++ and Java.
+ uset_applyPattern(set, u"[abcdef{ch}{sch}]", -1, 0, &ec);
+ if(U_FAILURE(ec)) {
+ log_err("uset_openPattern([abcdef{ch}{sch}]) failed - %s\n", u_errorName(ec));
+ return;
+ }
+ expect(set, "abcdef{ch}{sch}", "", NULL);
+
+ uset_removeAllCodePoints(set, u"ce", 2);
+ expect(set, "abdf{ch}{sch}", "ce", NULL);
+
+ uset_complementRange(set, u'b', u'f');
+ expect(set, "ace{ch}{sch}", "bdf", NULL);
+
+ uset_complementString(set, u"ch", -1);
+ expect(set, "ace{sch}", "bdf{ch}", NULL);
+
+ uset_complementString(set, u"xy", -1);
+ expect(set, "ace{sch}{xy}", "bdf{ch}", NULL);
+
+ uset_complementAllCodePoints(set, u"abef", 4);
+ expect(set, "bcf{sch}{xy}", "ade{ch}", NULL);
+
+ uset_retainAllCodePoints(set, u"abef", -1);
+ expect(set, "bf", "acde{ch}{sch}{xy}", NULL);
+
+ uset_applyPattern(set, u"[abcdef{ch}{sch}]", -1, 0, &ec);
+ if(U_FAILURE(ec)) {
+ log_err("uset_openPattern([abcdef{ch}{sch}]) failed - %s\n", u_errorName(ec));
+ return;
+ }
+ expect(set, "abcdef{ch}{sch}", "", NULL);
+
+ uset_retainString(set, u"sch", 3);
+ expect(set, "{sch}", "abcdef{ch}", NULL);
+
+ uset_retainString(set, u"ch", 3);
+ expect(set, "", "abcdef{ch}{sch}", NULL);
+
uset_close(set);
uset_close(set2);
}
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 97386cf..700ff05 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -696,6 +696,37 @@
if (U_FAILURE(status)) { errln("FAIL"); return; }
if (set != exp) { errln("FAIL: retain('s')"); return; }
+ // ICU 2.6 coverage tests
+ // public final UnicodeSet retain(String s);
+ // public final UnicodeSet remove(int c);
+ // public final UnicodeSet remove(String s);
+ // public int hashCode();
+ set.applyPattern(u"[a-z{ab}{cd}]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ set.retain(u"cd");
+ exp.applyPattern(u"[{cd}]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
+
+ set.applyPattern(u"[a-z{ab}{yz}]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ set.retain(u"cd");
+ exp.clear();
+ if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
+
+ set.applyPattern(u"[a-z{ab}{cd}]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ set.remove(u'c');
+ exp.applyPattern(u"[abd-z{ab}{cd}]", status);
+ if (set != exp) { errln("FAIL: remove('c')"); return; }
+
+ set.remove(u"cd");
+ exp.applyPattern(u"[abd-z{ab}]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
+
+ set.applyPattern("[s]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
uint16_t buf[32];
int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
index 5aaf044..f62e682 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -514,7 +514,7 @@
/**
* Make this object represent the range <code>start - end</code>.
- * If <code>end > start</code> then this object is set to an empty range.
+ * If <code>start > end</code> then this object is set to an empty range.
*
* @param start first character in the set, inclusive
* @param end last character in the set, inclusive
@@ -1159,7 +1159,7 @@
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
- * the call leaves this set unchanged. If <code>end > start</code>
+ * the call leaves this set unchanged. If <code>start > end</code>
* then an empty range is added, leaving the set unchanged.
*
* @param start first character, inclusive, of range to be added
@@ -1490,13 +1490,11 @@
/**
* Retain only the elements in this set that are contained in the
- * specified range. If <code>end > start</code> then an empty range is
+ * specified range. If <code>start > end</code> then an empty range is
* retained, leaving the set empty.
*
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
* @stable ICU 2.0
*/
public UnicodeSet retain(int start, int end) {
@@ -1541,11 +1539,15 @@
checkFrozen();
String s = cs.toString();
boolean isIn = strings.contains(s);
- if (isIn && size() == 1) {
+ // Check for getRangeCount() first to avoid somewhat-expensive size()
+ // when there are single code points.
+ if (isIn && getRangeCount() == 0 && size() == 1) {
return this;
}
clear();
- addString(s);
+ if (isIn) {
+ addString(s);
+ }
pat = null;
} else {
retain(cp, cp);
@@ -1556,7 +1558,7 @@
/**
* Removes the specified range from this set if it is present.
* The set will not contain the specified range once the call
- * returns. If <code>end > start</code> then an empty range is
+ * returns. If <code>start > end</code> then an empty range is
* removed, leaving the set unchanged.
*
* @param start first character, inclusive, of range to be removed
@@ -1617,13 +1619,11 @@
/**
* Complements the specified range in this set. Any character in
* the range will be removed if it is in this set, or will be
- * added if it is not in this set. If <code>end > start</code>
+ * added if it is not in this set. If <code>start > end</code>
* then an empty range is complemented, leaving the set unchanged.
*
- * @param start first character, inclusive, of range to be removed
- * from this set.
- * @param end last character, inclusive, of range to be removed
- * from this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
* @stable ICU 2.0
*/
public UnicodeSet complement(int start, int end) {
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
index 176c1e7..a079286 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -727,7 +727,12 @@
set.applyPattern("[a-z{ab}{cd}]");
set.retain("cd");
exp.applyPattern("[{cd}]");
- if (!set.equals(exp)) { errln("FAIL: retain(\"cd\")"); return; }
+ if (!set.equals(exp)) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
+
+ set.applyPattern("[a-z{ab}{yz}]");
+ set.retain("cd");
+ exp.clear();
+ if (!set.equals(exp)) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
set.applyPattern("[a-z{ab}{cd}]");
set.remove((char)0x63);