ICU-13702 add missing API functions and fix a bug in Java UnicodeSet.retain(String) which added the string even if the set did not contain it before, and some drive-by API doc fixes/clarifications

commit: 66460b9fad43be83a5846d95f0023a304fb84c47 [log] [tgz]
author: Markus Scherer <markus.icu@gmail.com> Tue Feb 16 16:09:18 2021 -0800
committer: Markus Scherer <markus.icu@gmail.com> Wed Feb 17 10:12:10 2021 -0800
tree: 6469388a9350b03fd19e98cb4464a06bb354807b
parent: 7159e334ff12893b857de9a92e1c2eae4f3399fc [diff]
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 71fa1eb..8403c40 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h

@@ -599,7 +599,7 @@
 
     /**
      * Make this object represent the range `start - end`.
-     * If `end > start` then this object is set to an empty range.
+     * If `start > end` then this object is set to an empty range.
      * A frozen set will not be modified.
      *
      * @param start first character in the set, inclusive
@@ -1075,7 +1075,7 @@
     /**
      * Adds the specified range to this set if it is not already
      * present.  If this set already contains the specified range,
-     * the call leaves this set unchanged.  If <code>end > start</code>
+     * the call leaves this set unchanged.  If <code>start > end</code>
      * then an empty range is added, leaving the set unchanged.
      * This is equivalent to a boolean logic OR, or a set UNION.
      * A frozen set will not be modified.
@@ -1093,6 +1093,9 @@
      * present.  If this set already contains the specified character,
      * the call leaves this set unchanged.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& add(UChar32 c);
@@ -1122,8 +1125,8 @@
 
  public:
     /**
-     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
+     * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+     * If this set already contains any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1133,7 +1136,6 @@
 
     /**
      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1143,7 +1145,6 @@
 
     /**
      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1153,7 +1154,6 @@
 
     /**
      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1183,15 +1183,13 @@
 
     /**
      * Retain only the elements in this set that are contained in the
-     * specified range.  If <code>end > start</code> then an empty range is
+     * specified range.  If <code>start > end</code> then an empty range is
      * retained, leaving the set empty.  This is equivalent to
      * a boolean logic AND, or a set INTERSECTION.
      * A frozen set will not be modified.
      *
-     * @param start first character, inclusive, of range to be retained
-     * to this set.
-     * @param end last character, inclusive, of range to be retained
-     * to this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
@@ -1200,14 +1198,31 @@
     /**
      * Retain the specified character from this set if it is present.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& retain(UChar32 c);
 
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Retains only the specified string from this set if it is present.
+     * Upon return this set will be empty if it did not contain s, or
+     * will only contain s if it did contain s.
+     * A frozen set will not be modified.
+     *
+     * @param s the source string
+     * @return this object, for chaining
+     * @draft ICU 69
+     */
+    UnicodeSet& retain(const UnicodeString &s);
+#endif  // U_HIDE_DRAFT_API
+
     /**
      * Removes the specified range from this set if it is present.
      * The set will not contain the specified range once the call
-     * returns.  If <code>end > start</code> then an empty range is
+     * returns.  If <code>start > end</code> then an empty range is
      * removed, leaving the set unchanged.
      * A frozen set will not be modified.
      *
@@ -1224,6 +1239,9 @@
      * The set will not contain the specified range once the call
      * returns.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& remove(UChar32 c);
@@ -1251,15 +1269,13 @@
     /**
      * Complements the specified range in this set.  Any character in
      * the range will be removed if it is in this set, or will be
-     * added if it is not in this set.  If <code>end > start</code>
+     * added if it is not in this set.  If <code>start > end</code>
      * then an empty range is complemented, leaving the set unchanged.
      * This is equivalent to a boolean logic XOR.
      * A frozen set will not be modified.
      *
-     * @param start first character, inclusive, of range to be removed
-     * from this set.
-     * @param end last character, inclusive, of range to be removed
-     * from this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
@@ -1269,14 +1285,16 @@
      * will be removed if it is in this set, or will be added if it is
      * not in this set.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& complement(UChar32 c);
 
     /**
      * Complement the specified string in this set.
-     * The set will not contain the specified string once the call
-     * returns.
+     * The string will be removed if it is in this set, or will be added if it is not in this set.
      * A frozen set will not be modified.
      *
      * @param s the string to complement

diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h
index 502ea8d..1d0daf9 100644
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h

@@ -582,8 +582,8 @@
 uset_addString(USet* set, const UChar* str, int32_t strLen);
 
 /**
- * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
+ * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+ * If this set already contains any particular character, it has no effect on that character.
  * A frozen set will not be modified.
  * @param set the object to which to add the character
  * @param str the source string
@@ -628,6 +628,20 @@
 U_CAPI void U_EXPORT2
 uset_removeString(USet* set, const UChar* str, int32_t strLen);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Removes from this set all of its elements that are contained in the
  * specified set.  This operation effectively modifies this
@@ -650,15 +664,41 @@
  * A frozen set will not be modified.
  *
  * @param set the object for which to retain only the specified range
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
  * @stable ICU 3.2
  */
 U_CAPI void U_EXPORT2
 uset_retain(USet* set, UChar32 start, UChar32 end);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Retains only the specified string from this set if it is present.
+ * Upon return this set will be empty if it did not contain s, or
+ * will only contain s if it did contain s.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Retains only the elements in this set that are contained in the
  * specified set.  In other words, removes from this set all of
@@ -696,6 +736,49 @@
 U_CAPI void U_EXPORT2
 uset_complement(USet* set);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Complements the specified range in this set.  Any character in
+ * the range will be removed if it is in this set, or will be
+ * added if it is not in this set.  If <code>start > end</code>
+ * then an empty range is complemented, leaving the set unchanged.
+ * This is equivalent to a boolean logic XOR.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end);
+
+/**
+ * Complements the specified string in this set.
+ * The string will be removed if it is in this set, or will be added if it is not in this set.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Complements in this set all elements contained in the specified
  * set.  Any character in the other set will be removed if it is

diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp
index be6ffb7..461e5a7 100644
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp

@@ -1120,6 +1120,26 @@
     return retain(c, c);
 }
 
+UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
+    if (isFrozen() || isBogus()) { return *this; }
+    UChar32 cp = getSingleCP(s);
+    if (cp < 0) {
+        bool isIn = stringsContains(s);
+        // Check for getRangeCount() first to avoid somewhat-expensive size()
+        // when there are single code points.
+        if (isIn && getRangeCount() == 0 && size() == 1) {
+            return *this;
+        }
+        clear();
+        if (isIn) {
+            _add(s);
+        }
+    } else {
+        retain(cp, cp);
+    }
+    return *this;
+}
+
 /**
  * Removes the specified range from this set if it is present.
  * The set will not contain the specified range once the call

diff --git a/icu4c/source/common/uset.cpp b/icu4c/source/common/uset.cpp
index eae7981..a7e3046 100644
--- a/icu4c/source/common/uset.cpp
+++ b/icu4c/source/common/uset.cpp

@@ -117,6 +117,12 @@
 }
 
 U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::removeAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_removeAll(USet* set, const USet* remove) {
     ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
 }
@@ -127,6 +133,18 @@
 }
 
 U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::retain(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::retainAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_retainAll(USet* set, const USet* retain) {
     ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
 }
@@ -142,6 +160,23 @@
 }
 
 U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end) {
+    ((UnicodeSet*) set)->UnicodeSet::complement(start, end);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::complement(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::complementAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_complementAll(USet* set, const USet* complement) {
     ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
 }

diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c
index d656964..9fe2362 100644
--- a/icu4c/source/test/cintltst/usettest.c
+++ b/icu4c/source/test/cintltst/usettest.c

@@ -211,6 +211,45 @@
     uset_retainAll(set2, set);
     expect(set2, "ghi", "abcdef{bc}", NULL);
 
+    // ICU 69 added some missing functions for parity with C++ and Java.
+    uset_applyPattern(set, u"[abcdef{ch}{sch}]", -1, 0, &ec);
+    if(U_FAILURE(ec)) {
+        log_err("uset_openPattern([abcdef{ch}{sch}]) failed - %s\n", u_errorName(ec));
+        return;
+    }
+    expect(set, "abcdef{ch}{sch}", "", NULL);
+
+    uset_removeAllCodePoints(set, u"ce", 2);
+    expect(set, "abdf{ch}{sch}", "ce", NULL);
+
+    uset_complementRange(set, u'b', u'f');
+    expect(set, "ace{ch}{sch}", "bdf", NULL);
+
+    uset_complementString(set, u"ch", -1);
+    expect(set, "ace{sch}", "bdf{ch}", NULL);
+
+    uset_complementString(set, u"xy", -1);
+    expect(set, "ace{sch}{xy}", "bdf{ch}", NULL);
+
+    uset_complementAllCodePoints(set, u"abef", 4);
+    expect(set, "bcf{sch}{xy}", "ade{ch}", NULL);
+
+    uset_retainAllCodePoints(set, u"abef", -1);
+    expect(set, "bf", "acde{ch}{sch}{xy}", NULL);
+
+    uset_applyPattern(set, u"[abcdef{ch}{sch}]", -1, 0, &ec);
+    if(U_FAILURE(ec)) {
+        log_err("uset_openPattern([abcdef{ch}{sch}]) failed - %s\n", u_errorName(ec));
+        return;
+    }
+    expect(set, "abcdef{ch}{sch}", "", NULL);
+
+    uset_retainString(set, u"sch", 3);
+    expect(set, "{sch}", "abcdef{ch}", NULL);
+
+    uset_retainString(set, u"ch", 3);
+    expect(set, "", "abcdef{ch}{sch}", NULL);
+
     uset_close(set);
     uset_close(set2);
 }

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 97386cf..700ff05 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp

@@ -696,6 +696,37 @@
     if (U_FAILURE(status)) { errln("FAIL"); return; }
     if (set != exp) { errln("FAIL: retain('s')"); return; }
 
+    // ICU 2.6 coverage tests
+    // public final UnicodeSet retain(String s);
+    // public final UnicodeSet remove(int c);
+    // public final UnicodeSet remove(String s);
+    // public int hashCode();
+    set.applyPattern(u"[a-z{ab}{cd}]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    set.retain(u"cd");
+    exp.applyPattern(u"[{cd}]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
+
+    set.applyPattern(u"[a-z{ab}{yz}]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    set.retain(u"cd");
+    exp.clear();
+    if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
+
+    set.applyPattern(u"[a-z{ab}{cd}]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    set.remove(u'c');
+    exp.applyPattern(u"[abd-z{ab}{cd}]", status);
+    if (set != exp) { errln("FAIL: remove('c')"); return; }
+
+    set.remove(u"cd");
+    exp.applyPattern(u"[abd-z{ab}]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
+
+    set.applyPattern("[s]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
     uint16_t buf[32];
     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
index 5aaf044..f62e682 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

@@ -514,7 +514,7 @@
 
     /**
      * Make this object represent the range <code>start - end</code>.
-     * If <code>end &gt; start</code> then this object is set to an empty range.
+     * If <code>start &gt; end</code> then this object is set to an empty range.
      *
      * @param start first character in the set, inclusive
      * @param end last character in the set, inclusive
@@ -1159,7 +1159,7 @@
     /**
      * Adds the specified range to this set if it is not already
      * present.  If this set already contains the specified range,
-     * the call leaves this set unchanged.  If <code>end &gt; start</code>
+     * the call leaves this set unchanged.  If <code>start &gt; end</code>
      * then an empty range is added, leaving the set unchanged.
      *
      * @param start first character, inclusive, of range to be added
@@ -1490,13 +1490,11 @@
 
     /**
      * Retain only the elements in this set that are contained in the
-     * specified range.  If <code>end &gt; start</code> then an empty range is
+     * specified range.  If <code>start &gt; end</code> then an empty range is
      * retained, leaving the set empty.
      *
-     * @param start first character, inclusive, of range to be retained
-     * to this set.
-     * @param end last character, inclusive, of range to be retained
-     * to this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     public UnicodeSet retain(int start, int end) {
@@ -1541,11 +1539,15 @@
             checkFrozen();
             String s = cs.toString();
             boolean isIn = strings.contains(s);
-            if (isIn && size() == 1) {
+            // Check for getRangeCount() first to avoid somewhat-expensive size()
+            // when there are single code points.
+            if (isIn && getRangeCount() == 0 && size() == 1) {
                 return this;
             }
             clear();
-            addString(s);
+            if (isIn) {
+                addString(s);
+            }
             pat = null;
         } else {
             retain(cp, cp);
@@ -1556,7 +1558,7 @@
     /**
      * Removes the specified range from this set if it is present.
      * The set will not contain the specified range once the call
-     * returns.  If <code>end &gt; start</code> then an empty range is
+     * returns.  If <code>start &gt; end</code> then an empty range is
      * removed, leaving the set unchanged.
      *
      * @param start first character, inclusive, of range to be removed
@@ -1617,13 +1619,11 @@
     /**
      * Complements the specified range in this set.  Any character in
      * the range will be removed if it is in this set, or will be
-     * added if it is not in this set.  If <code>end &gt; start</code>
+     * added if it is not in this set.  If <code>start &gt; end</code>
      * then an empty range is complemented, leaving the set unchanged.
      *
-     * @param start first character, inclusive, of range to be removed
-     * from this set.
-     * @param end last character, inclusive, of range to be removed
-     * from this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     public UnicodeSet complement(int start, int end) {

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
index 176c1e7..a079286 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java

@@ -727,7 +727,12 @@
         set.applyPattern("[a-z{ab}{cd}]");
         set.retain("cd");
         exp.applyPattern("[{cd}]");
-        if (!set.equals(exp)) { errln("FAIL: retain(\"cd\")"); return; }
+        if (!set.equals(exp)) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
+
+        set.applyPattern("[a-z{ab}{yz}]");
+        set.retain("cd");
+        exp.clear();
+        if (!set.equals(exp)) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
 
         set.applyPattern("[a-z{ab}{cd}]");
         set.remove((char)0x63);
commit	66460b9fad43be83a5846d95f0023a304fb84c47	[log] [tgz]
author	Markus Scherer <markus.icu@gmail.com>	Tue Feb 16 16:09:18 2021 -0800
committer	Markus Scherer <markus.icu@gmail.com>	Wed Feb 17 10:12:10 2021 -0800
tree	6469388a9350b03fd19e98cb4464a06bb354807b
parent	7159e334ff12893b857de9a92e1c2eae4f3399fc [diff]