icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetStringSpanTest.java - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html#License
 /*
  *******************************************************************************
  * Copyright (C) 2009-2014, International Business Machines Corporation and
  * others. All Rights Reserved.
  *******************************************************************************
  */
 package com.ibm.icu.dev.test.lang;

 import java.util.Collection;

 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSet.SpanCondition;
 import com.ibm.icu.util.OutputInt;

 /**
  * @test
  * @summary General test of UnicodeSet string span.
  */
 @RunWith(JUnit4.class)
 public class UnicodeSetStringSpanTest extends TestFmwk {
     // Simple test first, easier to debug.
     @Test
     public void TestSimpleStringSpan() {
         String pattern = "[a{ab}{bc}]";
         String string = "abc";
         UnicodeSet set = new UnicodeSet(pattern);
         set.complement();
         int pos = set.spanBack(string, 3, SpanCondition.SIMPLE);
         if (pos != 1) {
             errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)",
                     set.toString(), string, pos));
         }
         pos = set.span(string, SpanCondition.SIMPLE);
         if (pos != 3) {
             errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
                     set.toString(), string, pos));
         }
         pos = set.span(string, 1, SpanCondition.SIMPLE);
         if (pos != 3) {
             errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)",
                     set.toString(), string, pos));
         }
     }

     // test our slow implementation
     @Test
     public void TestSimpleStringSpanSlow() {
         String pattern = "[a{ab}{bc}]";
         String string = "abc";
         UnicodeSet uset = new UnicodeSet(pattern);
         uset.complement();
         UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);

         int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE);
         if (length != 1) {
             errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)",
                     set.toString(), string, length));
         }
         length = containsSpanUTF16(set, string, SpanCondition.SIMPLE);
         if (length != 3) {
             errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)",
                     set.toString(), string, length));
         }
         length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE);
         if (length != 2) {
             errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)",
                     set.toString(), string, length));
         }
     }

     // Test select patterns and strings, and test SIMPLE.
     @Test
     public void TestSimpleStringSpanAndFreeze() {
         String pattern = "[x{xy}{xya}{axy}{ax}]";
         final String string = "xx"
                 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
                 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
                 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa";

         UnicodeSet set = new UnicodeSet(pattern);
         if (set.containsAll(string)) {
             errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE");
         }

         // Remove trailing "aaaa".
         String string16 = string.substring(0, string.length() - 4);
         if (!set.containsAll(string16)) {
             errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE");
         }

         String s16 = "byayaxya";
         if (   set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4
             || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4
             || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4
             || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5
             || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4
             || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) {
             errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value");
         }

         pattern = "[a{ab}{abc}{cd}]";
         set.applyPattern(pattern);
         s16 = "acdabcdabccd";
         if (   set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
             || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
             || set.span(s16.substring(7),     SpanCondition.SIMPLE) != 5) {
             errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
         }
         set.freeze();
         if (   set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
             || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
             || set.span(s16.substring(7),     SpanCondition.SIMPLE) != 5) {
             errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
         }

         pattern = "[d{cd}{bcd}{ab}]";
         set = set.cloneAsThawed();
         set.applyPattern(pattern).freeze();
         s16 = "abbcdabcdabd";
         if (   set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0
             || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6
             || set.spanBack(s16,  5, SpanCondition.SIMPLE) != 0) {
             errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value");
         }
     }

     // more complex test. --------------------------------------------------------

     // Make the strings in a UnicodeSet easily accessible.
     private static class UnicodeSetWithStrings {
         private UnicodeSet set;
         private Collection<String> setStrings;
         private int stringsLength;

         public UnicodeSetWithStrings(final UnicodeSet normalSet) {
             set = normalSet;
             setStrings = normalSet.strings();
             stringsLength = setStrings.size();
         }

         public final UnicodeSet getSet() {
             return set;
         }

         public boolean hasStrings() {
             return (stringsLength > 0);
         }

         public Iterable<String> strings() {
             return setStrings;
         }
     }

     // Compare 16-bit Unicode strings (which may be malformed UTF-16)
     // at code point boundaries.
     // That is, each edge of a match must not be in the middle of a surrogate pair.
     static boolean matches16CPB(final String s, int start, int limit, final String t) {
         limit -= start;
         int length = t.length();
         return t.equals(s.substring(start, start + length))
                 && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
                                   UTF16.isTrailSurrogate(s.charAt(start)))
                 && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) &&
                                        UTF16.isTrailSurrogate(s.charAt(start + length)));
     }

     // Implement span() with contains() for comparison.
     static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s,
             SpanCondition spanCondition) {
         final UnicodeSet realSet = set.getSet();
         int length = s.length();
         if (!set.hasStrings()) {
             boolean spanContained = false;
             if (spanCondition != SpanCondition.NOT_CONTAINED) {
                 spanContained = true; // Pin to 0/1 values.
             }

             int c;
             int start = 0, prev;
             while ((prev = start) < length) {
                 c = s.codePointAt(start);
                 start = s.offsetByCodePoints(start, 1);
                 if (realSet.contains(c) != spanContained) {
                     break;
                 }
             }
             return prev;
         } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
             int c;
             int start, next;
             for (start = next = 0; start < length;) {
                 c = s.codePointAt(next);
                 next = s.offsetByCodePoints(next, 1);
                 if (realSet.contains(c)) {
                     break;
                 }
                 for (String str : set.strings()) {
                     if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
                         // spanNeedsStrings=true;
                         return start;
                     }
                 }
                 start = next;
             }
             return start;
         } else /* CONTAINED or SIMPLE */{
             int c;
             int start, next, maxSpanLimit = 0;
             for (start = next = 0; start < length;) {
                 c = s.codePointAt(next);
                 next = s.offsetByCodePoints(next, 1);
                 if (!realSet.contains(c)) {
                     next = start; // Do not span this single, not-contained code point.
                 }
                 for (String str : set.strings()) {
                     if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
                         // spanNeedsStrings=true;
                         int matchLimit = start + str.length();
                         if (matchLimit == length) {
                             return length;
                         }
                         if (spanCondition == SpanCondition.CONTAINED) {
                             // Iterate for the shortest match at each position.
                             // Recurse for each but the shortest match.
                             if (next == start) {
                                 next = matchLimit; // First match from start.
                             } else {
                                 if (matchLimit < next) {
                                     // Remember shortest match from start for iteration.
                                     int temp = next;
                                     next = matchLimit;
                                     matchLimit = temp;
                                 }
                                 // Recurse for non-shortest match from start.
                                 int spanLength = containsSpanUTF16(set, s.substring(matchLimit),
                                         SpanCondition.CONTAINED);
                                 if ((matchLimit + spanLength) > maxSpanLimit) {
                                     maxSpanLimit = matchLimit + spanLength;
                                     if (maxSpanLimit == length) {
                                         return length;
                                     }
                                 }
                             }
                         } else /* spanCondition==SIMPLE */{
                             if (matchLimit > next) {
                                 // Remember longest match from start.
                                 next = matchLimit;
                             }
                         }
                     }
                 }
                 if (next == start) {
                     break; // No match from start.
                 }
                 start = next;
             }
             if (start > maxSpanLimit) {
                 return start;
             } else {
                 return maxSpanLimit;
             }
         }
     }

     static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length,
             SpanCondition spanCondition) {
         if (length == 0) {
             return 0;
         }
         final UnicodeSet realSet = set.getSet();
         if (!set.hasStrings()) {
             boolean spanContained = false;
             if (spanCondition != SpanCondition.NOT_CONTAINED) {
                 spanContained = true; // Pin to 0/1 values.
             }

             int c;
             int prev = length;
             do {
                 c = s.codePointBefore(prev);
                 if (realSet.contains(c) != spanContained) {
                     break;
                 }
                 prev = s.offsetByCodePoints(prev, -1);
             } while (prev > 0);
             return prev;
         } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
             int c;
             int prev = length, length0 = length;
             do {
                 c = s.codePointBefore(prev);
                 if (realSet.contains(c)) {
                     break;
                 }
                 for (String str : set.strings()) {
                     if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
                         // spanNeedsStrings=true;
                         return prev;
                     }
                 }
                 prev = s.offsetByCodePoints(prev, -1);
             } while (prev > 0);
             return prev;
         } else /* SpanCondition.CONTAINED or SIMPLE */{
             int c;
             int prev = length, minSpanStart = length, length0 = length;
             do {
                 c = s.codePointBefore(length);
                 length = s.offsetByCodePoints(length, -1);
                 if (!realSet.contains(c)) {
                     length = prev; // Do not span this single, not-contained code point.
                 }
                 for (String str : set.strings()) {
                     if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
                         // spanNeedsStrings=true;
                         int matchStart = prev - str.length();
                         if (matchStart == 0) {
                             return 0;
                         }
                         if (spanCondition == SpanCondition.CONTAINED) {
                             // Iterate for the shortest match at each position.
                             // Recurse for each but the shortest match.
                             if (length == prev) {
                                 length = matchStart; // First match from prev.
                             } else {
                                 if (matchStart > length) {
                                     // Remember shortest match from prev for iteration.
                                     int temp = length;
                                     length = matchStart;
                                     matchStart = temp;
                                 }
                                 // Recurse for non-shortest match from prev.
                                 int spanStart = containsSpanBackUTF16(set, s, matchStart,
                                         SpanCondition.CONTAINED);
                                 if (spanStart < minSpanStart) {
                                     minSpanStart = spanStart;
                                     if (minSpanStart == 0) {
                                         return 0;
                                     }
                                 }
                             }
                         } else /* spanCondition==SIMPLE */{
                             if (matchStart < length) {
                                 // Remember longest match from prev.
                                 length = matchStart;
                             }
                         }
                     }
                 }
                 if (length == prev) {
                     break; // No match from prev.
                 }
             } while ((prev = length) > 0);
             if (prev < minSpanStart) {
                 return prev;
             } else {
                 return minSpanStart;
             }
         }
     }

     // spans to be performed and compared
     static final int SPAN_UTF16 = 1;
     static final int SPAN_UTF8 = 2;
     static final int SPAN_UTFS = 3;

     static final int SPAN_SET = 4;
     static final int SPAN_COMPLEMENT = 8;
     static final int SPAN_POLARITY = 0xc;

     static final int SPAN_FWD = 0x10;
     static final int SPAN_BACK = 0x20;
     static final int SPAN_DIRS = 0x30;

     static final int SPAN_CONTAINED = 0x100;
     static final int SPAN_SIMPLE = 0x200;
     static final int SPAN_CONDITION = 0x300;

     static final int SPAN_ALL = 0x33f;

     static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) {
         return spanCondition == SpanCondition.NOT_CONTAINED ? contained
                 : SpanCondition.NOT_CONTAINED;
     }

     /*
      * Count spans on a string with the method according to type and set the span limits. The set may be the complement
      * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack()
      * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1
      * if the span option is filtered out.
      */
     static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s,
             int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity,
             int expectCount) {
         final UnicodeSet realSet = set.getSet();
         int start, count, i;
         SpanCondition spanCondition, firstSpanCondition, contained;
         boolean isForward;

         int length = s.length();
         if (type < 0 || 7 < type) {
             typeName[0] = null;
             return 0;
         }

         final String typeNames16[] = {
                 "contains",
                 "contains(LM)",
                 "span",
                 "span(LM)",
                 "containsBack",
                 "containsBack(LM)",
                 "spanBack",
                 "spanBack(LM)" };

         typeName[0] = typeNames16[type];

         // filter span options
         if (type <= 3) {
             // span forward
             if ((whichSpans & SPAN_FWD) == 0) {
                 return -1;
             }
             isForward = true;
         } else {
             // span backward
             if ((whichSpans & SPAN_BACK) == 0) {
                 return -1;
             }
             isForward = false;
         }
         if ((type & 1) == 0) {
             // use SpanCondition.CONTAINED
             if ((whichSpans & SPAN_CONTAINED) == 0) {
                 return -1;
             }
             contained = SpanCondition.CONTAINED;
         } else {
             // use SIMPLE
             if ((whichSpans & SPAN_SIMPLE) == 0) {
                 return -1;
             }
             contained = SpanCondition.SIMPLE;
         }

         // Default first span condition for going forward with an uncomplemented set.
         spanCondition = SpanCondition.NOT_CONTAINED;
         if (isComplement) {
             spanCondition = invertSpanCondition(spanCondition, contained);
         }

         // First span condition for span(), used to terminate the spanBack() iteration.
         firstSpanCondition = spanCondition;

         // spanBack(): Its initial span condition is span()'s last span condition,
         // which is the opposite of span()'s first span condition
         // if we expect an even number of spans.
         // (The loop inverts spanCondition (expectCount-1) times
         // before the expectCount'th span() call.)
         // If we do not compare forward and backward directions, then we do not have an
         // expectCount and just start with firstSpanCondition.
         if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) {
             spanCondition = invertSpanCondition(spanCondition, contained);
         }

         count = 0;
         switch (type) {
         case 0:
         case 1:
             start = 0;
             for (;;) {
                 start += containsSpanUTF16(set, s.substring(start), spanCondition);
                 if (count < limitsCapacity) {
                     limits[count] = start;
                 }
                 ++count;
                 if (start >= length) {
                     break;
                 }
                 spanCondition = invertSpanCondition(spanCondition, contained);
             }
             break;
         case 2:
         case 3:
             start = 0;
             for (;;) {
                 start = realSet.span(s, start, spanCondition);
                 if (count < limitsCapacity) {
                     limits[count] = start;
                 }
                 ++count;
                 if (start >= length) {
                     break;
                 }
                 spanCondition = invertSpanCondition(spanCondition, contained);
             }
             break;
         case 4:
         case 5:
             for (;;) {
                 ++count;
                 if (count <= limitsCapacity) {
                     limits[limitsCapacity - count] = length;
                 }
                 length = containsSpanBackUTF16(set, s, length, spanCondition);
                 if (length == 0 && spanCondition == firstSpanCondition) {
                     break;
                 }
                 spanCondition = invertSpanCondition(spanCondition, contained);
             }
             if (count < limitsCapacity) {
                 for (i = count; i-- > 0;) {
                     limits[i] = limits[limitsCapacity - count + i];
                 }
             }
             break;
         case 6:
         case 7:
             for (;;) {
                 ++count;
                 if (count <= limitsCapacity) {
                     limits[limitsCapacity - count] = length >= 0 ? length : s.length();
                 }
                 length = realSet.spanBack(s, length, spanCondition);
                 if (length == 0 && spanCondition == firstSpanCondition) {
                     break;
                 }
                 spanCondition = invertSpanCondition(spanCondition, contained);
             }
             if (count < limitsCapacity) {
                 for (i = count; i-- > 0;) {
                     limits[i] = limits[limitsCapacity - count + i];
                 }
             }
             break;
         default:
             typeName = null;
             return -1;
         }

         return count;
     }

     // sets to be tested; odd index=isComplement
     static final int SLOW = 0;
     static final int SLOW_NOT = 1;
     static final int FAST = 2;
     static final int FAST_NOT = 3;
     static final int SET_COUNT = 4;

     static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" };

     /*
      * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen
      * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly).
      * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition).
      *
      * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an
      * input expectCount<0).
      */
     void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
             int expectLimits[], int expectCount,
             final String testName, int index) {
         int[] limits = new int[500];
         int limitsCount;
         int i, j;
         String[] typeName = new String[1];
         int type;

         for (i = 0; i < SET_COUNT; ++i) {
             if ((i & 1) == 0) {
                 // Even-numbered sets are original, uncomplemented sets.
                 if ((whichSpans & SPAN_SET) == 0) {
                     continue;
                 }
             } else {
                 // Odd-numbered sets are complemented.
                 if ((whichSpans & SPAN_COMPLEMENT) == 0) {
                     continue;
                 }
             }
             for (type = 0;; ++type) {
                 limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits,
                         limits.length, expectCount);
                 if (typeName[0] == null) {
                     break; // All types tried.
                 }
                 if (limitsCount < 0) {
                     continue; // Span option filtered out.
                 }
                 if (expectCount < 0) {
                     expectCount = limitsCount;
                     if (limitsCount > limits.length) {
                         errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans",
                                 testName, index, setNames[i], typeName[0], limitsCount, limits.length));
                         return;
                     }
                     for (j = limitsCount; j-- > 0;) {
                         expectLimits[j] = limits[j];
                     }
                 } else if (limitsCount != expectCount) {
                     errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i],
                             typeName[0], limitsCount, expectCount));
                 } else {
                     for (j = 0; j < limitsCount; ++j) {
                         if (limits[j] != expectLimits[j]) {
                             errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName,
                                     index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j]));
                             break;
                         }
                     }
                 }
             }
         }

         // Compare span() with containsAll()/containsNone(),
         // but only if we have expectLimits[] from the uncomplemented set.
         if ((whichSpans & SPAN_SET) != 0) {
             final String s16 = s;
             String string;
             int prev = 0, limit, len;
             for (i = 0; i < expectCount; ++i) {
                 limit = expectLimits[i];
                 len = limit - prev;
                 if (len > 0) {
                     string = s16.substring(prev, prev + len); // read-only alias
                     if (0 != (i & 1)) {
                         if (!sets[SLOW].getSet().containsAll(string)) {
                             errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
                                     testName, index, setNames[SLOW], prev, limit));
                             return;
                         }
                         if (!sets[FAST].getSet().containsAll(string)) {
                             errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
                                     testName, index, setNames[FAST], prev, limit));
                             return;
                         }
                     } else {
                         if (!sets[SLOW].getSet().containsNone(string)) {
                             errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
                                     testName, index, setNames[SLOW], prev, limit));
                             return;
                         }
                         if (!sets[FAST].getSet().containsNone(string)) {
                             errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
                                     testName, index, setNames[FAST], prev, limit));
                             return;
                         }
                     }
                 }
                 prev = limit;
             }
         }
     }

     // Specifically test either UTF-16 or UTF-8.
     void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
             final String testName, int index) {
         int[] expectLimits = new int[500];
         int expectCount = -1;
         verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index);
     }

     // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
     // unless either UTF is turned off in whichSpans.
     // Testing UTF-16 and UTF-8 together requires that surrogate code points
     // have the same contains(c) value as U+FFFD.
     void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans,
             final String testName, int index) {
         int[] expectLimits = new int[500];
         int expectCount;

         expectCount = -1; // Get expectLimits[] from verifySpan().

         if ((whichSpans & SPAN_UTF16) != 0) {
             verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index);
         }
     }

     static int nextCodePoint(int c) {
         // Skip some large and boring ranges.
         switch (c) {
         case 0x3441:
             return 0x4d7f;
         case 0x5100:
             return 0x9f00;
         case 0xb040:
             return 0xd780;
         case 0xe041:
             return 0xf8fe;
         case 0x10100:
             return 0x20000;
         case 0x20041:
             return 0xe0000;
         case 0xe0101:
             return 0x10fffd;
         default:
             return c + 1;
         }
     }

     // Verify that all implementations represent the same set.
     void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
         StringBuffer s = new StringBuffer();
         int localWhichSpans;
         int c, first;
         for (first = c = 0;; c = nextCodePoint(c)) {
             if (c > 0x10ffff || s.length() > 1024) {
                 localWhichSpans = whichSpans;
                 verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first);
                 if (c > 0x10ffff) {
                     break;
                 }
                 s.delete(0, s.length());
                 first = c;
             }
             UTF16.append(s, c);
         }
     }

     // Test with a particular, interesting string.
     // Specify length and try NUL-termination.
     static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space
             0x3b1, 0x3b2, 0x3b3, // Greek
             0xd900, // lead surrogate
             0x3000, 0x30ab, 0x30ad, // wide space, Katakana
             0xdc05, // trail surrogate
             0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
             0xd900, 0xdc05, // unassigned supplementary
             0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
             0xd7a4, 0xdc05, 0xd900, 0x2028  // unassigned, surrogates in wrong order, LS
     };
     static String interestingString = new String(interestingStringChars);
     static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]";

     @Test
     public void TestInterestingStringSpan() {
         UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1));
         SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
         int expect = 2;
         int start = 14;

         int c = 0xd840;
         boolean contains = uset.contains(c);
         if (false != contains) {
             errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)",
                   c));
         }

         UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
         int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition);
         if (expect != len) {
             errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)",
                   interestingString, start, len, expect));
         }

         len = uset.span(interestingString, start, spanCondition) - start;
         if (expect != len) {
             errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)",
                   interestingString, start, len, expect));
         }
     }

     void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
         if ((whichSpans & SPAN_UTF16) == 0) {
             return;
         }
         verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1);
     }

     // Take a set of span options and multiply them so that
     // each portion only has one of the options a, b and c.
     // If b==0, then the set of options is just modified with mask and a.
     // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
     static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) {
         int s;
         int i;

         for (i = 0; i < whichSpansCount; ++i) {
             s = whichSpans[i] & mask;
             whichSpans[i] = s | a;
             if (b != 0) {
                 whichSpans[whichSpansCount + i] = s | b;
                 if (c != 0) {
                     whichSpans[2 * whichSpansCount + i] = s | c;
                 }
             }
         }
         return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount;
     }

     // They are not representable in UTF-8, and a leading trail surrogate
     // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
     // U+20001 == \\uD840\\uDC01
     // U+20400 == \\uD841\\uDC00
     static final String patternWithUnpairedSurrogate =
         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]";
     static final String stringWithUnpairedSurrogate =
         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb";

     static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
     static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
     static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     static final String longPattern =
         "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]";

     @Test
     public void TestStringWithUnpairedSurrogateSpan() {
         String string = Utility.unescape(stringWithUnpairedSurrogate);
         UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate));
         SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
         int start = 17;
         int expect = 5;

         UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
         int len = containsSpanUTF16(set, string.substring(start), spanCondition);
         if (expect != len) {
             errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)",
                   string, start, len, expect));
         }

         len = uset.span(string, start, spanCondition) - start;
         if (expect != len) {
             errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)",
                   string, start, len, expect));
         }
     }

     @Test
     public void TestSpan() {
         // "[...]" is a UnicodeSet pattern.
         // "*" performs tests on all Unicode code points and on a selection of
         // malformed UTF-8/16 strings.
         // "-options" limits the scope of testing for the current set.
         // By default, the test verifies that equivalent boundaries are found
         // for UTF-16 and UTF-8, going forward and backward,
         // alternating NOT_CONTAINED with
         // either CONTAINED or SIMPLE.
         // Single-character options:
         // 8 -- UTF-16 and UTF-8 boundaries may differ.
         // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
         // or the set contains strings with unpaired surrogates
         // which do not translate to valid UTF-8.
         // c -- set.span() and set.complement().span() boundaries may differ.
         // Cause: Set strings are not complemented.
         // b -- span() and spanBack() boundaries may differ.
         // Cause: Strings in the set overlap, and spanBack(CONTAINED)
         // and spanBack(SIMPLE) are defined to
         // match with non-overlapping substrings.
         // For example, with a set containing "ab" and "ba",
         // span() of "aba" yields boundaries { 0, 2, 3 }
         // because the initial "ab" matches from 0 to 2,
         // while spanBack() yields boundaries { 0, 1, 3 }
         // because the final "ba" matches from 1 to 3.
         // l -- CONTAINED and SIMPLE boundaries may differ.
         // Cause: Strings in the set overlap, and a longer match may
         // require a sequence including non-longest substrings.
         // For example, with a set containing "ab", "abc" and "cd",
         // span(contained) of "abcd" spans the entire string
         // but span(longest match) only spans the first 3 characters.
         // Each "-options" first resets all options and then applies the specified options.
         // A "-" without options resets the options.
         // The options are also reset for each new set.
         // Other strings will be spanned.
         final String testdata[] = {
                 "[:ID_Continue:]",
                 "*",
                 "[:White_Space:]",
                 "*",
                 "[]",
                 "*",
                 "[\\u0000-\\U0010FFFF]",
                 "*",
                 "[\\u0000\\u0080\\u0800\\U00010000]",
                 "*",
                 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
                 "*",
                 unicodeSet1,
                 "-c",
                 "*",
                 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
                 "-c",
                 "*",

                 // Overlapping strings cause overlapping attempts to match.
                 "[x{xy}{xya}{axy}{ax}]",
                 "-cl",

                 // More repetitions of "xya" would take too long with the recursive
                 // reference implementation.
                 // containsAll()=false
                 // test_string 0x14
                 "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here.
                         "xx" + // set.complement().span(contained) will stop between the two 'x'es.
                         "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here.
                         "aaa",

                 // containsAll()=true
                 // test_string 0x15
                 "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy",

                 "-bc",
                 // test_string 0x17
                 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
                 "-c",
                 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
                 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
                 "-",
                 "byaya", // span() -> { 5 }
                 "byay", // span() -> { 4 }
                 "bya", // span() -> { 3 }

                 // span(longest match) will not span the whole string.
                 "[a{ab}{bc}]",
                 "-cl",
                 // test_string 0x21
                 "abc",

                 "[a{ab}{abc}{cd}]",
                 "-cl",
                 "acdabcdabccd",

                 // spanBack(longest match) will not span the whole string.
                 "[c{ab}{bc}]",
                 "-cl",
                 "abc",

                 "[d{cd}{bcd}{ab}]",
                 "-cl",
                 "abbcdabcdabd",

                 // Test with non-ASCII set strings - test proper handling of surrogate pairs
                 // and UTF-8 trail bytes.
                 // Copies of above test sets and strings, but transliterated to have
                 // different code points with similar trail units.
                 // Previous: a b c d
                 // Unicode: 042B 30AB 200AB 204AB
                 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
                 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
                 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
                 "-cl",
                 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",

                 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
                 "-cl",
                 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",

                 // Stress bookkeeping and recursion.
                 // The following strings are barely doable with the recursive
                 // reference implementation.
                 // The not-contained character at the end prevents an early exit from the span().
                 "[b{bb}]",
                 "-c",
                 // test_string 0x33
                 "bbbbbbbbbbbbbbbbbbbbbbbb-",
                 // On complement sets, span() and spanBack() get different results
                 // because b is not in the complement set and there is an odd number of b's
                 // in the test string.
                 "-bc",
                 "bbbbbbbbbbbbbbbbbbbbbbbbb-",

                 // Test with set strings with an initial or final code point span
                 // longer than 254.
                 longPattern,
                 "-c",
                 _64_a + _64_a + _64_a + _63_a + "b",
                 _64_a + _64_a + _64_a + _64_a + "b",
                 _64_a + _64_a + _64_a + _64_a + "aaaabbbb",
                 "a" + _64_b + _64_b + _64_b + _63_b,
                 "a" + _64_b + _64_b + _64_b + _64_b,
                 "aaaabbbb" + _64_b + _64_b + _64_b + _64_b,

                 // Test with strings containing unpaired surrogates.
                 patternWithUnpairedSurrogate, "-8cl",
                 stringWithUnpairedSurrogate };
         int i, j;
         int whichSpansCount = 1;
         int[] whichSpans = new int[96];
         for (i = whichSpans.length; i-- > 0;) {
             whichSpans[i] = SPAN_ALL;
         }

         UnicodeSet[] sets = new UnicodeSet[SET_COUNT];
         UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT];

         String testName = null;
         @SuppressWarnings("unused")
         String testNameLimit;

         for (i = 0; i < testdata.length; ++i) {
             final String s = testdata[i];
             if (s.charAt(0) == '[') {
                 // Create new test sets from this pattern.
                 for (j = 0; j < SET_COUNT; ++j) {
                     sets_with_str[j] = null;
                     sets[j] = null;
                 }
                 sets[SLOW] = new UnicodeSet(Utility.unescape(s));
                 sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]);
                 sets[SLOW_NOT].complement();
                 // Intermediate set: Test cloning of a frozen set.
                 UnicodeSet fast = new UnicodeSet(sets[SLOW]);
                 fast.freeze();
                 sets[FAST] = (UnicodeSet) fast.clone();
                 fast = null;
                 UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]);
                 fastNot.freeze();
                 sets[FAST_NOT] = (UnicodeSet) fastNot.clone();
                 fastNot = null;

                 for (j = 0; j < SET_COUNT; ++j) {
                     sets_with_str[j] = new UnicodeSetWithStrings(sets[j]);
                 }

                 testName = s + ':';
                 whichSpans[0] = SPAN_ALL;
                 whichSpansCount = 1;
             } else if (s.charAt(0) == '-') {
                 whichSpans[0] = SPAN_ALL;
                 whichSpansCount = 1;

                 for (j = 1; j < s.length(); j++) {
                     switch (s.charAt(j)) {
                     case 'c':
                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET,
                                 SPAN_COMPLEMENT, 0);
                         break;
                     case 'b':
                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK,
                                 0);
                         break;
                     case 'l':
                         // test CONTAINED FWD & BACK, and separately
                         // SIMPLE only FWD, and separately
                         // SIMPLE only BACK
                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION),
                                 SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE);
                         break;
                     case '8':
                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16,
                                 SPAN_UTF8, 0);
                         break;
                     default:
                         errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i]));
                         break;
                     }
                 }
             } else if (s.equals("*")) {
                 testNameLimit = "bad_string";
                 for (j = 0; j < whichSpansCount; ++j) {
                     if (whichSpansCount > 1) {
                         testNameLimit += String.format("%%0x%3x", whichSpans[j]);
                     }
                     verifySpanUTF16String(sets_with_str, whichSpans[j], testName);
                 }

                 testNameLimit = "contents";
                 for (j = 0; j < whichSpansCount; ++j) {
                     if (whichSpansCount > 1) {
                         testNameLimit += String.format("%%0x%3x", whichSpans[j]);
                     }
                     verifySpanContents(sets_with_str, whichSpans[j], testName);
                 }
             } else {
                 String string = Utility.unescape(s);
                 testNameLimit = "test_string";
                 for (j = 0; j < whichSpansCount; ++j) {
                     if (whichSpansCount > 1) {
                         testNameLimit += String.format("%%0x%3x", whichSpans[j]);
                     }
                     verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i);
                 }
             }
         }
     }

     @Test
     public void TestSpanAndCount() {
         // a set with no strings
         UnicodeSet abc = new UnicodeSet('a', 'c');
         // a set with an "irrelevant" string (fully contained in the code point set)
         UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n");
         // a set with no "irrelevant" string but some interesting overlaps
         UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd");
         String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde";
         OutputInt count = new OutputInt();
         assertEquals("abc span[8, 11[", 11,
                 abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
         assertEquals("abc count=3", 3, count.value);
         assertEquals("no abc span[2, 8[", 8,
                 abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
         assertEquals("no abc count=5", 5, count.value);
         assertEquals("line endings span[2, 6[", 6,
                 crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
         assertEquals("line endings count=3", 3, count.value);
         assertEquals("no ab+cd span[2, 8[", 8,
                 ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
         assertEquals("no ab+cd count=5", 5, count.value);
         assertEquals("ab+cd span[8, 12[", 12,
                 ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
         assertEquals("ab+cd count=2", 2, count.value);
         assertEquals("1x abc span[8, 11[", 11,
                 ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
         assertEquals("1x abc count=1", 1, count.value);

         abc.freeze();
         crlf.freeze();
         ab_cd.freeze();
         assertEquals("abc span[8, 11[ (frozen)", 11,
                 abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
         assertEquals("abc count=3 (frozen)", 3, count.value);
         assertEquals("no abc span[2, 8[ (frozen)", 8,
                 abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
         assertEquals("no abc count=5 (frozen)", 5, count.value);
         assertEquals("line endings span[2, 6[ (frozen)", 6,
                 crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
         assertEquals("line endings count=3 (frozen)", 3, count.value);
         assertEquals("no ab+cd span[2, 8[ (frozen)", 8,
                 ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
         assertEquals("no ab+cd count=5 (frozen)", 5, count.value);
         assertEquals("ab+cd span[8, 12[ (frozen)", 12,
                 ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
         assertEquals("ab+cd count=2 (frozen)", 2, count.value);
         assertEquals("1x abc span[8, 11[ (frozen)", 11,
                 ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
         assertEquals("1x abc count=1 (frozen)", 1, count.value);
     }
 }