icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeSetStringSpan.java - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
  ******************************************************************************
  *
  *   Copyright (C) 2009-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  ******************************************************************************
  */

 package com.ibm.icu.impl;

 import java.util.ArrayList;

 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSet.SpanCondition;
 import com.ibm.icu.util.OutputInt;

 /*
  * Implement span() etc. for a set with strings.
  * Avoid recursion because of its exponential complexity.
  * Instead, try multiple paths at once and track them with an IndexList.
  */
 public class UnicodeSetStringSpan {

     /*
      * Which span() variant will be used? The object is either built for one variant and used once,
      * or built for all and may be used many times.
      */
     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
     public static final int FWD           = 0x20;
     public static final int BACK          = 0x10;
     // public static final int UTF16      = 8;
     public static final int CONTAINED     = 2;
     public static final int NOT_CONTAINED = 1;

     public static final int ALL = 0x7f;

     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;

     /**
      * Special spanLength short values. (since Java has not unsigned byte type)
      * All code points in the string are contained in the parent set.
      */
     static final short ALL_CP_CONTAINED = 0xff;
     /** The spanLength is >=0xfe. */
     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;

     /** Set for span(). Same as parent but without strings. */
     private UnicodeSet spanSet;

     /**
      * Set for span(not contained).
      * Same as spanSet, plus characters that start or end strings.
      */
     private UnicodeSet spanNotSet;

     /** The strings of the parent set. */
     private ArrayList<String> strings;

     /** The lengths of span(), spanBack() etc. for each string. */
     private short[] spanLengths;

     /** Maximum lengths of relevant strings. */
     private final int maxLength16;

     /** Are there strings that are not fully contained in the code point set? */
     private boolean someRelevant;

     /** Set up for all variants of span()? */
     private boolean all;

     /** Span helper */
     private OffsetList offsets;

     /**
      * Constructs for all variants of span(), or only for any one variant.
      * Initializes as little as possible, for single use.
      */
     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
         spanSet = new UnicodeSet(0, 0x10ffff);
         // TODO: With Java 6, just take the parent set's strings as is,
         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
         // Then iterate via the first() and higher() methods.
         // (We do not want to create multiple Iterator objects in each span().)
         // See ICU ticket #7454.
         strings = setStrings;
         all = (which == ALL);
         spanSet.retainAll(set);
         if (0 != (which & NOT_CONTAINED)) {
             // Default to the same sets.
             // addToSpanNotSet() will create a separate set if necessary.
             spanNotSet = spanSet;
         }
         offsets = new OffsetList();

         // Determine if the strings even need to be taken into account at all for span() etc.
         // If any string is relevant, then all strings need to be used for
         // span(longest match) but only the relevant ones for span(while contained).
         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
         int stringsLength = strings.size();

         int i, spanLength;
         int maxLength16 = 0;
         someRelevant = false;
         for (i = 0; i < stringsLength; ++i) {
             String string = strings.get(i);
             int length16 = string.length();
             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
             if (spanLength < length16) { // Relevant string.
                 someRelevant = true;
             }
             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
                 maxLength16 = length16;
             }
         }
         this.maxLength16 = maxLength16;
         if (!someRelevant && (which & WITH_COUNT) == 0) {
             return;
         }

         // Freeze after checking for the need to use strings at all because freezing
         // a set takes some time and memory which are wasted if there are no relevant strings.
         if (all) {
             spanSet.freeze();
         }

         int spanBackLengthsOffset;

         // Allocate a block of meta data.
         int allocSize;
         if (all) {
             // 2 sets of span lengths
             allocSize = stringsLength * (2);
         } else {
             allocSize = stringsLength; // One set of span lengths.
         }
         spanLengths = new short[allocSize];

         if (all) {
             // Store span lengths for all span() variants.
             spanBackLengthsOffset = stringsLength;
         } else {
             // Store span lengths for only one span() variant.
             spanBackLengthsOffset = 0;
         }

         // Set the meta data and spanNotSet and write the UTF-8 strings.

         for (i = 0; i < stringsLength; ++i) {
             String string = strings.get(i);
             int length16 = string.length();
             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
             if (spanLength < length16) { // Relevant string.
                 if (true /* 0 != (which & UTF16) */) {
                     if (0 != (which & CONTAINED)) {
                         if (0 != (which & FWD)) {
                             spanLengths[i] = makeSpanLengthByte(spanLength);
                         }
                         if (0 != (which & BACK)) {
                             spanLength = length16
                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
                         }
                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
                                                                                      // flag.
                     }
                 }
                 if (0 != (which & NOT_CONTAINED)) {
                     // Add string start and end code points to the spanNotSet so that
                     // a span(while not contained) stops before any string.
                     int c;
                     if (0 != (which & FWD)) {
                         c = string.codePointAt(0);
                         addToSpanNotSet(c);
                     }
                     if (0 != (which & BACK)) {
                         c = string.codePointBefore(length16);
                         addToSpanNotSet(c);
                     }
                 }
             } else { // Irrelevant string.
                 if (all) {
                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
                 } else {
                     // All spanXYZLengths pointers contain the same address.
                     spanLengths[i] = ALL_CP_CONTAINED;
                 }
             }
         }

         // Finish.
         if (all) {
             spanNotSet.freeze();
         }
     }

     /**
      * Constructs a copy of an existing UnicodeSetStringSpan.
      * Assumes which==ALL for a frozen set.
      */
     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
             final ArrayList<String> newParentSetStrings) {
         spanSet = otherStringSpan.spanSet;
         strings = newParentSetStrings;
         maxLength16 = otherStringSpan.maxLength16;
         someRelevant = otherStringSpan.someRelevant;
         all = true;
         if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) {
             spanNotSet = spanSet;
         } else {
             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
         }
         offsets = new OffsetList();

         spanLengths = otherStringSpan.spanLengths.clone();
     }

     /**
      * Do the strings need to be checked in span() etc.?
      *
      * @return true if strings need to be checked (call span() here),
      *         false if not (use a BMPSet for best performance).
      */
     public boolean needsStringSpanUTF16() {
         return someRelevant;
     }

     /** For fast UnicodeSet::contains(c). */
     public boolean contains(int c) {
         return spanSet.contains(c);
     }

     /**
      * Adds a starting or ending string character to the spanNotSet
      * so that a character span ends before any string.
      */
     private void addToSpanNotSet(int c) {
         if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) {
             if (spanSet.contains(c)) {
                 return; // Nothing to do.
             }
             spanNotSet = spanSet.cloneAsThawed();
         }
         spanNotSet.add(c);
     }

     /*
      * Note: In span() when spanLength==0
      * (after a string match, or at the beginning after an empty code point span)
      * and in spanNot() and spanNotUTF8(),
      * string matching could use a binary search because all string matches are done
      * from the same start index.
      *
      * For UTF-8, this would require a comparison function that returns UTF-16 order.
      *
      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
      * API and implementation with a DFA (state machine).
      */

     /*
      * Algorithm for span(SpanCondition.CONTAINED)
      *
      * Theoretical algorithm:
      * - Iterate through the string, and at each code point boundary:
      *   + If the code point there is in the set, then remember to continue after it.
      *   + If a set string matches at the current position, then remember to continue after it.
      *   + Either recursively span for each code point or string match, or recursively span
      *     for all but the shortest one and iteratively continue the span with the shortest local match.
      *   + Remember the longest recursive span (the farthest end point).
      *   + If there is no match at the current position,
      *     neither for the code point there nor for any set string,
      *     then stop and return the longest recursive span length.
      *
      * Optimized implementation:
      *
      * (We assume that most sets will have very few very short strings.
      * A span using a string-less set is extremely fast.)
      *
      * Create and cache a spanSet which contains all of the single code points of the original set
      * but none of its strings.
      *
      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
      * - Loop:
      *   + Try to match each set string at the end of the spanLength.
      *     ~ Set strings that start with set-contained code points
      *       must be matched with a partial overlap
      *       because the recursive algorithm would have tried to match them at every position.
      *     ~ Set strings that entirely consist of set-contained code points
      *       are irrelevant for span(SpanCondition.CONTAINED)
      *       because the recursive algorithm would continue after them anyway and
      *       find the longest recursive match from their end.
      *     ~ Rather than recursing, note each end point of a set string match.
      *   + If no set string matched after spanSet.span(),
      *     then return with where the spanSet.span() ended.
      *   + If at least one set string matched after spanSet.span(),
      *     then pop the shortest string match end point and continue the loop,
      *     trying to match all set strings from there.
      *   + If at least one more set string matched after a previous string match, then test if the
      *     code point after the previous string match is also contained in the set.
      *     Continue the loop with the shortest end point of
      *     either this code point or a matching set string.
      *   + If no more set string matched after a previous string match,
      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
      *     Stop if spanLength==0, otherwise continue the loop.
      *
      * By noting each end point of a set string match, the function visits each string position at most once and
      * finishes in linear time.
      *
      * The recursive algorithm may visit the same string position many times
      * if multiple paths lead to it and finishes in exponential time.
      */

     /*
      * Algorithm for span(SIMPLE)
      *
      * Theoretical algorithm:
      * - Iterate through the string, and at each code point boundary:
      *   + If the code point there is in the set, then remember to continue after it.
      *   + If a set string matches at the current position, then remember to continue after it.
      *   + Continue from the farthest match position and ignore all others.
      *   + If there is no match at the current position, then stop and return the current position.
      *
      * Optimized implementation:
      *
      * (Same assumption and spanSet as above.)
      *
      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
      * - Loop:
      *   + Try to match each set string at the end of the spanLength.
      *     ~ Set strings that start with set-contained code points
      *       must be matched with a partial overlap
      *       because the standard algorithm would have tried to match them earlier.
      *     ~ Set strings that entirely consist of set-contained code points
      *       must be matched with a full overlap because the longest-match algorithm
      *       would hide set string matches that end earlier.
      *       Such set strings need not be matched earlier inside the code point span
      *       because the standard algorithm would then have
      *       continued after the set string match anyway.
      *     ~ Remember the longest set string match (farthest end point)
      *       from the earliest starting point.
      *   + If no set string matched after spanSet.span(),
      *     then return with where the spanSet.span() ended.
      *   + If at least one set string matched,
      *     then continue the loop after the longest match from the earliest position.
      *   + If no more set string matched after a previous string match,
      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
      *     Stop if spanLength==0, otherwise continue the loop.
      */
     /**
      * Spans a string.
      *
      * @param s The string to be spanned
      * @param start The start index that the span begins
      * @param spanCondition The span condition
      * @return the limit (exclusive end) of the span
      */
     public int span(CharSequence s, int start, SpanCondition spanCondition) {
         if (spanCondition == SpanCondition.NOT_CONTAINED) {
             return spanNot(s, start, null);
         }
         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
         if (spanLimit == s.length()) {
             return spanLimit;
         }
         return spanWithStrings(s, start, spanLimit, spanCondition);
     }

     /**
      * Synchronized method for complicated spans using the offsets.
      * Avoids synchronization for simple cases.
      *
      * @param spanLimit = spanSet.span(s, start, CONTAINED)
      */
     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
             SpanCondition spanCondition) {
         // Consider strings; they may overlap with the span.
         int initSize = 0;
         if (spanCondition == SpanCondition.CONTAINED) {
             // Use offset list to try all possibilities.
             initSize = maxLength16;
         }
         offsets.setMaxLength(initSize);
         int length = s.length();
         int pos = spanLimit, rest = length - spanLimit;
         int spanLength = spanLimit - start;
         int i, stringsLength = strings.size();
         for (;;) {
             if (spanCondition == SpanCondition.CONTAINED) {
                 for (i = 0; i < stringsLength; ++i) {
                     int overlap = spanLengths[i];
                     if (overlap == ALL_CP_CONTAINED) {
                         continue; // Irrelevant string.
                     }
                     String string = strings.get(i);

                     int length16 = string.length();

                     // Try to match this string at pos-overlap..pos.
                     if (overlap >= LONG_SPAN) {
                         overlap = length16;
                         // While contained: No point matching fully inside the code point span.
                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
                                                                           // point.
                     }
                     if (overlap > spanLength) {
                         overlap = spanLength;
                     }
                     int inc = length16 - overlap; // Keep overlap+inc==length16.
                     for (;;) {
                         if (inc > rest) {
                             break;
                         }
                         // Try to match if the increment is not listed already.
                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
                             if (inc == rest) {
                                 return length; // Reached the end of the string.
                             }
                             offsets.addOffset(inc);
                         }
                         if (overlap == 0) {
                             break;
                         }
                         --overlap;
                         ++inc;
                     }
                 }
             } else /* SIMPLE */{
                 int maxInc = 0, maxOverlap = 0;
                 for (i = 0; i < stringsLength; ++i) {
                     int overlap = spanLengths[i];
                     // For longest match, we do need to try to match even an all-contained string
                     // to find the match from the earliest start.

                     String string = strings.get(i);

                     int length16 = string.length();

                     // Try to match this string at pos-overlap..pos.
                     if (overlap >= LONG_SPAN) {
                         overlap = length16;
                         // Longest match: Need to match fully inside the code point span
                         // to find the match from the earliest start.
                     }
                     if (overlap > spanLength) {
                         overlap = spanLength;
                     }
                     int inc = length16 - overlap; // Keep overlap+inc==length16.
                     for (;;) {
                         if (inc > rest || overlap < maxOverlap) {
                             break;
                         }
                         // Try to match if the string is longer or starts earlier.
                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
                             maxInc = inc; // Longest match from earliest start.
                             maxOverlap = overlap;
                             break;
                         }
                         --overlap;
                         ++inc;
                     }
                 }

                 if (maxInc != 0 || maxOverlap != 0) {
                     // Longest-match algorithm, and there was a string match.
                     // Simply continue after it.
                     pos += maxInc;
                     rest -= maxInc;
                     if (rest == 0) {
                         return length; // Reached the end of the string.
                     }
                     spanLength = 0; // Match strings from after a string match.
                     continue;
                 }
             }
             // Finished trying to match all strings at pos.

             if (spanLength != 0 || pos == 0) {
                 // The position is after an unlimited code point span (spanLength!=0),
                 // not after a string match.
                 // The only position where spanLength==0 after a span is pos==0.
                 // Otherwise, an unlimited code point span is only tried again when no
                 // strings match, and if such a non-initial span fails we stop.
                 if (offsets.isEmpty()) {
                     return pos; // No strings matched after a span.
                 }
                 // Match strings from after the next string match.
             } else {
                 // The position is after a string match (or a single code point).
                 if (offsets.isEmpty()) {
                     // No more strings matched after a previous string match.
                     // Try another code point span from after the last string match.
                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
                     spanLength = spanLimit - pos;
                     if (spanLength == rest || // Reached the end of the string, or
                             spanLength == 0 // neither strings nor span progressed.
                     ) {
                         return spanLimit;
                     }
                     pos += spanLength;
                     rest -= spanLength;
                     continue; // spanLength>0: Match strings from after a span.
                 } else {
                     // Try to match only one code point from after a string match if some
                     // string matched beyond it, so that we try all possible positions
                     // and don't overshoot.
                     spanLength = spanOne(spanSet, s, pos, rest);
                     if (spanLength > 0) {
                         if (spanLength == rest) {
                             return length; // Reached the end of the string.
                         }
                         // Match strings after this code point.
                         // There cannot be any increments below it because UnicodeSet strings
                         // contain multiple code points.
                         pos += spanLength;
                         rest -= spanLength;
                         offsets.shift(spanLength);
                         spanLength = 0;
                         continue; // Match strings from after a single code point.
                     }
                     // Match strings from after the next string match.
                 }
             }
             int minOffset = offsets.popMinimum(null);
             pos += minOffset;
             rest -= minOffset;
             spanLength = 0; // Match strings from after a string match.
         }
     }

     /**
      * Spans a string and counts the smallest number of set elements on any path across the span.
      *
      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
      *
      * <p>If the set does not have any fully-contained strings, then we could optimize this
      * like span(), but such sets are likely rare, and this is at least still linear.
      *
      * @param s The string to be spanned
      * @param start The start index that the span begins
      * @param spanCondition The span condition
      * @param outCount The count
      * @return the limit (exclusive end) of the span
      */
     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
             OutputInt outCount) {
         if (spanCondition == SpanCondition.NOT_CONTAINED) {
             return spanNot(s, start, outCount);
         }
         // Consider strings; they may overlap with the span,
         // and they may result in a smaller count that with just code points.
         if (spanCondition == SpanCondition.CONTAINED) {
             return spanContainedAndCount(s, start, outCount);
         }
         // SIMPLE (not synchronized, does not use offsets)
         int stringsLength = strings.size();
         int length = s.length();
         int pos = start;
         int rest = length - start;
         int count = 0;
         while (rest != 0) {
             // Try to match the next code point.
             int cpLength = spanOne(spanSet, s, pos, rest);
             int maxInc = (cpLength > 0) ? cpLength : 0;
             // Try to match all of the strings.
             for (int i = 0; i < stringsLength; ++i) {
                 String string = strings.get(i);
                 int length16 = string.length();
                 if (maxInc < length16 && length16 <= rest &&
                         matches16CPB(s, pos, length, string, length16)) {
                     maxInc = length16;
                 }
             }
             // We are done if there is no match beyond pos.
             if (maxInc == 0) {
                 outCount.value = count;
                 return pos;
             }
             // Continue from the longest match.
             ++count;
             pos += maxInc;
             rest -= maxInc;
         }
         outCount.value = count;
         return pos;
     }

     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
         // Use offset list to try all possibilities.
         offsets.setMaxLength(maxLength16);
         int stringsLength = strings.size();
         int length = s.length();
         int pos = start;
         int rest = length - start;
         int count = 0;
         while (rest != 0) {
             // Try to match the next code point.
             int cpLength = spanOne(spanSet, s, pos, rest);
             if (cpLength > 0) {
                 offsets.addOffsetAndCount(cpLength, count + 1);
             }
             // Try to match all of the strings.
             for (int i = 0; i < stringsLength; ++i) {
                 String string = strings.get(i);
                 int length16 = string.length();
                 // Note: If the strings were sorted by length, then we could also
                 // avoid trying to match if there is already a match of the same length.
                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
                         matches16CPB(s, pos, length, string, length16)) {
                     offsets.addOffsetAndCount(length16, count + 1);
                 }
             }
             // We are done if there is no match beyond pos.
             if (offsets.isEmpty()) {
                 outCount.value = count;
                 return pos;
             }
             // Continue from the nearest match.
             int minOffset = offsets.popMinimum(outCount);
             count = outCount.value;
             pos += minOffset;
             rest -= minOffset;
         }
         outCount.value = count;
         return pos;
     }

     /**
      * Span a string backwards.
      *
      * @param s The string to be spanned
      * @param spanCondition The span condition
      * @return The string index which starts the span (i.e. inclusive).
      */
     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
         if (spanCondition == SpanCondition.NOT_CONTAINED) {
             return spanNotBack(s, length);
         }
         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
         if (pos == 0) {
             return 0;
         }
         int spanLength = length - pos;

         // Consider strings; they may overlap with the span.
         int initSize = 0;
         if (spanCondition == SpanCondition.CONTAINED) {
             // Use offset list to try all possibilities.
             initSize = maxLength16;
         }
         offsets.setMaxLength(initSize);
         int i, stringsLength = strings.size();
         int spanBackLengthsOffset = 0;
         if (all) {
             spanBackLengthsOffset = stringsLength;
         }
         for (;;) {
             if (spanCondition == SpanCondition.CONTAINED) {
                 for (i = 0; i < stringsLength; ++i) {
                     int overlap = spanLengths[spanBackLengthsOffset + i];
                     if (overlap == ALL_CP_CONTAINED) {
                         continue; // Irrelevant string.
                     }
                     String string = strings.get(i);

                     int length16 = string.length();

                     // Try to match this string at pos-(length16-overlap)..pos-length16.
                     if (overlap >= LONG_SPAN) {
                         overlap = length16;
                         // While contained: No point matching fully inside the code point span.
                         int len1 = 0;
                         len1 = string.offsetByCodePoints(0, 1);
                         overlap -= len1; // Length of the string minus the first code point.
                     }
                     if (overlap > spanLength) {
                         overlap = spanLength;
                     }
                     int dec = length16 - overlap; // Keep dec+overlap==length16.
                     for (;;) {
                         if (dec > pos) {
                             break;
                         }
                         // Try to match if the decrement is not listed already.
                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
                             if (dec == pos) {
                                 return 0; // Reached the start of the string.
                             }
                             offsets.addOffset(dec);
                         }
                         if (overlap == 0) {
                             break;
                         }
                         --overlap;
                         ++dec;
                     }
                 }
             } else /* SIMPLE */{
                 int maxDec = 0, maxOverlap = 0;
                 for (i = 0; i < stringsLength; ++i) {
                     int overlap = spanLengths[spanBackLengthsOffset + i];
                     // For longest match, we do need to try to match even an all-contained string
                     // to find the match from the latest end.

                     String string = strings.get(i);

                     int length16 = string.length();

                     // Try to match this string at pos-(length16-overlap)..pos-length16.
                     if (overlap >= LONG_SPAN) {
                         overlap = length16;
                         // Longest match: Need to match fully inside the code point span
                         // to find the match from the latest end.
                     }
                     if (overlap > spanLength) {
                         overlap = spanLength;
                     }
                     int dec = length16 - overlap; // Keep dec+overlap==length16.
                     for (;;) {
                         if (dec > pos || overlap < maxOverlap) {
                             break;
                         }
                         // Try to match if the string is longer or ends later.
                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
                                 && matches16CPB(s, pos - dec, length, string, length16)) {
                             maxDec = dec; // Longest match from latest end.
                             maxOverlap = overlap;
                             break;
                         }
                         --overlap;
                         ++dec;
                     }
                 }

                 if (maxDec != 0 || maxOverlap != 0) {
                     // Longest-match algorithm, and there was a string match.
                     // Simply continue before it.
                     pos -= maxDec;
                     if (pos == 0) {
                         return 0; // Reached the start of the string.
                     }
                     spanLength = 0; // Match strings from before a string match.
                     continue;
                 }
             }
             // Finished trying to match all strings at pos.

             if (spanLength != 0 || pos == length) {
                 // The position is before an unlimited code point span (spanLength!=0),
                 // not before a string match.
                 // The only position where spanLength==0 before a span is pos==length.
                 // Otherwise, an unlimited code point span is only tried again when no
                 // strings match, and if such a non-initial span fails we stop.
                 if (offsets.isEmpty()) {
                     return pos; // No strings matched before a span.
                 }
                 // Match strings from before the next string match.
             } else {
                 // The position is before a string match (or a single code point).
                 if (offsets.isEmpty()) {
                     // No more strings matched before a previous string match.
                     // Try another code point span from before the last string match.
                     int oldPos = pos;
                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
                     spanLength = oldPos - pos;
                     if (pos == 0 || // Reached the start of the string, or
                             spanLength == 0 // neither strings nor span progressed.
                     ) {
                         return pos;
                     }
                     continue; // spanLength>0: Match strings from before a span.
                 } else {
                     // Try to match only one code point from before a string match if some
                     // string matched beyond it, so that we try all possible positions
                     // and don't overshoot.
                     spanLength = spanOneBack(spanSet, s, pos);
                     if (spanLength > 0) {
                         if (spanLength == pos) {
                             return 0; // Reached the start of the string.
                         }
                         // Match strings before this code point.
                         // There cannot be any decrements below it because UnicodeSet strings
                         // contain multiple code points.
                         pos -= spanLength;
                         offsets.shift(spanLength);
                         spanLength = 0;
                         continue; // Match strings from before a single code point.
                     }
                     // Match strings from before the next string match.
                 }
             }
             pos -= offsets.popMinimum(null);
             spanLength = 0; // Match strings from before a string match.
         }
     }

     /**
      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
      *
      * Theoretical algorithm:
      * - Iterate through the string, and at each code point boundary:
      *   + If the code point there is in the set, then return with the current position.
      *   + If a set string matches at the current position, then return with the current position.
      *
      * Optimized implementation:
      *
      * (Same assumption as for span() above.)
      *
      * Create and cache a spanNotSet which contains
      * all of the single code points of the original set but none of its strings.
      * For each set string add its initial code point to the spanNotSet.
      * (Also add its final code point for spanNotBack().)
      *
      * - Loop:
      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
      *   + If the current code point is in the original set, then return the current position.
      *   + If any set string matches at the current position, then return the current position.
      *   + If there is no match at the current position, neither for the code point
      *     there nor for any set string, then skip this code point and continue the loop.
      *     This happens for set-string-initial code points that were added to spanNotSet
      *     when there is not actually a match for such a set string.
      *
      * @param s The string to be spanned
      * @param start The start index that the span begins
      * @param outCount If not null: Receives the number of code points across the span.
      * @return the limit (exclusive end) of the span
      */
     private int spanNot(CharSequence s, int start, OutputInt outCount) {
         int length = s.length();
         int pos = start, rest = length - start;
         int stringsLength = strings.size();
         int count = 0;
         do {
             // Span until we find a code point from the set,
             // or a code point that starts or ends some string.
             int spanLimit;
             if (outCount == null) {
                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
             } else {
                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
                 outCount.value = count = count + outCount.value;
             }
             if (spanLimit == length) {
                 return length; // Reached the end of the string.
             }
             pos = spanLimit;
             rest = length - spanLimit;

             // Check whether the current code point is in the original set,
             // without the string starts and ends.
             int cpLength = spanOne(spanSet, s, pos, rest);
             if (cpLength > 0) {
                 return pos; // There is a set element at pos.
             }

             // Try to match the strings at pos.
             for (int i = 0; i < stringsLength; ++i) {
                 if (spanLengths[i] == ALL_CP_CONTAINED) {
                     continue; // Irrelevant string.
                 }
                 String string = strings.get(i);

                 int length16 = string.length();
                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
                     return pos; // There is a set element at pos.
                 }
             }

             // The span(while not contained) ended on a string start/end which is
             // not in the original set. Skip this code point and continue.
             // cpLength<0
             pos -= cpLength;
             rest += cpLength;
             ++count;
         } while (rest != 0);
         if (outCount != null) {
             outCount.value = count;
         }
         return length; // Reached the end of the string.
     }

     private int spanNotBack(CharSequence s, int length) {
         int pos = length;
         int i, stringsLength = strings.size();
         do {
             // Span until we find a code point from the set,
             // or a code point that starts or ends some string.
             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
             if (pos == 0) {
                 return 0; // Reached the start of the string.
             }

             // Check whether the current code point is in the original set,
             // without the string starts and ends.
             int cpLength = spanOneBack(spanSet, s, pos);
             if (cpLength > 0) {
                 return pos; // There is a set element at pos.
             }

             // Try to match the strings at pos.
             for (i = 0; i < stringsLength; ++i) {
                 // Use spanLengths rather than a spanLengths pointer because
                 // it is easier and we only need to know whether the string is irrelevant
                 // which is the same in either array.
                 if (spanLengths[i] == ALL_CP_CONTAINED) {
                     continue; // Irrelevant string.
                 }
                 String string = strings.get(i);

                 int length16 = string.length();
                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
                     return pos; // There is a set element at pos.
                 }
             }

             // The span(while not contained) ended on a string start/end which is
             // not in the original set. Skip this code point and continue.
             // cpLength<0
             pos += cpLength;
         } while (pos != 0);
         return 0; // Reached the start of the string.
     }

     static short makeSpanLengthByte(int spanLength) {
         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
     }

     // Compare strings without any argument checks. Requires length>0.
     private static boolean matches16(CharSequence s, int start, final String t, int length) {
         int end = start + length;
         while (length-- > 0) {
             if (s.charAt(--end) != t.charAt(length)) {
                 return false;
             }
         }
         return true;
     }

     /**
      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
      * at code point boundaries.
      * That is, each edge of a match must not be in the middle of a surrogate pair.
      * @param s       The string to match in.
      * @param start   The start index of s.
      * @param limit   The limit of the subsequence of s being spanned.
      * @param t       The substring to be matched in s.
      * @param tlength The length of t.
      */
     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
         return matches16(s, start, t, tlength)
                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
                         Character.isLowSurrogate(s.charAt(start)))
                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
                         Character.isLowSurrogate(s.charAt(start + tlength)));
     }

     /**
      * Does the set contain the next code point?
      * If so, return its length; otherwise return its negative length.
      */
     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
         char c = s.charAt(start);
         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
             char c2 = s.charAt(start + 1);
             if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
                 int supplementary = Character.toCodePoint(c, c2);
                 return set.contains(supplementary) ? 2 : -2;
             }
         }
         return set.contains(c) ? 1 : -1;
     }

     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
         char c = s.charAt(length - 1);
         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
             char c2 = s.charAt(length - 2);
             if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
                 int supplementary = Character.toCodePoint(c2, c);
                 return set.contains(supplementary) ? 2 : -2;
             }
         }
         return set.contains(c) ? 1 : -1;
     }

     /**
      * Helper class for UnicodeSetStringSpan.
      *
      * <p>List of offsets from the current position from where to try matching
      * a code point or a string.
      * Stores offsets rather than indexes to simplify the code and use the same list
      * for both increments (in span()) and decrements (in spanBack()).
      *
      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
      * are relatively dense, that is,
      * there are normally no gaps of hundreds or thousands of offset values.
      *
      * <p>This class optionally also tracks the minimum non-negative count for each position,
      * intended to count the smallest number of elements of any path leading to that position.
      *
      * <p>The implementation uses a circular buffer of count integers,
      * each indicating whether the corresponding offset is in the list,
      * and its path element count.
      * This avoids inserting into a sorted list of offsets (or absolute indexes)
      * and physically moving part of the list.
      *
      * <p>Note: In principle, the caller should setMaxLength() to
      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
      * to account for "long" single code points.
      *
      * <p>Note: An earlier version did not track counts and stored only byte flags.
      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
      * the list could be stored as bit flags in a single integer.
      * Rather than handling a circular buffer with a start list index,
      * the integer would simply be shifted when lower offsets are removed.
      * UnicodeSet does not have a limit on the lengths of strings.
      */
     private static final class OffsetList {
         private int[] list;
         private int length;
         private int start;

         public OffsetList() {
             list = new int[16];  // default size
         }

         public void setMaxLength(int maxLength) {
             if (maxLength > list.length) {
                 list = new int[maxLength];
             }
             clear();
         }

         public void clear() {
             for (int i = list.length; i-- > 0;) {
                 list[i] = 0;
             }
             start = length = 0;
         }

         public boolean isEmpty() {
             return (length == 0);
         }

         /**
          * Reduces all stored offsets by delta, used when the current position moves by delta.
          * There must not be any offsets lower than delta.
          * If there is an offset equal to delta, it is removed.
          *
          * @param delta [1..maxLength]
          */
         public void shift(int delta) {
             int i = start + delta;
             if (i >= list.length) {
                 i -= list.length;
             }
             if (list[i] != 0) {
                 list[i] = 0;
                 --length;
             }
             start = i;
         }

         /**
          * Adds an offset. The list must not contain it yet.
          * @param offset [1..maxLength]
          */
         public void addOffset(int offset) {
             int i = start + offset;
             if (i >= list.length) {
                 i -= list.length;
             }
             assert list[i] == 0;
             list[i] = 1;
             ++length;
         }

         /**
          * Adds an offset and updates its count.
          * The list may already contain the offset.
          * @param offset [1..maxLength]
          */
         public void addOffsetAndCount(int offset, int count) {
             assert count > 0;
             int i = start + offset;
             if (i >= list.length) {
                 i -= list.length;
             }
             if (list[i] == 0) {
                 list[i] = count;
                 ++length;
             } else if (count < list[i]) {
                 list[i] = count;
             }
         }

         /**
          * @param offset [1..maxLength]
          */
         public boolean containsOffset(int offset) {
             int i = start + offset;
             if (i >= list.length) {
                 i -= list.length;
             }
             return list[i] != 0;
         }

         /**
          * @param offset [1..maxLength]
          */
         public boolean hasCountAtOffset(int offset, int count) {
             int i = start + offset;
             if (i >= list.length) {
                 i -= list.length;
             }
             int oldCount = list[i];
             return oldCount != 0 && oldCount <= count;
         }

         /**
          * Finds the lowest stored offset from a non-empty list, removes it,
          * and reduces all other offsets by this minimum.
          * @return min=[1..maxLength]
          */
         public int popMinimum(OutputInt outCount) {
             // Look for the next offset in list[start+1..list.length-1].
             int i = start, result;
             while (++i < list.length) {
                 int count = list[i];
                 if (count != 0) {
                     list[i] = 0;
                     --length;
                     result = i - start;
                     start = i;
                     if (outCount != null) { outCount.value = count; }
                     return result;
                 }
             }
             // i==list.length

             // Wrap around and look for the next offset in list[0..start].
             // Since the list is not empty, there will be one.
             result = list.length - start;
             i = 0;
             int count;
             while ((count = list[i]) == 0) {
                 ++i;
             }
             list[i] = 0;
             --length;
             start = i;
             if (outCount != null) { outCount.value = count; }
             return result + i;
         }
     }
 }