| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 2009-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| */ |
| |
| package com.ibm.icu.impl; |
| |
| import java.util.ArrayList; |
| |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSet.SpanCondition; |
| import com.ibm.icu.util.OutputInt; |
| |
| /* |
| * Implement span() etc. for a set with strings. |
| * Avoid recursion because of its exponential complexity. |
| * Instead, try multiple paths at once and track them with an IndexList. |
| */ |
| public class UnicodeSetStringSpan { |
| |
| /* |
| * Which span() variant will be used? The object is either built for one variant and used once, |
| * or built for all and may be used many times. |
| */ |
| public static final int WITH_COUNT = 0x40; // spanAndCount() may be called |
| public static final int FWD = 0x20; |
| public static final int BACK = 0x10; |
| // public static final int UTF16 = 8; |
| public static final int CONTAINED = 2; |
| public static final int NOT_CONTAINED = 1; |
| |
| public static final int ALL = 0x7f; |
| |
| public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; |
| public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; |
| public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; |
| public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; |
| |
| /** |
| * Special spanLength short values. (since Java has not unsigned byte type) |
| * All code points in the string are contained in the parent set. |
| */ |
| static final short ALL_CP_CONTAINED = 0xff; |
| /** The spanLength is >=0xfe. */ |
| static final short LONG_SPAN = ALL_CP_CONTAINED - 1; |
| |
| /** Set for span(). Same as parent but without strings. */ |
| private UnicodeSet spanSet; |
| |
| /** |
| * Set for span(not contained). |
| * Same as spanSet, plus characters that start or end strings. |
| */ |
| private UnicodeSet spanNotSet; |
| |
| /** The strings of the parent set. */ |
| private ArrayList<String> strings; |
| |
| /** The lengths of span(), spanBack() etc. for each string. */ |
| private short[] spanLengths; |
| |
| /** Maximum lengths of relevant strings. */ |
| private final int maxLength16; |
| |
| /** Are there strings that are not fully contained in the code point set? */ |
| private boolean someRelevant; |
| |
| /** Set up for all variants of span()? */ |
| private boolean all; |
| |
| /** Span helper */ |
| private OffsetList offsets; |
| |
| /** |
| * Constructs for all variants of span(), or only for any one variant. |
| * Initializes as little as possible, for single use. |
| */ |
| public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { |
| spanSet = new UnicodeSet(0, 0x10ffff); |
| // TODO: With Java 6, just take the parent set's strings as is, |
| // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. |
| // Then iterate via the first() and higher() methods. |
| // (We do not want to create multiple Iterator objects in each span().) |
| // See ICU ticket #7454. |
| strings = setStrings; |
| all = (which == ALL); |
| spanSet.retainAll(set); |
| if (0 != (which & NOT_CONTAINED)) { |
| // Default to the same sets. |
| // addToSpanNotSet() will create a separate set if necessary. |
| spanNotSet = spanSet; |
| } |
| offsets = new OffsetList(); |
| |
| // Determine if the strings even need to be taken into account at all for span() etc. |
| // If any string is relevant, then all strings need to be used for |
| // span(longest match) but only the relevant ones for span(while contained). |
| // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH |
| // and do not store UTF-8 strings if !thisRelevant and CONTAINED. |
| // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) |
| // Also count the lengths of the UTF-8 versions of the strings for memory allocation. |
| int stringsLength = strings.size(); |
| |
| int i, spanLength; |
| int maxLength16 = 0; |
| someRelevant = false; |
| for (i = 0; i < stringsLength; ++i) { |
| String string = strings.get(i); |
| int length16 = string.length(); |
| spanLength = spanSet.span(string, SpanCondition.CONTAINED); |
| if (spanLength < length16) { // Relevant string. |
| someRelevant = true; |
| } |
| if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { |
| maxLength16 = length16; |
| } |
| } |
| this.maxLength16 = maxLength16; |
| if (!someRelevant && (which & WITH_COUNT) == 0) { |
| return; |
| } |
| |
| // Freeze after checking for the need to use strings at all because freezing |
| // a set takes some time and memory which are wasted if there are no relevant strings. |
| if (all) { |
| spanSet.freeze(); |
| } |
| |
| int spanBackLengthsOffset; |
| |
| // Allocate a block of meta data. |
| int allocSize; |
| if (all) { |
| // 2 sets of span lengths |
| allocSize = stringsLength * (2); |
| } else { |
| allocSize = stringsLength; // One set of span lengths. |
| } |
| spanLengths = new short[allocSize]; |
| |
| if (all) { |
| // Store span lengths for all span() variants. |
| spanBackLengthsOffset = stringsLength; |
| } else { |
| // Store span lengths for only one span() variant. |
| spanBackLengthsOffset = 0; |
| } |
| |
| // Set the meta data and spanNotSet and write the UTF-8 strings. |
| |
| for (i = 0; i < stringsLength; ++i) { |
| String string = strings.get(i); |
| int length16 = string.length(); |
| spanLength = spanSet.span(string, SpanCondition.CONTAINED); |
| if (spanLength < length16) { // Relevant string. |
| if (true /* 0 != (which & UTF16) */) { |
| if (0 != (which & CONTAINED)) { |
| if (0 != (which & FWD)) { |
| spanLengths[i] = makeSpanLengthByte(spanLength); |
| } |
| if (0 != (which & BACK)) { |
| spanLength = length16 |
| - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); |
| spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); |
| } |
| } else /* not CONTAINED, not all, but NOT_CONTAINED */{ |
| spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant |
| // flag. |
| } |
| } |
| if (0 != (which & NOT_CONTAINED)) { |
| // Add string start and end code points to the spanNotSet so that |
| // a span(while not contained) stops before any string. |
| int c; |
| if (0 != (which & FWD)) { |
| c = string.codePointAt(0); |
| addToSpanNotSet(c); |
| } |
| if (0 != (which & BACK)) { |
| c = string.codePointBefore(length16); |
| addToSpanNotSet(c); |
| } |
| } |
| } else { // Irrelevant string. |
| if (all) { |
| spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; |
| } else { |
| // All spanXYZLengths pointers contain the same address. |
| spanLengths[i] = ALL_CP_CONTAINED; |
| } |
| } |
| } |
| |
| // Finish. |
| if (all) { |
| spanNotSet.freeze(); |
| } |
| } |
| |
| /** |
| * Constructs a copy of an existing UnicodeSetStringSpan. |
| * Assumes which==ALL for a frozen set. |
| */ |
| public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, |
| final ArrayList<String> newParentSetStrings) { |
| spanSet = otherStringSpan.spanSet; |
| strings = newParentSetStrings; |
| maxLength16 = otherStringSpan.maxLength16; |
| someRelevant = otherStringSpan.someRelevant; |
| all = true; |
| if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) { |
| spanNotSet = spanSet; |
| } else { |
| spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone(); |
| } |
| offsets = new OffsetList(); |
| |
| spanLengths = otherStringSpan.spanLengths.clone(); |
| } |
| |
| /** |
| * Do the strings need to be checked in span() etc.? |
| * |
| * @return true if strings need to be checked (call span() here), |
| * false if not (use a BMPSet for best performance). |
| */ |
| public boolean needsStringSpanUTF16() { |
| return someRelevant; |
| } |
| |
| /** For fast UnicodeSet::contains(c). */ |
| public boolean contains(int c) { |
| return spanSet.contains(c); |
| } |
| |
| /** |
| * Adds a starting or ending string character to the spanNotSet |
| * so that a character span ends before any string. |
| */ |
| private void addToSpanNotSet(int c) { |
| if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) { |
| if (spanSet.contains(c)) { |
| return; // Nothing to do. |
| } |
| spanNotSet = spanSet.cloneAsThawed(); |
| } |
| spanNotSet.add(c); |
| } |
| |
| /* |
| * Note: In span() when spanLength==0 |
| * (after a string match, or at the beginning after an empty code point span) |
| * and in spanNot() and spanNotUTF8(), |
| * string matching could use a binary search because all string matches are done |
| * from the same start index. |
| * |
| * For UTF-8, this would require a comparison function that returns UTF-16 order. |
| * |
| * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets |
| * with strings have very few very short strings. For cases with many strings, it might be better to use a different |
| * API and implementation with a DFA (state machine). |
| */ |
| |
| /* |
| * Algorithm for span(SpanCondition.CONTAINED) |
| * |
| * Theoretical algorithm: |
| * - Iterate through the string, and at each code point boundary: |
| * + If the code point there is in the set, then remember to continue after it. |
| * + If a set string matches at the current position, then remember to continue after it. |
| * + Either recursively span for each code point or string match, or recursively span |
| * for all but the shortest one and iteratively continue the span with the shortest local match. |
| * + Remember the longest recursive span (the farthest end point). |
| * + If there is no match at the current position, |
| * neither for the code point there nor for any set string, |
| * then stop and return the longest recursive span length. |
| * |
| * Optimized implementation: |
| * |
| * (We assume that most sets will have very few very short strings. |
| * A span using a string-less set is extremely fast.) |
| * |
| * Create and cache a spanSet which contains all of the single code points of the original set |
| * but none of its strings. |
| * |
| * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). |
| * - Loop: |
| * + Try to match each set string at the end of the spanLength. |
| * ~ Set strings that start with set-contained code points |
| * must be matched with a partial overlap |
| * because the recursive algorithm would have tried to match them at every position. |
| * ~ Set strings that entirely consist of set-contained code points |
| * are irrelevant for span(SpanCondition.CONTAINED) |
| * because the recursive algorithm would continue after them anyway and |
| * find the longest recursive match from their end. |
| * ~ Rather than recursing, note each end point of a set string match. |
| * + If no set string matched after spanSet.span(), |
| * then return with where the spanSet.span() ended. |
| * + If at least one set string matched after spanSet.span(), |
| * then pop the shortest string match end point and continue the loop, |
| * trying to match all set strings from there. |
| * + If at least one more set string matched after a previous string match, then test if the |
| * code point after the previous string match is also contained in the set. |
| * Continue the loop with the shortest end point of |
| * either this code point or a matching set string. |
| * + If no more set string matched after a previous string match, |
| * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). |
| * Stop if spanLength==0, otherwise continue the loop. |
| * |
| * By noting each end point of a set string match, the function visits each string position at most once and |
| * finishes in linear time. |
| * |
| * The recursive algorithm may visit the same string position many times |
| * if multiple paths lead to it and finishes in exponential time. |
| */ |
| |
| /* |
| * Algorithm for span(SIMPLE) |
| * |
| * Theoretical algorithm: |
| * - Iterate through the string, and at each code point boundary: |
| * + If the code point there is in the set, then remember to continue after it. |
| * + If a set string matches at the current position, then remember to continue after it. |
| * + Continue from the farthest match position and ignore all others. |
| * + If there is no match at the current position, then stop and return the current position. |
| * |
| * Optimized implementation: |
| * |
| * (Same assumption and spanSet as above.) |
| * |
| * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). |
| * - Loop: |
| * + Try to match each set string at the end of the spanLength. |
| * ~ Set strings that start with set-contained code points |
| * must be matched with a partial overlap |
| * because the standard algorithm would have tried to match them earlier. |
| * ~ Set strings that entirely consist of set-contained code points |
| * must be matched with a full overlap because the longest-match algorithm |
| * would hide set string matches that end earlier. |
| * Such set strings need not be matched earlier inside the code point span |
| * because the standard algorithm would then have |
| * continued after the set string match anyway. |
| * ~ Remember the longest set string match (farthest end point) |
| * from the earliest starting point. |
| * + If no set string matched after spanSet.span(), |
| * then return with where the spanSet.span() ended. |
| * + If at least one set string matched, |
| * then continue the loop after the longest match from the earliest position. |
| * + If no more set string matched after a previous string match, |
| * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). |
| * Stop if spanLength==0, otherwise continue the loop. |
| */ |
| /** |
| * Spans a string. |
| * |
| * @param s The string to be spanned |
| * @param start The start index that the span begins |
| * @param spanCondition The span condition |
| * @return the limit (exclusive end) of the span |
| */ |
| public int span(CharSequence s, int start, SpanCondition spanCondition) { |
| if (spanCondition == SpanCondition.NOT_CONTAINED) { |
| return spanNot(s, start, null); |
| } |
| int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); |
| if (spanLimit == s.length()) { |
| return spanLimit; |
| } |
| return spanWithStrings(s, start, spanLimit, spanCondition); |
| } |
| |
| /** |
| * Synchronized method for complicated spans using the offsets. |
| * Avoids synchronization for simple cases. |
| * |
| * @param spanLimit = spanSet.span(s, start, CONTAINED) |
| */ |
| private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, |
| SpanCondition spanCondition) { |
| // Consider strings; they may overlap with the span. |
| int initSize = 0; |
| if (spanCondition == SpanCondition.CONTAINED) { |
| // Use offset list to try all possibilities. |
| initSize = maxLength16; |
| } |
| offsets.setMaxLength(initSize); |
| int length = s.length(); |
| int pos = spanLimit, rest = length - spanLimit; |
| int spanLength = spanLimit - start; |
| int i, stringsLength = strings.size(); |
| for (;;) { |
| if (spanCondition == SpanCondition.CONTAINED) { |
| for (i = 0; i < stringsLength; ++i) { |
| int overlap = spanLengths[i]; |
| if (overlap == ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. |
| } |
| String string = strings.get(i); |
| |
| int length16 = string.length(); |
| |
| // Try to match this string at pos-overlap..pos. |
| if (overlap >= LONG_SPAN) { |
| overlap = length16; |
| // While contained: No point matching fully inside the code point span. |
| overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code |
| // point. |
| } |
| if (overlap > spanLength) { |
| overlap = spanLength; |
| } |
| int inc = length16 - overlap; // Keep overlap+inc==length16. |
| for (;;) { |
| if (inc > rest) { |
| break; |
| } |
| // Try to match if the increment is not listed already. |
| if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { |
| if (inc == rest) { |
| return length; // Reached the end of the string. |
| } |
| offsets.addOffset(inc); |
| } |
| if (overlap == 0) { |
| break; |
| } |
| --overlap; |
| ++inc; |
| } |
| } |
| } else /* SIMPLE */{ |
| int maxInc = 0, maxOverlap = 0; |
| for (i = 0; i < stringsLength; ++i) { |
| int overlap = spanLengths[i]; |
| // For longest match, we do need to try to match even an all-contained string |
| // to find the match from the earliest start. |
| |
| String string = strings.get(i); |
| |
| int length16 = string.length(); |
| |
| // Try to match this string at pos-overlap..pos. |
| if (overlap >= LONG_SPAN) { |
| overlap = length16; |
| // Longest match: Need to match fully inside the code point span |
| // to find the match from the earliest start. |
| } |
| if (overlap > spanLength) { |
| overlap = spanLength; |
| } |
| int inc = length16 - overlap; // Keep overlap+inc==length16. |
| for (;;) { |
| if (inc > rest || overlap < maxOverlap) { |
| break; |
| } |
| // Try to match if the string is longer or starts earlier. |
| if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) |
| && matches16CPB(s, pos - overlap, length, string, length16)) { |
| maxInc = inc; // Longest match from earliest start. |
| maxOverlap = overlap; |
| break; |
| } |
| --overlap; |
| ++inc; |
| } |
| } |
| |
| if (maxInc != 0 || maxOverlap != 0) { |
| // Longest-match algorithm, and there was a string match. |
| // Simply continue after it. |
| pos += maxInc; |
| rest -= maxInc; |
| if (rest == 0) { |
| return length; // Reached the end of the string. |
| } |
| spanLength = 0; // Match strings from after a string match. |
| continue; |
| } |
| } |
| // Finished trying to match all strings at pos. |
| |
| if (spanLength != 0 || pos == 0) { |
| // The position is after an unlimited code point span (spanLength!=0), |
| // not after a string match. |
| // The only position where spanLength==0 after a span is pos==0. |
| // Otherwise, an unlimited code point span is only tried again when no |
| // strings match, and if such a non-initial span fails we stop. |
| if (offsets.isEmpty()) { |
| return pos; // No strings matched after a span. |
| } |
| // Match strings from after the next string match. |
| } else { |
| // The position is after a string match (or a single code point). |
| if (offsets.isEmpty()) { |
| // No more strings matched after a previous string match. |
| // Try another code point span from after the last string match. |
| spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); |
| spanLength = spanLimit - pos; |
| if (spanLength == rest || // Reached the end of the string, or |
| spanLength == 0 // neither strings nor span progressed. |
| ) { |
| return spanLimit; |
| } |
| pos += spanLength; |
| rest -= spanLength; |
| continue; // spanLength>0: Match strings from after a span. |
| } else { |
| // Try to match only one code point from after a string match if some |
| // string matched beyond it, so that we try all possible positions |
| // and don't overshoot. |
| spanLength = spanOne(spanSet, s, pos, rest); |
| if (spanLength > 0) { |
| if (spanLength == rest) { |
| return length; // Reached the end of the string. |
| } |
| // Match strings after this code point. |
| // There cannot be any increments below it because UnicodeSet strings |
| // contain multiple code points. |
| pos += spanLength; |
| rest -= spanLength; |
| offsets.shift(spanLength); |
| spanLength = 0; |
| continue; // Match strings from after a single code point. |
| } |
| // Match strings from after the next string match. |
| } |
| } |
| int minOffset = offsets.popMinimum(null); |
| pos += minOffset; |
| rest -= minOffset; |
| spanLength = 0; // Match strings from after a string match. |
| } |
| } |
| |
| /** |
| * Spans a string and counts the smallest number of set elements on any path across the span. |
| * |
| * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. |
| * |
| * <p>If the set does not have any fully-contained strings, then we could optimize this |
| * like span(), but such sets are likely rare, and this is at least still linear. |
| * |
| * @param s The string to be spanned |
| * @param start The start index that the span begins |
| * @param spanCondition The span condition |
| * @param outCount The count |
| * @return the limit (exclusive end) of the span |
| */ |
| public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, |
| OutputInt outCount) { |
| if (spanCondition == SpanCondition.NOT_CONTAINED) { |
| return spanNot(s, start, outCount); |
| } |
| // Consider strings; they may overlap with the span, |
| // and they may result in a smaller count that with just code points. |
| if (spanCondition == SpanCondition.CONTAINED) { |
| return spanContainedAndCount(s, start, outCount); |
| } |
| // SIMPLE (not synchronized, does not use offsets) |
| int stringsLength = strings.size(); |
| int length = s.length(); |
| int pos = start; |
| int rest = length - start; |
| int count = 0; |
| while (rest != 0) { |
| // Try to match the next code point. |
| int cpLength = spanOne(spanSet, s, pos, rest); |
| int maxInc = (cpLength > 0) ? cpLength : 0; |
| // Try to match all of the strings. |
| for (int i = 0; i < stringsLength; ++i) { |
| String string = strings.get(i); |
| int length16 = string.length(); |
| if (maxInc < length16 && length16 <= rest && |
| matches16CPB(s, pos, length, string, length16)) { |
| maxInc = length16; |
| } |
| } |
| // We are done if there is no match beyond pos. |
| if (maxInc == 0) { |
| outCount.value = count; |
| return pos; |
| } |
| // Continue from the longest match. |
| ++count; |
| pos += maxInc; |
| rest -= maxInc; |
| } |
| outCount.value = count; |
| return pos; |
| } |
| |
| private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { |
| // Use offset list to try all possibilities. |
| offsets.setMaxLength(maxLength16); |
| int stringsLength = strings.size(); |
| int length = s.length(); |
| int pos = start; |
| int rest = length - start; |
| int count = 0; |
| while (rest != 0) { |
| // Try to match the next code point. |
| int cpLength = spanOne(spanSet, s, pos, rest); |
| if (cpLength > 0) { |
| offsets.addOffsetAndCount(cpLength, count + 1); |
| } |
| // Try to match all of the strings. |
| for (int i = 0; i < stringsLength; ++i) { |
| String string = strings.get(i); |
| int length16 = string.length(); |
| // Note: If the strings were sorted by length, then we could also |
| // avoid trying to match if there is already a match of the same length. |
| if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && |
| matches16CPB(s, pos, length, string, length16)) { |
| offsets.addOffsetAndCount(length16, count + 1); |
| } |
| } |
| // We are done if there is no match beyond pos. |
| if (offsets.isEmpty()) { |
| outCount.value = count; |
| return pos; |
| } |
| // Continue from the nearest match. |
| int minOffset = offsets.popMinimum(outCount); |
| count = outCount.value; |
| pos += minOffset; |
| rest -= minOffset; |
| } |
| outCount.value = count; |
| return pos; |
| } |
| |
| /** |
| * Span a string backwards. |
| * |
| * @param s The string to be spanned |
| * @param spanCondition The span condition |
| * @return The string index which starts the span (i.e. inclusive). |
| */ |
| public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { |
| if (spanCondition == SpanCondition.NOT_CONTAINED) { |
| return spanNotBack(s, length); |
| } |
| int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); |
| if (pos == 0) { |
| return 0; |
| } |
| int spanLength = length - pos; |
| |
| // Consider strings; they may overlap with the span. |
| int initSize = 0; |
| if (spanCondition == SpanCondition.CONTAINED) { |
| // Use offset list to try all possibilities. |
| initSize = maxLength16; |
| } |
| offsets.setMaxLength(initSize); |
| int i, stringsLength = strings.size(); |
| int spanBackLengthsOffset = 0; |
| if (all) { |
| spanBackLengthsOffset = stringsLength; |
| } |
| for (;;) { |
| if (spanCondition == SpanCondition.CONTAINED) { |
| for (i = 0; i < stringsLength; ++i) { |
| int overlap = spanLengths[spanBackLengthsOffset + i]; |
| if (overlap == ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. |
| } |
| String string = strings.get(i); |
| |
| int length16 = string.length(); |
| |
| // Try to match this string at pos-(length16-overlap)..pos-length16. |
| if (overlap >= LONG_SPAN) { |
| overlap = length16; |
| // While contained: No point matching fully inside the code point span. |
| int len1 = 0; |
| len1 = string.offsetByCodePoints(0, 1); |
| overlap -= len1; // Length of the string minus the first code point. |
| } |
| if (overlap > spanLength) { |
| overlap = spanLength; |
| } |
| int dec = length16 - overlap; // Keep dec+overlap==length16. |
| for (;;) { |
| if (dec > pos) { |
| break; |
| } |
| // Try to match if the decrement is not listed already. |
| if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { |
| if (dec == pos) { |
| return 0; // Reached the start of the string. |
| } |
| offsets.addOffset(dec); |
| } |
| if (overlap == 0) { |
| break; |
| } |
| --overlap; |
| ++dec; |
| } |
| } |
| } else /* SIMPLE */{ |
| int maxDec = 0, maxOverlap = 0; |
| for (i = 0; i < stringsLength; ++i) { |
| int overlap = spanLengths[spanBackLengthsOffset + i]; |
| // For longest match, we do need to try to match even an all-contained string |
| // to find the match from the latest end. |
| |
| String string = strings.get(i); |
| |
| int length16 = string.length(); |
| |
| // Try to match this string at pos-(length16-overlap)..pos-length16. |
| if (overlap >= LONG_SPAN) { |
| overlap = length16; |
| // Longest match: Need to match fully inside the code point span |
| // to find the match from the latest end. |
| } |
| if (overlap > spanLength) { |
| overlap = spanLength; |
| } |
| int dec = length16 - overlap; // Keep dec+overlap==length16. |
| for (;;) { |
| if (dec > pos || overlap < maxOverlap) { |
| break; |
| } |
| // Try to match if the string is longer or ends later. |
| if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) |
| && matches16CPB(s, pos - dec, length, string, length16)) { |
| maxDec = dec; // Longest match from latest end. |
| maxOverlap = overlap; |
| break; |
| } |
| --overlap; |
| ++dec; |
| } |
| } |
| |
| if (maxDec != 0 || maxOverlap != 0) { |
| // Longest-match algorithm, and there was a string match. |
| // Simply continue before it. |
| pos -= maxDec; |
| if (pos == 0) { |
| return 0; // Reached the start of the string. |
| } |
| spanLength = 0; // Match strings from before a string match. |
| continue; |
| } |
| } |
| // Finished trying to match all strings at pos. |
| |
| if (spanLength != 0 || pos == length) { |
| // The position is before an unlimited code point span (spanLength!=0), |
| // not before a string match. |
| // The only position where spanLength==0 before a span is pos==length. |
| // Otherwise, an unlimited code point span is only tried again when no |
| // strings match, and if such a non-initial span fails we stop. |
| if (offsets.isEmpty()) { |
| return pos; // No strings matched before a span. |
| } |
| // Match strings from before the next string match. |
| } else { |
| // The position is before a string match (or a single code point). |
| if (offsets.isEmpty()) { |
| // No more strings matched before a previous string match. |
| // Try another code point span from before the last string match. |
| int oldPos = pos; |
| pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); |
| spanLength = oldPos - pos; |
| if (pos == 0 || // Reached the start of the string, or |
| spanLength == 0 // neither strings nor span progressed. |
| ) { |
| return pos; |
| } |
| continue; // spanLength>0: Match strings from before a span. |
| } else { |
| // Try to match only one code point from before a string match if some |
| // string matched beyond it, so that we try all possible positions |
| // and don't overshoot. |
| spanLength = spanOneBack(spanSet, s, pos); |
| if (spanLength > 0) { |
| if (spanLength == pos) { |
| return 0; // Reached the start of the string. |
| } |
| // Match strings before this code point. |
| // There cannot be any decrements below it because UnicodeSet strings |
| // contain multiple code points. |
| pos -= spanLength; |
| offsets.shift(spanLength); |
| spanLength = 0; |
| continue; // Match strings from before a single code point. |
| } |
| // Match strings from before the next string match. |
| } |
| } |
| pos -= offsets.popMinimum(null); |
| spanLength = 0; // Match strings from before a string match. |
| } |
| } |
| |
| /** |
| * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) |
| * |
| * Theoretical algorithm: |
| * - Iterate through the string, and at each code point boundary: |
| * + If the code point there is in the set, then return with the current position. |
| * + If a set string matches at the current position, then return with the current position. |
| * |
| * Optimized implementation: |
| * |
| * (Same assumption as for span() above.) |
| * |
| * Create and cache a spanNotSet which contains |
| * all of the single code points of the original set but none of its strings. |
| * For each set string add its initial code point to the spanNotSet. |
| * (Also add its final code point for spanNotBack().) |
| * |
| * - Loop: |
| * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). |
| * + If the current code point is in the original set, then return the current position. |
| * + If any set string matches at the current position, then return the current position. |
| * + If there is no match at the current position, neither for the code point |
| * there nor for any set string, then skip this code point and continue the loop. |
| * This happens for set-string-initial code points that were added to spanNotSet |
| * when there is not actually a match for such a set string. |
| * |
| * @param s The string to be spanned |
| * @param start The start index that the span begins |
| * @param outCount If not null: Receives the number of code points across the span. |
| * @return the limit (exclusive end) of the span |
| */ |
| private int spanNot(CharSequence s, int start, OutputInt outCount) { |
| int length = s.length(); |
| int pos = start, rest = length - start; |
| int stringsLength = strings.size(); |
| int count = 0; |
| do { |
| // Span until we find a code point from the set, |
| // or a code point that starts or ends some string. |
| int spanLimit; |
| if (outCount == null) { |
| spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); |
| } else { |
| spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); |
| outCount.value = count = count + outCount.value; |
| } |
| if (spanLimit == length) { |
| return length; // Reached the end of the string. |
| } |
| pos = spanLimit; |
| rest = length - spanLimit; |
| |
| // Check whether the current code point is in the original set, |
| // without the string starts and ends. |
| int cpLength = spanOne(spanSet, s, pos, rest); |
| if (cpLength > 0) { |
| return pos; // There is a set element at pos. |
| } |
| |
| // Try to match the strings at pos. |
| for (int i = 0; i < stringsLength; ++i) { |
| if (spanLengths[i] == ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. |
| } |
| String string = strings.get(i); |
| |
| int length16 = string.length(); |
| if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { |
| return pos; // There is a set element at pos. |
| } |
| } |
| |
| // The span(while not contained) ended on a string start/end which is |
| // not in the original set. Skip this code point and continue. |
| // cpLength<0 |
| pos -= cpLength; |
| rest += cpLength; |
| ++count; |
| } while (rest != 0); |
| if (outCount != null) { |
| outCount.value = count; |
| } |
| return length; // Reached the end of the string. |
| } |
| |
| private int spanNotBack(CharSequence s, int length) { |
| int pos = length; |
| int i, stringsLength = strings.size(); |
| do { |
| // Span until we find a code point from the set, |
| // or a code point that starts or ends some string. |
| pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); |
| if (pos == 0) { |
| return 0; // Reached the start of the string. |
| } |
| |
| // Check whether the current code point is in the original set, |
| // without the string starts and ends. |
| int cpLength = spanOneBack(spanSet, s, pos); |
| if (cpLength > 0) { |
| return pos; // There is a set element at pos. |
| } |
| |
| // Try to match the strings at pos. |
| for (i = 0; i < stringsLength; ++i) { |
| // Use spanLengths rather than a spanLengths pointer because |
| // it is easier and we only need to know whether the string is irrelevant |
| // which is the same in either array. |
| if (spanLengths[i] == ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. |
| } |
| String string = strings.get(i); |
| |
| int length16 = string.length(); |
| if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { |
| return pos; // There is a set element at pos. |
| } |
| } |
| |
| // The span(while not contained) ended on a string start/end which is |
| // not in the original set. Skip this code point and continue. |
| // cpLength<0 |
| pos += cpLength; |
| } while (pos != 0); |
| return 0; // Reached the start of the string. |
| } |
| |
| static short makeSpanLengthByte(int spanLength) { |
| // 0xfe==UnicodeSetStringSpan::LONG_SPAN |
| return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; |
| } |
| |
| // Compare strings without any argument checks. Requires length>0. |
| private static boolean matches16(CharSequence s, int start, final String t, int length) { |
| int end = start + length; |
| while (length-- > 0) { |
| if (s.charAt(--end) != t.charAt(length)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Compare 16-bit Unicode strings (which may be malformed UTF-16) |
| * at code point boundaries. |
| * That is, each edge of a match must not be in the middle of a surrogate pair. |
| * @param s The string to match in. |
| * @param start The start index of s. |
| * @param limit The limit of the subsequence of s being spanned. |
| * @param t The substring to be matched in s. |
| * @param tlength The length of t. |
| */ |
| static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { |
| return matches16(s, start, t, tlength) |
| && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && |
| Character.isLowSurrogate(s.charAt(start))) |
| && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && |
| Character.isLowSurrogate(s.charAt(start + tlength))); |
| } |
| |
| /** |
| * Does the set contain the next code point? |
| * If so, return its length; otherwise return its negative length. |
| */ |
| static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { |
| char c = s.charAt(start); |
| if (c >= 0xd800 && c <= 0xdbff && length >= 2) { |
| char c2 = s.charAt(start + 1); |
| if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) { |
| int supplementary = Character.toCodePoint(c, c2); |
| return set.contains(supplementary) ? 2 : -2; |
| } |
| } |
| return set.contains(c) ? 1 : -1; |
| } |
| |
| static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { |
| char c = s.charAt(length - 1); |
| if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { |
| char c2 = s.charAt(length - 2); |
| if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) { |
| int supplementary = Character.toCodePoint(c2, c); |
| return set.contains(supplementary) ? 2 : -2; |
| } |
| } |
| return set.contains(c) ? 1 : -1; |
| } |
| |
| /** |
| * Helper class for UnicodeSetStringSpan. |
| * |
| * <p>List of offsets from the current position from where to try matching |
| * a code point or a string. |
| * Stores offsets rather than indexes to simplify the code and use the same list |
| * for both increments (in span()) and decrements (in spanBack()). |
| * |
| * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time |
| * are relatively dense, that is, |
| * there are normally no gaps of hundreds or thousands of offset values. |
| * |
| * <p>This class optionally also tracks the minimum non-negative count for each position, |
| * intended to count the smallest number of elements of any path leading to that position. |
| * |
| * <p>The implementation uses a circular buffer of count integers, |
| * each indicating whether the corresponding offset is in the list, |
| * and its path element count. |
| * This avoids inserting into a sorted list of offsets (or absolute indexes) |
| * and physically moving part of the list. |
| * |
| * <p>Note: In principle, the caller should setMaxLength() to |
| * the maximum of the max string length and U16_LENGTH/U8_LENGTH |
| * to account for "long" single code points. |
| * |
| * <p>Note: An earlier version did not track counts and stored only byte flags. |
| * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, |
| * the list could be stored as bit flags in a single integer. |
| * Rather than handling a circular buffer with a start list index, |
| * the integer would simply be shifted when lower offsets are removed. |
| * UnicodeSet does not have a limit on the lengths of strings. |
| */ |
| private static final class OffsetList { |
| private int[] list; |
| private int length; |
| private int start; |
| |
| public OffsetList() { |
| list = new int[16]; // default size |
| } |
| |
| public void setMaxLength(int maxLength) { |
| if (maxLength > list.length) { |
| list = new int[maxLength]; |
| } |
| clear(); |
| } |
| |
| public void clear() { |
| for (int i = list.length; i-- > 0;) { |
| list[i] = 0; |
| } |
| start = length = 0; |
| } |
| |
| public boolean isEmpty() { |
| return (length == 0); |
| } |
| |
| /** |
| * Reduces all stored offsets by delta, used when the current position moves by delta. |
| * There must not be any offsets lower than delta. |
| * If there is an offset equal to delta, it is removed. |
| * |
| * @param delta [1..maxLength] |
| */ |
| public void shift(int delta) { |
| int i = start + delta; |
| if (i >= list.length) { |
| i -= list.length; |
| } |
| if (list[i] != 0) { |
| list[i] = 0; |
| --length; |
| } |
| start = i; |
| } |
| |
| /** |
| * Adds an offset. The list must not contain it yet. |
| * @param offset [1..maxLength] |
| */ |
| public void addOffset(int offset) { |
| int i = start + offset; |
| if (i >= list.length) { |
| i -= list.length; |
| } |
| assert list[i] == 0; |
| list[i] = 1; |
| ++length; |
| } |
| |
| /** |
| * Adds an offset and updates its count. |
| * The list may already contain the offset. |
| * @param offset [1..maxLength] |
| */ |
| public void addOffsetAndCount(int offset, int count) { |
| assert count > 0; |
| int i = start + offset; |
| if (i >= list.length) { |
| i -= list.length; |
| } |
| if (list[i] == 0) { |
| list[i] = count; |
| ++length; |
| } else if (count < list[i]) { |
| list[i] = count; |
| } |
| } |
| |
| /** |
| * @param offset [1..maxLength] |
| */ |
| public boolean containsOffset(int offset) { |
| int i = start + offset; |
| if (i >= list.length) { |
| i -= list.length; |
| } |
| return list[i] != 0; |
| } |
| |
| /** |
| * @param offset [1..maxLength] |
| */ |
| public boolean hasCountAtOffset(int offset, int count) { |
| int i = start + offset; |
| if (i >= list.length) { |
| i -= list.length; |
| } |
| int oldCount = list[i]; |
| return oldCount != 0 && oldCount <= count; |
| } |
| |
| /** |
| * Finds the lowest stored offset from a non-empty list, removes it, |
| * and reduces all other offsets by this minimum. |
| * @return min=[1..maxLength] |
| */ |
| public int popMinimum(OutputInt outCount) { |
| // Look for the next offset in list[start+1..list.length-1]. |
| int i = start, result; |
| while (++i < list.length) { |
| int count = list[i]; |
| if (count != 0) { |
| list[i] = 0; |
| --length; |
| result = i - start; |
| start = i; |
| if (outCount != null) { outCount.value = count; } |
| return result; |
| } |
| } |
| // i==list.length |
| |
| // Wrap around and look for the next offset in list[0..start]. |
| // Since the list is not empty, there will be one. |
| result = list.length - start; |
| i = 0; |
| int count; |
| while ((count = list[i]) == 0) { |
| ++i; |
| } |
| list[i] = 0; |
| --length; |
| start = i; |
| if (outCount != null) { outCount.value = count; } |
| return result + i; |
| } |
| } |
| } |