| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 2007-2012, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| * file name: unisetspan.cpp |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2007mar01 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uniset.h" |
| #include "unicode/ustring.h" |
| #include "unicode/utf8.h" |
| #include "unicode/utf16.h" |
| #include "cmemory.h" |
| #include "uvector.h" |
| #include "unisetspan.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| /* |
| * List of offsets from the current position from where to try matching |
| * a code point or a string. |
| * Store offsets rather than indexes to simplify the code and use the same list |
| * for both increments (in span()) and decrements (in spanBack()). |
| * |
| * Assumption: The maximum offset is limited, and the offsets that are stored |
| * at any one time are relatively dense, that is, there are normally no gaps of |
| * hundreds or thousands of offset values. |
| * |
| * The implementation uses a circular buffer of byte flags, |
| * each indicating whether the corresponding offset is in the list. |
| * This avoids inserting into a sorted list of offsets (or absolute indexes) and |
| * physically moving part of the list. |
| * |
| * Note: In principle, the caller should setMaxLength() to the maximum of the |
| * max string length and U16_LENGTH/U8_LENGTH to account for |
| * "long" single code points. |
| * However, this implementation uses at least a staticList with more than |
| * U8_LENGTH entries anyway. |
| * |
| * Note: If maxLength were guaranteed to be no more than 32 or 64, |
| * the list could be stored as bit flags in a single integer. |
| * Rather than handling a circular buffer with a start list index, |
| * the integer would simply be shifted when lower offsets are removed. |
| * UnicodeSet does not have a limit on the lengths of strings. |
| */ |
| class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. |
| public: |
| OffsetList() : list(staticList), capacity(0), length(0), start(0) {} |
| |
| ~OffsetList() { |
| if(list!=staticList) { |
| uprv_free(list); |
| } |
| } |
| |
| // Call exactly once if the list is to be used. |
| void setMaxLength(int32_t maxLength) { |
| if(maxLength<=(int32_t)sizeof(staticList)) { |
| capacity=(int32_t)sizeof(staticList); |
| } else { |
| UBool *l=(UBool *)uprv_malloc(maxLength); |
| if(l!=NULL) { |
| list=l; |
| capacity=maxLength; |
| } |
| } |
| uprv_memset(list, 0, capacity); |
| } |
| |
| void clear() { |
| uprv_memset(list, 0, capacity); |
| start=length=0; |
| } |
| |
| UBool isEmpty() const { |
| return (UBool)(length==0); |
| } |
| |
| // Reduce all stored offsets by delta, used when the current position |
| // moves by delta. |
| // There must not be any offsets lower than delta. |
| // If there is an offset equal to delta, it is removed. |
| // delta=[1..maxLength] |
| void shift(int32_t delta) { |
| int32_t i=start+delta; |
| if(i>=capacity) { |
| i-=capacity; |
| } |
| if(list[i]) { |
| list[i]=FALSE; |
| --length; |
| } |
| start=i; |
| } |
| |
| // Add an offset. The list must not contain it yet. |
| // offset=[1..maxLength] |
| void addOffset(int32_t offset) { |
| int32_t i=start+offset; |
| if(i>=capacity) { |
| i-=capacity; |
| } |
| list[i]=TRUE; |
| ++length; |
| } |
| |
| // offset=[1..maxLength] |
| UBool containsOffset(int32_t offset) const { |
| int32_t i=start+offset; |
| if(i>=capacity) { |
| i-=capacity; |
| } |
| return list[i]; |
| } |
| |
| // Find the lowest stored offset from a non-empty list, remove it, |
| // and reduce all other offsets by this minimum. |
| // Returns [1..maxLength]. |
| int32_t popMinimum() { |
| // Look for the next offset in list[start+1..capacity-1]. |
| int32_t i=start, result; |
| while(++i<capacity) { |
| if(list[i]) { |
| list[i]=FALSE; |
| --length; |
| result=i-start; |
| start=i; |
| return result; |
| } |
| } |
| // i==capacity |
| |
| // Wrap around and look for the next offset in list[0..start]. |
| // Since the list is not empty, there will be one. |
| result=capacity-start; |
| i=0; |
| while(!list[i]) { |
| ++i; |
| } |
| list[i]=FALSE; |
| --length; |
| start=i; |
| return result+=i; |
| } |
| |
| private: |
| UBool *list; |
| int32_t capacity; |
| int32_t length; |
| int32_t start; |
| |
| UBool staticList[16]; |
| }; |
| |
| // Get the number of UTF-8 bytes for a UTF-16 (sub)string. |
| static int32_t |
| getUTF8Length(const UChar *s, int32_t length) { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| int32_t length8=0; |
| u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); |
| if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| return length8; |
| } else { |
| // The string contains an unpaired surrogate. |
| // Ignore this string. |
| return 0; |
| } |
| } |
| |
| // Append the UTF-8 version of the string to t and return the appended UTF-8 length. |
| static int32_t |
| appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| int32_t length8=0; |
| u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); |
| if(U_SUCCESS(errorCode)) { |
| return length8; |
| } else { |
| // The string contains an unpaired surrogate. |
| // Ignore this string. |
| return 0; |
| } |
| } |
| |
| static inline uint8_t |
| makeSpanLengthByte(int32_t spanLength) { |
| // 0xfe==UnicodeSetStringSpan::LONG_SPAN |
| return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; |
| } |
| |
| // Construct for all variants of span(), or only for any one variant. |
| // Initialize as little as possible, for single use. |
| UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, |
| const UVector &setStrings, |
| uint32_t which) |
| : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), |
| utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), |
| utf8Length(0), |
| maxLength16(0), maxLength8(0), |
| all((UBool)(which==ALL)) { |
| spanSet.retainAll(set); |
| if(which&NOT_CONTAINED) { |
| // Default to the same sets. |
| // addToSpanNotSet() will create a separate set if necessary. |
| pSpanNotSet=&spanSet; |
| } |
| |
| // Determine if the strings even need to be taken into account at all for span() etc. |
| // If any string is relevant, then all strings need to be used for |
| // span(longest match) but only the relevant ones for span(while contained). |
| // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH |
| // and do not store UTF-8 strings if !thisRelevant and CONTAINED. |
| // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) |
| // Also count the lengths of the UTF-8 versions of the strings for memory allocation. |
| int32_t stringsLength=strings.size(); |
| |
| int32_t i, spanLength; |
| UBool someRelevant=FALSE; |
| for(i=0; i<stringsLength; ++i) { |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| if (length16==0) { |
| continue; // skip the empty string |
| } |
| UBool thisRelevant; |
| spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); |
| if(spanLength<length16) { // Relevant string. |
| someRelevant=thisRelevant=TRUE; |
| } else { |
| thisRelevant=FALSE; |
| } |
| if((which&UTF16) && length16>maxLength16) { |
| maxLength16=length16; |
| } |
| if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { |
| int32_t length8=getUTF8Length(s16, length16); |
| utf8Length+=length8; |
| if(length8>maxLength8) { |
| maxLength8=length8; |
| } |
| } |
| } |
| if(!someRelevant) { |
| maxLength16=maxLength8=0; |
| return; |
| } |
| |
| // Freeze after checking for the need to use strings at all because freezing |
| // a set takes some time and memory which are wasted if there are no relevant strings. |
| if(all) { |
| spanSet.freeze(); |
| } |
| |
| uint8_t *spanBackLengths; |
| uint8_t *spanUTF8Lengths; |
| uint8_t *spanBackUTF8Lengths; |
| |
| // Allocate a block of meta data. |
| int32_t allocSize; |
| if(all) { |
| // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. |
| allocSize=stringsLength*(4+1+1+1+1)+utf8Length; |
| } else { |
| allocSize=stringsLength; // One set of span lengths. |
| if(which&UTF8) { |
| // UTF-8 lengths and UTF-8 strings. |
| allocSize+=stringsLength*4+utf8Length; |
| } |
| } |
| if(allocSize<=(int32_t)sizeof(staticLengths)) { |
| utf8Lengths=staticLengths; |
| } else { |
| utf8Lengths=(int32_t *)uprv_malloc(allocSize); |
| if(utf8Lengths==NULL) { |
| maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. |
| return; // Out of memory. |
| } |
| } |
| |
| if(all) { |
| // Store span lengths for all span() variants. |
| spanLengths=(uint8_t *)(utf8Lengths+stringsLength); |
| spanBackLengths=spanLengths+stringsLength; |
| spanUTF8Lengths=spanBackLengths+stringsLength; |
| spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; |
| utf8=spanBackUTF8Lengths+stringsLength; |
| } else { |
| // Store span lengths for only one span() variant. |
| if(which&UTF8) { |
| spanLengths=(uint8_t *)(utf8Lengths+stringsLength); |
| utf8=spanLengths+stringsLength; |
| } else { |
| spanLengths=(uint8_t *)utf8Lengths; |
| } |
| spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; |
| } |
| |
| // Set the meta data and pSpanNotSet and write the UTF-8 strings. |
| int32_t utf8Count=0; // Count UTF-8 bytes written so far. |
| |
| for(i=0; i<stringsLength; ++i) { |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); |
| if(spanLength<length16 && length16>0) { // Relevant string. |
| if(which&UTF16) { |
| if(which&CONTAINED) { |
| if(which&FWD) { |
| spanLengths[i]=makeSpanLengthByte(spanLength); |
| } |
| if(which&BACK) { |
| spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); |
| spanBackLengths[i]=makeSpanLengthByte(spanLength); |
| } |
| } else /* not CONTAINED, not all, but NOT_CONTAINED */ { |
| spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. |
| } |
| } |
| if(which&UTF8) { |
| uint8_t *s8=utf8+utf8Count; |
| int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); |
| utf8Count+=utf8Lengths[i]=length8; |
| if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. |
| spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; |
| } else { // Relevant for UTF-8. |
| if(which&CONTAINED) { |
| if(which&FWD) { |
| spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); |
| spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); |
| } |
| if(which&BACK) { |
| spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); |
| spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); |
| } |
| } else /* not CONTAINED, not all, but NOT_CONTAINED */ { |
| spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. |
| } |
| } |
| } |
| if(which&NOT_CONTAINED) { |
| // Add string start and end code points to the spanNotSet so that |
| // a span(while not contained) stops before any string. |
| UChar32 c; |
| if(which&FWD) { |
| int32_t len=0; |
| U16_NEXT(s16, len, length16, c); |
| addToSpanNotSet(c); |
| } |
| if(which&BACK) { |
| int32_t len=length16; |
| U16_PREV(s16, 0, len, c); |
| addToSpanNotSet(c); |
| } |
| } |
| } else { // Irrelevant string. (Also the empty string.) |
| if(which&UTF8) { |
| if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. |
| uint8_t *s8=utf8+utf8Count; |
| int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); |
| utf8Count+=utf8Lengths[i]=length8; |
| } else { |
| utf8Lengths[i]=0; |
| } |
| } |
| if(all) { |
| spanLengths[i]=spanBackLengths[i]= |
| spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= |
| (uint8_t)ALL_CP_CONTAINED; |
| } else { |
| // All spanXYZLengths pointers contain the same address. |
| spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; |
| } |
| } |
| } |
| |
| // Finish. |
| if(all) { |
| pSpanNotSet->freeze(); |
| } |
| } |
| |
| // Copy constructor. Assumes which==ALL for a frozen set. |
| UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, |
| const UVector &newParentSetStrings) |
| : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), |
| utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), |
| utf8Length(otherStringSpan.utf8Length), |
| maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), |
| all(TRUE) { |
| if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { |
| pSpanNotSet=&spanSet; |
| } else { |
| pSpanNotSet=otherStringSpan.pSpanNotSet->clone(); |
| } |
| |
| // Allocate a block of meta data. |
| // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. |
| int32_t stringsLength=strings.size(); |
| int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; |
| if(allocSize<=(int32_t)sizeof(staticLengths)) { |
| utf8Lengths=staticLengths; |
| } else { |
| utf8Lengths=(int32_t *)uprv_malloc(allocSize); |
| if(utf8Lengths==NULL) { |
| maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. |
| return; // Out of memory. |
| } |
| } |
| |
| spanLengths=(uint8_t *)(utf8Lengths+stringsLength); |
| utf8=spanLengths+stringsLength*4; |
| uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); |
| } |
| |
| UnicodeSetStringSpan::~UnicodeSetStringSpan() { |
| if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { |
| delete pSpanNotSet; |
| } |
| if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { |
| uprv_free(utf8Lengths); |
| } |
| } |
| |
| void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { |
| if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { |
| if(spanSet.contains(c)) { |
| return; // Nothing to do. |
| } |
| UnicodeSet *newSet=spanSet.cloneAsThawed(); |
| if(newSet==NULL) { |
| return; // Out of memory. |
| } else { |
| pSpanNotSet=newSet; |
| } |
| } |
| pSpanNotSet->add(c); |
| } |
| |
| // Compare strings without any argument checks. Requires length>0. |
| static inline UBool |
| matches16(const UChar *s, const UChar *t, int32_t length) { |
| do { |
| if(*s++!=*t++) { |
| return FALSE; |
| } |
| } while(--length>0); |
| return TRUE; |
| } |
| |
| static inline UBool |
| matches8(const uint8_t *s, const uint8_t *t, int32_t length) { |
| do { |
| if(*s++!=*t++) { |
| return FALSE; |
| } |
| } while(--length>0); |
| return TRUE; |
| } |
| |
| // Compare 16-bit Unicode strings (which may be malformed UTF-16) |
| // at code point boundaries. |
| // That is, each edge of a match must not be in the middle of a surrogate pair. |
| static inline UBool |
| matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { |
| s+=start; |
| limit-=start; |
| return matches16(s, t, length) && |
| !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && |
| !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); |
| } |
| |
| // Does the set contain the next code point? |
| // If so, return its length; otherwise return its negative length. |
| static inline int32_t |
| spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { |
| UChar c=*s, c2; |
| if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { |
| return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; |
| } |
| return set.contains(c) ? 1 : -1; |
| } |
| |
| static inline int32_t |
| spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { |
| UChar c=s[length-1], c2; |
| if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { |
| return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; |
| } |
| return set.contains(c) ? 1 : -1; |
| } |
| |
| static inline int32_t |
| spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { |
| UChar32 c=*s; |
| if(U8_IS_SINGLE(c)) { |
| return set.contains(c) ? 1 : -1; |
| } |
| // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). |
| int32_t i=0; |
| U8_NEXT_OR_FFFD(s, i, length, c); |
| return set.contains(c) ? i : -i; |
| } |
| |
| static inline int32_t |
| spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { |
| UChar32 c=s[length-1]; |
| if(U8_IS_SINGLE(c)) { |
| return set.contains(c) ? 1 : -1; |
| } |
| int32_t i=length-1; |
| c=utf8_prevCharSafeBody(s, 0, &i, c, -3); |
| length-=i; |
| return set.contains(c) ? length : -length; |
| } |
| |
| /* |
| * Note: In span() when spanLength==0 (after a string match, or at the beginning |
| * after an empty code point span) and in spanNot() and spanNotUTF8(), |
| * string matching could use a binary search |
| * because all string matches are done from the same start index. |
| * |
| * For UTF-8, this would require a comparison function that returns UTF-16 order. |
| * |
| * This optimization should not be necessary for normal UnicodeSets because |
| * most sets have no strings, and most sets with strings have |
| * very few very short strings. |
| * For cases with many strings, it might be better to use a different API |
| * and implementation with a DFA (state machine). |
| */ |
| |
| /* |
| * Algorithm for span(USET_SPAN_CONTAINED) |
| * |
| * Theoretical algorithm: |
| * - Iterate through the string, and at each code point boundary: |
| * + If the code point there is in the set, then remember to continue after it. |
| * + If a set string matches at the current position, then remember to continue after it. |
| * + Either recursively span for each code point or string match, |
| * or recursively span for all but the shortest one and |
| * iteratively continue the span with the shortest local match. |
| * + Remember the longest recursive span (the farthest end point). |
| * + If there is no match at the current position, neither for the code point there |
| * nor for any set string, then stop and return the longest recursive span length. |
| * |
| * Optimized implementation: |
| * |
| * (We assume that most sets will have very few very short strings. |
| * A span using a string-less set is extremely fast.) |
| * |
| * Create and cache a spanSet which contains all of the single code points |
| * of the original set but none of its strings. |
| * |
| * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| * - Loop: |
| * + Try to match each set string at the end of the spanLength. |
| * ~ Set strings that start with set-contained code points must be matched |
| * with a partial overlap because the recursive algorithm would have tried |
| * to match them at every position. |
| * ~ Set strings that entirely consist of set-contained code points |
| * are irrelevant for span(USET_SPAN_CONTAINED) because the |
| * recursive algorithm would continue after them anyway |
| * and find the longest recursive match from their end. |
| * ~ Rather than recursing, note each end point of a set string match. |
| * + If no set string matched after spanSet.span(), then return |
| * with where the spanSet.span() ended. |
| * + If at least one set string matched after spanSet.span(), then |
| * pop the shortest string match end point and continue |
| * the loop, trying to match all set strings from there. |
| * + If at least one more set string matched after a previous string match, |
| * then test if the code point after the previous string match is also |
| * contained in the set. |
| * Continue the loop with the shortest end point of either this code point |
| * or a matching set string. |
| * + If no more set string matched after a previous string match, |
| * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| * Stop if spanLength==0, otherwise continue the loop. |
| * |
| * By noting each end point of a set string match, |
| * the function visits each string position at most once and finishes |
| * in linear time. |
| * |
| * The recursive algorithm may visit the same string position many times |
| * if multiple paths lead to it and finishes in exponential time. |
| */ |
| |
| /* |
| * Algorithm for span(USET_SPAN_SIMPLE) |
| * |
| * Theoretical algorithm: |
| * - Iterate through the string, and at each code point boundary: |
| * + If the code point there is in the set, then remember to continue after it. |
| * + If a set string matches at the current position, then remember to continue after it. |
| * + Continue from the farthest match position and ignore all others. |
| * + If there is no match at the current position, |
| * then stop and return the current position. |
| * |
| * Optimized implementation: |
| * |
| * (Same assumption and spanSet as above.) |
| * |
| * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| * - Loop: |
| * + Try to match each set string at the end of the spanLength. |
| * ~ Set strings that start with set-contained code points must be matched |
| * with a partial overlap because the standard algorithm would have tried |
| * to match them earlier. |
| * ~ Set strings that entirely consist of set-contained code points |
| * must be matched with a full overlap because the longest-match algorithm |
| * would hide set string matches that end earlier. |
| * Such set strings need not be matched earlier inside the code point span |
| * because the standard algorithm would then have continued after |
| * the set string match anyway. |
| * ~ Remember the longest set string match (farthest end point) from the earliest |
| * starting point. |
| * + If no set string matched after spanSet.span(), then return |
| * with where the spanSet.span() ended. |
| * + If at least one set string matched, then continue the loop after the |
| * longest match from the earliest position. |
| * + If no more set string matched after a previous string match, |
| * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| * Stop if spanLength==0, otherwise continue the loop. |
| */ |
| |
| int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| return spanNot(s, length); |
| } |
| int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); |
| if(spanLength==length) { |
| return length; |
| } |
| |
| // Consider strings; they may overlap with the span. |
| OffsetList offsets; |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| // Use offset list to try all possibilities. |
| offsets.setMaxLength(maxLength16); |
| } |
| int32_t pos=spanLength, rest=length-pos; |
| int32_t i, stringsLength=strings.size(); |
| for(;;) { |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| for(i=0; i<stringsLength; ++i) { |
| int32_t overlap=spanLengths[i]; |
| if(overlap==ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. (Also the empty string.) |
| } |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| U_ASSERT(length>0); |
| |
| // Try to match this string at pos-overlap..pos. |
| if(overlap>=LONG_SPAN) { |
| overlap=length16; |
| // While contained: No point matching fully inside the code point span. |
| U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t inc=length16-overlap; // Keep overlap+inc==length16. |
| for(;;) { |
| if(inc>rest) { |
| break; |
| } |
| // Try to match if the increment is not listed already. |
| if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { |
| if(inc==rest) { |
| return length; // Reached the end of the string. |
| } |
| offsets.addOffset(inc); |
| } |
| if(overlap==0) { |
| break; |
| } |
| --overlap; |
| ++inc; |
| } |
| } |
| } else /* USET_SPAN_SIMPLE */ { |
| int32_t maxInc=0, maxOverlap=0; |
| for(i=0; i<stringsLength; ++i) { |
| int32_t overlap=spanLengths[i]; |
| // For longest match, we do need to try to match even an all-contained string |
| // to find the match from the earliest start. |
| |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| if (length16==0) { |
| continue; // skip the empty string |
| } |
| |
| // Try to match this string at pos-overlap..pos. |
| if(overlap>=LONG_SPAN) { |
| overlap=length16; |
| // Longest match: Need to match fully inside the code point span |
| // to find the match from the earliest start. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t inc=length16-overlap; // Keep overlap+inc==length16. |
| for(;;) { |
| if(inc>rest || overlap<maxOverlap) { |
| break; |
| } |
| // Try to match if the string is longer or starts earlier. |
| if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && |
| matches16CPB(s, pos-overlap, length, s16, length16) |
| ) { |
| maxInc=inc; // Longest match from earliest start. |
| maxOverlap=overlap; |
| break; |
| } |
| --overlap; |
| ++inc; |
| } |
| } |
| |
| if(maxInc!=0 || maxOverlap!=0) { |
| // Longest-match algorithm, and there was a string match. |
| // Simply continue after it. |
| pos+=maxInc; |
| rest-=maxInc; |
| if(rest==0) { |
| return length; // Reached the end of the string. |
| } |
| spanLength=0; // Match strings from after a string match. |
| continue; |
| } |
| } |
| // Finished trying to match all strings at pos. |
| |
| if(spanLength!=0 || pos==0) { |
| // The position is after an unlimited code point span (spanLength!=0), |
| // not after a string match. |
| // The only position where spanLength==0 after a span is pos==0. |
| // Otherwise, an unlimited code point span is only tried again when no |
| // strings match, and if such a non-initial span fails we stop. |
| if(offsets.isEmpty()) { |
| return pos; // No strings matched after a span. |
| } |
| // Match strings from after the next string match. |
| } else { |
| // The position is after a string match (or a single code point). |
| if(offsets.isEmpty()) { |
| // No more strings matched after a previous string match. |
| // Try another code point span from after the last string match. |
| spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); |
| if( spanLength==rest || // Reached the end of the string, or |
| spanLength==0 // neither strings nor span progressed. |
| ) { |
| return pos+spanLength; |
| } |
| pos+=spanLength; |
| rest-=spanLength; |
| continue; // spanLength>0: Match strings from after a span. |
| } else { |
| // Try to match only one code point from after a string match if some |
| // string matched beyond it, so that we try all possible positions |
| // and don't overshoot. |
| spanLength=spanOne(spanSet, s+pos, rest); |
| if(spanLength>0) { |
| if(spanLength==rest) { |
| return length; // Reached the end of the string. |
| } |
| // Match strings after this code point. |
| // There cannot be any increments below it because UnicodeSet strings |
| // contain multiple code points. |
| pos+=spanLength; |
| rest-=spanLength; |
| offsets.shift(spanLength); |
| spanLength=0; |
| continue; // Match strings from after a single code point. |
| } |
| // Match strings from after the next string match. |
| } |
| } |
| int32_t minOffset=offsets.popMinimum(); |
| pos+=minOffset; |
| rest-=minOffset; |
| spanLength=0; // Match strings from after a string match. |
| } |
| } |
| |
| int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| return spanNotBack(s, length); |
| } |
| int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); |
| if(pos==0) { |
| return 0; |
| } |
| int32_t spanLength=length-pos; |
| |
| // Consider strings; they may overlap with the span. |
| OffsetList offsets; |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| // Use offset list to try all possibilities. |
| offsets.setMaxLength(maxLength16); |
| } |
| int32_t i, stringsLength=strings.size(); |
| uint8_t *spanBackLengths=spanLengths; |
| if(all) { |
| spanBackLengths+=stringsLength; |
| } |
| for(;;) { |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| for(i=0; i<stringsLength; ++i) { |
| int32_t overlap=spanBackLengths[i]; |
| if(overlap==ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. (Also the empty string.) |
| } |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| U_ASSERT(length>0); |
| |
| // Try to match this string at pos-(length16-overlap)..pos-length16. |
| if(overlap>=LONG_SPAN) { |
| overlap=length16; |
| // While contained: No point matching fully inside the code point span. |
| int32_t len1=0; |
| U16_FWD_1(s16, len1, overlap); |
| overlap-=len1; // Length of the string minus the first code point. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t dec=length16-overlap; // Keep dec+overlap==length16. |
| for(;;) { |
| if(dec>pos) { |
| break; |
| } |
| // Try to match if the decrement is not listed already. |
| if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { |
| if(dec==pos) { |
| return 0; // Reached the start of the string. |
| } |
| offsets.addOffset(dec); |
| } |
| if(overlap==0) { |
| break; |
| } |
| --overlap; |
| ++dec; |
| } |
| } |
| } else /* USET_SPAN_SIMPLE */ { |
| int32_t maxDec=0, maxOverlap=0; |
| for(i=0; i<stringsLength; ++i) { |
| int32_t overlap=spanBackLengths[i]; |
| // For longest match, we do need to try to match even an all-contained string |
| // to find the match from the latest end. |
| |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| if (length16==0) { |
| continue; // skip the empty string |
| } |
| |
| // Try to match this string at pos-(length16-overlap)..pos-length16. |
| if(overlap>=LONG_SPAN) { |
| overlap=length16; |
| // Longest match: Need to match fully inside the code point span |
| // to find the match from the latest end. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t dec=length16-overlap; // Keep dec+overlap==length16. |
| for(;;) { |
| if(dec>pos || overlap<maxOverlap) { |
| break; |
| } |
| // Try to match if the string is longer or ends later. |
| if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && |
| matches16CPB(s, pos-dec, length, s16, length16) |
| ) { |
| maxDec=dec; // Longest match from latest end. |
| maxOverlap=overlap; |
| break; |
| } |
| --overlap; |
| ++dec; |
| } |
| } |
| |
| if(maxDec!=0 || maxOverlap!=0) { |
| // Longest-match algorithm, and there was a string match. |
| // Simply continue before it. |
| pos-=maxDec; |
| if(pos==0) { |
| return 0; // Reached the start of the string. |
| } |
| spanLength=0; // Match strings from before a string match. |
| continue; |
| } |
| } |
| // Finished trying to match all strings at pos. |
| |
| if(spanLength!=0 || pos==length) { |
| // The position is before an unlimited code point span (spanLength!=0), |
| // not before a string match. |
| // The only position where spanLength==0 before a span is pos==length. |
| // Otherwise, an unlimited code point span is only tried again when no |
| // strings match, and if such a non-initial span fails we stop. |
| if(offsets.isEmpty()) { |
| return pos; // No strings matched before a span. |
| } |
| // Match strings from before the next string match. |
| } else { |
| // The position is before a string match (or a single code point). |
| if(offsets.isEmpty()) { |
| // No more strings matched before a previous string match. |
| // Try another code point span from before the last string match. |
| int32_t oldPos=pos; |
| pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); |
| spanLength=oldPos-pos; |
| if( pos==0 || // Reached the start of the string, or |
| spanLength==0 // neither strings nor span progressed. |
| ) { |
| return pos; |
| } |
| continue; // spanLength>0: Match strings from before a span. |
| } else { |
| // Try to match only one code point from before a string match if some |
| // string matched beyond it, so that we try all possible positions |
| // and don't overshoot. |
| spanLength=spanOneBack(spanSet, s, pos); |
| if(spanLength>0) { |
| if(spanLength==pos) { |
| return 0; // Reached the start of the string. |
| } |
| // Match strings before this code point. |
| // There cannot be any decrements below it because UnicodeSet strings |
| // contain multiple code points. |
| pos-=spanLength; |
| offsets.shift(spanLength); |
| spanLength=0; |
| continue; // Match strings from before a single code point. |
| } |
| // Match strings from before the next string match. |
| } |
| } |
| pos-=offsets.popMinimum(); |
| spanLength=0; // Match strings from before a string match. |
| } |
| } |
| |
| int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| return spanNotUTF8(s, length); |
| } |
| int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); |
| if(spanLength==length) { |
| return length; |
| } |
| |
| // Consider strings; they may overlap with the span. |
| OffsetList offsets; |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| // Use offset list to try all possibilities. |
| offsets.setMaxLength(maxLength8); |
| } |
| int32_t pos=spanLength, rest=length-pos; |
| int32_t i, stringsLength=strings.size(); |
| uint8_t *spanUTF8Lengths=spanLengths; |
| if(all) { |
| spanUTF8Lengths+=2*stringsLength; |
| } |
| for(;;) { |
| const uint8_t *s8=utf8; |
| int32_t length8; |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| for(i=0; i<stringsLength; ++i) { |
| length8=utf8Lengths[i]; |
| if(length8==0) { |
| continue; // String not representable in UTF-8. |
| } |
| int32_t overlap=spanUTF8Lengths[i]; |
| if(overlap==ALL_CP_CONTAINED) { |
| s8+=length8; |
| continue; // Irrelevant string. |
| } |
| |
| // Try to match this string at pos-overlap..pos. |
| if(overlap>=LONG_SPAN) { |
| overlap=length8; |
| // While contained: No point matching fully inside the code point span. |
| U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t inc=length8-overlap; // Keep overlap+inc==length8. |
| for(;;) { |
| if(inc>rest) { |
| break; |
| } |
| // Try to match if the increment is not listed already. |
| // Match at code point boundaries. (The UTF-8 strings were converted |
| // from UTF-16 and are guaranteed to be well-formed.) |
| if(!U8_IS_TRAIL(s[pos-overlap]) && |
| !offsets.containsOffset(inc) && |
| matches8(s+pos-overlap, s8, length8)) { |
| if(inc==rest) { |
| return length; // Reached the end of the string. |
| } |
| offsets.addOffset(inc); |
| } |
| if(overlap==0) { |
| break; |
| } |
| --overlap; |
| ++inc; |
| } |
| s8+=length8; |
| } |
| } else /* USET_SPAN_SIMPLE */ { |
| int32_t maxInc=0, maxOverlap=0; |
| for(i=0; i<stringsLength; ++i) { |
| length8=utf8Lengths[i]; |
| if(length8==0) { |
| continue; // String not representable in UTF-8. |
| } |
| int32_t overlap=spanUTF8Lengths[i]; |
| // For longest match, we do need to try to match even an all-contained string |
| // to find the match from the earliest start. |
| |
| // Try to match this string at pos-overlap..pos. |
| if(overlap>=LONG_SPAN) { |
| overlap=length8; |
| // Longest match: Need to match fully inside the code point span |
| // to find the match from the earliest start. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t inc=length8-overlap; // Keep overlap+inc==length8. |
| for(;;) { |
| if(inc>rest || overlap<maxOverlap) { |
| break; |
| } |
| // Try to match if the string is longer or starts earlier. |
| // Match at code point boundaries. (The UTF-8 strings were converted |
| // from UTF-16 and are guaranteed to be well-formed.) |
| if(!U8_IS_TRAIL(s[pos-overlap]) && |
| (overlap>maxOverlap || |
| /* redundant overlap==maxOverlap && */ inc>maxInc) && |
| matches8(s+pos-overlap, s8, length8)) { |
| maxInc=inc; // Longest match from earliest start. |
| maxOverlap=overlap; |
| break; |
| } |
| --overlap; |
| ++inc; |
| } |
| s8+=length8; |
| } |
| |
| if(maxInc!=0 || maxOverlap!=0) { |
| // Longest-match algorithm, and there was a string match. |
| // Simply continue after it. |
| pos+=maxInc; |
| rest-=maxInc; |
| if(rest==0) { |
| return length; // Reached the end of the string. |
| } |
| spanLength=0; // Match strings from after a string match. |
| continue; |
| } |
| } |
| // Finished trying to match all strings at pos. |
| |
| if(spanLength!=0 || pos==0) { |
| // The position is after an unlimited code point span (spanLength!=0), |
| // not after a string match. |
| // The only position where spanLength==0 after a span is pos==0. |
| // Otherwise, an unlimited code point span is only tried again when no |
| // strings match, and if such a non-initial span fails we stop. |
| if(offsets.isEmpty()) { |
| return pos; // No strings matched after a span. |
| } |
| // Match strings from after the next string match. |
| } else { |
| // The position is after a string match (or a single code point). |
| if(offsets.isEmpty()) { |
| // No more strings matched after a previous string match. |
| // Try another code point span from after the last string match. |
| spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); |
| if( spanLength==rest || // Reached the end of the string, or |
| spanLength==0 // neither strings nor span progressed. |
| ) { |
| return pos+spanLength; |
| } |
| pos+=spanLength; |
| rest-=spanLength; |
| continue; // spanLength>0: Match strings from after a span. |
| } else { |
| // Try to match only one code point from after a string match if some |
| // string matched beyond it, so that we try all possible positions |
| // and don't overshoot. |
| spanLength=spanOneUTF8(spanSet, s+pos, rest); |
| if(spanLength>0) { |
| if(spanLength==rest) { |
| return length; // Reached the end of the string. |
| } |
| // Match strings after this code point. |
| // There cannot be any increments below it because UnicodeSet strings |
| // contain multiple code points. |
| pos+=spanLength; |
| rest-=spanLength; |
| offsets.shift(spanLength); |
| spanLength=0; |
| continue; // Match strings from after a single code point. |
| } |
| // Match strings from after the next string match. |
| } |
| } |
| int32_t minOffset=offsets.popMinimum(); |
| pos+=minOffset; |
| rest-=minOffset; |
| spanLength=0; // Match strings from after a string match. |
| } |
| } |
| |
| int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| return spanNotBackUTF8(s, length); |
| } |
| int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); |
| if(pos==0) { |
| return 0; |
| } |
| int32_t spanLength=length-pos; |
| |
| // Consider strings; they may overlap with the span. |
| OffsetList offsets; |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| // Use offset list to try all possibilities. |
| offsets.setMaxLength(maxLength8); |
| } |
| int32_t i, stringsLength=strings.size(); |
| uint8_t *spanBackUTF8Lengths=spanLengths; |
| if(all) { |
| spanBackUTF8Lengths+=3*stringsLength; |
| } |
| for(;;) { |
| const uint8_t *s8=utf8; |
| int32_t length8; |
| if(spanCondition==USET_SPAN_CONTAINED) { |
| for(i=0; i<stringsLength; ++i) { |
| length8=utf8Lengths[i]; |
| if(length8==0) { |
| continue; // String not representable in UTF-8. |
| } |
| int32_t overlap=spanBackUTF8Lengths[i]; |
| if(overlap==ALL_CP_CONTAINED) { |
| s8+=length8; |
| continue; // Irrelevant string. |
| } |
| |
| // Try to match this string at pos-(length8-overlap)..pos-length8. |
| if(overlap>=LONG_SPAN) { |
| overlap=length8; |
| // While contained: No point matching fully inside the code point span. |
| int32_t len1=0; |
| U8_FWD_1(s8, len1, overlap); |
| overlap-=len1; // Length of the string minus the first code point. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t dec=length8-overlap; // Keep dec+overlap==length8. |
| for(;;) { |
| if(dec>pos) { |
| break; |
| } |
| // Try to match if the decrement is not listed already. |
| // Match at code point boundaries. (The UTF-8 strings were converted |
| // from UTF-16 and are guaranteed to be well-formed.) |
| if( !U8_IS_TRAIL(s[pos-dec]) && |
| !offsets.containsOffset(dec) && |
| matches8(s+pos-dec, s8, length8) |
| ) { |
| if(dec==pos) { |
| return 0; // Reached the start of the string. |
| } |
| offsets.addOffset(dec); |
| } |
| if(overlap==0) { |
| break; |
| } |
| --overlap; |
| ++dec; |
| } |
| s8+=length8; |
| } |
| } else /* USET_SPAN_SIMPLE */ { |
| int32_t maxDec=0, maxOverlap=0; |
| for(i=0; i<stringsLength; ++i) { |
| length8=utf8Lengths[i]; |
| if(length8==0) { |
| continue; // String not representable in UTF-8. |
| } |
| int32_t overlap=spanBackUTF8Lengths[i]; |
| // For longest match, we do need to try to match even an all-contained string |
| // to find the match from the latest end. |
| |
| // Try to match this string at pos-(length8-overlap)..pos-length8. |
| if(overlap>=LONG_SPAN) { |
| overlap=length8; |
| // Longest match: Need to match fully inside the code point span |
| // to find the match from the latest end. |
| } |
| if(overlap>spanLength) { |
| overlap=spanLength; |
| } |
| int32_t dec=length8-overlap; // Keep dec+overlap==length8. |
| for(;;) { |
| if(dec>pos || overlap<maxOverlap) { |
| break; |
| } |
| // Try to match if the string is longer or ends later. |
| // Match at code point boundaries. (The UTF-8 strings were converted |
| // from UTF-16 and are guaranteed to be well-formed.) |
| if( !U8_IS_TRAIL(s[pos-dec]) && |
| (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && |
| matches8(s+pos-dec, s8, length8) |
| ) { |
| maxDec=dec; // Longest match from latest end. |
| maxOverlap=overlap; |
| break; |
| } |
| --overlap; |
| ++dec; |
| } |
| s8+=length8; |
| } |
| |
| if(maxDec!=0 || maxOverlap!=0) { |
| // Longest-match algorithm, and there was a string match. |
| // Simply continue before it. |
| pos-=maxDec; |
| if(pos==0) { |
| return 0; // Reached the start of the string. |
| } |
| spanLength=0; // Match strings from before a string match. |
| continue; |
| } |
| } |
| // Finished trying to match all strings at pos. |
| |
| if(spanLength!=0 || pos==length) { |
| // The position is before an unlimited code point span (spanLength!=0), |
| // not before a string match. |
| // The only position where spanLength==0 before a span is pos==length. |
| // Otherwise, an unlimited code point span is only tried again when no |
| // strings match, and if such a non-initial span fails we stop. |
| if(offsets.isEmpty()) { |
| return pos; // No strings matched before a span. |
| } |
| // Match strings from before the next string match. |
| } else { |
| // The position is before a string match (or a single code point). |
| if(offsets.isEmpty()) { |
| // No more strings matched before a previous string match. |
| // Try another code point span from before the last string match. |
| int32_t oldPos=pos; |
| pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); |
| spanLength=oldPos-pos; |
| if( pos==0 || // Reached the start of the string, or |
| spanLength==0 // neither strings nor span progressed. |
| ) { |
| return pos; |
| } |
| continue; // spanLength>0: Match strings from before a span. |
| } else { |
| // Try to match only one code point from before a string match if some |
| // string matched beyond it, so that we try all possible positions |
| // and don't overshoot. |
| spanLength=spanOneBackUTF8(spanSet, s, pos); |
| if(spanLength>0) { |
| if(spanLength==pos) { |
| return 0; // Reached the start of the string. |
| } |
| // Match strings before this code point. |
| // There cannot be any decrements below it because UnicodeSet strings |
| // contain multiple code points. |
| pos-=spanLength; |
| offsets.shift(spanLength); |
| spanLength=0; |
| continue; // Match strings from before a single code point. |
| } |
| // Match strings from before the next string match. |
| } |
| } |
| pos-=offsets.popMinimum(); |
| spanLength=0; // Match strings from before a string match. |
| } |
| } |
| |
| /* |
| * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) |
| * |
| * Theoretical algorithm: |
| * - Iterate through the string, and at each code point boundary: |
| * + If the code point there is in the set, then return with the current position. |
| * + If a set string matches at the current position, then return with the current position. |
| * |
| * Optimized implementation: |
| * |
| * (Same assumption as for span() above.) |
| * |
| * Create and cache a spanNotSet which contains all of the single code points |
| * of the original set but none of its strings. |
| * For each set string add its initial code point to the spanNotSet. |
| * (Also add its final code point for spanNotBack().) |
| * |
| * - Loop: |
| * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). |
| * + If the current code point is in the original set, then |
| * return the current position. |
| * + If any set string matches at the current position, then |
| * return the current position. |
| * + If there is no match at the current position, neither for the code point there |
| * nor for any set string, then skip this code point and continue the loop. |
| * This happens for set-string-initial code points that were added to spanNotSet |
| * when there is not actually a match for such a set string. |
| */ |
| |
| int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { |
| int32_t pos=0, rest=length; |
| int32_t i, stringsLength=strings.size(); |
| do { |
| // Span until we find a code point from the set, |
| // or a code point that starts or ends some string. |
| i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); |
| if(i==rest) { |
| return length; // Reached the end of the string. |
| } |
| pos+=i; |
| rest-=i; |
| |
| // Check whether the current code point is in the original set, |
| // without the string starts and ends. |
| int32_t cpLength=spanOne(spanSet, s+pos, rest); |
| if(cpLength>0) { |
| return pos; // There is a set element at pos. |
| } |
| |
| // Try to match the strings at pos. |
| for(i=0; i<stringsLength; ++i) { |
| if(spanLengths[i]==ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. (Also the empty string.) |
| } |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| U_ASSERT(length>0); |
| if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { |
| return pos; // There is a set element at pos. |
| } |
| } |
| |
| // The span(while not contained) ended on a string start/end which is |
| // not in the original set. Skip this code point and continue. |
| // cpLength<0 |
| pos-=cpLength; |
| rest+=cpLength; |
| } while(rest!=0); |
| return length; // Reached the end of the string. |
| } |
| |
| int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { |
| int32_t pos=length; |
| int32_t i, stringsLength=strings.size(); |
| do { |
| // Span until we find a code point from the set, |
| // or a code point that starts or ends some string. |
| pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); |
| if(pos==0) { |
| return 0; // Reached the start of the string. |
| } |
| |
| // Check whether the current code point is in the original set, |
| // without the string starts and ends. |
| int32_t cpLength=spanOneBack(spanSet, s, pos); |
| if(cpLength>0) { |
| return pos; // There is a set element at pos. |
| } |
| |
| // Try to match the strings at pos. |
| for(i=0; i<stringsLength; ++i) { |
| // Use spanLengths rather than a spanBackLengths pointer because |
| // it is easier and we only need to know whether the string is irrelevant |
| // which is the same in either array. |
| if(spanLengths[i]==ALL_CP_CONTAINED) { |
| continue; // Irrelevant string. (Also the empty string.) |
| } |
| const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); |
| const UChar *s16=string.getBuffer(); |
| int32_t length16=string.length(); |
| U_ASSERT(length>0); |
| if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { |
| return pos; // There is a set element at pos. |
| } |
| } |
| |
| // The span(while not contained) ended on a string start/end which is |
| // not in the original set. Skip this code point and continue. |
| // cpLength<0 |
| pos+=cpLength; |
| } while(pos!=0); |
| return 0; // Reached the start of the string. |
| } |
| |
| int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { |
| int32_t pos=0, rest=length; |
| int32_t i, stringsLength=strings.size(); |
| uint8_t *spanUTF8Lengths=spanLengths; |
| if(all) { |
| spanUTF8Lengths+=2*stringsLength; |
| } |
| do { |
| // Span until we find a code point from the set, |
| // or a code point that starts or ends some string. |
| i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); |
| if(i==rest) { |
| return length; // Reached the end of the string. |
| } |
| pos+=i; |
| rest-=i; |
| |
| // Check whether the current code point is in the original set, |
| // without the string starts and ends. |
| int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); |
| if(cpLength>0) { |
| return pos; // There is a set element at pos. |
| } |
| |
| // Try to match the strings at pos. |
| const uint8_t *s8=utf8; |
| int32_t length8; |
| for(i=0; i<stringsLength; ++i) { |
| length8=utf8Lengths[i]; |
| // ALL_CP_CONTAINED: Irrelevant string. |
| if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { |
| return pos; // There is a set element at pos. |
| } |
| s8+=length8; |
| } |
| |
| // The span(while not contained) ended on a string start/end which is |
| // not in the original set. Skip this code point and continue. |
| // cpLength<0 |
| pos-=cpLength; |
| rest+=cpLength; |
| } while(rest!=0); |
| return length; // Reached the end of the string. |
| } |
| |
| int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { |
| int32_t pos=length; |
| int32_t i, stringsLength=strings.size(); |
| uint8_t *spanBackUTF8Lengths=spanLengths; |
| if(all) { |
| spanBackUTF8Lengths+=3*stringsLength; |
| } |
| do { |
| // Span until we find a code point from the set, |
| // or a code point that starts or ends some string. |
| pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); |
| if(pos==0) { |
| return 0; // Reached the start of the string. |
| } |
| |
| // Check whether the current code point is in the original set, |
| // without the string starts and ends. |
| int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); |
| if(cpLength>0) { |
| return pos; // There is a set element at pos. |
| } |
| |
| // Try to match the strings at pos. |
| const uint8_t *s8=utf8; |
| int32_t length8; |
| for(i=0; i<stringsLength; ++i) { |
| length8=utf8Lengths[i]; |
| // ALL_CP_CONTAINED: Irrelevant string. |
| if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { |
| return pos; // There is a set element at pos. |
| } |
| s8+=length8; |
| } |
| |
| // The span(while not contained) ended on a string start/end which is |
| // not in the original set. Skip this code point and continue. |
| // cpLength<0 |
| pos+=cpLength; |
| } while(pos!=0); |
| return 0; // Reached the start of the string. |
| } |
| |
| U_NAMESPACE_END |