| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ************************************************************************** |
| * Copyright (C) 2002-2016 International Business Machines Corporation |
| * and others. All rights reserved. |
| ************************************************************************** |
| */ |
| // |
| // file: rematch.cpp |
| // |
| // Contains the implementation of class RegexMatcher, |
| // which is one of the main API classes for the ICU regular expression package. |
| // |
| |
| #include "unicode/utypes.h" |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |
| #include "unicode/regex.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uchar.h" |
| #include "unicode/ustring.h" |
| #include "unicode/rbbi.h" |
| #include "unicode/utf.h" |
| #include "unicode/utf16.h" |
| #include "uassert.h" |
| #include "cmemory.h" |
| #include "cstr.h" |
| #include "uvector.h" |
| #include "uvectr32.h" |
| #include "uvectr64.h" |
| #include "regeximp.h" |
| #include "regexst.h" |
| #include "regextxt.h" |
| #include "ucase.h" |
| |
| // #include <malloc.h> // Needed for heapcheck testing |
| |
| |
| U_NAMESPACE_BEGIN |
| |
| // Default limit for the size of the back track stack, to avoid system |
| // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. |
| // This value puts ICU's limits higher than most other regexp implementations, |
| // which use recursion rather than the heap, and take more storage per |
| // backtrack point. |
| // |
| static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; |
| |
| // Time limit counter constant. |
| // Time limits for expression evaluation are in terms of quanta of work by |
| // the engine, each of which is 10,000 state saves. |
| // This constant determines that state saves per tick number. |
| static const int32_t TIMER_INITIAL_VALUE = 10000; |
| |
| |
| // Test for any of the Unicode line terminating characters. |
| static inline UBool isLineTerminator(UChar32 c) { |
| if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { |
| return false; |
| } |
| return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // Constructor and Destructor |
| // |
| //----------------------------------------------------------------------------- |
| RegexMatcher::RegexMatcher(const RegexPattern *pat) { |
| fDeferredStatus = U_ZERO_ERROR; |
| init(fDeferredStatus); |
| if (U_FAILURE(fDeferredStatus)) { |
| return; |
| } |
| if (pat==NULL) { |
| fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| fPattern = pat; |
| init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); |
| } |
| |
| |
| |
| RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| fPattern = fPatternOwned; |
| |
| UText inputText = UTEXT_INITIALIZER; |
| utext_openConstUnicodeString(&inputText, &input, &status); |
| init2(&inputText, status); |
| utext_close(&inputText); |
| |
| fInputUniStrMaybeMutable = TRUE; |
| } |
| |
| |
| RegexMatcher::RegexMatcher(UText *regexp, UText *input, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| fPattern = fPatternOwned; |
| init2(input, status); |
| } |
| |
| |
| RegexMatcher::RegexMatcher(const UnicodeString ®exp, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fPattern = fPatternOwned; |
| init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| } |
| |
| RegexMatcher::RegexMatcher(UText *regexp, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| fPattern = fPatternOwned; |
| init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| } |
| |
| |
| |
| |
| RegexMatcher::~RegexMatcher() { |
| delete fStack; |
| if (fData != fSmallData) { |
| uprv_free(fData); |
| fData = NULL; |
| } |
| if (fPatternOwned) { |
| delete fPatternOwned; |
| fPatternOwned = NULL; |
| fPattern = NULL; |
| } |
| |
| if (fInput) { |
| delete fInput; |
| } |
| if (fInputText) { |
| utext_close(fInputText); |
| } |
| if (fAltInputText) { |
| utext_close(fAltInputText); |
| } |
| |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| delete fWordBreakItr; |
| delete fGCBreakItr; |
| #endif |
| } |
| |
| // |
| // init() common initialization for use by all constructors. |
| // Initialize all fields, get the object into a consistent state. |
| // This must be done even when the initial status shows an error, |
| // so that the object is initialized sufficiently well for the destructor |
| // to run safely. |
| // |
| void RegexMatcher::init(UErrorCode &status) { |
| fPattern = NULL; |
| fPatternOwned = NULL; |
| fFrameSize = 0; |
| fRegionStart = 0; |
| fRegionLimit = 0; |
| fAnchorStart = 0; |
| fAnchorLimit = 0; |
| fLookStart = 0; |
| fLookLimit = 0; |
| fActiveStart = 0; |
| fActiveLimit = 0; |
| fTransparentBounds = FALSE; |
| fAnchoringBounds = TRUE; |
| fMatch = FALSE; |
| fMatchStart = 0; |
| fMatchEnd = 0; |
| fLastMatchEnd = -1; |
| fAppendPosition = 0; |
| fHitEnd = FALSE; |
| fRequireEnd = FALSE; |
| fStack = NULL; |
| fFrame = NULL; |
| fTimeLimit = 0; |
| fTime = 0; |
| fTickCounter = 0; |
| fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; |
| fCallbackFn = NULL; |
| fCallbackContext = NULL; |
| fFindProgressCallbackFn = NULL; |
| fFindProgressCallbackContext = NULL; |
| fTraceDebug = FALSE; |
| fDeferredStatus = status; |
| fData = fSmallData; |
| fWordBreakItr = NULL; |
| fGCBreakItr = NULL; |
| |
| fStack = NULL; |
| fInputText = NULL; |
| fAltInputText = NULL; |
| fInput = NULL; |
| fInputLength = 0; |
| fInputUniStrMaybeMutable = FALSE; |
| } |
| |
| // |
| // init2() Common initialization for use by RegexMatcher constructors, part 2. |
| // This handles the common setup to be done after the Pattern is available. |
| // |
| void RegexMatcher::init2(UText *input, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| fDeferredStatus = status; |
| return; |
| } |
| |
| if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) { |
| fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); |
| if (fData == NULL) { |
| status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| } |
| |
| fStack = new UVector64(status); |
| if (fStack == NULL) { |
| status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| reset(input); |
| setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); |
| if (U_FAILURE(status)) { |
| fDeferredStatus = status; |
| return; |
| } |
| } |
| |
| |
| static const UChar BACKSLASH = 0x5c; |
| static const UChar DOLLARSIGN = 0x24; |
| static const UChar LEFTBRACKET = 0x7b; |
| static const UChar RIGHTBRACKET = 0x7d; |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // appendReplacement |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, |
| const UnicodeString &replacement, |
| UErrorCode &status) { |
| UText replacementText = UTEXT_INITIALIZER; |
| |
| utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| if (U_SUCCESS(status)) { |
| UText resultText = UTEXT_INITIALIZER; |
| utext_openUnicodeString(&resultText, &dest, &status); |
| |
| if (U_SUCCESS(status)) { |
| appendReplacement(&resultText, &replacementText, status); |
| utext_close(&resultText); |
| } |
| utext_close(&replacementText); |
| } |
| |
| return *this; |
| } |
| |
| // |
| // appendReplacement, UText mode |
| // |
| RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
| UText *replacement, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return *this; |
| } |
| if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| return *this; |
| } |
| |
| // Copy input string from the end of previous match to start of current match |
| int64_t destLen = utext_nativeLength(dest); |
| if (fMatchStart > fAppendPosition) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, |
| (int32_t)(fMatchStart-fAppendPosition), &status); |
| } else { |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)(fMatchStart-fAppendPosition); |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); |
| } |
| UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| if (inputChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return *this; |
| } |
| utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); |
| destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); |
| uprv_free(inputChars); |
| } |
| } |
| fAppendPosition = fMatchEnd; |
| |
| |
| // scan the replacement text, looking for substitutions ($n) and \escapes. |
| // TODO: optimize this loop by efficiently scanning for '$' or '\', |
| // move entire ranges not containing substitutions. |
| UTEXT_SETNATIVEINDEX(replacement, 0); |
| for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { |
| if (c == BACKSLASH) { |
| // Backslash Escape. Copy the following char out without further checks. |
| // Note: Surrogate pairs don't need any special handling |
| // The second half wont be a '$' or a '\', and |
| // will move to the dest normally on the next |
| // loop iteration. |
| c = UTEXT_CURRENT32(replacement); |
| if (c == U_SENTINEL) { |
| break; |
| } |
| |
| if (c==0x55/*U*/ || c==0x75/*u*/) { |
| // We have a \udddd or \Udddddddd escape sequence. |
| int32_t offset = 0; |
| struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); |
| UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); |
| if (escapedChar != (UChar32)0xFFFFFFFF) { |
| if (U_IS_BMP(escapedChar)) { |
| UChar c16 = (UChar)escapedChar; |
| destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
| } else { |
| UChar surrogate[2]; |
| surrogate[0] = U16_LEAD(escapedChar); |
| surrogate[1] = U16_TRAIL(escapedChar); |
| if (U_SUCCESS(status)) { |
| destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); |
| } |
| } |
| // TODO: Report errors for mal-formed \u escapes? |
| // As this is, the original sequence is output, which may be OK. |
| if (context.lastOffset == offset) { |
| (void)UTEXT_PREVIOUS32(replacement); |
| } else if (context.lastOffset != offset-1) { |
| utext_moveIndex32(replacement, offset - context.lastOffset - 1); |
| } |
| } |
| } else { |
| (void)UTEXT_NEXT32(replacement); |
| // Plain backslash escape. Just put out the escaped character. |
| if (U_IS_BMP(c)) { |
| UChar c16 = (UChar)c; |
| destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
| } else { |
| UChar surrogate[2]; |
| surrogate[0] = U16_LEAD(c); |
| surrogate[1] = U16_TRAIL(c); |
| if (U_SUCCESS(status)) { |
| destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); |
| } |
| } |
| } |
| } else if (c != DOLLARSIGN) { |
| // Normal char, not a $. Copy it out without further checks. |
| if (U_IS_BMP(c)) { |
| UChar c16 = (UChar)c; |
| destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
| } else { |
| UChar surrogate[2]; |
| surrogate[0] = U16_LEAD(c); |
| surrogate[1] = U16_TRAIL(c); |
| if (U_SUCCESS(status)) { |
| destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); |
| } |
| } |
| } else { |
| // We've got a $. Pick up a capture group name or number if one follows. |
| // Consume digits so long as the resulting group number <= the number of |
| // number of capture groups in the pattern. |
| |
| int32_t groupNum = 0; |
| int32_t numDigits = 0; |
| UChar32 nextChar = utext_current32(replacement); |
| if (nextChar == LEFTBRACKET) { |
| // Scan for a Named Capture Group, ${name}. |
| UnicodeString groupName; |
| utext_next32(replacement); |
| while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { |
| nextChar = utext_next32(replacement); |
| if (nextChar == U_SENTINEL) { |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z |
| (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z |
| (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 |
| groupName.append(nextChar); |
| } else if (nextChar == RIGHTBRACKET) { |
| groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0; |
| if (groupNum == 0) { |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } |
| } else { |
| // Character was something other than a name char or a closing '}' |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } |
| } |
| |
| } else if (u_isdigit(nextChar)) { |
| // $n Scan for a capture group number |
| int32_t numCaptureGroups = fPattern->fGroupMap->size(); |
| for (;;) { |
| nextChar = UTEXT_CURRENT32(replacement); |
| if (nextChar == U_SENTINEL) { |
| break; |
| } |
| if (u_isdigit(nextChar) == FALSE) { |
| break; |
| } |
| int32_t nextDigitVal = u_charDigitValue(nextChar); |
| if (groupNum*10 + nextDigitVal > numCaptureGroups) { |
| // Don't consume the next digit if it makes the capture group number too big. |
| if (numDigits == 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| break; |
| } |
| (void)UTEXT_NEXT32(replacement); |
| groupNum=groupNum*10 + nextDigitVal; |
| ++numDigits; |
| } |
| } else { |
| // $ not followed by capture group name or number. |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } |
| |
| if (U_SUCCESS(status)) { |
| destLen += appendGroup(groupNum, dest, status); |
| } |
| } // End of $ capture group handling |
| } // End of per-character loop through the replacement string. |
| |
| return *this; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // appendTail Intended to be used in conjunction with appendReplacement() |
| // To the destination string, append everything following |
| // the last match position from the input string. |
| // |
| // Note: Match ranges do not affect appendTail or appendReplacement |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { |
| UErrorCode status = U_ZERO_ERROR; |
| UText resultText = UTEXT_INITIALIZER; |
| utext_openUnicodeString(&resultText, &dest, &status); |
| |
| if (U_SUCCESS(status)) { |
| appendTail(&resultText, status); |
| utext_close(&resultText); |
| } |
| |
| return dest; |
| } |
| |
| // |
| // appendTail, UText mode |
| // |
| UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| if (fInputLength > fAppendPosition) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| int64_t destLen = utext_nativeLength(dest); |
| utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, |
| (int32_t)(fInputLength-fAppendPosition), &status); |
| } else { |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)(fInputLength-fAppendPosition); |
| } else { |
| len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); |
| status = U_ZERO_ERROR; // buffer overflow |
| } |
| |
| UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); |
| if (inputChars == NULL) { |
| fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated |
| int64_t destLen = utext_nativeLength(dest); |
| utext_replace(dest, destLen, destLen, inputChars, len16, &status); |
| uprv_free(inputChars); |
| } |
| } |
| } |
| return dest; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // end |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::end(UErrorCode &err) const { |
| return end(0, err); |
| } |
| |
| int64_t RegexMatcher::end64(UErrorCode &err) const { |
| return end64(0, err); |
| } |
| |
| int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { |
| if (U_FAILURE(err)) { |
| return -1; |
| } |
| if (fMatch == FALSE) { |
| err = U_REGEX_INVALID_STATE; |
| return -1; |
| } |
| if (group < 0 || group > fPattern->fGroupMap->size()) { |
| err = U_INDEX_OUTOFBOUNDS_ERROR; |
| return -1; |
| } |
| int64_t e = -1; |
| if (group == 0) { |
| e = fMatchEnd; |
| } else { |
| // Get the position within the stack frame of the variables for |
| // this capture group. |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| e = fFrame->fExtra[groupOffset + 1]; |
| } |
| |
| return e; |
| } |
| |
| int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { |
| return (int32_t)end64(group, err); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // findProgressInterrupt This function is called once for each advance in the target |
| // string from the find() function, and calls the user progress callback |
| // function if there is one installed. |
| // |
| // Return: TRUE if the find operation is to be terminated. |
| // FALSE if the find operation is to continue running. |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { |
| if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) { |
| status = U_REGEX_STOPPED_BY_CALLER; |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // find() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::find() { |
| if (U_FAILURE(fDeferredStatus)) { |
| return FALSE; |
| } |
| UErrorCode status = U_ZERO_ERROR; |
| UBool result = find(status); |
| return result; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // find() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::find(UErrorCode &status) { |
| // Start at the position of the last match end. (Will be zero if the |
| // matcher has been reset.) |
| // |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| return findUsingChunk(status); |
| } |
| |
| int64_t startPos = fMatchEnd; |
| if (startPos==0) { |
| startPos = fActiveStart; |
| } |
| |
| if (fMatch) { |
| // Save the position of any previous successful match. |
| fLastMatchEnd = fMatchEnd; |
| |
| if (fMatchStart == fMatchEnd) { |
| // Previous match had zero length. Move start position up one position |
| // to avoid sending find() into a loop on zero-length matches. |
| if (startPos >= fActiveLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| (void)UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } else { |
| if (fLastMatchEnd >= 0) { |
| // A previous find() failed to match. Don't try again. |
| // (without this test, a pattern with a zero-length match |
| // could match again at the end of an input string.) |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| } |
| |
| |
| // Compute the position in the input string beyond which a match can not begin, because |
| // the minimum length match would extend past the end of the input. |
| // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. |
| // Be aware of possible overflows if making changes here. |
| int64_t testStartLimit; |
| if (UTEXT_USES_U16(fInputText)) { |
| testStartLimit = fActiveLimit - fPattern->fMinMatchLen; |
| if (startPos > testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| } else { |
| // We don't know exactly how long the minimum match length is in native characters. |
| // Treat anything > 0 as 1. |
| testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); |
| } |
| |
| UChar32 c; |
| U_ASSERT(startPos >= 0); |
| |
| switch (fPattern->fStartType) { |
| case START_NO_INFO: |
| // No optimization was found. |
| // Try a match at each input position. |
| for (;;) { |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| if (startPos >= testStartLimit) { |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| (void)UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testStartLimit the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| UPRV_UNREACHABLE; |
| |
| case START_START: |
| // Matches are only possible at the start of the input string |
| // (pattern begins with ^ or \A) |
| if (startPos > fActiveStart) { |
| fMatch = FALSE; |
| return FALSE; |
| } |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| return fMatch; |
| |
| |
| case START_SET: |
| { |
| // Match may start on any char from a pre-computed set. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| for (;;) { |
| int64_t pos = startPos; |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // c will be -1 (U_SENTINEL) at end of text, in which case we |
| // skip this next block (so we don't have a negative array index) |
| // and handle end of text in the following block. |
| if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || |
| (c>=256 && fPattern->fInitialChars->contains(c)))) { |
| MatchAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, pos); |
| } |
| if (startPos > testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| UPRV_UNREACHABLE; |
| |
| case START_STRING: |
| case START_CHAR: |
| { |
| // Match starts on exactly one char. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| UChar32 theChar = fPattern->fInitialChar; |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| for (;;) { |
| int64_t pos = startPos; |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| if (c == theChar) { |
| MatchAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| if (startPos > testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| UPRV_UNREACHABLE; |
| |
| case START_LINE: |
| { |
| UChar32 ch; |
| if (startPos == fAnchorStart) { |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| ch = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| ch = UTEXT_PREVIOUS32(fInputText); |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| |
| if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| for (;;) { |
| if (ch == 0x0a) { |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| if (startPos >= testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| ch = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testStartLimit the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } else { |
| for (;;) { |
| if (isLineTerminator(ch)) { |
| if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { |
| (void)UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| if (startPos >= testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| ch = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testStartLimit the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| } |
| |
| default: |
| UPRV_UNREACHABLE; |
| } |
| |
| UPRV_UNREACHABLE; |
| } |
| |
| |
| |
| UBool RegexMatcher::find(int64_t start, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| this->reset(); // Note: Reset() is specified by Java Matcher documentation. |
| // This will reset the region to be the full input length. |
| if (start < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| int64_t nativeStart = start; |
| if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| fMatchEnd = nativeStart; |
| return find(status); |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // findUsingChunk() -- like find(), but with the advance knowledge that the |
| // entire string is available in the UText's chunk buffer. |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::findUsingChunk(UErrorCode &status) { |
| // Start at the position of the last match end. (Will be zero if the |
| // matcher has been reset. |
| // |
| |
| int32_t startPos = (int32_t)fMatchEnd; |
| if (startPos==0) { |
| startPos = (int32_t)fActiveStart; |
| } |
| |
| const UChar *inputBuf = fInputText->chunkContents; |
| |
| if (fMatch) { |
| // Save the position of any previous successful match. |
| fLastMatchEnd = fMatchEnd; |
| |
| if (fMatchStart == fMatchEnd) { |
| // Previous match had zero length. Move start position up one position |
| // to avoid sending find() into a loop on zero-length matches. |
| if (startPos >= fActiveLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fInputLength); |
| } |
| } else { |
| if (fLastMatchEnd >= 0) { |
| // A previous find() failed to match. Don't try again. |
| // (without this test, a pattern with a zero-length match |
| // could match again at the end of an input string.) |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| } |
| |
| |
| // Compute the position in the input string beyond which a match can not begin, because |
| // the minimum length match would extend past the end of the input. |
| // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. |
| // Be aware of possible overflows if making changes here. |
| // Note: a match can begin at inputBuf + testLen; it is an inclusive limit. |
| int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); |
| if (startPos > testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| |
| UChar32 c; |
| U_ASSERT(startPos >= 0); |
| |
| switch (fPattern->fStartType) { |
| case START_NO_INFO: |
| // No optimization was found. |
| // Try a match at each input position. |
| for (;;) { |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| if (startPos >= testLen) { |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testLen the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| UPRV_UNREACHABLE; |
| |
| case START_START: |
| // Matches are only possible at the start of the input string |
| // (pattern begins with ^ or \A) |
| if (startPos > fActiveStart) { |
| fMatch = FALSE; |
| return FALSE; |
| } |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| return fMatch; |
| |
| |
| case START_SET: |
| { |
| // Match may start on any char from a pre-computed set. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| for (;;) { |
| int32_t pos = startPos; |
| U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; |
| if ((c<256 && fPattern->fInitialChars8->contains(c)) || |
| (c>=256 && fPattern->fInitialChars->contains(c))) { |
| MatchChunkAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos > testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| UPRV_UNREACHABLE; |
| |
| case START_STRING: |
| case START_CHAR: |
| { |
| // Match starts on exactly one char. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| UChar32 theChar = fPattern->fInitialChar; |
| for (;;) { |
| int32_t pos = startPos; |
| U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; |
| if (c == theChar) { |
| MatchChunkAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos > testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| UPRV_UNREACHABLE; |
| |
| case START_LINE: |
| { |
| UChar32 ch; |
| if (startPos == fAnchorStart) { |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| } |
| |
| if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| for (;;) { |
| ch = inputBuf[startPos-1]; |
| if (ch == 0x0a) { |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos >= testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testLen the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } else { |
| for (;;) { |
| ch = inputBuf[startPos-1]; |
| if (isLineTerminator(ch)) { |
| if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { |
| startPos++; |
| } |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos >= testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testLen the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| } |
| |
| default: |
| UPRV_UNREACHABLE; |
| } |
| |
| UPRV_UNREACHABLE; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // group() |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString RegexMatcher::group(UErrorCode &status) const { |
| return group(0, status); |
| } |
| |
| // Return immutable shallow clone |
| UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { |
| return group(0, dest, group_len, status); |
| } |
| |
| // Return immutable shallow clone |
| UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { |
| group_len = 0; |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| } else if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| |
| int64_t s, e; |
| if (groupNum == 0) { |
| s = fMatchStart; |
| e = fMatchEnd; |
| } else { |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| s = fFrame->fExtra[groupOffset]; |
| e = fFrame->fExtra[groupOffset+1]; |
| } |
| |
| if (s < 0) { |
| // A capture group wasn't part of the match |
| return utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| } |
| U_ASSERT(s <= e); |
| group_len = e - s; |
| |
| dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| if (dest) |
| UTEXT_SETNATIVEINDEX(dest, s); |
| return dest; |
| } |
| |
| UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { |
| UnicodeString result; |
| int64_t groupStart = start64(groupNum, status); |
| int64_t groupEnd = end64(groupNum, status); |
| if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { |
| return result; |
| } |
| |
| // Get the group length using a utext_extract preflight. |
| // UText is actually pretty efficient at this when underlying encoding is UTF-16. |
| int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status); |
| if (status != U_BUFFER_OVERFLOW_ERROR) { |
| return result; |
| } |
| |
| status = U_ZERO_ERROR; |
| UChar *buf = result.getBuffer(length); |
| if (buf == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); |
| result.releaseBuffer(extractLength); |
| U_ASSERT(length == extractLength); |
| } |
| return result; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // appendGroup() -- currently internal only, appends a group to a UText rather |
| // than replacing its contents |
| // |
| //-------------------------------------------------------------------------------- |
| |
| int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return 0; |
| } |
| int64_t destLen = utext_nativeLength(dest); |
| |
| if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| } |
| if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| } |
| |
| int64_t s, e; |
| if (groupNum == 0) { |
| s = fMatchStart; |
| e = fMatchEnd; |
| } else { |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| s = fFrame->fExtra[groupOffset]; |
| e = fFrame->fExtra[groupOffset+1]; |
| } |
| |
| if (s < 0) { |
| // A capture group wasn't part of the match |
| return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| } |
| U_ASSERT(s <= e); |
| |
| int64_t deltaLen; |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| U_ASSERT(e <= fInputLength); |
| deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); |
| } else { |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)(e-s); |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
| } |
| UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| if (groupChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
| |
| deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); |
| uprv_free(groupChars); |
| } |
| return deltaLen; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // groupCount() |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::groupCount() const { |
| return fPattern->fGroupMap->size(); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // hasAnchoringBounds() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::hasAnchoringBounds() const { |
| return fAnchoringBounds; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // hasTransparentBounds() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::hasTransparentBounds() const { |
| return fTransparentBounds; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // hitEnd() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::hitEnd() const { |
| return fHitEnd; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // input() |
| // |
| //-------------------------------------------------------------------------------- |
| const UnicodeString &RegexMatcher::input() const { |
| if (!fInput) { |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)fInputLength; |
| } else { |
| len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); |
| status = U_ZERO_ERROR; // overflow, length status |
| } |
| UnicodeString *result = new UnicodeString(len16, 0, 0); |
| |
| UChar *inputChars = result->getBuffer(len16); |
| utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning |
| result->releaseBuffer(len16); |
| |
| (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= |
| } |
| |
| return *fInput; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // inputText() |
| // |
| //-------------------------------------------------------------------------------- |
| UText *RegexMatcher::inputText() const { |
| return fInputText; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getInput() -- like inputText(), but makes a clone or copies into another UText |
| // |
| //-------------------------------------------------------------------------------- |
| UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| if (dest) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); |
| } else { |
| int32_t input16Len; |
| if (UTEXT_USES_U16(fInputText)) { |
| input16Len = (int32_t)fInputLength; |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error |
| } |
| UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); |
| if (inputChars == NULL) { |
| return dest; |
| } |
| |
| status = U_ZERO_ERROR; |
| utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning |
| status = U_ZERO_ERROR; |
| utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); |
| |
| uprv_free(inputChars); |
| } |
| return dest; |
| } else { |
| return utext_clone(NULL, fInputText, FALSE, TRUE, &status); |
| } |
| } |
| |
| |
| static UBool compat_SyncMutableUTextContents(UText *ut); |
| static UBool compat_SyncMutableUTextContents(UText *ut) { |
| UBool retVal = FALSE; |
| |
| // In the following test, we're really only interested in whether the UText should switch |
| // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents |
| // will still point to the correct data. |
| if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { |
| UnicodeString *us=(UnicodeString *)ut->context; |
| |
| // Update to the latest length. |
| // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). |
| int32_t newLength = us->length(); |
| |
| // Update the chunk description. |
| // The buffer may have switched between stack- and heap-based. |
| ut->chunkContents = us->getBuffer(); |
| ut->chunkLength = newLength; |
| ut->chunkNativeLimit = newLength; |
| ut->nativeIndexingLimit = newLength; |
| retVal = TRUE; |
| } |
| |
| return retVal; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // lookingAt() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::lookingAt(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| else { |
| resetPreserveRegion(); |
| } |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)fActiveStart, FALSE, status); |
| } else { |
| MatchAt(fActiveStart, FALSE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| reset(); |
| |
| if (start < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| |
| int64_t nativeStart; |
| nativeStart = start; |
| if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)nativeStart, FALSE, status); |
| } else { |
| MatchAt(nativeStart, FALSE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // matches() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::matches(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| else { |
| resetPreserveRegion(); |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)fActiveStart, TRUE, status); |
| } else { |
| MatchAt(fActiveStart, TRUE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| reset(); |
| |
| if (start < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| |
| int64_t nativeStart; |
| nativeStart = start; |
| if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)nativeStart, TRUE, status); |
| } else { |
| MatchAt(nativeStart, TRUE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // pattern |
| // |
| //-------------------------------------------------------------------------------- |
| const RegexPattern &RegexMatcher::pattern() const { |
| return *fPattern; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // region |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| |
| if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| |
| int64_t nativeStart = regionStart; |
| int64_t nativeLimit = regionLimit; |
| if (nativeStart > fInputLength || nativeLimit > fInputLength) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| |
| if (startIndex == -1) |
| this->reset(); |
| else |
| resetPreserveRegion(); |
| |
| fRegionStart = nativeStart; |
| fRegionLimit = nativeLimit; |
| fActiveStart = nativeStart; |
| fActiveLimit = nativeLimit; |
| |
| if (startIndex != -1) { |
| if (startIndex < fActiveStart || startIndex > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| fMatchEnd = startIndex; |
| } |
| |
| if (!fTransparentBounds) { |
| fLookStart = nativeStart; |
| fLookLimit = nativeLimit; |
| } |
| if (fAnchoringBounds) { |
| fAnchorStart = nativeStart; |
| fAnchorLimit = nativeLimit; |
| } |
| return *this; |
| } |
| |
| RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { |
| return region(start, limit, -1, status); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // regionEnd |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::regionEnd() const { |
| return (int32_t)fRegionLimit; |
| } |
| |
| int64_t RegexMatcher::regionEnd64() const { |
| return fRegionLimit; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // regionStart |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::regionStart() const { |
| return (int32_t)fRegionStart; |
| } |
| |
| int64_t RegexMatcher::regionStart64() const { |
| return fRegionStart; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // replaceAll |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { |
| UText replacementText = UTEXT_INITIALIZER; |
| UText resultText = UTEXT_INITIALIZER; |
| UnicodeString resultString; |
| if (U_FAILURE(status)) { |
| return resultString; |
| } |
| |
| utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| utext_openUnicodeString(&resultText, &resultString, &status); |
| |
| replaceAll(&replacementText, &resultText, status); |
| |
| utext_close(&resultText); |
| utext_close(&replacementText); |
| |
| return resultString; |
| } |
| |
| |
| // |
| // replaceAll, UText mode |
| // |
| UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| if (dest == NULL) { |
| UnicodeString emptyString; |
| UText empty = UTEXT_INITIALIZER; |
| |
| utext_openUnicodeString(&empty, &emptyString, &status); |
| dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| utext_close(&empty); |
| } |
| |
| if (U_SUCCESS(status)) { |
| reset(); |
| while (find()) { |
| appendReplacement(dest, replacement, status); |
| if (U_FAILURE(status)) { |
| break; |
| } |
| } |
| appendTail(dest, status); |
| } |
| |
| return dest; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // replaceFirst |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { |
| UText replacementText = UTEXT_INITIALIZER; |
| UText resultText = UTEXT_INITIALIZER; |
| UnicodeString resultString; |
| |
| utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| utext_openUnicodeString(&resultText, &resultString, &status); |
| |
| replaceFirst(&replacementText, &resultText, status); |
| |
| utext_close(&resultText); |
| utext_close(&replacementText); |
| |
| return resultString; |
| } |
| |
| // |
| // replaceFirst, UText mode |
| // |
| UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| reset(); |
| if (!find()) { |
| return getInput(dest, status); |
| } |
| |
| if (dest == NULL) { |
| UnicodeString emptyString; |
| UText empty = UTEXT_INITIALIZER; |
| |
| utext_openUnicodeString(&empty, &emptyString, &status); |
| dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| utext_close(&empty); |
| } |
| |
| appendReplacement(dest, replacement, status); |
| appendTail(dest, status); |
| |
| return dest; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // requireEnd |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::requireEnd() const { |
| return fRequireEnd; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // reset |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::reset() { |
| fRegionStart = 0; |
| fRegionLimit = fInputLength; |
| fActiveStart = 0; |
| fActiveLimit = fInputLength; |
| fAnchorStart = 0; |
| fAnchorLimit = fInputLength; |
| fLookStart = 0; |
| fLookLimit = fInputLength; |
| resetPreserveRegion(); |
| return *this; |
| } |
| |
| |
| |
| void RegexMatcher::resetPreserveRegion() { |
| fMatchStart = 0; |
| fMatchEnd = 0; |
| fLastMatchEnd = -1; |
| fAppendPosition = 0; |
| fMatch = FALSE; |
| fHitEnd = FALSE; |
| fRequireEnd = FALSE; |
| fTime = 0; |
| fTickCounter = TIMER_INITIAL_VALUE; |
| //resetStack(); // more expensive than it looks... |
| } |
| |
| |
| RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { |
| fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); |
| if (fPattern->fNeedsAltInput) { |
| fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| return *this; |
| } |
| fInputLength = utext_nativeLength(fInputText); |
| |
| reset(); |
| delete fInput; |
| fInput = NULL; |
| |
| // Do the following for any UnicodeString. |
| // This is for compatibility for those clients who modify the input string "live" during regex operations. |
| fInputUniStrMaybeMutable = TRUE; |
| |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| if (fWordBreakItr) { |
| fWordBreakItr->setText(fInputText, fDeferredStatus); |
| } |
| if (fGCBreakItr) { |
| fGCBreakItr->setText(fInputText, fDeferredStatus); |
| } |
| #endif |
| |
| return *this; |
| } |
| |
| |
| RegexMatcher &RegexMatcher::reset(UText *input) { |
| if (fInputText != input) { |
| fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); |
| if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); |
| if (U_FAILURE(fDeferredStatus)) { |
| return *this; |
| } |
| fInputLength = utext_nativeLength(fInputText); |
| |
| delete fInput; |
| fInput = NULL; |
| |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| if (fWordBreakItr) { |
| fWordBreakItr->setText(input, fDeferredStatus); |
| } |
| if (fGCBreakItr) { |
| fGCBreakItr->setText(fInputText, fDeferredStatus); |
| } |
| #endif |
| } |
| reset(); |
| fInputUniStrMaybeMutable = FALSE; |
| |
| return *this; |
| } |
| |
| /*RegexMatcher &RegexMatcher::reset(const UChar *) { |
| fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; |
| return *this; |
| }*/ |
| |
| RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| reset(); // Reset also resets the region to be the entire string. |
| |
| if (position < 0 || position > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return *this; |
| } |
| fMatchEnd = position; |
| return *this; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // refresh |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (input == NULL) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return *this; |
| } |
| if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return *this; |
| } |
| int64_t pos = utext_getNativeIndex(fInputText); |
| // Shallow read-only clone of the new UText into the existing input UText |
| fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| utext_setNativeIndex(fInputText, pos); |
| |
| if (fAltInputText != NULL) { |
| pos = utext_getNativeIndex(fAltInputText); |
| fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| utext_setNativeIndex(fAltInputText, pos); |
| } |
| return *this; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setTrace |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setTrace(UBool state) { |
| fTraceDebug = state; |
| } |
| |
| |
| |
| /** |
| * UText, replace entire contents of the destination UText with a substring of the source UText. |
| * |
| * @param src The source UText |
| * @param dest The destination UText. Must be writable. |
| * May be NULL, in which case a new UText will be allocated. |
| * @param start Start index of source substring. |
| * @param limit Limit index of source substring. |
| * @param status An error code. |
| */ |
| static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { |
| if (U_FAILURE(*status)) { |
| return dest; |
| } |
| if (start == limit) { |
| if (dest) { |
| utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status); |
| return dest; |
| } else { |
| return utext_openUChars(NULL, NULL, 0, status); |
| } |
| } |
| int32_t length = utext_extract(src, start, limit, NULL, 0, status); |
| if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { |
| return dest; |
| } |
| *status = U_ZERO_ERROR; |
| MaybeStackArray<UChar, 40> buffer; |
| if (length >= buffer.getCapacity()) { |
| UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. |
| if (newBuf == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| utext_extract(src, start, limit, buffer.getAlias(), length+1, status); |
| if (dest) { |
| utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); |
| return dest; |
| } |
| |
| // Caller did not provide a prexisting UText. |
| // Open a new one, and have it adopt the text buffer storage. |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| int32_t ownedLength = 0; |
| UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); |
| if (ownedBuf == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| UText *result = utext_openUChars(NULL, ownedBuf, length, status); |
| if (U_FAILURE(*status)) { |
| uprv_free(ownedBuf); |
| return NULL; |
| } |
| result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); |
| return result; |
| } |
| |
| |
| //--------------------------------------------------------------------- |
| // |
| // split |
| // |
| //--------------------------------------------------------------------- |
| int32_t RegexMatcher::split(const UnicodeString &input, |
| UnicodeString dest[], |
| int32_t destCapacity, |
| UErrorCode &status) |
| { |
| UText inputText = UTEXT_INITIALIZER; |
| utext_openConstUnicodeString(&inputText, &input, &status); |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| |
| UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); |
| if (destText == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| int32_t i; |
| for (i = 0; i < destCapacity; i++) { |
| destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); |
| } |
| |
| int32_t fieldCount = split(&inputText, destText, destCapacity, status); |
| |
| for (i = 0; i < destCapacity; i++) { |
| utext_close(destText[i]); |
| } |
| |
| uprv_free(destText); |
| utext_close(&inputText); |
| return fieldCount; |
| } |
| |
| // |
| // split, UText mode |
| // |
| int32_t RegexMatcher::split(UText *input, |
| UText *dest[], |
| int32_t destCapacity, |
| UErrorCode &status) |
| { |
| // |
| // Check arguments for validity |
| // |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| |
| if (destCapacity < 1) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| // |
| // Reset for the input text |
| // |
| reset(input); |
| int64_t nextOutputStringStart = 0; |
| if (fActiveLimit == 0) { |
| return 0; |
| } |
| |
| // |
| // Loop through the input text, searching for the delimiter pattern |
| // |
| int32_t i; |
| int32_t numCaptureGroups = fPattern->fGroupMap->size(); |
| for (i=0; ; i++) { |
| if (i>=destCapacity-1) { |
| // There is one or zero output string left. |
| // Fill the last output string with whatever is left from the input, then exit the loop. |
| // ( i will be == destCapacity if we filled the output array while processing |
| // capture groups of the delimiter expression, in which case we will discard the |
| // last capture group saved in favor of the unprocessed remainder of the |
| // input string.) |
| i = destCapacity-1; |
| if (fActiveLimit > nextOutputStringStart) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| input->chunkContents+nextOutputStringStart, |
| (int32_t)(fActiveLimit-nextOutputStringStart), &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, |
| fActiveLimit-nextOutputStringStart, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| int32_t remaining16Length = |
| utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); |
| UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); |
| if (remainingChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| |
| utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| |
| uprv_free(remainingChars); |
| } |
| } |
| break; |
| } |
| if (find()) { |
| // We found another delimiter. Move everything from where we started looking |
| // up until the start of the delimiter into the next output string. |
| if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| input->chunkContents+nextOutputStringStart, |
| (int32_t)(fMatchStart-nextOutputStringStart), &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, |
| fMatchStart-nextOutputStringStart, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); |
| UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); |
| if (remainingChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| |
| uprv_free(remainingChars); |
| } |
| nextOutputStringStart = fMatchEnd; |
| |
| // If the delimiter pattern has capturing parentheses, the captured |
| // text goes out into the next n destination strings. |
| int32_t groupNum; |
| for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
| if (i >= destCapacity-2) { |
| // Never fill the last available output string with capture group text. |
| // It will filled with the last field, the remainder of the |
| // unsplit input text. |
| break; |
| } |
| i++; |
| dest[i] = utext_extract_replace(fInputText, dest[i], |
| start64(groupNum, status), end64(groupNum, status), &status); |
| } |
| |
| if (nextOutputStringStart == fActiveLimit) { |
| // The delimiter was at the end of the string. We're done, but first |
| // we output one last empty string, for the empty field following |
| // the delimiter at the end of input. |
| if (i+1 < destCapacity) { |
| ++i; |
| if (dest[i] == NULL) { |
| dest[i] = utext_openUChars(NULL, NULL, 0, &status); |
| } else { |
| static const UChar emptyString[] = {(UChar)0}; |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); |
| } |
| } |
| break; |
| |
| } |
| } |
| else |
| { |
| // We ran off the end of the input while looking for the next delimiter. |
| // All the remaining text goes into the current output string. |
| if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| input->chunkContents+nextOutputStringStart, |
| (int32_t)(fActiveLimit-nextOutputStringStart), &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, |
| fActiveLimit-nextOutputStringStart, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); |
| UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); |
| if (remainingChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| |
| utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| |
| uprv_free(remainingChars); |
| } |
| break; |
| } |
| if (U_FAILURE(status)) { |
| break; |
| } |
| } // end of for loop |
| return i+1; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // start |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::start(UErrorCode &status) const { |
| return start(0, status); |
| } |
| |
| int64_t RegexMatcher::start64(UErrorCode &status) const { |
| return start64(0, status); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // start(int32_t group, UErrorCode &status) |
| // |
| //-------------------------------------------------------------------------------- |
| |
| int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { |
| if (U_FAILURE(status)) { |
| return -1; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return -1; |
| } |
| if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| return -1; |
| } |
| if (group < 0 || group > fPattern->fGroupMap->size()) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return -1; |
| } |
| int64_t s; |
| if (group == 0) { |
| s = fMatchStart; |
| } else { |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| s = fFrame->fExtra[groupOffset]; |
| } |
| |
| return s; |
| } |
| |
| |
| int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { |
| return (int32_t)start64(group, status); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // useAnchoringBounds |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { |
| fAnchoringBounds = b; |
| fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); |
| fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); |
| return *this; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // useTransparentBounds |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { |
| fTransparentBounds = b; |
| fLookStart = (fTransparentBounds ? 0 : fRegionStart); |
| fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); |
| return *this; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setTimeLimit |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| if (limit < 0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| fTimeLimit = limit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getTimeLimit |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::getTimeLimit() const { |
| return fTimeLimit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setStackLimit |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| if (limit < 0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| // Reset the matcher. This is needed here in case there is a current match |
| // whose final stack frame (containing the match results, pointed to by fFrame) |
| // would be lost by resizing to a smaller stack size. |
| reset(); |
| |
| if (limit == 0) { |
| // Unlimited stack expansion |
| fStack->setMaxCapacity(0); |
| } else { |
| // Change the units of the limit from bytes to ints, and bump the size up |
| // to be big enough to hold at least one stack frame for the pattern, |
| // if it isn't there already. |
| int32_t adjustedLimit = limit / sizeof(int32_t); |
| if (adjustedLimit < fPattern->fFrameSize) { |
| adjustedLimit = fPattern->fFrameSize; |
| } |
| fStack->setMaxCapacity(adjustedLimit); |
| } |
| fStackLimit = limit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getStackLimit |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::getStackLimit() const { |
| return fStackLimit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, |
| const void *context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fCallbackFn = callback; |
| fCallbackContext = context; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, |
| const void *&context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| callback = fCallbackFn; |
| context = fCallbackContext; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, |
| const void *context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fFindProgressCallbackFn = callback; |
| fFindProgressCallbackContext = context; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, |
| const void *&context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| callback = fFindProgressCallbackFn; |
| context = fFindProgressCallbackContext; |
| } |
| |
| |
| //================================================================================ |
| // |
| // Code following this point in this file is the internal |
| // Match Engine Implementation. |
| // |
| //================================================================================ |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // resetStack |
| // Discard any previous contents of the state save stack, and initialize a |
| // new stack frame to all -1. The -1s are needed for capture group limits, |
| // where they indicate that a group has not yet matched anything. |
| //-------------------------------------------------------------------------------- |
| REStackFrame *RegexMatcher::resetStack() { |
| // Discard any previous contents of the state save stack, and initialize a |
| // new stack frame with all -1 data. The -1s are needed for capture group limits, |
| // where they indicate that a group has not yet matched anything. |
| fStack->removeAllElements(); |
| |
| REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); |
| if(U_FAILURE(fDeferredStatus)) { |
| return NULL; |
| } |
| |
| int32_t i; |
| for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { |
| iFrame->fExtra[i] = -1; |
| } |
| return iFrame; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // isWordBoundary |
| // in perl, "xab..cd..", \b is true at positions 0,3,5,7 |
| // For us, |
| // If the current char is a combining mark, |
| // \b is FALSE. |
| // Else Scan backwards to the first non-combining char. |
| // We are at a boundary if the this char and the original chars are |
| // opposite in membership in \w set |
| // |
| // parameters: pos - the current position in the input buffer |
| // |
| // TODO: double-check edge cases at region boundaries. |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::isWordBoundary(int64_t pos) { |
| UBool isBoundary = FALSE; |
| UBool cIsWord = FALSE; |
| |
| if (pos >= fLookLimit) { |
| fHitEnd = TRUE; |
| } else { |
| // Determine whether char c at current position is a member of the word set of chars. |
| // If we're off the end of the string, behave as though we're not at a word char. |
| UTEXT_SETNATIVEINDEX(fInputText, pos); |
| UChar32 c = UTEXT_CURRENT32(fInputText); |
| if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { |
| // Current char is a combining one. Not a boundary. |
| return FALSE; |
| } |
| cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); |
| } |
| |
| // Back up until we come to a non-combining char, determine whether |
| // that char is a word char. |
| UBool prevCIsWord = FALSE; |
| for (;;) { |
| if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { |
| break; |
| } |
| UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); |
| if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); |
| break; |
| } |
| } |
| isBoundary = cIsWord ^ prevCIsWord; |
| return isBoundary; |
| } |
| |
| UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { |
| UBool isBoundary = FALSE; |
| UBool cIsWord = FALSE; |
| |
| const UChar *inputBuf = fInputText->chunkContents; |
| |
| if (pos >= fLookLimit) { |
| fHitEnd = TRUE; |
| } else { |
| // Determine whether char c at current position is a member of the word set of chars. |
| // If we're off the end of the string, behave as though we're not at a word char. |
| UChar32 c; |
| U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); |
| if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { |
| // Current char is a combining one. Not a boundary. |
| return FALSE; |
| } |
| cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); |
| } |
| |
| // Back up until we come to a non-combining char, determine whether |
| // that char is a word char. |
| UBool prevCIsWord = FALSE; |
| for (;;) { |
| if (pos <= fLookStart) { |
| break; |
| } |
| UChar32 prevChar; |
| U16_PREV(inputBuf, fLookStart, pos, prevChar); |
| if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); |
| break; |
| } |
| } |
| isBoundary = cIsWord ^ prevCIsWord; |
| return isBoundary; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // isUWordBoundary |
| // |
| // Test for a word boundary using RBBI word break. |
| // |
| // parameters: pos - the current position in the input buffer |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) { |
| UBool returnVal = FALSE; |
| |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| // Note: this point will never be reached if break iteration is configured out. |
| // Regex patterns that would require this function will fail to compile. |
| |
| // If we haven't yet created a break iterator for this matcher, do it now. |
| if (fWordBreakItr == nullptr) { |
| fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| fWordBreakItr->setText(fInputText, status); |
| } |
| |
| // Note: zero width boundary tests like \b see through transparent region bounds, |
| // which is why fLookLimit is used here, rather than fActiveLimit. |
| if (pos >= fLookLimit) { |
| fHitEnd = TRUE; |
| returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" |
| // words are not boundaries. All non-word chars stand by themselves, |
| // with word boundaries on both sides. |
| } else { |
| returnVal = fWordBreakItr->isBoundary((int32_t)pos); |
| } |
| #endif |
| return returnVal; |
| } |
| |
| |
| int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) { |
| int64_t result = pos; |
| |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| // Note: this point will never be reached if break iteration is configured out. |
| // Regex patterns that would require this function will fail to compile. |
| |
| // If we haven't yet created a break iterator for this matcher, do it now. |
| if (fGCBreakItr == nullptr) { |
| fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); |
| if (U_FAILURE(status)) { |
| return pos; |
| } |
| fGCBreakItr->setText(fInputText, status); |
| } |
| result = fGCBreakItr->following(pos); |
| if (result == BreakIterator::DONE) { |
| result = pos; |
| } |
| #endif |
| return result; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // IncrementTime This function is called once each TIMER_INITIAL_VALUE state |
| // saves. Increment the "time" counter, and call the |
| // user callback function if there is one installed. |
| // |
| // If the match operation needs to be aborted, either for a time-out |
| // or because the user callback asked for it, just set an error status. |
| // The engine will pick that up and stop in its outer loop. |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::IncrementTime(UErrorCode &status) { |
| fTickCounter = TIMER_INITIAL_VALUE; |
| fTime++; |
| if (fCallbackFn != NULL) { |
| if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { |
| status = U_REGEX_STOPPED_BY_CALLER; |
| return; |
| } |
| } |
| if (fTimeLimit > 0 && fTime >= fTimeLimit) { |
| status = U_REGEX_TIME_OUT; |
| } |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // StateSave |
| // Make a new stack frame, initialized as a copy of the current stack frame. |
| // Set the pattern index in the original stack frame from the operand value |
| // in the opcode. Execution of the engine continues with the state in |
| // the newly created stack frame |
| // |
| // Note that reserveBlock() may grow the stack, resulting in the |
| // whole thing being relocated in memory. |
| // |
| // Parameters: |
| // fp The top frame pointer when called. At return, a new |
| // fame will be present |
| // savePatIdx An index into the compiled pattern. Goes into the original |
| // (not new) frame. If execution ever back-tracks out of the |
| // new frame, this will be where we continue from in the pattern. |
| // Return |
| // The new frame pointer. |
| // |
| //-------------------------------------------------------------------------------- |
| inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return fp; |
| } |
| // push storage for a new frame. |
| int64_t *newFP = fStack->reserveBlock(fFrameSize, status); |
| if (U_FAILURE(status)) { |
| // Failure on attempted stack expansion. |
| // Stack function set some other error code, change it to a more |
| // specific one for regular expressions. |
| status = U_REGEX_STACK_OVERFLOW; |
| // We need to return a writable stack frame, so just return the |
| // previous frame. The match operation will stop quickly |
| // because of the error status, after which the frame will never |
| // be looked at again. |
| return fp; |
| } |
| fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. |
| |
| // New stack frame = copy of old top frame. |
| int64_t *source = (int64_t *)fp; |
| int64_t *dest = newFP; |
| for (;;) { |
| *dest++ = *source++; |
| if (source == newFP) { |
| break; |
| } |
| } |
| |
| fTickCounter--; |
| if (fTickCounter <= 0) { |
| IncrementTime(status); // Re-initializes fTickCounter |
| } |
| fp->fPatIdx = savePatIdx; |
| return (REStackFrame *)newFP; |
| } |
| |
| #if defined(REGEX_DEBUG) |
| namespace { |
| UnicodeString StringFromUText(UText *ut) { |
| UnicodeString result; |
| for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) { |
| result.append(c); |
| } |
| return result; |
| } |
| } |
| #endif // REGEX_DEBUG |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // MatchAt This is the actual matching engine. |
| // |
| // startIdx: begin matching a this index. |
| // toEnd: if true, match must extend to end of the input region |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
| UBool isMatch = FALSE; // True if the we have a match. |
| |
| int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards |
| |
| int32_t op; // Operation from the compiled pattern, split into |
| int32_t opType; // the opcode |
| int32_t opValue; // and the operand value. |
| |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| printf("MatchAt(startIdx=%ld)\n", startIdx); |
| printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); |
| printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); |
| } |
| #endif |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // Cache frequently referenced items from the compiled pattern |
| // |
| int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| |
| const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| UVector *fSets = fPattern->fSets; |
| |
| fFrameSize = fPattern->fFrameSize; |
| REStackFrame *fp = resetStack(); |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| |
| fp->fPatIdx = 0; |
| fp->fInputIdx = startIdx; |
| |
| // Zero out the pattern's static data |
| int32_t i; |
| for (i = 0; i<fPattern->fDataSize; i++) { |
| fData[i] = 0; |
| } |
| |
| // |
| // Main loop for interpreting the compiled pattern. |
| // One iteration of the loop per pattern operation performed. |
| // |
| for (;;) { |
| op = (int32_t)pat[fp->fPatIdx]; |
| opType = URX_TYPE(op); |
| opValue = URX_VAL(op); |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, |
| UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); |
| fPattern->dumpOp(fp->fPatIdx); |
| } |
| #endif |
| fp->fPatIdx++; |
| |
| switch (opType) { |
| |
| |
| case URX_NOP: |
| break; |
| |
| |
| case URX_BACKTRACK: |
| // Force a backtrack. In some circumstances, the pattern compiler |
| // will notice that the pattern can't possibly match anything, and will |
| // emit one of these at that point. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_ONECHAR: |
| if (fp->fInputIdx < fActiveLimit) { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (c == opValue) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } else { |
| fHitEnd = TRUE; |
| } |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_STRING: |
| { |
| // Test input against a literal string. |
| // Strings require two slots in the compiled pattern, one for the |
| // offset to the string text, and one for the length. |
| |
| int32_t stringStartIdx = opValue; |
| op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand |
| fp->fPatIdx++; |
| opType = URX_TYPE(op); |
| int32_t stringLen = URX_VAL(op); |
| U_ASSERT(opType == URX_STRING_LEN); |
| U_ASSERT(stringLen >= 2); |
| |
| const UChar *patternString = litText+stringStartIdx; |
| int32_t patternStringIndex = 0; |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 inputChar; |
| UChar32 patternChar; |
| UBool success = TRUE; |
| while (patternStringIndex < stringLen) { |
| if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| inputChar = UTEXT_NEXT32(fInputText); |
| U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); |
| if (patternChar != inputChar) { |
| success = FALSE; |
| break; |
| } |
| } |
| |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_STATE_SAVE: |
| fp = StateSave(fp, opValue, status); |
| break; |
| |
| |
| case URX_END: |
| // The match loop will exit via this path on a successful match, |
| // when we reach the end of the pattern. |
| if (toEnd && fp->fInputIdx != fActiveLimit) { |
| // The pattern matched, but not to the end of input. Try some more. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| isMatch = TRUE; |
| goto breakFromLoop; |
| |
| // Start and End Capture stack frame variables are laid out out like this: |
| // fp->fExtra[opValue] - The start of a completed capture group |
| // opValue+1 - The end of a completed capture group |
| // opValue+2 - the start of a capture group whose end |
| // has not yet been reached (and might not ever be). |
| case URX_START_CAPTURE: |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| fp->fExtra[opValue+2] = fp->fInputIdx; |
| break; |
| |
| |
| case URX_END_CAPTURE: |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. |
| fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. |
| fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
| U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
| break; |
| |
| |
| case URX_DOLLAR: // $, test for End of line |
| // or for position before new line at end of input |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // If we are positioned just before a new-line that is located at the |
| // end of input, succeed. |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
| if (isLineTerminator(c)) { |
| // If not in the middle of a CR/LF sequence |
| if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
| // At new-line at end of input. Success |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| |
| break; |
| } |
| } |
| } else { |
| UChar32 nextC = UTEXT_NEXT32(fInputText); |
| if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; // At CR/LF at end of input. Success |
| } |
| } |
| |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // Off the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| // Either at the last character of input, or off the end. |
| if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| } |
| |
| // Not at end of input. Back-track out. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_DOLLAR_M: // $, test for End of line in multi-line mode |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| // If we are positioned just before a new-line, succeed. |
| // It makes no difference where the new-line is within the input. |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
|