| |
| // |
| // file: regexcmp.cpp |
| // |
| // Copyright (C) 2002, International Business Machines Corporation and others. |
| // All Rights Reserved. |
| // |
| // This file contains the ICU regular expression compiler, which is responsible |
| // for processing a regular expression pattern into the compiled form that |
| // is used by the match finding engine. |
| // |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |
| #include "unicode/unistr.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uchar.h" |
| #include "unicode/uchriter.h" |
| #include "unicode/parsepos.h" |
| #include "unicode/parseerr.h" |
| #include "unicode/regex.h" |
| #include "uprops.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "uassert.h" |
| #include "ucln_in.h" |
| #include "mutex.h" |
| |
| #include "regeximp.h" |
| #include "regexcst.h" // Contains state table for the regex pattern parser. |
| // generated by a Perl script. |
| #include "regexcmp.h" |
| |
| |
| |
| U_NAMESPACE_BEGIN |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Unicode Sets for each of the character classes needed for parsing a regex pattern. |
| // (Initialized with hex values for portability to EBCDIC based machines. |
| // Really ugly, but there's no good way to avoid it.) |
| // |
| // The sets are referred to by name in the regexcst.txt, which is the |
| // source form of the state transition table. These names are converted |
| // to indicies in regexcst.h by the perl state table building script regexcst.pl. |
| // The indices are used to access the array gRuleSets. |
| // |
| //---------------------------------------------------------------------------------------- |
| |
| // "Rule Char" Characters are those with no special meaning, and therefore do not |
| // need to be escaped to appear as literals in a regexp. Expressed |
| // as the inverse of those needing escaping -- [^\*\?\+\[\(\)\{\}\^\$\|\\\.] |
| static const UChar gRuleSet_rule_char_pattern[] = { |
| // [ ^ \ * \ ? \ + \ [ \ ( / ) |
| 0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29, |
| // \ { \ } \ ^ \ $ \ | \ \ \ . ] |
| 0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0}; |
| |
| |
| static const UChar gRuleSet_digit_char_pattern[] = { |
| // [ 0 - 9 ] |
| 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; |
| |
| |
| static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects. |
| static UnicodeSet *gUnescapeCharSet; |
| |
| // |
| // Here are the backslash escape characters that ICU's unescape() function |
| // will handle. |
| // |
| static const UChar gUnescapeCharPattern[] = { |
| // [ a c e f n r t u U ] |
| 0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0}; |
| |
| |
| // |
| // White space characters that may appear within a pattern in free-form mode |
| // |
| static const UChar gRuleWhiteSpacePattern[] = { |
| /* "[[:Cf:][:WSpace:]]" */ |
| 91, 91, 58, 67, 102, 58, 93, 91, 58, 87, |
| 83, 112, 97, 99, 101, 58, 93, 93, 0 }; |
| |
| |
| |
| // |
| // Unicode Set Definitions for Regular Expression \w |
| // |
| static const UChar gIsWordPattern[] = { |
| // [ \ p { L l } \ p { L u } |
| 0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d, |
| // \ p { L t } \ p { L o } |
| 0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d, |
| // \ p { N d } ] |
| 0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0}; |
| |
| |
| // |
| // Unicode Set Definitions for Regular Expression \s |
| // |
| static const UChar gIsSpacePattern[] = { |
| // [ \ t \ n \ f \ r \ p { Z } ] |
| 0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d, 0}; |
| |
| static UnicodeSet *gPropSets[URX_LAST_SET]; |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // ThreadSafeUnicodeSetInit Thread safe creation of a shared UnicodeSet. |
| // |
| //---------------------------------------------------------------------------------------- |
| static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UErrorCode &status) { |
| if (*pSet == NULL) { |
| UnicodeSet *t = new UnicodeSet(pattern, status); |
| if (U_FAILURE(status)) { |
| delete t; |
| return; |
| } |
| if (t == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| Mutex lock; |
| if (*pSet == NULL) { |
| *pSet = t; |
| } else { |
| delete t; |
| } |
| } |
| } |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Constructor. |
| // |
| //---------------------------------------------------------------------------------------- |
| RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) |
| { |
| fStatus = &status; |
| |
| fScanIndex = 0; |
| fNextIndex = 0; |
| fPeekChar = -1; |
| fLineNum = 1; |
| fCharNum = 0; |
| fQuoteMode = FALSE; |
| fFreeForm = FALSE; |
| |
| fMatchOpenParen = -1; |
| fMatchCloseParen = -1; |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // |
| // Register the I18n library for cleanup, |
| // but only if we haven't initialized our globals yet. |
| if (gRuleSets[kRuleSet_rule_char-128] == NULL) { |
| ucln_i18n_registerCleanup(); |
| } |
| |
| // |
| // Set up the constant (static) Unicode Sets. |
| // TODO: something cleaner for that -128 constant. |
| // |
| ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status); |
| ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status); |
| ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128], gRuleSet_digit_char_pattern, status); |
| ThreadSafeUnicodeSetInit(&gUnescapeCharSet, gUnescapeCharPattern, status); |
| ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status); |
| ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status); |
| } |
| |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Destructor |
| // |
| //---------------------------------------------------------------------------------------- |
| RegexCompile::~RegexCompile() { |
| } |
| |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // cleanup. Called (indirectly) by u_cleanup to free all cached memory |
| // |
| //---------------------------------------------------------------------------------------- |
| void RegexCompile::cleanup() { |
| delete gRuleSets[kRuleSet_rule_char-128]; |
| delete gRuleSets[kRuleSet_white_space-128]; |
| delete gRuleSets[kRuleSet_digit_char-128]; |
| delete gUnescapeCharSet; |
| gRuleSets[kRuleSet_rule_char-128] = NULL; |
| gRuleSets[kRuleSet_white_space-128] = NULL; |
| gRuleSets[kRuleSet_digit_char-128] = NULL; |
| gUnescapeCharSet = NULL; |
| int i; |
| for (i=0; i<URX_LAST_SET; i++) { |
| delete (UnicodeSet *)gPropSets[i]; |
| gPropSets[i] = NULL; |
| } |
| return; |
| } |
| |
| |
| //--------------------------------------------------------------------------------- |
| // |
| // Compile regex pattern. The state machine for rexexp pattern parsing is here. |
| // The state tables are hand-written in the file regexcst.txt, |
| // and converted to the form used here by a perl |
| // script regexcst.pl |
| // |
| //--------------------------------------------------------------------------------- |
| void RegexCompile::compile( |
| RegexPattern &rxp, // User level pattern object to receive |
| // the compiled pattern. |
| const UnicodeString &pat, // Source pat to be compiled. |
| UParseError &pp, // Error position info |
| UErrorCode &e) // Error Code |
| { |
| fStatus = &e; |
| fRXPat = &rxp; |
| fParseErr = &pp; |
| fStackPtr = 0; |
| fStack[fStackPtr] = 0; |
| |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| // There should be no pattern stuff in the RegexPattern object. They can not be reused. |
| U_ASSERT(fRXPat->fPattern.length() == 0); |
| |
| // Prepare the RegexPattern object to receive the compiled pattern. |
| fRXPat->fPattern = pat; |
| fRXPat->fStaticSets = gPropSets; |
| |
| |
| // Initialize the pattern scanning state machine |
| fPatternLength = pat.length(); |
| uint16_t state = 1; |
| const RegexTableEl *tableEl; |
| nextChar(fC); // Fetch the first char from the pattern string. |
| |
| // |
| // Main loop for the regex pattern parsing state machine. |
| // Runs once per state transition. |
| // Each time through optionally performs, depending on the state table, |
| // - an advance to the the next pattern char |
| // - an action to be performed. |
| // - pushing or popping a state to/from the local state return stack. |
| // file regexcst.txt is the source for the state table. The logic behind |
| // recongizing the pattern syntax is there, not here. |
| // |
| for (;;) { |
| // Bail out if anything has gone wrong. |
| // Regex pattern parsing stops on the first error encountered. |
| if (U_FAILURE(*fStatus)) { |
| break; |
| } |
| |
| U_ASSERT(state != 0); |
| |
| // Find the state table element that matches the input char from the rule, or the |
| // class of the input character. Start with the first table row for this |
| // state, then linearly scan forward until we find a row that matches the |
| // character. The last row for each state always matches all characters, so |
| // the search will stop there, if not before. |
| // |
| tableEl = &gRuleParseStateTable[state]; |
| REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ", |
| fC.fChar, fLineNum, fCharNum, RegexStateNames[state]); |
| |
| for (;;) { // loop through table rows belonging to this state, looking for one |
| // that matches the current input char. |
| REGEX_SCAN_DEBUG_PRINTF( "."); |
| if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { |
| // Table row specified an individual character, not a set, and |
| // the input character is not quoted, and |
| // the input character matched it. |
| break; |
| } |
| if (tableEl->fCharClass == 255) { |
| // Table row specified default, match anything character class. |
| break; |
| } |
| if (tableEl->fCharClass == 254 && fC.fQuoted) { |
| // Table row specified "quoted" and the char was quoted. |
| break; |
| } |
| if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { |
| // Table row specified eof and we hit eof on the input. |
| break; |
| } |
| |
| if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && |
| fC.fQuoted == FALSE && // char is not escaped && |
| fC.fChar != (UChar32)-1) { // char is not EOF |
| UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128]; |
| if (uniset->contains(fC.fChar)) { |
| // Table row specified a character class, or set of characters, |
| // and the current char matches it. |
| break; |
| } |
| } |
| |
| // No match on this row, advance to the next row for this state, |
| tableEl++; |
| } |
| REGEX_SCAN_DEBUG_PRINTF("\n"); |
| |
| // |
| // We've found the row of the state table that matches the current input |
| // character from the rules string. |
| // Perform any action specified by this row in the state table. |
| if (doParseActions((EParseAction)tableEl->fAction) == FALSE) { |
| // Break out of the state machine loop if the |
| // the action signalled some kind of error, or |
| // the action was to exit, occurs on normal end-of-rules-input. |
| break; |
| } |
| |
| if (tableEl->fPushState != 0) { |
| fStackPtr++; |
| if (fStackPtr >= kStackSize) { |
| error(U_REGEX_INTERNAL_ERROR); |
| REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n"); |
| fStackPtr--; |
| } |
| fStack[fStackPtr] = tableEl->fPushState; |
| } |
| |
| if (tableEl->fNextChar) { |
| nextChar(fC); |
| } |
| |
| // Get the next state from the table entry, or from the |
| // state stack if the next state was specified as "pop". |
| if (tableEl->fNextState != 255) { |
| state = tableEl->fNextState; |
| } else { |
| state = fStack[fStackPtr]; |
| fStackPtr--; |
| if (fStackPtr < 0) { |
| // state stack underflow |
| // This will occur if the user pattern has mis-matched parentheses, |
| // with extra close parens. |
| // |
| fStackPtr++; |
| error(U_REGEX_MISMATCHED_PAREN); |
| } |
| } |
| |
| } |
| |
| // |
| // The pattern has now been read and processed, and the compiled code generated. |
| // |
| |
| // |
| // Compute the number of digits requried for the largest capture group number. |
| // |
| fRXPat->fMaxCaptureDigits = 1; |
| int32_t n = 10; |
| for (;;) { |
| if (n > fRXPat->fNumCaptureGroups) { |
| break; |
| } |
| fRXPat->fMaxCaptureDigits++; |
| n *= 10; |
| } |
| |
| // |
| // A stupid bit of non-sense to prevent code coverage testing from complaining |
| // about the pattern.dump() debug function. Go through the motions of dumping, |
| // even though, without the #define set, it will do nothing. |
| // |
| #ifndef REGEX_DUMP_DEBUG |
| static UBool phonyDumpDone = FALSE; |
| if (phonyDumpDone==FALSE) { |
| fRXPat->dump(); |
| phonyDumpDone = TRUE; |
| } |
| #endif |
| |
| } |
| |
| |
| |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // doParseAction Do some action during regex pattern parsing. |
| // Called by the parse state machine. |
| // |
| // |
| //---------------------------------------------------------------------------------------- |
| UBool RegexCompile::doParseActions(EParseAction action) |
| { |
| UBool returnVal = TRUE; |
| |
| switch ((Regex_PatternParseAction)action) { |
| |
| case doPatStart: |
| // Start of pattern compiles to: |
| //0 SAVE 2 Fall back to position of FAIL |
| //1 jmp 3 |
| //2 FAIL Stop if we ever reach here. |
| //3 NOP Dummy, so start of pattern looks the same as |
| // the start of an ( grouping. |
| //4 NOP Resreved, will be replaced by a save if there are |
| // OR | operators at the top level |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| |
| fParenStack.push(-1, *fStatus); // Begin a Paren Stack Frame |
| fParenStack.push( 3, *fStatus); // Push location of first NOP |
| break; |
| |
| case doPatFinish: |
| // We've scanned to the end of the pattern |
| // The end of pattern compiles to: |
| // URX_END |
| // which will stop the runtime match engine. |
| // Encountering end of pattern also behaves like a close paren, |
| // and forces fixups of the State Save at the beginning of the compiled pattern |
| // and of any OR operations at the top level. |
| // |
| handleCloseParen(); |
| if (fParenStack.size() > 0) { |
| // Missing close paren in pattern. |
| error(U_REGEX_MISMATCHED_PAREN); |
| } |
| |
| // add the END operation to the compiled pattern. |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); |
| |
| // Terminate the pattern compilation state machine. |
| returnVal = FALSE; |
| break; |
| |
| |
| |
| case doOrOperator: |
| // Scanning a '|', as in (A|B) |
| { |
| // Insert a SAVE operation at the start of the pattern section preceding |
| // this OR at this level. This SAVE will branch the match forward |
| // to the right hand side of the OR in the event that the left hand |
| // side fails to match and backtracks. Locate the position for the |
| // save from the location on the top of the parentheses stack. |
| int32_t savePosition = fParenStack.popi(); |
| int32_t op = fRXPat->fCompiledPat->elementAti(savePosition); |
| U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location |
| op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
| fRXPat->fCompiledPat->setElementAt(op, savePosition); |
| |
| // Append an JMP operation into the compiled pattern. The operand for |
| // the OR will eventually be the location following the ')' for the |
| // group. This will be patched in later, when the ')' is encountered. |
| op = URX_BUILD(URX_JMP, 0); |
| fRXPat->fCompiledPat->addElement(op, *fStatus); |
| |
| // Push the position of the newly added JMP op onto the parentheses stack. |
| // This registers if for fixup when this block's close paren is encountered. |
| fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| |
| // Append a NOP to the compiled pattern. This is the slot reserved |
| // for a SAVE in the event that there is yet another '|' following |
| // this one. |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| } |
| break; |
| |
| |
| case doOpenCaptureParen: |
| // Open Paren. |
| // Compile to a |
| // - NOP, which later may be replaced by a save-state if the |
| // parenthesized group gets a * quantifier, followed by |
| // - START_CAPTURE |
| // - NOP, which may later be replaced by a save-state if there |
| // is an '|' alternation within the parens. |
| { |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| fRXPat->fNumCaptureGroups++; |
| int32_t cop = URX_BUILD(URX_START_CAPTURE, fRXPat->fNumCaptureGroups); |
| fRXPat->fCompiledPat->addElement(cop, *fStatus); |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| |
| // On the Parentheses stack, start a new frame and add the postions |
| // of the two NOPs. Depending on what follows in the pattern, the |
| // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| // address of the end of the parenthesized group. |
| fParenStack.push(-2, *fStatus); // Begin a new frame. |
| fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP |
| fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP |
| } |
| break; |
| |
| case doOpenNonCaptureParen: |
| // Open non-caputuring (grouping only) Paren. |
| // Compile to a |
| // - NOP, which later may be replaced by a save-state if the |
| // parenthesized group gets a * quantifier, followed by |
| // - NOP, which may later be replaced by a save-state if there |
| // is an '|' alternation within the parens. |
| { |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| |
| // On the Parentheses stack, start a new frame and add the postions |
| // of the two NOPs. |
| fParenStack.push(-1, *fStatus); // Begin a new frame. |
| fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP |
| fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP |
| } |
| break; |
| |
| |
| case doOpenAtomicParen: |
| // Open Atomic Paren. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doOpenLookAhead: |
| // Open Paren. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doOpenLookAheadNeg: |
| // Open Paren. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doOpenLookBehind: |
| // Open Paren. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doOpenLookBehindNeg: |
| // Open Paren. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doCloseParen: |
| handleCloseParen(); |
| if (fParenStack.size() <= 0) { |
| // Extra close paren, or missing open paren. |
| error(U_REGEX_MISMATCHED_PAREN); |
| } |
| break; |
| |
| case doNOP: |
| break; |
| |
| |
| case doBadOpenParenType: |
| case doRuleError: |
| error(U_REGEX_RULE_SYNTAX); |
| returnVal = FALSE; |
| break; |
| |
| |
| case doMismatchedParenErr: |
| error(U_REGEX_MISMATCHED_PAREN); |
| returnVal = FALSE; |
| break; |
| |
| case doPlus: |
| // Normal '+' compiles to |
| // 1. stuff to be repeated (already built) |
| // 2. state-save 4 |
| // 3. jmp 1 |
| // 4. ... |
| { |
| int32_t topLoc = blockTopLoc(FALSE); // location of item #1 |
| |
| // Locate the position in the compiled pattern where the match will continue |
| // after completing the + (4 in the comment above) |
| int32_t continueLoc = fRXPat->fCompiledPat->size()+2; |
| |
| // Emit the STATE_SAVE |
| int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); |
| |
| // Emit the JMP |
| int32_t jmpOp = URX_BUILD(URX_JMP, topLoc); |
| fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| } |
| break; |
| |
| case doNGPlus: |
| // Non-greedy '+?' compiles to |
| // 1. stuff to be repeated (already built) |
| // 2. state-save 1 |
| // 3. ... |
| { |
| int32_t topLoc = blockTopLoc(FALSE); |
| int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); |
| fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); |
| } |
| break; |
| |
| |
| case doOpt: |
| // Normal (greedy) ? quantifier. |
| // Compiles to |
| // 1. state save 3 |
| // 2. body of optional block |
| // 3. ... |
| // Insert the state save into the compiled pattern, and we're done. |
| { |
| int32_t saveStateLoc = blockTopLoc(TRUE); |
| int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); |
| fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| } |
| break; |
| |
| case doNGOpt: |
| // Non-greedy ?? quantifier |
| // compiles to |
| // 1. jmp 4 |
| // 2. body of optional block |
| // 3 jmp 5 |
| // 4. state save 2 |
| // 5 ... |
| // This code is less than ideal, with two jmps instead of one, because we can only |
| // insert one instruction at the top of the block being iterated. |
| { |
| int32_t jmp1_loc = blockTopLoc(TRUE); |
| int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
| |
| int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); |
| fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
| |
| int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); |
| fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); |
| |
| int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); |
| fRXPat->fCompiledPat->addElement(save_op, *fStatus); |
| } |
| break; |
| |
| |
| case doStar: |
| // Normal (greedy) * quantifier. |
| // Compiles to |
| // 1. STATE_SAVE 3 |
| // 2. body of stuff being iterated over |
| // 3. JMP 0 |
| // 4. ... |
| // |
| { |
| // location of item #1, the STATE_SAVE |
| int32_t saveStateLoc = blockTopLoc(TRUE); |
| |
| // Locate the position in the compiled pattern where the match will continue |
| // after completing the *. (4 in the comment above) |
| int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| |
| // Put together the save state op store it into the compiled code. |
| int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| |
| // Append the URX_JMP operation to the compiled pattern. Its target |
| // is the locaton of the state-save, above. |
| int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc); |
| fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| } |
| break; |
| |
| case doNGStar: |
| // Non-greedy *? quantifier |
| // compiles to |
| // 1. JMP 3 |
| // 2. body of stuff being iterated over |
| // 3. STATE_SAVE 2 |
| // 4 ... |
| { |
| int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. |
| int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. |
| int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); |
| int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); |
| fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
| fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); |
| } |
| break; |
| |
| |
| |
| case doLiteralChar: |
| // We've just scanned a "normal" character from the pattern, |
| literalChar(); |
| break; |
| |
| |
| |
| case doDotAny: |
| // scanned a ".", match any single character. |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus); |
| break; |
| |
| case doCaret: // TODO: multi-line mode flag. |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); |
| break; |
| |
| |
| case doDollar: // TODO: multi-line mode flag. |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); |
| break; |
| |
| case doBackslashA: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus); |
| break; |
| |
| case doBackslashB: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus); |
| break; |
| |
| case doBackslashb: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus); |
| break; |
| |
| case doBackslashD: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); |
| break; |
| |
| case doBackslashd: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); |
| break; |
| |
| case doBackslashG: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); |
| break; |
| |
| case doBackslashS: |
| fRXPat->fCompiledPat->addElement( |
| URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus); |
| break; |
| |
| case doBackslashs: |
| fRXPat->fCompiledPat->addElement( |
| URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); |
| break; |
| |
| case doBackslashW: |
| fRXPat->fCompiledPat->addElement( |
| URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus); |
| break; |
| |
| case doBackslashw: |
| fRXPat->fCompiledPat->addElement( |
| URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); |
| break; |
| |
| case doBackslashX: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); |
| break; |
| |
| case doBackslashx: // \x{abcd} alternate hex format |
| // TODO: implement |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| |
| |
| case doBackslashZ: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); |
| break; |
| |
| case doBackslashz: |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); |
| break; |
| |
| case doExit: |
| returnVal = FALSE; |
| break; |
| |
| case doProperty: |
| { |
| UnicodeSet *theSet = scanProp(); |
| compileSet(theSet); |
| } |
| break; |
| |
| |
| case doScanUnicodeSet: |
| { |
| UnicodeSet *theSet = scanSet(); |
| compileSet(theSet); |
| } |
| break; |
| |
| case doEnterQuoteMode: |
| // Just scanned a \Q. Put character scanner into quote mode. |
| fQuoteMode = TRUE; |
| break; |
| |
| case doBackRef: |
| // TODO: implement back references. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doNamedChar: // \N{NAMED_CHAR} |
| // TODO: implement |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doMatchMode: // (?i) and similar |
| // TODO: implement |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| case doNotImplementedError: |
| // TODO: get rid of this once everything is implemented. |
| error(U_REGEX_UNIMPLEMENTED); |
| break; |
| |
| |
| default: |
| error(U_REGEX_INTERNAL_ERROR); |
| returnVal = FALSE; |
| break; |
| } |
| return returnVal; |
| }; |
| |
| |
| |
| //------------------------------------------------------------------------------ |
| // |
| // literalChar We've encountered a literal character from the pattern, |
| // or an escape sequence that reduces to a character. |
| // Add it to the string containing all literal chars/strings from |
| // the pattern. |
| // If we are in a pattern string already, add the new char to it. |
| // If we aren't in a pattern string, begin one now. |
| // |
| //------------------------------------------------------------------------------ |
| void RegexCompile::literalChar() { |
| int32_t op; // An operation in the compiled pattern. |
| int32_t opType; |
| int32_t patternLoc; // A position in the compiled pattern. |
| int32_t stringLen; |
| |
| |
| // If the last thing compiled into the pattern was not a literal char, |
| // force this new literal char to begin a new string, and not append to the previous. |
| op = fRXPat->fCompiledPat->lastElementi(); |
| opType = URX_TYPE(op); |
| if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR)) { |
| fixLiterals(); |
| } |
| |
| if (fStringOpStart == -1) { |
| // First char of a string in the pattern. |
| // Emit a OneChar op into the compiled pattern. |
| op = URX_BUILD(URX_ONECHAR, fC.fChar); |
| fRXPat->fCompiledPat->addElement(op, *fStatus); |
| |
| // Also add it to the string pool, in case we get a second adjacent literal |
| // and want to change form ONE_CHAR to STRING |
| fStringOpStart = fRXPat->fLiteralText.length(); |
| fRXPat->fLiteralText.append(fC.fChar); |
| return; |
| } |
| |
| // We are adding onto an existing string |
| fRXPat->fLiteralText.append(fC.fChar); |
| |
| // If the most recently emitted op is a URX_ONECHAR, change it to a string op. |
| op = fRXPat->fCompiledPat->lastElementi(); |
| opType = URX_TYPE(op); |
| U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN); |
| if (opType == URX_ONECHAR) { |
| op = URX_BUILD(URX_STRING, fStringOpStart); |
| patternLoc = fRXPat->fCompiledPat->size() - 1; |
| fRXPat->fCompiledPat->setElementAt(op, patternLoc); |
| op = URX_BUILD(URX_STRING_LEN, 0); |
| fRXPat->fCompiledPat->addElement(op, *fStatus); |
| } |
| |
| // The pattern contains a URX_SRING / URX_STRING_LEN. Update the |
| // string length to reflect the new char we just added to the string. |
| stringLen = fRXPat->fLiteralText.length() - fStringOpStart; |
| op = URX_BUILD(URX_STRING_LEN, stringLen); |
| patternLoc = fRXPat->fCompiledPat->size() - 1; |
| fRXPat->fCompiledPat->setElementAt(op, patternLoc); |
| } |
| |
| |
| |
| //------------------------------------------------------------------------------ |
| // |
| // fixLiterals When compiling something that can follow a literal |
| // string in a pattern, we need to "fix" any preceding |
| // string, which will cause any subsequent literals to |
| // begin a new string, rather than appending to the |
| // old one. |
| // |
| // Optionally, split the last char of the string off into |
| // a single "ONE_CHAR" operation, so that quantifiers can |
| // apply to that char alone. Example: abc* |
| // The * needs to apply to the 'c' only. |
| // |
| //------------------------------------------------------------------------------ |
| void RegexCompile::fixLiterals(UBool split) { |
| int32_t stringStart = fStringOpStart; // start index of the current literal string |
| int32_t op; // An op from/for the compiled pattern. |
| int32_t opType; // An opcode type from the compiled pattern. |
| int32_t stringLastCharIdx; |
| UChar32 lastChar; |
| int32_t stringNextToLastCharIdx; |
| UChar32 nextToLastChar; |
| int32_t stringLen; |
| |
| fStringOpStart = -1; |
| if (!split) { |
| return; |
| } |
| |
| // Split: We need to ensure that the last item in the compiled pattern does |
| // not refer to a literal string of more than one char. If it does, |
| // separate the last char from the rest of the string. |
| |
| // If the last operation from the compiled pattern is not a string, |
| // nothing needs to be done |
| op = fRXPat->fCompiledPat->lastElementi(); |
| opType = URX_TYPE(op); |
| if (opType != URX_STRING_LEN) { |
| return; |
| } |
| stringLen = URX_VAL(op); |
| |
| // |
| // Find the position of the last code point in the string (might be a surrogate pair) |
| // |
| stringLastCharIdx = fRXPat->fLiteralText.length(); |
| stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); |
| lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx); |
| |
| // The string should always be at least two code points long, meaning that there |
| // should be something before the last char position that we just found. |
| U_ASSERT(stringLastCharIdx > stringStart); |
| stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); |
| U_ASSERT(stringNextToLastCharIdx >= stringStart); |
| nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx); |
| |
| if (stringNextToLastCharIdx > stringStart) { |
| // The length of string remaining after removing one char is two or more. |
| // Leave the string in the compiled pattern, shorten it by one char, |
| // and append a URX_ONECHAR op for the last char. |
| stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx); |
| op = URX_BUILD(URX_STRING_LEN, stringLen); |
| fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); |
| op = URX_BUILD(URX_ONECHAR, lastChar); |
| fRXPat->fCompiledPat->addElement(op, *fStatus); |
| } else { |
| // The original string consisted of exactly two characters. Replace |
| // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair |
| // of URX_ONECHARs. |
| op = URX_BUILD(URX_ONECHAR, nextToLastChar); |
| fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2); |
| op = URX_BUILD(URX_ONECHAR, lastChar); |
| fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); |
| } |
| } |
| |
| |
| |
| |
| |
| |
| //------------------------------------------------------------------------------ |
| // |
| // blockTopLoc() Find or create a location in the compiled pattern |
| // at the start of the operation or block that has |
| // just been compiled. Needed when a quantifier (* or |
| // whatever) appears, and we need to add an operation |
| // at the start of the thing being quantified. |
| // |
| // (Parenthesized Blocks) have a slot with a NOP that |
| // is reserved for this purpose. .* or similar don't |
| // and a slot needs to be added. |
| // |
| // parameter reserveLoc : TRUE - ensure that there is space to add an opcode |
| // at the returned location. |
| // FALSE - just return the address, reserve a location there. |
| // |
| //------------------------------------------------------------------------------ |
| int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { |
| int32_t theLoc; |
| if (fRXPat->fCompiledPat->size() == fMatchCloseParen) |
| { |
| // The item just processed is a parenthesized block. |
| theLoc = fMatchOpenParen; // A slot is already reserved for us. |
| U_ASSERT(theLoc > 0); |
| uint32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc); |
| U_ASSERT(URX_TYPE(opAtTheLoc) == URX_NOP); |
| } |
| else { |
| // Item just compiled is a single thing, a ".", or a single char, or a set reference. |
| // No slot for STATE_SAVE was pre-reserved in the compiled code. |
| // We need to make space now. |
| fixLiterals(TRUE); // If last item was a string, separate the last char. |
| theLoc = fRXPat->fCompiledPat->size()-1; |
| if (reserveLoc) { |
| int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc); |
| int32_t prevType = URX_TYPE(opAtTheLoc); |
| int32_t nop = URX_BUILD(URX_NOP, 0); |
| fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); |
| } |
| } |
| return theLoc; |
| } |
| |
| |
| |
| //------------------------------------------------------------------------------ |
| // |
| // handleCloseParen When compiling a close paren, we need to go back |
| // and fix up any JMP or SAVE operations within the |
| // parenthesized block that need to target the end |
| // of the block. The locations of these are kept on |
| // the paretheses stack. |
| // |
| // This function is called both when encountering a |
| // real ) and at the end of the pattern. |
| // |
| //------------------------------------------------------------------------------- |
| void RegexCompile::handleCloseParen() { |
| int32_t patIdx; |
| int32_t patOp; |
| if (fParenStack.size() <= 0) { |
| error(U_REGEX_MISMATCHED_PAREN); |
| return; |
| } |
| |
| // Force any literal chars that may follow the close paren to start a new string, |
| // and not attach to any preceding it. |
| fixLiterals(FALSE); |
| |
| // Fixup any operations within the just-closed parenthesized group |
| // that need to reference the end of the (block). |
| // (The first one on popped from the stack is an unused slot for |
| // alternation (OR) state save, but applying the fixup to it does no harm.) |
| for (;;) { |
| patIdx = fParenStack.popi(); |
| if (patIdx < 0) { |
| break; |
| } |
| U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); |
| patOp = fRXPat->fCompiledPat->elementAti(patIdx); |
| U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. |
| patOp |= fRXPat->fCompiledPat->size(); // Set it now. |
| fRXPat->fCompiledPat->setElementAt(patOp, patIdx); |
| fMatchOpenParen = patIdx; |
| } |
| |
| // DO any additional fixups, depending on the specific kind of |
| // parentesized grouping this is |
| |
| switch (patIdx) { |
| case -1: |
| // No additional fixups required. |
| // This is the case with most kinds of groupings. |
| break; |
| case -2: |
| // Capturing Parentheses. |
| // Insert a End Capture op into the pattern. |
| // Grab the group number from the start capture op |
| // and put it into the end-capture op. |
| { |
| int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); |
| U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
| int32_t captureGroupNumber = URX_VAL(captureOp); |
| U_ASSERT(captureGroupNumber > 0); |
| int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, captureGroupNumber); |
| fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); |
| } |
| break; |
| default: |
| U_ASSERT(FALSE); |
| } |
| |
| // remember the next location in the compiled pattern. |
| // The compilation of Quantifiers will look at this to see whether its looping |
| // over a parenthesized block or a single item |
| fMatchCloseParen = fRXPat->fCompiledPat->size(); |
| } |
| |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // compileSet Compile the pattern operations for a reference to a |
| // UnicodeSet. |
| // |
| //---------------------------------------------------------------------------------------- |
| void RegexCompile::compileSet(UnicodeSet *theSet) |
| { |
| if (theSet == NULL) { |
| return; |
| } |
| int32_t setSize = theSet->size(); |
| UChar32 firstSetChar = theSet->charAt(0); |
| if (firstSetChar == -1) { |
| // Sets that contain only strings, but no individual chars, |
| // will end up here. TODO: figure out what to with sets containing strings. |
| setSize = 0; |
| } |
| |
| switch (setSize) { |
| case 0: |
| { |
| // Set of no elements. Always fails to match. |
| fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); |
| delete theSet; |
| } |
| break; |
| |
| case 1: |
| { |
| // The set contains only a single code point. Put it into |
| // the compiled pattern as a single char operation rather |
| // than a set, and discard the set itself. |
| int32_t charToken = URX_BUILD(URX_ONECHAR, firstSetChar); |
| fRXPat->fCompiledPat->addElement(charToken, *fStatus); |
| delete theSet; |
| } |
| break; |
| |
| default: |
| { |
| // The set contains two or more chars. (the normal case) |
| // Put it into the compiled pattern as a set. |
| int32_t setNumber = fRXPat->fSets->size(); |
| fRXPat->fSets->addElement(theSet, *fStatus); |
| int32_t setOp = URX_BUILD(URX_SETREF, setNumber); |
| fRXPat->fCompiledPat->addElement(setOp, *fStatus); |
| } |
| } |
| } |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Error Report a rule parse error. |
| // Only report it if no previous error has been recorded. |
| // |
| //---------------------------------------------------------------------------------------- |
| void RegexCompile::error(UErrorCode e) { |
| if (U_SUCCESS(*fStatus)) { |
| *fStatus = e; |
| fParseErr->line = fLineNum; |
| fParseErr->offset = fCharNum; |
| fParseErr->preContext[0] = 0; // TODO: copy in some input pattern text |
| fParseErr->preContext[0] = 0; |
| } |
| } |
| |
| |
| |
| |
| // |
| // Assorted Unicode character constants. |
| // Numeric because there is no portable way to enter them as literals. |
| // (Think EBCDIC). |
| // |
| static const UChar chCR = 0x0d; // New lines, for terminating comments. |
| static const UChar chLF = 0x0a; |
| static const UChar chNEL = 0x85; // NEL newline variant |
| static const UChar chLS = 0x2028; // Unicode Line Separator |
| static const UChar chApos = 0x27; // single quote, for quoted chars. |
| static const UChar chPound = 0x23; // '#', introduces a comment. |
| static const UChar chE = 0x45; // 'E' |
| static const UChar chBackSlash = 0x5c; // '\' introduces a char escape |
| static const UChar chLParen = 0x28; |
| static const UChar chRParen = 0x29; |
| static const UChar chLBracket = 0x5b; |
| static const UChar chRBracket = 0x5d; |
| static const UChar chRBrace = 0x7d; |
| static const UChar chLowerP = 0x70; |
| static const UChar chUpperP = 0x50; |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // nextCharLL Low Level Next Char from the regex pattern. |
| // Get a char from the string, |
| // keep track of input position for error reporting. |
| // |
| //---------------------------------------------------------------------------------------- |
| UChar32 RegexCompile::nextCharLL() { |
| UChar32 ch; |
| UnicodeString &pattern = fRXPat->fPattern; |
| |
| if (fPeekChar != -1) { |
| ch = fPeekChar; |
| fPeekChar = -1; |
| return ch; |
| } |
| if (fPatternLength==0 || fNextIndex >= fPatternLength) { |
| return (UChar32)-1; |
| } |
| ch = pattern.char32At(fNextIndex); |
| fNextIndex = pattern.moveIndex32(fNextIndex, 1); |
| |
| if (ch == chCR || |
| ch == chNEL || |
| ch == chLS || |
| ch == chLF && fLastChar != chCR) { |
| // Character is starting a new line. Bump up the line number, and |
| // reset the column to 0. |
| fLineNum++; |
| fCharNum=0; |
| if (fQuoteMode) { |
| error(U_REGEX_RULE_SYNTAX); |
| fQuoteMode = FALSE; |
| } |
| } |
| else { |
| // Character is not starting a new line. Except in the case of a |
| // LF following a CR, increment the column position. |
| if (ch != chLF) { |
| fCharNum++; |
| } |
| } |
| fLastChar = ch; |
| return ch; |
| } |
| |
| //--------------------------------------------------------------------------------- |
| // |
| // peekCharLL Low Level Character Scanning, sneak a peek at the next |
| // character without actually getting it. |
| // |
| //--------------------------------------------------------------------------------- |
| UChar32 RegexCompile::peekCharLL() { |
| if (fPeekChar == -1) { |
| fPeekChar = nextCharLL(); |
| } |
| return fPeekChar; |
| } |
| |
| |
| //--------------------------------------------------------------------------------- |
| // |
| // nextChar for pattern scanning. At this level, we handle stripping |
| // out comments and processing some backslash character escapes. |
| // The rest of the pattern grammar is handled at the next level up. |
| // |
| //--------------------------------------------------------------------------------- |
| void RegexCompile::nextChar(RegexPatternChar &c) { |
| |
| // Unicode Character constants needed for the processing done by nextChar(), |
| // in hex because literals wont work on EBCDIC machines. |
| |
| fScanIndex = fNextIndex; |
| c.fChar = nextCharLL(); |
| c.fQuoted = FALSE; |
| |
| if (fQuoteMode) { |
| c.fQuoted = TRUE; |
| if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) { |
| fQuoteMode = FALSE; // Exit quote mode, |
| nextCharLL(); // discard the E |
| nextChar(c); // recurse to get the real next char |
| } |
| } |
| else |
| { |
| // We are not in a 'quoted region' of the source. |
| // |
| if (fFreeForm && c.fChar == chPound) { |
| // Start of a comment. Consume the rest of it. |
| // The new-line char that terminates the comment is always returned. |
| // It will be treated as white-space, and serves to break up anything |
| // that might otherwise incorrectly clump together with a comment in |
| // the middle (a variable name, for example.) |
| for (;;) { |
| c.fChar = nextCharLL(); |
| if (c.fChar == (UChar32)-1 || // EOF |
| c.fChar == chCR || |
| c.fChar == chLF || |
| c.fChar == chNEL || |
| c.fChar == chLS) {break;} |
| } |
| } |
| if (c.fChar == (UChar32)-1) { |
| return; |
| } |
| |
| // |
| // check for backslash escaped characters. |
| // Use UnicodeString::unescapeAt() to handle those that it can. |
| // Otherwise just return the '\', and let the pattern parser deal with it. |
| // |
| int32_t startX = fNextIndex; // start and end positions of the |
| int32_t endX = fNextIndex; // sequence following the '\' |
| if (c.fChar == chBackSlash) { |
| if (gUnescapeCharSet->contains(peekCharLL())) { |
| nextCharLL(); // get & discard the peeked char. |
| c.fQuoted = TRUE; |
| c.fChar = fRXPat->fPattern.unescapeAt(endX); |
| if (startX == endX) { |
| error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| } |
| fCharNum += endX - startX; |
| fNextIndex = endX; |
| } |
| } |
| } |
| // putc(c.fChar, stdout); |
| } |
| |
| |
| |
| //--------------------------------------------------------------------------------- |
| // |
| // scanSet Construct a UnicodeSet from the text at the current scan |
| // position. Advance the scan position to the first character |
| // after the set. |
| // |
| // The scan position is normally under the control of the state machine |
| // that controls pattern parsing. UnicodeSets, however, are parsed by |
| // the UnicodeSet constructor, not by the Regex pattern parser. |
| // |
| //--------------------------------------------------------------------------------- |
| UnicodeSet *RegexCompile::scanSet() { |
| UnicodeSet *uset = NULL; |
| ParsePosition pos; |
| int startPos; |
| int i; |
| |
| if (U_FAILURE(*fStatus)) { |
| return NULL; |
| } |
| |
| pos.setIndex(fScanIndex); |
| startPos = fScanIndex; |
| UErrorCode localStatus = U_ZERO_ERROR; |
| uset = new UnicodeSet(fRXPat->fPattern, pos, |
| localStatus); |
| if (U_FAILURE(localStatus)) { |
| // TODO: Get more accurate position of the error from UnicodeSet's return info. |
| // UnicodeSet appears to not be reporting correctly at this time. |
| REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); |
| error(localStatus); |
| delete uset; |
| return NULL; |
| } |
| |
| // Advance the current scan postion over the UnicodeSet. |
| // Don't just set fScanIndex because the line/char positions maintained |
| // for error reporting would be thrown off. |
| i = pos.getIndex(); |
| for (;;) { |
| if (fNextIndex >= i) { |
| break; |
| } |
| nextCharLL(); |
| } |
| |
| return uset; |
| }; |
| |
| |
| //--------------------------------------------------------------------------------- |
| // |
| // scanProp Construct a UnicodeSet from the text at the current scan |
| // position, which will be of the form \p{whaterver} |
| // |
| // The scan position will be at the 'p' or 'P'. On return |
| // the scan position should be just after the '}' |
| // |
| // Return a UnicodeSet, constructed from the \P pattern, |
| // or NULL if the pattern is invalid. |
| // |
| //--------------------------------------------------------------------------------- |
| UnicodeSet *RegexCompile::scanProp() { |
| UnicodeSet *uset = NULL; |
| |
| if (U_FAILURE(*fStatus)) { |
| return NULL; |
| } |
| |
| U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP); |
| |
| // enclose the \p{property} from the regex pattern source in [brackets] |
| UnicodeString setPattern; |
| setPattern.append(chLBracket); |
| setPattern.append(chBackSlash); |
| for (;;) { |
| setPattern.append(fC.fChar); |
| if (fC.fChar == chRBrace) { |
| break; |
| } |
| nextChar(fC); |
| if (fC.fChar == -1) { |
| // Hit the end of the input string without finding the closing '}' |
| *fStatus = U_REGEX_PROPERTY_SYNTAX; |
| return NULL; |
| } |
| } |
| setPattern.append(chRBracket); |
| |
| // Build the UnicodeSet from the set pattern we just built up in a string. |
| uset = new UnicodeSet(setPattern, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| delete uset; |
| uset = NULL; |
| } |
| |
| nextChar(fC); // Continue overall regex pattern processing with char after the '}' |
| return uset; |
| }; |
| |
| U_NAMESPACE_END |
| #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |