|  | 
 | // | 
 | //  file:  regexcmp.cpp | 
 | // | 
 | //  Copyright (C) 2002-2006 International Business Machines Corporation and others. | 
 | //  All Rights Reserved. | 
 | // | 
 | //  This file contains the ICU regular expression compiler, which is responsible | 
 | //  for processing a regular expression pattern into the compiled form that | 
 | //  is used by the match finding engine. | 
 | // | 
 |  | 
 | #include "unicode/utypes.h" | 
 |  | 
 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 
 |  | 
 | #include "unicode/unistr.h" | 
 | #include "unicode/uniset.h" | 
 | #include "unicode/uchar.h" | 
 | #include "unicode/uchriter.h" | 
 | #include "unicode/parsepos.h" | 
 | #include "unicode/parseerr.h" | 
 | #include "unicode/regex.h" | 
 | #include "util.h" | 
 | #include "cmemory.h" | 
 | #include "cstring.h" | 
 | #include "uvectr32.h" | 
 | #include "uassert.h" | 
 | #include "ucln_in.h" | 
 | #include "mutex.h" | 
 |  | 
 | #include "regeximp.h" | 
 | #include "regexcst.h"   // Contains state table for the regex pattern parser. | 
 |                         //   generated by a Perl script. | 
 | #include "regexcmp.h" | 
 | #include "regexst.h" | 
 |  | 
 |  | 
 |  | 
 | U_NAMESPACE_BEGIN | 
 |  | 
 |  | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  Constructor. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status) | 
 | { | 
 |     fStatus           = &status; | 
 |  | 
 |     fRXPat            = rxp; | 
 |     fScanIndex        = 0; | 
 |     fNextIndex        = 0; | 
 |     fPeekChar         = -1; | 
 |     fLineNum          = 1; | 
 |     fCharNum          = 0; | 
 |     fQuoteMode        = FALSE; | 
 |     fInBackslashQuote = FALSE; | 
 |     fModeFlags        = fRXPat->fFlags | 0x80000000; | 
 |     fEOLComments      = TRUE; | 
 |  | 
 |     fMatchOpenParen   = -1; | 
 |     fMatchCloseParen  = -1; | 
 |     fStringOpStart    = -1; | 
 |  | 
 |     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { | 
 |         status = rxp->fDeferredStatus; | 
 |     } | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  Destructor | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | RegexCompile::~RegexCompile() { | 
 | } | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  Compile regex pattern.   The state machine for rexexp pattern parsing is here. | 
 | //                           The state tables are hand-written in the file regexcst.txt, | 
 | //                           and converted to the form used here by a perl | 
 | //                           script regexcst.pl | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void    RegexCompile::compile( | 
 |                          const UnicodeString &pat,   // Source pat to be compiled. | 
 |                          UParseError &pp,            // Error position info | 
 |                          UErrorCode &e)              // Error Code | 
 | { | 
 |     fStatus             = &e; | 
 |     fParseErr           = &pp; | 
 |     fStackPtr           = 0; | 
 |     fStack[fStackPtr]   = 0; | 
 |  | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     // There should be no pattern stuff in the RegexPattern object.  They can not be reused. | 
 |     U_ASSERT(fRXPat->fPattern.length() == 0); | 
 |  | 
 |     // Prepare the RegexPattern object to receive the compiled pattern. | 
 |     //   TODO:  remove per-instance field, and just use globals directly.  (But check perf) | 
 |     fRXPat->fPattern        = pat; | 
 |     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets; | 
 |     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8; | 
 |  | 
 |  | 
 |     // Initialize the pattern scanning state machine | 
 |     fPatternLength = pat.length(); | 
 |     uint16_t                state = 1; | 
 |     const RegexTableEl      *tableEl; | 
 |     nextChar(fC);                        // Fetch the first char from the pattern string. | 
 |  | 
 |     // | 
 |     // Main loop for the regex pattern parsing state machine. | 
 |     //   Runs once per state transition. | 
 |     //   Each time through optionally performs, depending on the state table, | 
 |     //      - an advance to the the next pattern char | 
 |     //      - an action to be performed. | 
 |     //      - pushing or popping a state to/from the local state return stack. | 
 |     //   file regexcst.txt is the source for the state table.  The logic behind | 
 |     //     recongizing the pattern syntax is there, not here. | 
 |     // | 
 |     for (;;) { | 
 |         //  Bail out if anything has gone wrong. | 
 |         //  Regex pattern parsing stops on the first error encountered. | 
 |         if (U_FAILURE(*fStatus)) { | 
 |             break; | 
 |         } | 
 |  | 
 |         U_ASSERT(state != 0); | 
 |  | 
 |         // Find the state table element that matches the input char from the pattern, or the | 
 |         //    class of the input character.  Start with the first table row for this | 
 |         //    state, then linearly scan forward until we find a row that matches the | 
 |         //    character.  The last row for each state always matches all characters, so | 
 |         //    the search will stop there, if not before. | 
 |         // | 
 |         tableEl = &gRuleParseStateTable[state]; | 
 |         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",  | 
 |             fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); | 
 |  | 
 |         for (;;) {    // loop through table rows belonging to this state, looking for one | 
 |                       //   that matches the current input char. | 
 |             REGEX_SCAN_DEBUG_PRINTF((".")); | 
 |             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) { | 
 |                 // Table row specified an individual character, not a set, and | 
 |                 //   the input character is not quoted, and | 
 |                 //   the input character matched it. | 
 |                 break; | 
 |             } | 
 |             if (tableEl->fCharClass == 255) { | 
 |                 // Table row specified default, match anything character class. | 
 |                 break; | 
 |             } | 
 |             if (tableEl->fCharClass == 254 && fC.fQuoted)  { | 
 |                 // Table row specified "quoted" and the char was quoted. | 
 |                 break; | 
 |             } | 
 |             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  { | 
 |                 // Table row specified eof and we hit eof on the input. | 
 |                 break; | 
 |             } | 
 |  | 
 |             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class && | 
 |                 fC.fQuoted == FALSE &&                                       //   char is not escaped && | 
 |                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF | 
 |                 UnicodeSet *uniset = RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128]; | 
 |                 if (uniset->contains(fC.fChar)) { | 
 |                     // Table row specified a character class, or set of characters, | 
 |                     //   and the current char matches it. | 
 |                     break; | 
 |                 } | 
 |             } | 
 |  | 
 |             // No match on this row, advance to the next  row for this state, | 
 |             tableEl++; | 
 |         } | 
 |         REGEX_SCAN_DEBUG_PRINTF(("\n")); | 
 |  | 
 |         // | 
 |         // We've found the row of the state table that matches the current input | 
 |         //   character from the rules string. | 
 |         // Perform any action specified  by this row in the state table. | 
 |         if (doParseActions((EParseAction)tableEl->fAction) == FALSE) { | 
 |             // Break out of the state machine loop if the | 
 |             //   the action signalled some kind of error, or | 
 |             //   the action was to exit, occurs on normal end-of-rules-input. | 
 |             break; | 
 |         } | 
 |  | 
 |         if (tableEl->fPushState != 0) { | 
 |             fStackPtr++; | 
 |             if (fStackPtr >= kStackSize) { | 
 |                 error(U_REGEX_INTERNAL_ERROR); | 
 |                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); | 
 |                 fStackPtr--; | 
 |             } | 
 |             fStack[fStackPtr] = tableEl->fPushState; | 
 |         } | 
 |  | 
 |         // | 
 |         //  NextChar.  This is where characters are actually fetched from the pattern. | 
 |         //             Happens under control of the 'n' tag in the state table. | 
 |         // | 
 |         if (tableEl->fNextChar) { | 
 |             nextChar(fC); | 
 |         } | 
 |  | 
 |         // Get the next state from the table entry, or from the | 
 |         //   state stack if the next state was specified as "pop". | 
 |         if (tableEl->fNextState != 255) { | 
 |             state = tableEl->fNextState; | 
 |         } else { | 
 |             state = fStack[fStackPtr]; | 
 |             fStackPtr--; | 
 |             if (fStackPtr < 0) { | 
 |                 // state stack underflow | 
 |                 // This will occur if the user pattern has mis-matched parentheses, | 
 |                 //   with extra close parens. | 
 |                 //  | 
 |                 fStackPtr++; | 
 |                 error(U_REGEX_MISMATCHED_PAREN); | 
 |             } | 
 |         } | 
 |  | 
 |     } | 
 |  | 
 |     // | 
 |     // The pattern has now been read and processed, and the compiled code generated. | 
 |     // | 
 |  | 
 |     // Back-reference fixup | 
 |     // | 
 |     int32_t loc; | 
 |     for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) { | 
 |         int32_t op = fRXPat->fCompiledPat->elementAti(loc); | 
 |         int32_t opType = URX_TYPE(op); | 
 |         if (opType == URX_BACKREF || opType == URX_BACKREF_I) { | 
 |             int32_t where = URX_VAL(op); | 
 |             if (where > fRXPat->fGroupMap->size()) { | 
 |                 error(U_REGEX_INVALID_BACK_REF); | 
 |                 break; | 
 |             } | 
 |             where = fRXPat->fGroupMap->elementAti(where-1); | 
 |             op    = URX_BUILD(opType, where); | 
 |             fRXPat->fCompiledPat->setElementAt(op, loc); | 
 |         } | 
 |     } | 
 |  | 
 |  | 
 |     // | 
 |     // Compute the number of digits requried for the largest capture group number. | 
 |     // | 
 |     fRXPat->fMaxCaptureDigits = 1; | 
 |     int32_t  n = 10; | 
 |     for (;;) { | 
 |         if (n > fRXPat->fGroupMap->size()) { | 
 |             break; | 
 |         } | 
 |         fRXPat->fMaxCaptureDigits++; | 
 |         n *= 10; | 
 |     } | 
 |  | 
 |     // | 
 |     // The pattern's fFrameSize so far has accumulated the requirements for | 
 |     //   storage for capture parentheses, counters, etc. that are encountered | 
 |     //   in the pattern.  Add space for the two variables that are always | 
 |     //   present in the saved state:  the input string position and the | 
 |     //   position in the compiled pattern. | 
 |     // | 
 |     fRXPat->fFrameSize+=2; | 
 |  | 
 |     // | 
 |     // Get bounds for the minimum and maximum length of a string that this | 
 |     //   pattern can match.  Used to avoid looking for matches in strings that | 
 |     //   are too short. | 
 |     // | 
 |     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); | 
 |  | 
 |     // | 
 |     // Optimization passes | 
 |     //   | 
 |     matchStartType();   | 
 |     OptDotStar(); | 
 |     stripNOPs(); | 
 |  | 
 |     // | 
 |     // Set up fast latin-1 range sets | 
 |     // | 
 |     int32_t numSets = fRXPat->fSets->size(); | 
 |     fRXPat->fSets8 = new Regex8BitSet[numSets]; | 
 |     int32_t i; | 
 |     for (i=0; i<numSets; i++) { | 
 |         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); | 
 |         fRXPat->fSets8[i].init(s); | 
 |     } | 
 |  | 
 | } | 
 |  | 
 |  | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  doParseAction        Do some action during regex pattern parsing. | 
 | //                       Called by the parse state machine. | 
 | // | 
 | //                       Generation of the match engine PCode happens here, or | 
 | //                       in functions called from the parse actions defined here. | 
 | // | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | UBool RegexCompile::doParseActions(EParseAction action) | 
 | { | 
 |     UBool   returnVal = TRUE; | 
 |  | 
 |     switch ((Regex_PatternParseAction)action) { | 
 |  | 
 |     case doPatStart: | 
 |         // Start of pattern compiles to: | 
 |         //0   SAVE   2        Fall back to position of FAIL | 
 |         //1   jmp    3 | 
 |         //2   FAIL            Stop if we ever reach here. | 
 |         //3   NOP             Dummy, so start of pattern looks the same as | 
 |         //                    the start of an ( grouping. | 
 |         //4   NOP             Resreved, will be replaced by a save if there are | 
 |         //                    OR | operators at the top level | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus); | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); | 
 |  | 
 |         // Standard open nonCapture paren action emits the two NOPs and | 
 |         //   sets up the paren stack frame. | 
 |         doParseActions((EParseAction)doOpenNonCaptureParen); | 
 |         break; | 
 |  | 
 |     case doPatFinish: | 
 |         // We've scanned to the end of the pattern | 
 |         //  The end of pattern compiles to: | 
 |         //        URX_END | 
 |         //    which will stop the runtime match engine. | 
 |         //  Encountering end of pattern also behaves like a close paren, | 
 |         //   and forces fixups of the State Save at the beginning of the compiled pattern | 
 |         //   and of any OR operations at the top level. | 
 |         // | 
 |         handleCloseParen(); | 
 |         if (fParenStack.size() > 0) { | 
 |             // Missing close paren in pattern. | 
 |             error(U_REGEX_MISMATCHED_PAREN); | 
 |         } | 
 |  | 
 |         // add the END operation to the compiled pattern. | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); | 
 |  | 
 |         // Terminate the pattern compilation state machine. | 
 |         returnVal = FALSE; | 
 |         break; | 
 |  | 
 |  | 
 |  | 
 |     case doOrOperator: | 
 |         // Scanning a '|', as in (A|B) | 
 |         { | 
 |             // Insert a SAVE operation at the start of the pattern section preceding | 
 |             //   this OR at this level.  This SAVE will branch the match forward | 
 |             //   to the right hand side of the OR in the event that the left hand | 
 |             //   side fails to match and backtracks.  Locate the position for the | 
 |             //   save from the location on the top of the parentheses stack. | 
 |             int32_t savePosition = fParenStack.popi(); | 
 |             int32_t op = fRXPat->fCompiledPat->elementAti(savePosition); | 
 |             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location | 
 |             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | 
 |             fRXPat->fCompiledPat->setElementAt(op, savePosition); | 
 |  | 
 |             // Append an JMP operation into the compiled pattern.  The operand for | 
 |             //  the JMP will eventually be the location following the ')' for the | 
 |             //  group.  This will be patched in later, when the ')' is encountered. | 
 |             op = URX_BUILD(URX_JMP, 0); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // Push the position of the newly added JMP op onto the parentheses stack. | 
 |             // This registers if for fixup when this block's close paren is encountered. | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 
 |  | 
 |             // Append a NOP to the compiled pattern.  This is the slot reserved | 
 |             //   for a SAVE in the event that there is yet another '|' following | 
 |             //   this one. | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doOpenCaptureParen: | 
 |         // Open Paren. | 
 |         //   Compile to a | 
 |         //      - NOP, which later may be replaced by a save-state if the | 
 |         //         parenthesized group gets a * quantifier, followed by | 
 |         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables. | 
 |         //      - NOP, which may later be replaced by a save-state if there | 
 |         //             is an '|' alternation within the parens. | 
 |         // | 
 |         //    Each capture group gets three slots in the save stack frame: | 
 |         //         0:   Capture Group start position (in input string being matched.) | 
 |         //         1:   Capture Group end   positino. | 
 |         //         2:   Start of Match-in-progress. | 
 |         //    The first two locations are for a completed capture group, and are | 
 |         //     referred to by back references and the like. | 
 |         //    The third location stores the capture start position when an START_CAPTURE is | 
 |         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding | 
 |         //      END_CAPure is encountered. | 
 |         { | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |             int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame. | 
 |             fRXPat->fFrameSize += 3; | 
 |             int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc); | 
 |             fRXPat->fCompiledPat->addElement(cop, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |  | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the two NOPs.  Depending on what follows in the pattern, the | 
 |             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target | 
 |             //   address of the end of the parenthesized group. | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push(capturing, *fStatus);                        // Frame type. | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc | 
 |  | 
 |             // Save the mapping from group number to stack frame variable position. | 
 |             fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 
 |         } | 
 |          break; | 
 |  | 
 |     case doOpenNonCaptureParen: | 
 |         // Open non-caputuring (grouping only) Paren. | 
 |         //   Compile to a | 
 |         //      - NOP, which later may be replaced by a save-state if the | 
 |         //         parenthesized group gets a * quantifier, followed by | 
 |         //      - NOP, which may later be replaced by a save-state if there | 
 |         //             is an '|' alternation within the parens. | 
 |         { | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |  | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the two NOPs. | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push(plain,      *fStatus);                       // Begin a new frame. | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc | 
 |         } | 
 |          break; | 
 |  | 
 |  | 
 |     case doOpenAtomicParen: | 
 |         // Open Atomic Paren.  (?> | 
 |         //   Compile to a | 
 |         //      - NOP, which later may be replaced if the parenthesized group  | 
 |         //         has a quantifier, followed by | 
 |         //      - STO_SP  save state stack position, so it can be restored at the ")" | 
 |         //      - NOP, which may later be replaced by a save-state if there | 
 |         //             is an '|' alternation within the parens. | 
 |         { | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |             int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the | 
 |             fRXPat->fDataSize += 1;                    //  state stack ptr. | 
 |             int32_t  stoOp     = URX_BUILD(URX_STO_SP, varLoc); | 
 |             fRXPat->fCompiledPat->addElement(stoOp, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |  | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the two NOPs.  Depending on what follows in the pattern, the | 
 |             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target | 
 |             //   address of the end of the parenthesized group. | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push(atomic, *fStatus);                           // Frame type. | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doOpenLookAhead: | 
 |         // Positive Look-ahead   (?=  stuff  ) | 
 |         // Compiles to | 
 |         //    1    START_LA     dataLoc | 
 |         //    2.   NOP              reserved for use by quantifiers on the block. | 
 |         //                          Look-ahead can't have quantifiers, but paren stack | 
 |         //                             compile time conventions require the slot anyhow. | 
 |         //    3.   NOP              may be replaced if there is are '|' ops in the block. | 
 |         //    4.     code for parenthesized stuff. | 
 |         //    5.   ENDLA | 
 |         //      | 
 |         //  Two data slots are reserved, for saving the stack ptr and the input position. | 
 |         { | 
 |             int32_t dataLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize += 2;  | 
 |             int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             op = URX_BUILD(URX_NOP, 0); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the NOPs.   | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push(lookAhead, *fStatus);                        // Frame type. | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location | 
 |         } | 
 |         break; | 
 |  | 
 |     case doOpenLookAheadNeg: | 
 |         // Negated Lookahead.   (?! stuff ) | 
 |         // Compiles to | 
 |         //    1.    START_LA    dataloc | 
 |         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state, | 
 |         //                                //   which continues with the match. | 
 |         //    3.    NOP                   // Std. Open Paren sequence, for possible '|' | 
 |         //    4.       code for parenthesized stuff. | 
 |         //    5.    END_LA                // Cut back stack, remove saved state from step 2. | 
 |         //    6.    FAIL                  // code in block succeeded, so neg. lookahead fails. | 
 |         //    7.    ... | 
 |         { | 
 |             int32_t dataLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize += 2;  | 
 |             int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later. | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             op = URX_BUILD(URX_NOP, 0); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the StateSave and NOP.   | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push( negLookAhead, *fStatus);                    // Frame type | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location | 
 |              | 
 |             // Instructions #5 and #6 will be added when the ')' is encountered. | 
 |         } | 
 |         break; | 
 |  | 
 |     case doOpenLookBehind: | 
 |         { | 
 |             //   Compile a (?<= look-behind open paren. | 
 |             // | 
 |             //          Compiles to | 
 |             //              0       URX_LB_START     dataLoc | 
 |             //              1       URX_LB_CONT      dataLoc | 
 |             //              2                        MinMatchLen | 
 |             //              3                        MaxMatchLen | 
 |             //              4       URX_NOP          Standard '(' boilerplate. | 
 |             //              5       URX_NOP          Reserved slot for use with '|' ops within (block). | 
 |             //              6         <code for LookBehind expression> | 
 |             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len | 
 |             //              8       URX_LA_END       dataLoc    # Restore stack, input pos | 
 |             // | 
 |             //          Allocate a block of matcher data, to contain (when running a match) | 
 |             //              0:    Stack ptr on entry | 
 |             //              1:    Input Index on entry | 
 |             //              2:    Start index of match current match attempt. | 
 |             //              3:    Original Input String len.   | 
 |  | 
 |             // Allocate data space | 
 |             int32_t dataLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize += 4;  | 
 |              | 
 |             // Emit URX_LB_START | 
 |             int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |              | 
 |             // Emit URX_LB_CONT | 
 |             op = URX_BUILD(URX_LB_CONT, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later. | 
 |             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later. | 
 |              | 
 |             // Emit the NOP | 
 |             op = URX_BUILD(URX_NOP, 0); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |              | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the URX_LB_CONT and the NOP.   | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push(lookBehind, *fStatus);                       // Frame type | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location | 
 |              | 
 |             // The final two instructions will be added when the ')' is encountered. | 
 |         } | 
 |  | 
 |         break; | 
 |  | 
 |     case doOpenLookBehindNeg: | 
 |         { | 
 |             //   Compile a (?<! negated look-behind open paren. | 
 |             // | 
 |             //          Compiles to | 
 |             //              0       URX_LB_START     dataLoc    # Save entry stack, input len | 
 |             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions | 
 |             //              2                        MinMatchLen | 
 |             //              3                        MaxMatchLen | 
 |             //              4                        continueLoc (9) | 
 |             //              5       URX_NOP          Standard '(' boilerplate. | 
 |             //              6       URX_NOP          Reserved slot for use with '|' ops within (block). | 
 |             //              7         <code for LookBehind expression> | 
 |             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL | 
 |             //              9       ... | 
 |             // | 
 |             //          Allocate a block of matcher data, to contain (when running a match) | 
 |             //              0:    Stack ptr on entry | 
 |             //              1:    Input Index on entry | 
 |             //              2:    Start index of match current match attempt. | 
 |             //              3:    Original Input String len.   | 
 |  | 
 |             // Allocate data space | 
 |             int32_t dataLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize += 4;  | 
 |              | 
 |             // Emit URX_LB_START | 
 |             int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |              | 
 |             // Emit URX_LBN_CONT | 
 |             op = URX_BUILD(URX_LBN_CONT, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later. | 
 |             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later. | 
 |             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // Continue Loc.    To be filled later. | 
 |              | 
 |             // Emit the NOP | 
 |             op = URX_BUILD(URX_NOP, 0); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |              | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the URX_LB_CONT and the NOP.   | 
 |             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state | 
 |             fParenStack.push(lookBehindN, *fStatus);                      // Frame type | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location | 
 |              | 
 |             // The final two instructions will be added when the ')' is encountered. | 
 |         } | 
 |         break; | 
 |  | 
 |     case doConditionalExpr: | 
 |         // Conditionals such as (?(1)a:b) | 
 |     case doPerlInline: | 
 |         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them. | 
 |         error(U_REGEX_UNIMPLEMENTED); | 
 |         break; | 
 |  | 
 |  | 
 |     case doCloseParen: | 
 |         handleCloseParen(); | 
 |         if (fParenStack.size() <= 0) { | 
 |             //  Extra close paren, or missing open paren. | 
 |             error(U_REGEX_MISMATCHED_PAREN); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doNOP: | 
 |         break; | 
 |  | 
 |  | 
 |     case doBadOpenParenType: | 
 |     case doRuleError: | 
 |         error(U_REGEX_RULE_SYNTAX); | 
 |         break; | 
 |  | 
 |  | 
 |     case doMismatchedParenErr: | 
 |         error(U_REGEX_MISMATCHED_PAREN); | 
 |         break; | 
 |  | 
 |     case doPlus: | 
 |         //  Normal '+'  compiles to | 
 |         //     1.   stuff to be repeated  (already built) | 
 |         //     2.   jmp-sav 1 | 
 |         //     3.   ... | 
 |         // | 
 |         //  Or, if the item to be repeated can match a zero length string, | 
 |         //     1.   STO_INP_LOC  data-loc | 
 |         //     2.      body of stuff to be repeated | 
 |         //     3.   JMP_SAV_X    2 | 
 |         //     4.   ... | 
 |  | 
 |         // | 
 |         //  Or, if the item to be repeated is simple | 
 |         //     1.   Item to be repeated. | 
 |         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref) | 
 |         //     3.   LOOP_C       stack location | 
 |         { | 
 |             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1 | 
 |             int32_t  frameLoc; | 
 |  | 
 |             // Check for simple constructs, which may get special optimized code. | 
 |             if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 
 |                 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc); | 
 |  | 
 |                 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 
 |                     // Emit optimized code for [char set]+ | 
 |                     int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); | 
 |                     fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | 
 |                     frameLoc = fRXPat->fFrameSize; | 
 |                     fRXPat->fFrameSize++; | 
 |                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | 
 |                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | 
 |                     break; | 
 |                 } | 
 |  | 
 |                 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 
 |                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 
 |                     // Emit Optimized code for .+ operations. | 
 |                     int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 
 |                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 
 |                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. | 
 |                         loopOpI |= 1; | 
 |                     } | 
 |                     fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | 
 |                     frameLoc = fRXPat->fFrameSize; | 
 |                     fRXPat->fFrameSize++; | 
 |                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | 
 |                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | 
 |                     break; | 
 |                 } | 
 |  | 
 |             } | 
 |  | 
 |             // General case. | 
 |  | 
 |             // Check for minimum match length of zero, which requires | 
 |             //    extra loop-breaking code. | 
 |             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { | 
 |                 // Zero length match is possible. | 
 |                 // Emit the code sequence that can handle it. | 
 |                 insertOp(topLoc); | 
 |                 frameLoc =  fRXPat->fFrameSize; | 
 |                 fRXPat->fFrameSize++; | 
 |  | 
 |                 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | 
 |                 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 
 |  | 
 |                 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | 
 |                 fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |             } else { | 
 |                 // Simpler code when the repeated body must match something non-empty | 
 |                 int32_t  jmpOp  = URX_BUILD(URX_JMP_SAV, topLoc); | 
 |                 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | 
 |             } | 
 |         } | 
 |         break; | 
 |  | 
 |     case doNGPlus: | 
 |         //  Non-greedy '+?'  compiles to | 
 |         //     1.   stuff to be repeated  (already built) | 
 |         //     2.   state-save  1 | 
 |         //     3.   ... | 
 |         { | 
 |             int32_t topLoc      = blockTopLoc(FALSE); | 
 |             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | 
 |             fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doOpt: | 
 |         // Normal (greedy) ? quantifier. | 
 |         //  Compiles to | 
 |         //     1. state save 3 | 
 |         //     2.    body of optional block | 
 |         //     3. ... | 
 |         // Insert the state save into the compiled pattern, and we're done. | 
 |         { | 
 |             int32_t   saveStateLoc = blockTopLoc(TRUE); | 
 |             int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); | 
 |             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doNGOpt: | 
 |         // Non-greedy ?? quantifier | 
 |         //   compiles to | 
 |         //    1.  jmp   4 | 
 |         //    2.     body of optional block | 
 |         //    3   jmp   5 | 
 |         //    4.  state save 2 | 
 |         //    5    ... | 
 |         //  This code is less than ideal, with two jmps instead of one, because we can only | 
 |         //  insert one instruction at the top of the block being iterated. | 
 |         { | 
 |             int32_t  jmp1_loc = blockTopLoc(TRUE); | 
 |             int32_t  jmp2_loc = fRXPat->fCompiledPat->size(); | 
 |  | 
 |             int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1); | 
 |             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | 
 |  | 
 |             int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2); | 
 |             fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); | 
 |  | 
 |             int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | 
 |             fRXPat->fCompiledPat->addElement(save_op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doStar: | 
 |         // Normal (greedy) * quantifier. | 
 |         // Compiles to | 
 |         //       1.   STATE_SAVE   4 | 
 |         //       2.      body of stuff being iterated over | 
 |         //       3.   JMP_SAV      2 | 
 |         //       4.   ... | 
 |         // | 
 |         // Or, if the body is a simple [Set], | 
 |         //       1.   LOOP_SR_I    set number | 
 |         //       2.   LOOP_C       stack location | 
 |         //       ... | 
 |         // | 
 |         // Or if this is a .*  | 
 |         //       1.   LOOP_DOT_I    (. matches all mode flag) | 
 |         //       2.   LOOP_C        stack location | 
 |         // | 
 |         // Or, if the body can match a zero-length string, to inhibit infinite loops, | 
 |         //       1.   STATE_SAVE   5 | 
 |         //       2.   STO_INP_LOC  data-loc | 
 |         //       3.      body of stuff | 
 |         //       4.   JMP_SAV_X    2 | 
 |         //       5.   ... | 
 |         { | 
 |             // location of item #1, the STATE_SAVE | 
 |             int32_t   topLoc = blockTopLoc(FALSE); | 
 |             int32_t   dataLoc = -1; | 
 |  | 
 |             // Check for simple *, where the construct being repeated | 
 |             //   compiled to single opcode, and might be optimizable. | 
 |             if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 
 |                 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc); | 
 |  | 
 |                 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 
 |                     // Emit optimized code for a [char set]*  | 
 |                     int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); | 
 |                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 
 |                     dataLoc = fRXPat->fFrameSize; | 
 |                     fRXPat->fFrameSize++; | 
 |                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | 
 |                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | 
 |                     break; | 
 |                 } | 
 |  | 
 |                 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 
 |                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 
 |                     // Emit Optimized code for .* operations. | 
 |                     int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 
 |                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 
 |                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. | 
 |                         loopOpI |= 1; | 
 |                     } | 
 |                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 
 |                     dataLoc = fRXPat->fFrameSize; | 
 |                     fRXPat->fFrameSize++; | 
 |                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | 
 |                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | 
 |                     break; | 
 |                 } | 
 |             } | 
 |  | 
 |             // Emit general case code for this * | 
 |             // The optimizations did not apply. | 
 |  | 
 |             int32_t   saveStateLoc = blockTopLoc(TRUE); | 
 |             int32_t   jmpOp        = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | 
 |  | 
 |             // Check for minimum match length of zero, which requires | 
 |             //    extra loop-breaking code. | 
 |             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { | 
 |                 insertOp(saveStateLoc); | 
 |                 dataLoc =  fRXPat->fFrameSize; | 
 |                 fRXPat->fFrameSize++; | 
 |  | 
 |                 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | 
 |                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | 
 |                 jmpOp      = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | 
 |             } | 
 |                  | 
 |             // Locate the position in the compiled pattern where the match will continue | 
 |             //   after completing the *.   (4 or 5 in the comment above) | 
 |             int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 
 |  | 
 |             // Put together the save state op store it into the compiled code. | 
 |             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | 
 |             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 
 |  | 
 |             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. | 
 |             fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doNGStar: | 
 |         // Non-greedy *? quantifier | 
 |         // compiles to | 
 |         //     1.   JMP    3 | 
 |         //     2.      body of stuff being iterated over | 
 |         //     3.   STATE_SAVE  2 | 
 |         //     4    ... | 
 |         { | 
 |             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1. | 
 |             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3. | 
 |             int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc); | 
 |             int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | 
 |             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | 
 |             fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doIntervalInit: | 
 |         // The '{' opening an interval quantifier was just scanned. | 
 |         // Init the counter varaiables that will accumulate the values as the digits | 
 |         //    are scanned. | 
 |         fIntervalLow = 0; | 
 |         fIntervalUpper = -1; | 
 |         break; | 
 |  | 
 |     case doIntevalLowerDigit: | 
 |         // Scanned a digit from the lower value of an {lower,upper} interval | 
 |         { | 
 |             int32_t digitValue = u_charDigitValue(fC.fChar); | 
 |             U_ASSERT(digitValue >= 0); | 
 |             fIntervalLow = fIntervalLow*10 + digitValue; | 
 |             if (fIntervalLow < 0) { | 
 |                 error(U_REGEX_NUMBER_TOO_BIG); | 
 |             } | 
 |         } | 
 |         break; | 
 |  | 
 |     case doIntervalUpperDigit: | 
 |         // Scanned a digit from the upper value of an {lower,upper} interval | 
 |         { | 
 |             if (fIntervalUpper < 0) { | 
 |                 fIntervalUpper = 0; | 
 |             } | 
 |             int32_t digitValue = u_charDigitValue(fC.fChar); | 
 |             U_ASSERT(digitValue >= 0); | 
 |             fIntervalUpper = fIntervalUpper*10 + digitValue; | 
 |             if (fIntervalUpper < 0) { | 
 |                 error(U_REGEX_NUMBER_TOO_BIG); | 
 |             } | 
 |         } | 
 |         break; | 
 |  | 
 |     case doIntervalSame: | 
 |         // Scanned a single value interval like {27}.  Upper = Lower. | 
 |         fIntervalUpper = fIntervalLow; | 
 |         break; | 
 |  | 
 |     case doInterval: | 
 |         // Finished scanning a normal {lower,upper} interval.  Generate the code for it. | 
 |         if (compileInlineInterval() == FALSE) { | 
 |             compileInterval(URX_CTR_INIT, URX_CTR_LOOP); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doPossessiveInterval: | 
 |         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it. | 
 |         { | 
 |             // Remember the loc for the top of the block being looped over. | 
 |             //   (Can not reserve a slot in the compiled pattern at this time, becuase  | 
 |             //    compileInterval needs to reserve also, and blockTopLoc can only reserve  | 
 |             //    once per block.) | 
 |             int32_t topLoc = blockTopLoc(FALSE); | 
 |  | 
 |             // Produce normal looping code. | 
 |             compileInterval(URX_CTR_INIT, URX_CTR_LOOP); | 
 |  | 
 |             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP | 
 |             //  just as if the loop was inclosed in atomic parentheses. | 
 |  | 
 |             // First the STO_SP before the start of the loop | 
 |             insertOp(topLoc); | 
 |             int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the | 
 |             fRXPat->fDataSize += 1;                    //  state stack ptr. | 
 |             int32_t  op        = URX_BUILD(URX_STO_SP, varLoc); | 
 |             fRXPat->fCompiledPat->setElementAt(op, topLoc); | 
 |  | 
 |             int32_t loopOp = fRXPat->fCompiledPat->popi(); | 
 |             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); | 
 |             loopOp++;     // point LoopOp after the just-inserted STO_SP | 
 |             fRXPat->fCompiledPat->push(loopOp, *fStatus); | 
 |  | 
 |             // Then the LD_SP after the end of the loop | 
 |             op = URX_BUILD(URX_LD_SP, varLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |  | 
 |         break; | 
 |  | 
 |     case doNGInterval: | 
 |         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it. | 
 |         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); | 
 |         break; | 
 |  | 
 |     case doIntervalError: | 
 |         error(U_REGEX_BAD_INTERVAL); | 
 |         break; | 
 |  | 
 |     case doLiteralChar: | 
 |         // We've just scanned a "normal" character from the pattern,  | 
 |         literalChar(fC.fChar); | 
 |         break; | 
 |  | 
 |  | 
 |  | 
 |     case doDotAny: | 
 |         // scanned a ".",  match any single character. | 
 |         { | 
 |             int32_t   op; | 
 |             if (fModeFlags & UREGEX_DOTALL) { | 
 |                 op = URX_BUILD(URX_DOTANY_ALL, 0); | 
 |             } else { | 
 |                 op = URX_BUILD(URX_DOTANY, 0); | 
 |             } | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doCaret:  | 
 |         { | 
 |             int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET; | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doDollar:   | 
 |         { | 
 |             int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR; | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doBackslashA: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashB: | 
 |         { | 
 |             #if  UCONFIG_NO_BREAK_ITERATION==1 | 
 |             if (fModeFlags & UREGEX_UWORD) { | 
 |                 error(U_UNSUPPORTED_ERROR); | 
 |             } | 
 |             #endif | 
 |             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doBackslashb: | 
 |         { | 
 |             #if  UCONFIG_NO_BREAK_ITERATION==1 | 
 |             if (fModeFlags & UREGEX_UWORD) { | 
 |                 error(U_UNSUPPORTED_ERROR); | 
 |             } | 
 |             #endif | 
 |             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doBackslashD: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashd: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashG: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashS: | 
 |         fRXPat->fCompiledPat->addElement( | 
 |             URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashs: | 
 |         fRXPat->fCompiledPat->addElement( | 
 |             URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashW: | 
 |         fRXPat->fCompiledPat->addElement( | 
 |             URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashw: | 
 |         fRXPat->fCompiledPat->addElement( | 
 |             URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashX: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); | 
 |         break; | 
 |  | 
 |  | 
 |     case doBackslashZ: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); | 
 |         break; | 
 |  | 
 |     case doBackslashz: | 
 |         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); | 
 |         break; | 
 |  | 
 |     case doEscapeError: | 
 |         error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 
 |         break; | 
 |  | 
 |     case doExit: | 
 |         returnVal = FALSE; | 
 |         break; | 
 |  | 
 |     case doProperty: | 
 |         { | 
 |             UnicodeSet *theSet = scanProp(); | 
 |             compileSet(theSet); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doScanUnicodeSet: | 
 |         { | 
 |             UnicodeSet *theSet = scanSet(); | 
 |             compileSet(theSet); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doEnterQuoteMode: | 
 |         // Just scanned a \Q.  Put character scanner into quote mode. | 
 |         fQuoteMode = TRUE; | 
 |         break; | 
 |  | 
 |     case doBackRef: | 
 |         // BackReference.  Somewhat unusual in that the front-end can not completely parse | 
 |         //                 the regular expression, because the number of digits to be consumed | 
 |         //                 depends on the number of capture groups that have been defined.  So | 
 |         //                 we have to do it here instead. | 
 |         { | 
 |             int32_t  numCaptureGroups = fRXPat->fGroupMap->size(); | 
 |             int32_t  groupNum = 0; | 
 |             UChar32  c        = fC.fChar; | 
 |  | 
 |             for (;;) { | 
 |                 // Loop once per digit, for max allowed number of digits in a back reference. | 
 |                 int32_t digit = u_charDigitValue(c); | 
 |                 groupNum = groupNum * 10 + digit; | 
 |                 if (groupNum >= numCaptureGroups) { | 
 |                     break; | 
 |                 } | 
 |                 c = peekCharLL(); | 
 |                 if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) { | 
 |                     break; | 
 |                 } | 
 |                 nextCharLL(); | 
 |             } | 
 |  | 
 |             // Scan of the back reference in the source regexp is complete.  Now generate | 
 |             //  the compiled code for it.  | 
 |             // Because capture groups can be forward-referenced by back-references, | 
 |             //  we fill the operand with the capture group number.  At the end | 
 |             //  of compilation, it will be changed to the variable's location. | 
 |             U_ASSERT(groupNum > 0); | 
 |             int32_t  op; | 
 |             if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 
 |                 op = URX_BUILD(URX_BACKREF_I, groupNum); | 
 |             } else { | 
 |                 op = URX_BUILD(URX_BACKREF, groupNum); | 
 |             } | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doPossessivePlus: | 
 |         // Possessive ++ quantifier. | 
 |         // Compiles to | 
 |         //       1.   STO_SP | 
 |         //       2.      body of stuff being iterated over | 
 |         //       3.   STATE_SAVE 5 | 
 |         //       4.   JMP        2 | 
 |         //       5.   LD_SP | 
 |         //       6.   ... | 
 |         // | 
 |         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up | 
 |         //                then unconditionally discarded.  Perhaps introduce a new opcode | 
 |         // | 
 |         { | 
 |             // Emit the STO_SP | 
 |             int32_t   topLoc = blockTopLoc(TRUE); | 
 |             int32_t   stoLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr. | 
 |             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->setElementAt(op, topLoc); | 
 |  | 
 |             // Emit the STATE_SAVE | 
 |             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |              | 
 |             // Emit the JMP | 
 |             op = URX_BUILD(URX_JMP, topLoc+1); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // Emit the LD_SP | 
 |             op = URX_BUILD(URX_LD_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doPossessiveStar: | 
 |         // Possessive *+ quantifier. | 
 |         // Compiles to | 
 |         //       1.   STO_SP       loc | 
 |         //       2.   STATE_SAVE   5 | 
 |         //       3.      body of stuff being iterated over | 
 |         //       4.   JMP          2 | 
 |         //       5.   LD_SP        loc | 
 |         //       6    ... | 
 |         // TODO:  do something to cut back the state stack each time through the loop. | 
 |         { | 
 |             // Reserve two slots at the top of the block. | 
 |             int32_t   topLoc = blockTopLoc(TRUE); | 
 |             insertOp(topLoc); | 
 |  | 
 |             // emit   STO_SP     loc | 
 |             int32_t   stoLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr. | 
 |             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->setElementAt(op, topLoc); | 
 |  | 
 |             // Emit the SAVE_STATE   5 | 
 |             int32_t L7 = fRXPat->fCompiledPat->size()+1; | 
 |             op = URX_BUILD(URX_STATE_SAVE, L7); | 
 |             fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 
 |  | 
 |             // Append the JMP operation.  | 
 |             op = URX_BUILD(URX_JMP, topLoc+1); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // Emit the LD_SP       loc | 
 |             op = URX_BUILD(URX_LD_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case doPossessiveOpt: | 
 |         // Possessive  ?+ quantifier. | 
 |         //  Compiles to | 
 |         //     1. STO_SP      loc | 
 |         //     2. SAVE_STATE  5 | 
 |         //     3.    body of optional block | 
 |         //     4. LD_SP       loc | 
 |         //     5. ... | 
 |         // | 
 |         { | 
 |             // Reserve two slots at the top of the block. | 
 |             int32_t   topLoc = blockTopLoc(TRUE); | 
 |             insertOp(topLoc); | 
 |  | 
 |             // Emit the STO_SP | 
 |             int32_t   stoLoc = fRXPat->fDataSize; | 
 |             fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr. | 
 |             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->setElementAt(op, topLoc); | 
 |  | 
 |             // Emit the SAVE_STATE | 
 |             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1; | 
 |             op = URX_BUILD(URX_STATE_SAVE, continueLoc); | 
 |             fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 
 |  | 
 |             // Emit the LD_SP | 
 |             op = URX_BUILD(URX_LD_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |     case doBeginMatchMode: | 
 |         fNewModeFlags = fModeFlags; | 
 |         fSetModeFlag  = TRUE; | 
 |         break; | 
 |  | 
 |     case doMatchMode:   //  (?i)    and similar | 
 |         { | 
 |             int32_t  bit = 0; | 
 |             switch (fC.fChar) { | 
 |             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break; | 
 |             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break; | 
 |             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break; | 
 |             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break; | 
 |             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break; | 
 |             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break; | 
 |             default: | 
 |                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out | 
 |                                    // by the scanner. | 
 |             } | 
 |             if (fSetModeFlag) { | 
 |                 fNewModeFlags |= bit; | 
 |             } else { | 
 |                 fNewModeFlags &= ~bit; | 
 |             } | 
 |         } | 
 |         break; | 
 |  | 
 |     case doSetMatchMode: | 
 |         // We've got a (?i) or similar.  The match mode is being changed, but | 
 |         //   the change is not scoped to a parenthesized block. | 
 |         U_ASSERT(fNewModeFlags < 0); | 
 |         fModeFlags = fNewModeFlags; | 
 |  | 
 |         // Prevent any string from spanning across the change of match mode. | 
 |         //   Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"  | 
 |         fixLiterals();      | 
 |         break; | 
 |  | 
 |  | 
 |     case doMatchModeParen: | 
 |         // We've got a (?i: or similar.  Begin a parenthesized block, save old | 
 |         //   mode flags so they can be restored at the close of the block. | 
 |         // | 
 |         //   Compile to a | 
 |         //      - NOP, which later may be replaced by a save-state if the | 
 |         //         parenthesized group gets a * quantifier, followed by | 
 |         //      - NOP, which may later be replaced by a save-state if there | 
 |         //             is an '|' alternation within the parens. | 
 |         { | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 
 |  | 
 |             // On the Parentheses stack, start a new frame and add the postions | 
 |             //   of the two NOPs (a normal non-capturing () frame, except for the | 
 |             //   saving of the orignal mode flags.) | 
 |             fParenStack.push(fModeFlags, *fStatus); | 
 |             fParenStack.push(flags, *fStatus);                            // Frame Marker | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP | 
 |             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP | 
 |  | 
 |             // Set the current mode flags to the new values. | 
 |             U_ASSERT(fNewModeFlags < 0); | 
 |             fModeFlags = fNewModeFlags; | 
 |         } | 
 |         break; | 
 |  | 
 |     case doBadModeFlag: | 
 |         error(U_REGEX_INVALID_FLAG); | 
 |         break; | 
 |  | 
 |     case doSuppressComments: | 
 |         // We have just scanned a '(?'.  We now need to prevent the character scanner from | 
 |         // treating a '#' as a to-the-end-of-line comment. | 
 |         //   (This Perl compatibility just gets uglier and uglier to do...) | 
 |         fEOLComments = FALSE; | 
 |         break; | 
 |  | 
 |  | 
 |  | 
 |     default: | 
 |         U_ASSERT(FALSE); | 
 |         error(U_REGEX_INTERNAL_ERROR); | 
 |         break; | 
 |     } | 
 |  | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         returnVal = FALSE; | 
 |     } | 
 |  | 
 |     return returnVal; | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   literalChar           We've encountered a literal character from the pattern, | 
 | //                             or an escape sequence that reduces to a character. | 
 | //                         Add it to the string containing all literal chars/strings from | 
 | //                             the pattern. | 
 | //                         If we are in a pattern string already, add the new char to it. | 
 | //                         If we aren't in a pattern string, begin one now. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void RegexCompile::literalChar(UChar32 c)  { | 
 |     int32_t           op;            // An operation in the compiled pattern. | 
 |     int32_t           opType; | 
 |     int32_t           patternLoc;   // A position in the compiled pattern. | 
 |     int32_t           stringLen; | 
 |  | 
 |  | 
 |     // If the last thing compiled into the pattern was not a literal char, | 
 |     //   force this new literal char to begin a new string, and not append to the previous. | 
 |     op     = fRXPat->fCompiledPat->lastElementi(); | 
 |     opType = URX_TYPE(op); | 
 |     if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) { | 
 |         fixLiterals(); | 
 |     } | 
 |  | 
 |     if (fStringOpStart == -1) { | 
 |         // First char of a string in the pattern. | 
 |         // Emit a OneChar op into the compiled pattern. | 
 |         emitONE_CHAR(c); | 
 |  | 
 |         // Also add it to the string pool, in case we get a second adjacent literal | 
 |         //   and want to change form ONE_CHAR to STRING | 
 |         fStringOpStart = fRXPat->fLiteralText.length(); | 
 |         fRXPat->fLiteralText.append(c); | 
 |         return; | 
 |     } | 
 |      | 
 |     // We are adding onto an existing string | 
 |     fRXPat->fLiteralText.append(c); | 
 |  | 
 |     op     = fRXPat->fCompiledPat->lastElementi(); | 
 |     opType = URX_TYPE(op); | 
 |     U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); | 
 |  | 
 |     // If the most recently emitted op is a URX_ONECHAR,  | 
 |     if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { | 
 |         if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { | 
 |             // The most recently emitted op is a ONECHAR that was the first half | 
 |             //   of a surrogate pair.  Update the ONECHAR's operand to be the | 
 |             //   supplementary code point resulting from both halves of the pair. | 
 |             c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c); | 
 |             op = URX_BUILD(opType, c); | 
 |             patternLoc = fRXPat->fCompiledPat->size() - 1; | 
 |             fRXPat->fCompiledPat->setElementAt(op, patternLoc); | 
 |             return; | 
 |         } | 
 |          | 
 |         // The most recently emitted op is a ONECHAR. | 
 |         //  We've now received another adjacent char.  Change the ONECHAR op | 
 |         //   to a string op. | 
 |         if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 
 |             op     = URX_BUILD(URX_STRING_I, fStringOpStart); | 
 |         } else { | 
 |             op     = URX_BUILD(URX_STRING, fStringOpStart); | 
 |         } | 
 |         patternLoc = fRXPat->fCompiledPat->size() - 1; | 
 |         fRXPat->fCompiledPat->setElementAt(op, patternLoc); | 
 |         op         = URX_BUILD(URX_STRING_LEN, 0); | 
 |         fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |     } | 
 |      | 
 |     // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the | 
 |     //  string length to reflect the new char we just added to the string. | 
 |     stringLen  = fRXPat->fLiteralText.length() - fStringOpStart; | 
 |     op         = URX_BUILD(URX_STRING_LEN, stringLen); | 
 |     patternLoc = fRXPat->fCompiledPat->size() - 1; | 
 |     fRXPat->fCompiledPat->setElementAt(op, patternLoc); | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //    emitONE_CHAR         emit a ONE_CHAR op into the generated code. | 
 | //                         Choose cased or uncased version, depending on the | 
 | //                         match mode and whether the character itself is cased. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void RegexCompile::emitONE_CHAR(UChar32  c) { | 
 |     int32_t op; | 
 |     if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | 
 |         u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { | 
 |         // We have a cased character, and are in case insensitive matching mode. | 
 |         c  = u_foldCase(c, U_FOLD_CASE_DEFAULT); | 
 |         op = URX_BUILD(URX_ONECHAR_I, c); | 
 |     } else { | 
 |         // Uncased char, or case sensitive match mode. | 
 |         //  Either way, just generate a literal compare of the char. | 
 |         op = URX_BUILD(URX_ONECHAR, c); | 
 |     } | 
 |     fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 | } | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //    fixLiterals           When compiling something that can follow a literal | 
 | //                          string in a pattern, we need to "fix" any preceding | 
 | //                          string, which will cause any subsequent literals to | 
 | //                          begin a new string, rather than appending to the | 
 | //                          old one. | 
 | // | 
 | //                          Optionally, split the last char of the string off into | 
 | //                          a single "ONE_CHAR" operation, so that quantifiers can | 
 | //                          apply to that char alone.  Example:   abc* | 
 | //                          The * must apply to the 'c' only. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void    RegexCompile::fixLiterals(UBool split) { | 
 |     int32_t  stringStart = fStringOpStart;    // start index of the current literal string | 
 |     int32_t  op;                              // An op from/for the compiled pattern. | 
 |     int32_t  opType;                          // An opcode type from the compiled pattern. | 
 |     int32_t  stringLastCharIdx; | 
 |     UChar32  lastChar; | 
 |     int32_t  stringNextToLastCharIdx; | 
 |     UChar32  nextToLastChar; | 
 |     int32_t  stringLen; | 
 |  | 
 |     fStringOpStart = -1;     | 
 |     if (!split) { | 
 |         return; | 
 |     } | 
 |  | 
 |     // Split:  We need to  ensure that the last item in the compiled pattern does | 
 |     //   not refer to a literal string of more than one char.  If it does, | 
 |     //   separate the last char from the rest of the string. | 
 |  | 
 |     // If the last operation from the compiled pattern is not a string, | 
 |     //   nothing needs to be done   | 
 |     op     = fRXPat->fCompiledPat->lastElementi(); | 
 |     opType = URX_TYPE(op); | 
 |     if (opType != URX_STRING_LEN) { | 
 |         return; | 
 |     } | 
 |     stringLen = URX_VAL(op); | 
 |  | 
 |     // | 
 |     // Find the position of the last code point in the string  (might be a surrogate pair) | 
 |     // | 
 |     stringLastCharIdx = fRXPat->fLiteralText.length(); | 
 |     stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); | 
 |     lastChar          = fRXPat->fLiteralText.char32At(stringLastCharIdx); | 
 |  | 
 |     // The string should always be at least two code points long, meaning that there | 
 |     //   should be something before the last char position that we just found. | 
 |     U_ASSERT(stringLastCharIdx > stringStart); | 
 |     stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); | 
 |     U_ASSERT(stringNextToLastCharIdx >= stringStart); | 
 |     nextToLastChar          = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx); | 
 |  | 
 |     if (stringNextToLastCharIdx > stringStart) { | 
 |         // The length of string remaining after removing one char is two or more. | 
 |         // Leave the string in the compiled pattern, shorten it by one char, | 
 |         //   and append a URX_ONECHAR op for the last char. | 
 |         stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx); | 
 |         op = URX_BUILD(URX_STRING_LEN, stringLen); | 
 |         fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); | 
 |         emitONE_CHAR(lastChar); | 
 |     } else { | 
 |         // The original string consisted of exactly two characters.  Replace | 
 |         // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair | 
 |         // of URX_ONECHARs. | 
 |         fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2); | 
 |         emitONE_CHAR(nextToLastChar); | 
 |         emitONE_CHAR(lastChar); | 
 |     } | 
 | } | 
 |  | 
 |  | 
 |  | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   insertOp()             Insert a slot for a new opcode into the already | 
 | //                          compiled pattern code. | 
 | // | 
 | //                          Fill the slot with a NOP.  Our caller will replace it | 
 | //                          with what they really wanted. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void   RegexCompile::insertOp(int32_t where) { | 
 |     UVector32 *code = fRXPat->fCompiledPat; | 
 |     U_ASSERT(where>0 && where < code->size()); | 
 |  | 
 |     int32_t  nop = URX_BUILD(URX_NOP, 0); | 
 |     code->insertElementAt(nop, where, *fStatus); | 
 |  | 
 |     // Walk through the pattern, looking for any ops with targets that | 
 |     //  were moved down by the insert.  Fix them. | 
 |     int32_t loc; | 
 |     for (loc=0; loc<code->size(); loc++) { | 
 |         int32_t op = code->elementAti(loc); | 
 |         int32_t opType = URX_TYPE(op); | 
 |         int32_t opValue = URX_VAL(op); | 
 |         if ((opType == URX_JMP         || | 
 |             opType == URX_JMPX         || | 
 |             opType == URX_STATE_SAVE   || | 
 |             opType == URX_CTR_LOOP     || | 
 |             opType == URX_CTR_LOOP_NG  || | 
 |             opType == URX_JMP_SAV      || | 
 |             opType == URX_RELOC_OPRND)    && opValue > where) { | 
 |             // Target location for this opcode is after the insertion point and | 
 |             //   needs to be incremented to adjust for the insertion. | 
 |             opValue++; | 
 |             op = URX_BUILD(opType, opValue); | 
 |             code->setElementAt(op, loc); | 
 |         } | 
 |     } | 
 |  | 
 |     // Now fix up the parentheses stack.  All positive values in it are locations in | 
 |     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.) | 
 |     for (loc=0; loc<fParenStack.size(); loc++) { | 
 |         int32_t x = fParenStack.elementAti(loc); | 
 |         U_ASSERT(x < code->size()); | 
 |         if (x>where) { | 
 |             x++; | 
 |             fParenStack.setElementAt(x, loc); | 
 |         } | 
 |     } | 
 |  | 
 |     if (fMatchCloseParen > where) { | 
 |         fMatchCloseParen++; | 
 |     } | 
 |     if (fMatchOpenParen > where) { | 
 |         fMatchOpenParen++; | 
 |     } | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   blockTopLoc()          Find or create a location in the compiled pattern | 
 | //                          at the start of the operation or block that has | 
 | //                          just been compiled.  Needed when a quantifier (* or | 
 | //                          whatever) appears, and we need to add an operation | 
 | //                          at the start of the thing being quantified. | 
 | // | 
 | //                          (Parenthesized Blocks) have a slot with a NOP that | 
 | //                          is reserved for this purpose.  .* or similar don't | 
 | //                          and a slot needs to be added. | 
 | // | 
 | //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode | 
 | //                                         at the returned location. | 
 | //                                 FALSE - just return the address,  | 
 | //                                         do not reserve a location there. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) { | 
 |     int32_t   theLoc; | 
 |     if (fRXPat->fCompiledPat->size() == fMatchCloseParen) | 
 |     { | 
 |         // The item just processed is a parenthesized block. | 
 |         theLoc = fMatchOpenParen;   // A slot is already reserved for us. | 
 |         U_ASSERT(theLoc > 0); | 
 |         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); | 
 |     } | 
 |     else { | 
 |         // Item just compiled is a single thing, a ".", or a single char, or a set reference. | 
 |         // No slot for STATE_SAVE was pre-reserved in the compiled code. | 
 |         // We need to make space now. | 
 |         fixLiterals(TRUE);  // If last item was a string, separate the last char. | 
 |         theLoc = fRXPat->fCompiledPat->size()-1; | 
 |         if (reserveLoc) { | 
 |             /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/ | 
 |             int32_t  nop = URX_BUILD(URX_NOP, 0); | 
 |             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); | 
 |         } | 
 |     } | 
 |     return theLoc; | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //    handleCloseParen      When compiling a close paren, we need to go back | 
 | //                          and fix up any JMP or SAVE operations within the | 
 | //                          parenthesized block that need to target the end | 
 | //                          of the block.  The locations of these are kept on | 
 | //                          the paretheses stack. | 
 | // | 
 | //                          This function is called both when encountering a | 
 | //                          real ) and at the end of the pattern. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void  RegexCompile::handleCloseParen() { | 
 |     int32_t   patIdx; | 
 |     int32_t   patOp; | 
 |     if (fParenStack.size() <= 0) { | 
 |         error(U_REGEX_MISMATCHED_PAREN); | 
 |         return; | 
 |     } | 
 |  | 
 |     // Force any literal chars that may follow the close paren to start a new string, | 
 |     //   and not attach to any preceding it. | 
 |     fixLiterals(FALSE); | 
 |  | 
 |     // Fixup any operations within the just-closed parenthesized group | 
 |     //    that need to reference the end of the (block). | 
 |     //    (The first one popped from the stack is an unused slot for | 
 |     //     alternation (OR) state save, but applying the fixup to it does no harm.) | 
 |     for (;;) { | 
 |         patIdx = fParenStack.popi(); | 
 |         if (patIdx < 0) { | 
 |             // value < 0 flags the start of the frame on the paren stack. | 
 |             break; | 
 |         } | 
 |         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); | 
 |         patOp = fRXPat->fCompiledPat->elementAti(patIdx); | 
 |         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set. | 
 |         patOp |= fRXPat->fCompiledPat->size();  // Set it now. | 
 |         fRXPat->fCompiledPat->setElementAt(patOp, patIdx); | 
 |         fMatchOpenParen     = patIdx; | 
 |     } | 
 |  | 
 |     //  At the close of any parenthesized block, restore the match mode flags  to | 
 |     //  the value they had at the open paren.  Saved value is | 
 |     //  at the top of the paren stack.   | 
 |     fModeFlags = fParenStack.popi(); | 
 |     U_ASSERT(fModeFlags < 0); | 
 |      | 
 |     // DO any additional fixups, depending on the specific kind of | 
 |     // parentesized grouping this is | 
 |  | 
 |     switch (patIdx) { | 
 |     case plain: | 
 |     case flags: | 
 |         // No additional fixups required. | 
 |         //   (Grouping-only parentheses) | 
 |         break; | 
 |     case capturing: | 
 |         // Capturing Parentheses. | 
 |         //   Insert a End Capture op into the pattern. | 
 |         //   The frame offset of the variables for this cg is obtained from the | 
 |         //       start capture op and put it into the end-capture op. | 
 |         { | 
 |             int32_t   captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); | 
 |             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | 
 |  | 
 |             int32_t   frameVarLocation = URX_VAL(captureOp); | 
 |             int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); | 
 |             fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); | 
 |         } | 
 |         break; | 
 |     case atomic: | 
 |         // Atomic Parenthesis. | 
 |         //   Insert a LD_SP operation to restore the state stack to the position | 
 |         //   it was when the atomic parens were entered. | 
 |         { | 
 |             int32_t   stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); | 
 |             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | 
 |             int32_t   stoLoc = URX_VAL(stoOp); | 
 |             int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc); | 
 |             fRXPat->fCompiledPat->addElement(ldOp, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case lookAhead: | 
 |         { | 
 |             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); | 
 |             U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 
 |             int32_t dataLoc  = URX_VAL(startOp); | 
 |             int32_t op       = URX_BUILD(URX_LA_END, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |         } | 
 |         break; | 
 |  | 
 |     case negLookAhead: | 
 |         { | 
 |             // See comment at doOpenLookAheadNeg | 
 |             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); | 
 |             U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 
 |             int32_t dataLoc  = URX_VAL(startOp); | 
 |             int32_t op       = URX_BUILD(URX_LA_END, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |              op              = URX_BUILD(URX_FAIL, 0); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // Patch the URX_SAVE near the top of the block. | 
 |             int32_t saveOp   = fRXPat->fCompiledPat->elementAti(fMatchOpenParen); | 
 |             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | 
 |             int32_t dest     = fRXPat->fCompiledPat->size(); | 
 |             saveOp           = URX_BUILD(URX_STATE_SAVE, dest); | 
 |             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | 
 |         } | 
 |         break; | 
 |  | 
 |     case lookBehind: | 
 |         { | 
 |             // See comment at doOpenLookBehind. | 
 |              | 
 |             // Append the URX_LB_END and URX_LA_END to the compiled pattern. | 
 |             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); | 
 |             U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 
 |             int32_t dataLoc  = URX_VAL(startOp); | 
 |             int32_t op       = URX_BUILD(URX_LB_END, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |                     op       = URX_BUILD(URX_LA_END, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // Determine the min and max bounds for the length of the | 
 |             //  string that the pattern can match. | 
 |             //  An unbounded upper limit is an error. | 
 |             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1; | 
 |             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd); | 
 |             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd); | 
 |             if (maxML == INT32_MAX) { | 
 |                 error(U_REGEX_LOOK_BEHIND_LIMIT); | 
 |                 break; | 
 |             } | 
 |             U_ASSERT(minML <= maxML); | 
 |  | 
 |             // Insert the min and max match len bounds into the URX_LB_CONT op that | 
 |             //  appears at the top of the look-behind block, at location fMatchOpenParen+1 | 
 |             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2); | 
 |             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1); | 
 |  | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |  | 
 |     case lookBehindN: | 
 |         { | 
 |             // See comment at doOpenLookBehindNeg. | 
 |              | 
 |             // Append the URX_LBN_END to the compiled pattern. | 
 |             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); | 
 |             U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 
 |             int32_t dataLoc  = URX_VAL(startOp); | 
 |             int32_t op       = URX_BUILD(URX_LBN_END, dataLoc); | 
 |             fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |             // Determine the min and max bounds for the length of the | 
 |             //  string that the pattern can match. | 
 |             //  An unbounded upper limit is an error. | 
 |             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1; | 
 |             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd); | 
 |             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd); | 
 |             if (maxML == INT32_MAX) { | 
 |                 error(U_REGEX_LOOK_BEHIND_LIMIT); | 
 |                 break; | 
 |             } | 
 |             U_ASSERT(minML <= maxML); | 
 |  | 
 |             // Insert the min and max match len bounds into the URX_LB_CONT op that | 
 |             //  appears at the top of the look-behind block, at location fMatchOpenParen+1 | 
 |             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3); | 
 |             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2); | 
 |  | 
 |             // Insert the pattern location to continue at after a successful match | 
 |             //  as the last operand of the URX_LBN_CONT | 
 |             op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); | 
 |             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1); | 
 |         } | 
 |         break; | 
 |  | 
 |  | 
 |  | 
 |     default: | 
 |         U_ASSERT(FALSE); | 
 |     } | 
 |  | 
 |     // remember the next location in the compiled pattern. | 
 |     // The compilation of Quantifiers will look at this to see whether its looping | 
 |     //   over a parenthesized block or a single item | 
 |     fMatchCloseParen = fRXPat->fCompiledPat->size(); | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   compileSet       Compile the pattern operations for a reference to a | 
 | //                    UnicodeSet. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void        RegexCompile::compileSet(UnicodeSet *theSet) | 
 | { | 
 |     if (theSet == NULL) { | 
 |         return; | 
 |     } | 
 |     int32_t  setSize = theSet->size(); | 
 |     UChar32  firstSetChar = theSet->charAt(0); | 
 |     if (firstSetChar == -1) { | 
 |         // Sets that contain only strings, but no individual chars, | 
 |         // will end up here. | 
 |         error(U_REGEX_SET_CONTAINS_STRING); | 
 |         setSize = 0; | 
 |     } | 
 |  | 
 |     switch (setSize) { | 
 |     case 0:       | 
 |         { | 
 |             // Set of no elements.   Always fails to match.   | 
 |             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); | 
 |             delete theSet; | 
 |         } | 
 |         break; | 
 |          | 
 |     case 1: | 
 |         { | 
 |             // The set contains only a single code point.  Put it into | 
 |             //   the compiled pattern as a single char operation rather | 
 |             //   than a set, and discard the set itself. | 
 |             literalChar(firstSetChar); | 
 |             delete theSet; | 
 |         } | 
 |         break; | 
 |          | 
 |     default:  | 
 |         { | 
 |             //  The set contains two or more chars.  (the normal case) | 
 |             //  Put it into the compiled pattern as a set. | 
 |             int32_t setNumber = fRXPat->fSets->size(); | 
 |             fRXPat->fSets->addElement(theSet, *fStatus); | 
 |             int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | 
 |             fRXPat->fCompiledPat->addElement(setOp, *fStatus); | 
 |         } | 
 |     } | 
 | } | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   compileInterval    Generate the code for a {min, max} style interval quantifier. | 
 | //                      Except for the specific opcodes used, the code is the same | 
 | //                      for all three types (greedy, non-greedy, possessive) of | 
 | //                      intervals.  The opcodes are supplied as parameters. | 
 | // | 
 | //                      The code for interval loops has this form: | 
 | //                         0  CTR_INIT   counter loc (in stack frame) | 
 | //                         1             5  patt address of CTR_LOOP at bottom of block | 
 | //                         2             min count | 
 | //                         3             max count   (-1 for unbounded) | 
 | //                         4  ...        block to be iterated over | 
 | //                         5  CTR_LOOP    | 
 | //     | 
 | //                       In                                  | 
 | //------------------------------------------------------------------------------ | 
 | void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp) | 
 | { | 
 |     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes | 
 |     //   four slots in the compiled code.  Reserve them. | 
 |     int32_t   topOfBlock = blockTopLoc(TRUE); | 
 |     insertOp(topOfBlock); | 
 |     insertOp(topOfBlock); | 
 |     insertOp(topOfBlock); | 
 |  | 
 |     // The operands for the CTR_INIT opcode include the index in the matcher data | 
 |     //   of the counter.  Allocate it now. | 
 |     int32_t   counterLoc = fRXPat->fFrameSize; | 
 |     fRXPat->fFrameSize++; | 
 |  | 
 |     int32_t   op = URX_BUILD(InitOp, counterLoc); | 
 |     fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | 
 |  | 
 |     // The second operand of CTR_INIT is the location following the end of the loop. | 
 |     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the | 
 |     //   compilation of something later on causes the code to grow and the target | 
 |     //   position to move. | 
 |     int32_t loopEnd = fRXPat->fCompiledPat->size(); | 
 |     op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | 
 |     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | 
 |  | 
 |     // Followed by the min and max counts. | 
 |     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); | 
 |     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); | 
 |  | 
 |     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op. | 
 |     //   Goes at end of the block being looped over, so just append to the code so far. | 
 |     op = URX_BUILD(LoopOp, topOfBlock); | 
 |     fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |  | 
 |     if ((fIntervalLow & 0xff000000) != 0 || | 
 |         fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0) { | 
 |             error(U_REGEX_NUMBER_TOO_BIG); | 
 |         } | 
 |  | 
 |     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { | 
 |         error(U_REGEX_MAX_LT_MIN); | 
 |     } | 
 | } | 
 |  | 
 |  | 
 |  | 
 | UBool RegexCompile::compileInlineInterval() { | 
 |     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { | 
 |         // Too big to inline.  Fail, which will cause looping code to be generated. | 
 |         //   (Upper < Lower picks up unbounded upper and errors, both.) | 
 |         return FALSE; | 
 |     } | 
 |  | 
 |     int32_t   topOfBlock = blockTopLoc(FALSE); | 
 |     if (fIntervalUpper == 0) { | 
 |         // Pathological case.  Attempt no matches, as if the block doesn't exist. | 
 |         fRXPat->fCompiledPat->setSize(topOfBlock); | 
 |         return TRUE; | 
 |     } | 
 |  | 
 |     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { | 
 |         // The thing being repeated is not a single op, but some | 
 |         //   more complex block.  Do it as a loop, not inlines. | 
 |         //   Note that things "repeated" a max of once are handled as inline, because | 
 |         //     the one copy of the code already generated is just fine. | 
 |         return FALSE; | 
 |     } | 
 |  | 
 |     // Pick up the opcode that is to be repeated | 
 |     // | 
 |     int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock); | 
 |  | 
 |     // Compute the pattern location where the inline sequence  | 
 |     //   will end, and set up the state save op that will be needed. | 
 |     //    | 
 |     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 | 
 |                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow); | 
 |     int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); | 
 |     if (fIntervalLow == 0) { | 
 |         insertOp(topOfBlock); | 
 |         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); | 
 |     } | 
 |  | 
 |  | 
 |  | 
 |     //  Loop, emitting the op for the thing being repeated each time. | 
 |     //    Loop starts at 1 because one instance of the op already exists in the pattern, | 
 |     //    it was put there when it was originally encountered. | 
 |     int32_t i; | 
 |     for (i=1; i<fIntervalUpper; i++ ) { | 
 |         if (i == fIntervalLow) { | 
 |             fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | 
 |         } | 
 |         if (i > fIntervalLow) { | 
 |             fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | 
 |         } | 
 |         fRXPat->fCompiledPat->addElement(op, *fStatus); | 
 |     } | 
 |     return TRUE; | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   matchStartType    Determine how a match can start. | 
 | //                     Used to optimize find() operations. | 
 | // | 
 | //                     Operation is very similar to minMatchLength().  Walk the compiled | 
 | //                     pattern, keeping an on-going minimum-match-length.  For any | 
 | //                     op where the min match coming in is zero, add that ops possible | 
 | //                     starting matches to the possible starts for the overall pattern. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void   RegexCompile::matchStartType() { | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return; | 
 |     } | 
 |  | 
 |  | 
 |     int32_t    loc;                    // Location in the pattern of the current op being processed. | 
 |     int32_t    op;                     // The op being processed | 
 |     int32_t    opType;                 // The opcode type of the op | 
 |     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern | 
 |     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start. | 
 |  | 
 |     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered | 
 |                                        //   could have advanced the position in a match. | 
 |                                        //   (Maximum match length so far == 0) | 
 |  | 
 |     // forwardedLength is a vector holding minimum-match-length values that | 
 |     //   are propagated forward in the pattern by JMP or STATE_SAVE operations. | 
 |     //   It must be one longer than the pattern being checked because some  ops | 
 |     //   will jmp to a end-of-block+1 location from within a block, and we must | 
 |     //   count those when checking the block. | 
 |     int32_t end = fRXPat->fCompiledPat->size(); | 
 |     UVector32  forwardedLength(end+1, *fStatus); | 
 |     forwardedLength.setSize(end+1); | 
 |     for (loc=3; loc<end; loc++) { | 
 |         forwardedLength.setElementAt(INT32_MAX, loc); | 
 |     } | 
 |  | 
 |     for (loc = 3; loc<end; loc++) { | 
 |         op = fRXPat->fCompiledPat->elementAti(loc); | 
 |         opType = URX_TYPE(op); | 
 |  | 
 |         // The loop is advancing linearly through the pattern. | 
 |         // If the op we are now at was the destination of a branch in the pattern, | 
 |         // and that path has a shorter minimum length than the current accumulated value, | 
 |         // replace the current accumulated value. | 
 |         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); | 
 |         if (forwardedLength.elementAti(loc) < currentLen) { | 
 |             currentLen = forwardedLength.elementAti(loc); | 
 |             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); | 
 |         } | 
 |  | 
 |         switch (opType) { | 
 |             // Ops that don't change the total length matched | 
 |         case URX_RESERVED_OP: | 
 |         case URX_END: | 
 |         case URX_STRING_LEN: | 
 |         case URX_NOP: | 
 |         case URX_START_CAPTURE: | 
 |         case URX_END_CAPTURE: | 
 |         case URX_BACKSLASH_B: | 
 |         case URX_BACKSLASH_BU: | 
 |         case URX_BACKSLASH_G: | 
 |         case URX_BACKSLASH_Z: | 
 |         case URX_DOLLAR: | 
 |         case URX_RELOC_OPRND: | 
 |         case URX_STO_INP_LOC: | 
 |         case URX_DOLLAR_M: | 
 |         case URX_BACKTRACK: | 
 |         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match | 
 |         case URX_BACKREF_I: | 
 |  | 
 |         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match. | 
 |         case URX_LD_SP: | 
 |             break; | 
 |              | 
 |         case URX_CARET: | 
 |             if (atStart) { | 
 |                 fRXPat->fStartType = START_START; | 
 |             } | 
 |             break; | 
 |  | 
 |         case URX_CARET_M: | 
 |             if (atStart) { | 
 |                 fRXPat->fStartType = START_LINE; | 
 |             } | 
 |             break; | 
 |                  | 
 |         case URX_ONECHAR: | 
 |             if (currentLen == 0) { | 
 |                 // This character could appear at the start of a match. | 
 |                 //   Add it to the set of possible starting characters. | 
 |                 fRXPat->fInitialChars->add(URX_VAL(op)); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |              | 
 |  | 
 |         case URX_SETREF:       | 
 |             if (currentLen == 0) { | 
 |                 int32_t  sn = URX_VAL(op); | 
 |                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); | 
 |                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); | 
 |                 fRXPat->fInitialChars->addAll(*s); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |         case URX_LOOP_SR_I: | 
 |             // [Set]*, like a SETREF, above, in what it can match, | 
 |             //  but may not match at all, so currentLen is not incremented. | 
 |             if (currentLen == 0) { | 
 |                 int32_t  sn = URX_VAL(op); | 
 |                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); | 
 |                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); | 
 |                 fRXPat->fInitialChars->addAll(*s); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |         case URX_LOOP_DOT_I: | 
 |             if (currentLen == 0) { | 
 |                 // .* at the start of a pattern. | 
 |                 //    Any character can begin the match. | 
 |                 fRXPat->fInitialChars->clear(); | 
 |                 fRXPat->fInitialChars->complement(); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_STATIC_SETREF:     | 
 |             if (currentLen == 0) { | 
 |                 int32_t  sn = URX_VAL(op); | 
 |                 U_ASSERT(sn>0 && sn<URX_LAST_SET); | 
 |                 const UnicodeSet *s = fRXPat->fStaticSets[sn]; | 
 |                 fRXPat->fInitialChars->addAll(*s); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |  | 
 |         case URX_STAT_SETREF_N:     | 
 |             if (currentLen == 0) { | 
 |                 int32_t  sn = URX_VAL(op); | 
 |                 const UnicodeSet *s = fRXPat->fStaticSets[sn]; | 
 |                 UnicodeSet sc(*s); | 
 |                 sc.complement(); | 
 |                 fRXPat->fInitialChars->addAll(sc); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |  | 
 |         case URX_BACKSLASH_D: | 
 |             // Digit Char | 
 |              if (currentLen == 0) { | 
 |                  UnicodeSet s;    | 
 |                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); | 
 |                  if (URX_VAL(op) != 0) { | 
 |                      s.complement(); | 
 |                  } | 
 |                  fRXPat->fInitialChars->addAll(s); | 
 |                  numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_ONECHAR_I: | 
 |             // Case Insensitive Single Character. | 
 |             if (currentLen == 0) { | 
 |                 UChar32  c = URX_VAL(op); | 
 |                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { | 
 |                     // character may have distinct cased forms.  Add all of them | 
 |                     //   to the set of possible starting match chars. | 
 |                     UnicodeSet s(c, c); | 
 |                     s.closeOver(USET_CASE_INSENSITIVE); | 
 |                     fRXPat->fInitialChars->addAll(s); | 
 |                 } else { | 
 |                     // Char has no case variants.  Just add it as-is to the | 
 |                     //   set of possible starting chars. | 
 |                     fRXPat->fInitialChars->add(c); | 
 |                 } | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded. | 
 |         case URX_DOTANY_ALL:    // . matches one or two. | 
 |         case URX_DOTANY: | 
 |         case URX_DOTANY_ALL_PL: | 
 |         case URX_DOTANY_PL: | 
 |             if (currentLen == 0) { | 
 |                 // These constructs are all bad news when they appear at the start | 
 |                 //   of a match.  Any character can begin the match. | 
 |                 fRXPat->fInitialChars->clear(); | 
 |                 fRXPat->fInitialChars->complement(); | 
 |                 numInitialStrings += 2; | 
 |             } | 
 |             currentLen++; | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_JMPX: | 
 |             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP. | 
 |         case URX_JMP: | 
 |             { | 
 |                 int32_t  jmpDest = URX_VAL(op); | 
 |                 if (jmpDest < loc) { | 
 |                     // Loop of some kind.  Can safely ignore, the worst that will happen | 
 |                     //  is that we understate the true minimum length | 
 |                     currentLen = forwardedLength.elementAti(loc+1); | 
 |                     | 
 |                 } else { | 
 |                     // Forward jump.  Propagate the current min length to the target loc of the jump. | 
 |                     U_ASSERT(jmpDest <= end+1); | 
 |                     if (forwardedLength.elementAti(jmpDest) > currentLen) { | 
 |                         forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                     } | 
 |                 } | 
 |             } | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |         case URX_JMP_SAV: | 
 |         case URX_JMP_SAV_X: | 
 |             // Combo of state save to the next loc, + jmp backwards. | 
 |             //   Net effect on min. length computation is nothing. | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |         case URX_FAIL: | 
 |             // Fails are kind of like a branch, except that the min length was | 
 |             //   propagated already, by the state save. | 
 |             currentLen = forwardedLength.elementAti(loc+1); | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_STATE_SAVE: | 
 |             { | 
 |                 // State Save, for forward jumps, propagate the current minimum. | 
 |                 //             of the state save. | 
 |                 int32_t  jmpDest = URX_VAL(op); | 
 |                 if (jmpDest > loc) { | 
 |                     if (currentLen < forwardedLength.elementAti(jmpDest)) { | 
 |                         forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                     } | 
 |                 }  | 
 |             } | 
 |             atStart = FALSE; | 
 |             break; | 
 |              | 
 |  | 
 |  | 
 |  | 
 |         case URX_STRING: | 
 |             { | 
 |                 loc++; | 
 |                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc); | 
 |                 int32_t stringLen   = URX_VAL(stringLenOp); | 
 |                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); | 
 |                 U_ASSERT(stringLenOp >= 2); | 
 |                 if (currentLen == 0) { | 
 |                     // Add the starting character of this string to the set of possible starting | 
 |                     //   characters for this pattern. | 
 |                     int32_t stringStartIdx = URX_VAL(op); | 
 |                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx); | 
 |                     fRXPat->fInitialChars->add(c); | 
 |  | 
 |                     // Remember this string.  After the entire pattern has been checked, | 
 |                     //  if nothing else is identified that can start a match, we'll use it. | 
 |                     numInitialStrings++; | 
 |                     fRXPat->fInitialStringIdx = stringStartIdx; | 
 |                     fRXPat->fInitialStringLen = stringLen; | 
 |                 } | 
 |                      | 
 |                 currentLen += stringLen; | 
 |                 atStart = FALSE; | 
 |             } | 
 |             break; | 
 |  | 
 |         case URX_STRING_I: | 
 |             { | 
 |                 // Case-insensitive string.  Unlike exact-match strings, we won't | 
 |                 //   attempt a string search for possible match positions.  But we | 
 |                 //   do update the set of possible starting characters. | 
 |                 loc++; | 
 |                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc); | 
 |                 int32_t stringLen   = URX_VAL(stringLenOp); | 
 |                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); | 
 |                 U_ASSERT(stringLenOp >= 2); | 
 |                 if (currentLen == 0) { | 
 |                     // Add the starting character of this string to the set of possible starting | 
 |                     //   characters for this pattern. | 
 |                     int32_t stringStartIdx = URX_VAL(op); | 
 |                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx); | 
 |                     UnicodeSet s(c, c); | 
 |                     s.closeOver(USET_CASE_INSENSITIVE); | 
 |                     fRXPat->fInitialChars->addAll(s); | 
 |                     numInitialStrings += 2;  // Matching on an initial string not possible. | 
 |                 } | 
 |                 currentLen += stringLen; | 
 |                 atStart = FALSE; | 
 |             } | 
 |             break; | 
 |  | 
 |         case URX_CTR_INIT: | 
 |         case URX_CTR_INIT_NG: | 
 |             { | 
 |                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops | 
 |                 //   so location must be updated accordingly. | 
 |                 // Loop Init Ops.   | 
 |                 //   If the min loop count == 0 | 
 |                 //      move loc forwards to the end of the loop, skipping over the body. | 
 |                 //   If the min count is > 0,  | 
 |                 //      continue normal processing of the body of the loop. | 
 |                 int32_t loopEndLoc   = fRXPat->fCompiledPat->elementAti(loc+1); | 
 |                         loopEndLoc   = URX_VAL(loopEndLoc); | 
 |                 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2); | 
 |                 if (minLoopCount == 0) { | 
 |                     // Min Loop Count of 0, treat like a forward branch and | 
 |                     //   move the current minimum length up to the target | 
 |                     //   (end of loop) location. | 
 |                     U_ASSERT(loopEndLoc <= end+1); | 
 |                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) { | 
 |                         forwardedLength.setElementAt(currentLen, loopEndLoc); | 
 |                     } | 
 |                 }  | 
 |                 loc+=3;  // Skips over operands of CTR_INIT | 
 |             } | 
 |             atStart = FALSE; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_CTR_LOOP: | 
 |         case URX_CTR_LOOP_NG: | 
 |             // Loop ops.  | 
 |             //  The jump is conditional, backwards only. | 
 |             atStart = FALSE; | 
 |             break; | 
 |              | 
 |         case URX_LOOP_C: | 
 |             // More loop ops.  These state-save to themselves. | 
 |             //   don't change the minimum match | 
 |             atStart = FALSE; | 
 |             break; | 
 |              | 
 |  | 
 |         case URX_LA_START: | 
 |         case URX_LB_START: | 
 |             { | 
 |                 // Look-around.  Scan forward until the matching look-ahead end, | 
 |                 //   without processing the look-around block.  This is overly pessimistic. | 
 |                 int32_t  depth = 0; | 
 |                 for (;;) { | 
 |                     loc++; | 
 |                     op = fRXPat->fCompiledPat->elementAti(loc); | 
 |                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { | 
 |                         depth++; | 
 |                     } | 
 |                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { | 
 |                         if (depth == 0) { | 
 |                             break; | 
 |                         } | 
 |                         depth--; | 
 |                     } | 
 |                     if (URX_TYPE(op) == URX_STATE_SAVE) { | 
 |                         // Need this because neg lookahead blocks will FAIL to outside | 
 |                         //   of the block. | 
 |                         int32_t  jmpDest = URX_VAL(op); | 
 |                         if (jmpDest > loc) { | 
 |                             if (currentLen < forwardedLength.elementAti(jmpDest)) { | 
 |                                 forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                             } | 
 |                         } | 
 |                     } | 
 |                     U_ASSERT(loc <= end);   | 
 |                 } | 
 |             } | 
 |             break; | 
 |              | 
 |         case URX_LA_END: | 
 |         case URX_LB_CONT: | 
 |         case URX_LB_END: | 
 |         case URX_LBN_CONT: | 
 |         case URX_LBN_END: | 
 |             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be  | 
 |                                  //  consumed by the scan in URX_LA_START and LB_START | 
 |  | 
 |             break; | 
 |              | 
 |         default: | 
 |             U_ASSERT(FALSE); | 
 |             } | 
 |              | 
 |         } | 
 |  | 
 |  | 
 |     // We have finished walking through the ops.  Check whether some forward jump | 
 |     //   propagated a shorter length to location end+1. | 
 |     if (forwardedLength.elementAti(end+1) < currentLen) { | 
 |         currentLen = forwardedLength.elementAti(end+1); | 
 |     } | 
 |  | 
 |  | 
 |     fRXPat->fInitialChars8->init(fRXPat->fInitialChars); | 
 |  | 
 |  | 
 |     // Sort out what we should check for when looking for candidate match start positions. | 
 |     // In order of preference, | 
 |     //     1.   Start of input text buffer. | 
 |     //     2.   A literal string. | 
 |     //     3.   Start of line in multi-line mode. | 
 |     //     4.   A single literal character. | 
 |     //     5.   A character from a set of characters. | 
 |     // | 
 |     if (fRXPat->fStartType == START_START) { | 
 |         // Match only at the start of an input text string. | 
 |         //    start type is already set.  We're done. | 
 |     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { | 
 |         // Match beginning only with a literal string. | 
 |         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); | 
 |         U_ASSERT(fRXPat->fInitialChars->contains(c)); | 
 |         fRXPat->fStartType   = START_STRING; | 
 |         fRXPat->fInitialChar = c; | 
 |     } else if (fRXPat->fStartType == START_LINE) { | 
 |         // Match at start of line in Multi-Line mode. | 
 |         // Nothing to do here; everything is already set. | 
 |     } else if (fRXPat->fMinMatchLen == 0) { | 
 |         // Zero length match possible.  We could start anywhere. | 
 |         fRXPat->fStartType = START_NO_INFO; | 
 |     } else if (fRXPat->fInitialChars->size() == 1) { | 
 |         // All matches begin with the same char. | 
 |         fRXPat->fStartType   = START_CHAR; | 
 |         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); | 
 |         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); | 
 |     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && | 
 |         fRXPat->fMinMatchLen > 0) { | 
 |         // Matches start with a set of character smaller than the set of all chars. | 
 |         fRXPat->fStartType = START_SET; | 
 |     } else { | 
 |         // Matches can start with anything | 
 |         fRXPat->fStartType = START_NO_INFO; | 
 |     } | 
 |  | 
 |     return; | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   minMatchLength    Calculate the length of the shortest string that could | 
 | //                     match the specified pattern.    | 
 | //                     Length is in 16 bit code units, not code points. | 
 | // | 
 | //                     The calculated length may not be exact.  The returned | 
 | //                     value may be shorter than the actual minimum; it must | 
 | //                     never be longer. | 
 | // | 
 | //                     start and end are the range of p-code operations to be | 
 | //                     examined.  The endpoints are included in the range. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) { | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     U_ASSERT(start <= end); | 
 |     U_ASSERT(end < fRXPat->fCompiledPat->size()); | 
 |  | 
 |  | 
 |     int32_t    loc; | 
 |     int32_t    op; | 
 |     int32_t    opType; | 
 |     int32_t    currentLen = 0; | 
 |  | 
 |  | 
 |     // forwardedLength is a vector holding minimum-match-length values that | 
 |     //   are propagated forward in the pattern by JMP or STATE_SAVE operations. | 
 |     //   It must be one longer than the pattern being checked because some  ops | 
 |     //   will jmp to a end-of-block+1 location from within a block, and we must | 
 |     //   count those when checking the block. | 
 |     UVector32  forwardedLength(end+2, *fStatus); | 
 |     forwardedLength.setSize(end+2); | 
 |     for (loc=start; loc<=end+1; loc++) { | 
 |         forwardedLength.setElementAt(INT32_MAX, loc); | 
 |     } | 
 |  | 
 |     for (loc = start; loc<=end; loc++) { | 
 |         op = fRXPat->fCompiledPat->elementAti(loc); | 
 |         opType = URX_TYPE(op); | 
 |  | 
 |         // The loop is advancing linearly through the pattern. | 
 |         // If the op we are now at was the destination of a branch in the pattern, | 
 |         // and that path has a shorter minimum length than the current accumulated value, | 
 |         // replace the current accumulated value. | 
 |         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); | 
 |         if (forwardedLength.elementAti(loc) < currentLen) { | 
 |             currentLen = forwardedLength.elementAti(loc); | 
 |             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); | 
 |         } | 
 |  | 
 |         switch (opType) { | 
 |             // Ops that don't change the total length matched | 
 |         case URX_RESERVED_OP: | 
 |         case URX_END: | 
 |         case URX_STRING_LEN: | 
 |         case URX_NOP: | 
 |         case URX_START_CAPTURE: | 
 |         case URX_END_CAPTURE: | 
 |         case URX_BACKSLASH_B: | 
 |         case URX_BACKSLASH_BU: | 
 |         case URX_BACKSLASH_G: | 
 |         case URX_BACKSLASH_Z: | 
 |         case URX_CARET: | 
 |         case URX_DOLLAR: | 
 |         case URX_RELOC_OPRND: | 
 |         case URX_STO_INP_LOC: | 
 |         case URX_DOLLAR_M: | 
 |         case URX_CARET_M: | 
 |         case URX_BACKTRACK: | 
 |         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match | 
 |         case URX_BACKREF_I: | 
 |  | 
 |         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match. | 
 |         case URX_LD_SP: | 
 |  | 
 |         case URX_JMP_SAV: | 
 |         case URX_JMP_SAV_X: | 
 |             break; | 
 |              | 
 |  | 
 |             // Ops that match a minimum of one character (one or two 16 bit code units.) | 
 |             //    | 
 |         case URX_ONECHAR: | 
 |         case URX_STATIC_SETREF: | 
 |         case URX_STAT_SETREF_N: | 
 |         case URX_SETREF: | 
 |         case URX_BACKSLASH_D: | 
 |         case URX_ONECHAR_I: | 
 |         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded. | 
 |         case URX_DOTANY_ALL:    // . matches one or two. | 
 |         case URX_DOTANY: | 
 |         case URX_DOTANY_PL: | 
 |         case URX_DOTANY_ALL_PL: | 
 |             currentLen++; | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_JMPX: | 
 |             loc++;              // URX_JMPX has an extra operand, ignored here, | 
 |                                 //   otherwise processed identically to URX_JMP. | 
 |         case URX_JMP: | 
 |             { | 
 |                 int32_t  jmpDest = URX_VAL(op); | 
 |                 if (jmpDest < loc) { | 
 |                     // Loop of some kind.  Can safely ignore, the worst that will happen | 
 |                     //  is that we understate the true minimum length | 
 |                     currentLen = forwardedLength.elementAti(loc+1); | 
 |                 } else { | 
 |                     // Forward jump.  Propagate the current min length to the target loc of the jump. | 
 |                     U_ASSERT(jmpDest <= end+1); | 
 |                     if (forwardedLength.elementAti(jmpDest) > currentLen) { | 
 |                         forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                     } | 
 |                 } | 
 |             } | 
 |             break; | 
 |  | 
 |         case URX_FAIL: | 
 |             { | 
 |                 // Fails are kind of like a branch, except that the min length was | 
 |                 //   propagated already, by the state save. | 
 |                 currentLen = forwardedLength.elementAti(loc+1); | 
 |                 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); | 
 |             } | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_STATE_SAVE: | 
 |             { | 
 |                 // State Save, for forward jumps, propagate the current minimum. | 
 |                 //             of the state save. | 
 |                 int32_t  jmpDest = URX_VAL(op); | 
 |                 if (jmpDest > loc) { | 
 |                     if (currentLen < forwardedLength.elementAti(jmpDest)) { | 
 |                         forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                     } | 
 |                 }  | 
 |             } | 
 |             break; | 
 |              | 
 |  | 
 |         case URX_STRING: | 
 |         case URX_STRING_I: | 
 |             { | 
 |                 loc++; | 
 |                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc); | 
 |                 currentLen += URX_VAL(stringLenOp); | 
 |             } | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_CTR_INIT: | 
 |         case URX_CTR_INIT_NG: | 
 |             { | 
 |                 // Loop Init Ops.   | 
 |                 //   If the min loop count == 0 | 
 |                 //      move loc forwards to the end of the loop, skipping over the body. | 
 |                 //   If the min count is > 0,  | 
 |                 //      continue normal processing of the body of the loop. | 
 |                 int32_t loopEndLoc   = fRXPat->fCompiledPat->elementAti(loc+1); | 
 |                         loopEndLoc   = URX_VAL(loopEndLoc); | 
 |                 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2); | 
 |                 if (minLoopCount == 0) { | 
 |                     loc = loopEndLoc; | 
 |                 } else { | 
 |                     loc+=3;  // Skips over operands of CTR_INIT | 
 |                 } | 
 |             } | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_CTR_LOOP: | 
 |         case URX_CTR_LOOP_NG: | 
 |             // Loop ops.  | 
 |             //  The jump is conditional, backwards only. | 
 |             break; | 
 |              | 
 |         case URX_LOOP_SR_I: | 
 |         case URX_LOOP_DOT_I: | 
 |         case URX_LOOP_C: | 
 |             // More loop ops.  These state-save to themselves. | 
 |             //   don't change the minimum match - could match nothing at all. | 
 |             break; | 
 |              | 
 |  | 
 |         case URX_LA_START: | 
 |         case URX_LB_START: | 
 |             { | 
 |                 // Look-around.  Scan forward until the matching look-ahead end, | 
 |                 //   without processing the look-around block.  This is overly pessimistic. | 
 |                 //   TODO:  Positive lookahead could recursively do the block, then continue | 
 |                 //          with the longer of the block or the value coming in. | 
 |                 int32_t  depth = 0; | 
 |                 for (;;) { | 
 |                     loc++; | 
 |                     op = fRXPat->fCompiledPat->elementAti(loc); | 
 |                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { | 
 |                         depth++; | 
 |                     } | 
 |                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { | 
 |                         if (depth == 0) { | 
 |                             break; | 
 |                         } | 
 |                         depth--; | 
 |                     } | 
 |                     if (URX_TYPE(op) == URX_STATE_SAVE) { | 
 |                         // Need this because neg lookahead blocks will FAIL to outside | 
 |                         //   of the block. | 
 |                         int32_t  jmpDest = URX_VAL(op); | 
 |                         if (jmpDest > loc) { | 
 |                             if (currentLen < forwardedLength.elementAti(jmpDest)) { | 
 |                                 forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                             } | 
 |                         } | 
 |                     } | 
 |                          | 
 |                     U_ASSERT(loc <= end);   | 
 |                 } | 
 |             } | 
 |             break; | 
 |              | 
 |         case URX_LA_END: | 
 |         case URX_LB_CONT: | 
 |         case URX_LB_END: | 
 |         case URX_LBN_CONT: | 
 |         case URX_LBN_END: | 
 |             // Only come here if the matching URX_LA_START or URX_LB_START was not in the | 
 |             //   range being sized, which happens when measuring size of look-behind blocks. | 
 |             break; | 
 |              | 
 |         default: | 
 |             U_ASSERT(FALSE); | 
 |             } | 
 |              | 
 |         } | 
 |  | 
 |     // We have finished walking through the ops.  Check whether some forward jump | 
 |     //   propagated a shorter length to location end+1. | 
 |     if (forwardedLength.elementAti(end+1) < currentLen) { | 
 |         currentLen = forwardedLength.elementAti(end+1); | 
 |         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); | 
 |     } | 
 |              | 
 |     return currentLen; | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   maxMatchLength    Calculate the length of the longest string that could | 
 | //                     match the specified pattern.    | 
 | //                     Length is in 16 bit code units, not code points. | 
 | // | 
 | //                     The calculated length may not be exact.  The returned | 
 | //                     value may be longer than the actual maximum; it must | 
 | //                     never be shorter. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) { | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return 0; | 
 |     } | 
 |     U_ASSERT(start <= end); | 
 |     U_ASSERT(end < fRXPat->fCompiledPat->size()); | 
 |  | 
 |  | 
 |     int32_t    loc; | 
 |     int32_t    op; | 
 |     int32_t    opType; | 
 |     int32_t    currentLen = 0; | 
 |     UVector32  forwardedLength(end+1, *fStatus); | 
 |     forwardedLength.setSize(end+1); | 
 |  | 
 |     for (loc=start; loc<=end; loc++) { | 
 |         forwardedLength.setElementAt(0, loc); | 
 |     } | 
 |  | 
 |     for (loc = start; loc<=end; loc++) { | 
 |         op = fRXPat->fCompiledPat->elementAti(loc); | 
 |         opType = URX_TYPE(op); | 
 |  | 
 |         // The loop is advancing linearly through the pattern. | 
 |         // If the op we are now at was the destination of a branch in the pattern, | 
 |         // and that path has a longer maximum length than the current accumulated value, | 
 |         // replace the current accumulated value. | 
 |         if (forwardedLength.elementAti(loc) > currentLen) { | 
 |             currentLen = forwardedLength.elementAti(loc); | 
 |         } | 
 |  | 
 |         switch (opType) { | 
 |             // Ops that don't change the total length matched | 
 |         case URX_RESERVED_OP: | 
 |         case URX_END: | 
 |         case URX_STRING_LEN: | 
 |         case URX_NOP: | 
 |         case URX_START_CAPTURE: | 
 |         case URX_END_CAPTURE: | 
 |         case URX_BACKSLASH_B: | 
 |         case URX_BACKSLASH_BU: | 
 |         case URX_BACKSLASH_G: | 
 |         case URX_BACKSLASH_Z: | 
 |         case URX_CARET: | 
 |         case URX_DOLLAR: | 
 |         case URX_RELOC_OPRND: | 
 |         case URX_STO_INP_LOC: | 
 |         case URX_DOLLAR_M: | 
 |         case URX_CARET_M: | 
 |         case URX_BACKTRACK: | 
 |  | 
 |         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match. | 
 |         case URX_LD_SP: | 
 |  | 
 |         case URX_LB_END: | 
 |         case URX_LB_CONT: | 
 |         case URX_LBN_CONT: | 
 |         case URX_LBN_END: | 
 |             break; | 
 |              | 
 |  | 
 |             // Ops that increase that cause an unbounded increase in the length | 
 |             //   of a matched string, or that increase it a hard to characterize way. | 
 |             //   Call the max length unbounded, and stop further checking. | 
 |         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match | 
 |         case URX_BACKREF_I: | 
 |         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded. | 
 |         case URX_DOTANY_PL: | 
 |         case URX_DOTANY_ALL_PL: | 
 |             currentLen = INT32_MAX; | 
 |             break; | 
 |  | 
 |  | 
 |             // Ops that match a max of one character (possibly two 16 bit code units.) | 
 |             //    | 
 |         case URX_STATIC_SETREF: | 
 |         case URX_STAT_SETREF_N: | 
 |         case URX_SETREF: | 
 |         case URX_BACKSLASH_D: | 
 |         case URX_ONECHAR_I: | 
 |         case URX_DOTANY_ALL:   | 
 |         case URX_DOTANY: | 
 |             currentLen+=2; | 
 |             break; | 
 |  | 
 |             // Single literal character.  Increase current max length by one or two, | 
 |             //       depending on whether the char is in the supplementary range. | 
 |         case URX_ONECHAR: | 
 |             currentLen++; | 
 |             if (URX_VAL(op) > 0x10000) { | 
 |                 currentLen++; | 
 |             } | 
 |             break; | 
 |  | 
 |             // Jumps.   | 
 |             // | 
 |         case URX_JMP: | 
 |         case URX_JMPX: | 
 |         case URX_JMP_SAV: | 
 |         case URX_JMP_SAV_X: | 
 |             { | 
 |                 int32_t  jmpDest = URX_VAL(op); | 
 |                 if (jmpDest < loc) { | 
 |                     // Loop of some kind.  Max match length is unbounded. | 
 |                     currentLen = INT32_MAX; | 
 |                 } else { | 
 |                     // Forward jump.  Propagate the current min length to the target loc of the jump. | 
 |                     if (forwardedLength.elementAti(jmpDest) < currentLen) { | 
 |                         forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                     } | 
 |                     currentLen = 0; | 
 |                 } | 
 |             } | 
 |             break; | 
 |  | 
 |         case URX_FAIL: | 
 |             // Fails are kind of like a branch, except that the max length was | 
 |             //   propagated already, by the state save. | 
 |             currentLen = forwardedLength.elementAti(loc+1); | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_STATE_SAVE: | 
 |             { | 
 |                 // State Save, for forward jumps, propagate the current minimum. | 
 |                 //               of the state save. | 
 |                 //             For backwards jumps, they create a loop, maximum | 
 |                 //               match length is unbounded. | 
 |                 int32_t  jmpDest = URX_VAL(op); | 
 |                 if (jmpDest > loc) { | 
 |                     if (currentLen > forwardedLength.elementAti(jmpDest)) { | 
 |                         forwardedLength.setElementAt(currentLen, jmpDest); | 
 |                     } | 
 |                 } else { | 
 |                     currentLen = INT32_MAX; | 
 |                 } | 
 |             } | 
 |             break; | 
 |              | 
 |  | 
 |  | 
 |  | 
 |         case URX_STRING: | 
 |         case URX_STRING_I: | 
 |             { | 
 |                 loc++; | 
 |                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc); | 
 |                 currentLen += URX_VAL(stringLenOp); | 
 |             } | 
 |             break; | 
 |  | 
 |  | 
 |         case URX_CTR_INIT: | 
 |         case URX_CTR_INIT_NG: | 
 |         case URX_CTR_LOOP: | 
 |         case URX_CTR_LOOP_NG: | 
 |         case URX_LOOP_SR_I: | 
 |         case URX_LOOP_DOT_I: | 
 |         case URX_LOOP_C: | 
 |             // For anything to do with loops, make the match length unbounded. | 
 |             //   Note:  INIT instructions are multi-word.  Can ignore because | 
 |             //          INT32_MAX length will stop the per-instruction loop. | 
 |             currentLen = INT32_MAX; | 
 |             break; | 
 |              | 
 |              | 
 |  | 
 |         case URX_LA_START: | 
 |         case URX_LA_END: | 
 |             // Look-ahead.  Just ignore, treat the look-ahead block as if | 
 |             // it were normal pattern.  Gives a too-long match length, | 
 |             //  but good enough for now. | 
 |             break; | 
 |              | 
 |             // End of look-ahead ops should always be consumed by the processing at | 
 |             //  the URX_LA_START op. | 
 |             // U_ASSERT(FALSE); | 
 |             // break; | 
 |              | 
 |         case URX_LB_START: | 
 |             { | 
 |                 // Look-behind.  Scan forward until the matching look-around end, | 
 |                 //   without processing the look-behind block.   | 
 |                 int32_t  depth = 0; | 
 |                 for (;;) { | 
 |                     loc++; | 
 |                     op = fRXPat->fCompiledPat->elementAti(loc); | 
 |                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { | 
 |                         depth++; | 
 |                     } | 
 |                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { | 
 |                         if (depth == 0) { | 
 |                             break; | 
 |                         } | 
 |                         depth--; | 
 |                     } | 
 |                     U_ASSERT(loc < end);   | 
 |                 } | 
 |             } | 
 |             break; | 
 |  | 
 |         default: | 
 |             U_ASSERT(FALSE); | 
 |         } | 
 |  | 
 |              | 
 |         if (currentLen == INT32_MAX) { | 
 |             //  The maximum length is unbounded. | 
 |             //  Stop further processing of the pattern. | 
 |             break; | 
 |         } | 
 |          | 
 |     } | 
 |     return currentLen; | 
 |      | 
 | } | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   stripNOPs    Remove any NOP operations from the compiled pattern code. | 
 | //                Extra NOPs are inserted for some constructs during the initial | 
 | //                code generation to provide locations that may be patched later. | 
 | //                Many end up unneeded, and are removed by this function. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void RegexCompile::stripNOPs() { | 
 |  | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     int32_t    end = fRXPat->fCompiledPat->size(); | 
 |     UVector32  deltas(end, *fStatus); | 
 |  | 
 |     // Make a first pass over the code, computing the amount that things | 
 |     //   will be offset at each location in the original code. | 
 |     int32_t   loc; | 
 |     int32_t   d = 0; | 
 |     for (loc=0; loc<end; loc++) { | 
 |         deltas.addElement(d, *fStatus); | 
 |         int32_t op = fRXPat->fCompiledPat->elementAti(loc); | 
 |         if (URX_TYPE(op) == URX_NOP) { | 
 |             d++; | 
 |         } | 
 |     } | 
 |  | 
 |     // Make a second pass over the code, removing the NOPs by moving following | 
 |     //  code up, and patching operands that refer to code locations that | 
 |     //  are being moved.  The array of offsets from the first step is used | 
 |     //  to compute the new operand values. | 
 |     int32_t src; | 
 |     int32_t dst = 0; | 
 |     for (src=0; src<end; src++) { | 
 |         int32_t op = fRXPat->fCompiledPat->elementAti(src); | 
 |         int32_t opType = URX_TYPE(op); | 
 |         switch (opType) { | 
 |         case URX_NOP: | 
 |             break; | 
 |  | 
 |         case URX_STATE_SAVE: | 
 |         case URX_JMP: | 
 |         case URX_CTR_LOOP: | 
 |         case URX_CTR_LOOP_NG: | 
 |         case URX_RELOC_OPRND: | 
 |         case URX_JMPX: | 
 |         case URX_JMP_SAV: | 
 |         case URX_JMP_SAV_X: | 
 |             // These are instructions with operands that refer to code locations. | 
 |             { | 
 |                 int32_t  operandAddress = URX_VAL(op); | 
 |                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); | 
 |                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); | 
 |                 op = URX_BUILD(opType, fixedOperandAddress); | 
 |                 fRXPat->fCompiledPat->setElementAt(op, dst); | 
 |                 dst++; | 
 |                 break; | 
 |             } | 
 |  | 
 |         case URX_RESERVED_OP: | 
 |         case URX_RESERVED_OP_N: | 
 |         case URX_BACKTRACK: | 
 |         case URX_END: | 
 |         case URX_ONECHAR: | 
 |         case URX_STRING: | 
 |         case URX_STRING_LEN: | 
 |         case URX_START_CAPTURE: | 
 |         case URX_END_CAPTURE: | 
 |         case URX_STATIC_SETREF: | 
 |         case URX_STAT_SETREF_N: | 
 |         case URX_SETREF: | 
 |         case URX_DOTANY: | 
 |         case URX_FAIL: | 
 |         case URX_BACKSLASH_B: | 
 |         case URX_BACKSLASH_BU: | 
 |         case URX_BACKSLASH_G: | 
 |         case URX_BACKSLASH_X: | 
 |         case URX_BACKSLASH_Z: | 
 |         case URX_DOTANY_ALL: | 
 |         case URX_DOTANY_ALL_PL: | 
 |         case URX_DOTANY_PL: | 
 |         case URX_BACKSLASH_D: | 
 |         case URX_CARET: | 
 |         case URX_DOLLAR: | 
 |         case URX_CTR_INIT: | 
 |         case URX_CTR_INIT_NG: | 
 |         case URX_STO_SP: | 
 |         case URX_LD_SP: | 
 |         case URX_BACKREF: | 
 |         case URX_STO_INP_LOC: | 
 |         case URX_LA_START: | 
 |         case URX_LA_END: | 
 |         case URX_ONECHAR_I: | 
 |         case URX_STRING_I: | 
 |         case URX_BACKREF_I: | 
 |         case URX_DOLLAR_M: | 
 |         case URX_CARET_M: | 
 |         case URX_LB_START: | 
 |         case URX_LB_CONT: | 
 |         case URX_LB_END: | 
 |         case URX_LBN_CONT: | 
 |         case URX_LBN_END: | 
 |         case URX_LOOP_SR_I: | 
 |         case URX_LOOP_DOT_I: | 
 |         case URX_LOOP_C: | 
 |             // These instructions are unaltered by the relocation. | 
 |             fRXPat->fCompiledPat->setElementAt(op, dst); | 
 |             dst++; | 
 |             break; | 
 |  | 
 |         default: | 
 |             // Some op is unaccounted for. | 
 |             U_ASSERT(FALSE); | 
 |             error(U_REGEX_INTERNAL_ERROR); | 
 |         } | 
 |     } | 
 |  | 
 |     fRXPat->fCompiledPat->setSize(dst); | 
 | } | 
 |  | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   OptDotStar       Optimize patterns that end with a '.*' or '.+' to | 
 | //                    just advance the input to the end. | 
 | // | 
 | //         Transform this compiled sequence | 
 | //            [DOT_ANY | DOT_ANY_ALL] | 
 | //            JMP_SAV  to previous instruction | 
 | //            [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]* | 
 | //            END | 
 | // | 
 | //         To | 
 | //            NOP | 
 | //            [DOT_ANY_PL | DOT_ANY_ALL_PL] | 
 | //            [NOP | END_CAPTURE | DOLLAR | BACKSLASH_Z]* | 
 | //            END | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void RegexCompile::OptDotStar() { | 
 |     // Scan backwards in the pattern, looking for a JMP_SAV near the end. | 
 |     int32_t  jmpLoc; | 
 |     int32_t  op = 0; | 
 |     int32_t  opType; | 
 |     for (jmpLoc=fRXPat->fCompiledPat->size(); jmpLoc--;) { | 
 |         U_ASSERT(jmpLoc>0); | 
 |         op     = fRXPat->fCompiledPat->elementAti(jmpLoc); | 
 |         opType = URX_TYPE(op); | 
 |         switch(opType) {  | 
 |  | 
 |              | 
 |         case URX_END: | 
 |         case URX_NOP: | 
 |         case URX_END_CAPTURE: | 
 |         case URX_DOLLAR_M: | 
 |         case URX_DOLLAR: | 
 |         case URX_BACKSLASH_Z: | 
 |             // These ops may follow the JMP_SAV without preventing us from | 
 |             //   doing this optimization. | 
 |             continue; | 
 |  | 
 |         case URX_JMP_SAV: | 
 |             // Got a trailing JMP_SAV that's a candidate for optimization. | 
 |             break; | 
 |  | 
 |         default: | 
 |             // This optimization not possible. | 
 |             return; | 
 |         } | 
 |         break;   // from the for loop. | 
 |     } | 
 |  | 
 |     // We found in URX_JMP_SAV near the end that is a candidate for optimizing. | 
 |     // Is the target address the previous instruction? | 
 |     // Is the previous instruction a flavor of URX_DOTANY | 
 |     int32_t  loopTopLoc = URX_VAL(op); | 
 |     if (loopTopLoc != jmpLoc-1) { | 
 |         return; | 
 |     } | 
 |     int32_t newOp; | 
 |     int32_t oldOp     = fRXPat->fCompiledPat->elementAti(loopTopLoc); | 
 |     int32_t oldOpType = opType = URX_TYPE(oldOp); | 
 |     if (oldOpType == URX_DOTANY) { | 
 |         newOp = URX_BUILD(URX_DOTANY_PL, 0); | 
 |     } | 
 |     else if (oldOpType == URX_DOTANY_ALL) { | 
 |         newOp = URX_BUILD(URX_DOTANY_ALL_PL, 0); | 
 |     } else { | 
 |         return;    // Sequence we were looking for isn't there. | 
 |     } | 
 |  | 
 |     // Substitute the new instructions into the pattern. | 
 |     // The NOP will be removed in a later optimization step. | 
 |     fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loopTopLoc); | 
 |     fRXPat->fCompiledPat->setElementAt(newOp, jmpLoc); | 
 | } | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  Error         Report a rule parse error. | 
 | //                Only report it if no previous error has been recorded. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void RegexCompile::error(UErrorCode e) { | 
 |     if (U_SUCCESS(*fStatus)) { | 
 |         *fStatus = e; | 
 |         fParseErr->line   = fLineNum; | 
 |         fParseErr->offset = fCharNum; | 
 |  | 
 |         // Fill in the context. | 
 |         //   Note: extractBetween() pins supplied indicies to the string bounds. | 
 |         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext)); | 
 |         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); | 
 |         fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, | 
 |             fParseErr->preContext,  0); | 
 |         fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, | 
 |             fParseErr->postContext, 0); | 
 |     } | 
 | } | 
 |  | 
 |  | 
 | // | 
 | //  Assorted Unicode character constants. | 
 | //     Numeric because there is no portable way to enter them as literals. | 
 | //     (Think EBCDIC). | 
 | // | 
 | static const UChar      chCR        = 0x0d;      // New lines, for terminating comments. | 
 | static const UChar      chLF        = 0x0a; | 
 | static const UChar      chNEL       = 0x85;      //    NEL newline variant | 
 | static const UChar      chLS        = 0x2028;    //    Unicode Line Separator | 
 | static const UChar      chPound     = 0x23;      // '#', introduces a comment. | 
 | static const UChar      chE         = 0x45;      // 'E' | 
 | static const UChar      chUpperN    = 0x4E; | 
 | static const UChar      chLowerP    = 0x70; | 
 | static const UChar      chUpperP    = 0x50; | 
 | static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape | 
 | static const UChar      chLBracket  = 0x5b; | 
 | static const UChar      chRBracket  = 0x5d; | 
 | static const UChar      chRBrace    = 0x7d; | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  nextCharLL    Low Level Next Char from the regex pattern. | 
 | //                Get a char from the string, keep track of input position | 
 | //                     for error reporting. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | UChar32  RegexCompile::nextCharLL() { | 
 |     UChar32       ch; | 
 |     UnicodeString &pattern = fRXPat->fPattern; | 
 |  | 
 |     if (fPeekChar != -1) { | 
 |         ch = fPeekChar; | 
 |         fPeekChar = -1; | 
 |         return ch; | 
 |     } | 
 |     if (fPatternLength==0 || fNextIndex >= fPatternLength) { | 
 |         return (UChar32)-1; | 
 |     } | 
 |     ch         = pattern.char32At(fNextIndex); | 
 |     fNextIndex = pattern.moveIndex32(fNextIndex, 1); | 
 |  | 
 |     if (ch == chCR || | 
 |         ch == chNEL || | 
 |         ch == chLS   || | 
 |         ch == chLF && fLastChar != chCR) { | 
 |         // Character is starting a new line.  Bump up the line number, and | 
 |         //  reset the column to 0. | 
 |         fLineNum++; | 
 |         fCharNum=0; | 
 |         if (fQuoteMode) { | 
 |             error(U_REGEX_RULE_SYNTAX); | 
 |             fQuoteMode = FALSE; | 
 |         } | 
 |     } | 
 |     else { | 
 |         // Character is not starting a new line.  Except in the case of a | 
 |         //   LF following a CR, increment the column position. | 
 |         if (ch != chLF) { | 
 |             fCharNum++; | 
 |         } | 
 |     } | 
 |     fLastChar = ch; | 
 |     return ch; | 
 | } | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   peekCharLL    Low Level Character Scanning, sneak a peek at the next | 
 | //                 character without actually getting it. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | UChar32  RegexCompile::peekCharLL() { | 
 |     if (fPeekChar == -1) { | 
 |         fPeekChar = nextCharLL(); | 
 |     } | 
 |     return fPeekChar; | 
 | } | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //   nextChar     for pattern scanning.  At this level, we handle stripping | 
 | //                out comments and processing some backslash character escapes. | 
 | //                The rest of the pattern grammar is handled at the next level up. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | void RegexCompile::nextChar(RegexPatternChar &c) { | 
 |  | 
 |     fScanIndex = fNextIndex; | 
 |     c.fChar    = nextCharLL(); | 
 |     c.fQuoted  = FALSE; | 
 |  | 
 |     if (fQuoteMode) { | 
 |         c.fQuoted = TRUE; | 
 |         if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) { | 
 |             fQuoteMode = FALSE;  //  Exit quote mode, | 
 |             nextCharLL();       // discard the E | 
 |             nextChar(c);        // recurse to get the real next char | 
 |         } | 
 |     } | 
 |     else if (fInBackslashQuote) { | 
 |         // The current character immediately follows a '\' | 
 |         // Don't check for any further escapes, just return it as-is. | 
 |         // Don't set c.fQuoted, because that would prevent the state machine from | 
 |         //    dispatching on the character. | 
 |         fInBackslashQuote = FALSE; | 
 |     } | 
 |     else | 
 |     { | 
 |         // We are not in a \Q quoted region \E of the source. | 
 |         // | 
 |         if (fModeFlags & UREGEX_COMMENTS) { | 
 |             // | 
 |             // We are in free-spacing and comments mode. | 
 |             //  Scan through any white space and comments, until we  | 
 |             //  reach a significant character or the end of inut. | 
 |             for (;;) { | 
 |                 if (c.fChar == (UChar32)-1) { | 
 |                     break;     // End of Input | 
 |                 } | 
 |                 if  (c.fChar == chPound && fEOLComments == TRUE) { | 
 |                     // Start of a comment.  Consume the rest of it, until EOF or a new line | 
 |                     for (;;) { | 
 |                         c.fChar = nextCharLL(); | 
 |                         if (c.fChar == (UChar32)-1 ||  // EOF | 
 |                             c.fChar == chCR        || | 
 |                             c.fChar == chLF        || | 
 |                             c.fChar == chNEL       || | 
 |                             c.fChar == chLS)       { | 
 |                             break; | 
 |                         } | 
 |                     } | 
 |                 } | 
 |                 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) { | 
 |                     break; | 
 |                 } | 
 |                 c.fChar = nextCharLL(); | 
 |             } | 
 |         } | 
 |  | 
 |         // | 
 |         //  check for backslash escaped characters. | 
 |         // | 
 |                 int32_t startX = fNextIndex;  // start and end positions of the | 
 |                 int32_t endX   = fNextIndex;  //   sequence following the '\' | 
 |         if (c.fChar == chBackSlash) { | 
 |             if (RegexStaticSets::gStaticSets->fUnescapeCharSet->contains(peekCharLL())) { | 
 |                 // | 
 |                 // A '\' sequence that is handled by ICU's standard unescapeAt function. | 
 |                 //   Includes \uxxxx, \n, \r, many others. | 
 |                 //   Return the single equivalent character. | 
 |                 // | 
 |                 nextCharLL();                 // get & discard the peeked char. | 
 |                 c.fQuoted = TRUE; | 
 |                 c.fChar = fRXPat->fPattern.unescapeAt(endX); | 
 |                 if (startX == endX) { | 
 |                     error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 
 |                 } | 
 |                 fCharNum += endX - startX; | 
 |                 fNextIndex = endX; | 
 |             } | 
 |             else | 
 |             { | 
 |                 // We are in a '\' escape that will be handled by the state table scanner. | 
 |                 // Just return the backslash, but remember that the following char is to | 
 |                 //  be taken literally.  TODO:  this is awkward, think about alternatives. | 
 |                 fInBackslashQuote = TRUE; | 
 |             } | 
 |         } | 
 |     } | 
 |  | 
 |     // re-enable # to end-of-line comments, in case they were disabled. | 
 |     // They are disabled by the parser upon seeing '(?', but this lasts for | 
 |     //  the fetching of the next character only. | 
 |     fEOLComments = TRUE; | 
 |  | 
 |     // putc(c.fChar, stdout); | 
 | } | 
 |  | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  scanSet    Construct a UnicodeSet from the text at the current scan | 
 | //             position.  Advance the scan position to the first character | 
 | //             after the set. | 
 | // | 
 | //             The scan position is normally under the control of the state machine | 
 | //             that controls pattern parsing.  UnicodeSets, however, are parsed by | 
 | //             the UnicodeSet constructor, not by the Regex pattern parser. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | UnicodeSet *RegexCompile::scanSet() { | 
 |     UnicodeSet    *uset = NULL; | 
 |     ParsePosition  pos; | 
 |     int            i; | 
 |  | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return NULL; | 
 |     } | 
 |  | 
 |     pos.setIndex(fScanIndex); | 
 |     UErrorCode localStatus = U_ZERO_ERROR; | 
 |     uint32_t   usetFlags = 0; | 
 |     if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 
 |         usetFlags |= USET_CASE_INSENSITIVE; | 
 |     } | 
 |     if (fModeFlags & UREGEX_COMMENTS) { | 
 |         usetFlags |= USET_IGNORE_SPACE; | 
 |     } | 
 |  | 
 |     uset = new UnicodeSet(fRXPat->fPattern, pos, | 
 |                          usetFlags, NULL, localStatus); | 
 |     if (U_FAILURE(localStatus)) { | 
 |         //  TODO:  Get more accurate position of the error from UnicodeSet's return info. | 
 |         //         UnicodeSet appears to not be reporting correctly at this time. | 
 |         REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex())); | 
 |         error(localStatus); | 
 |         delete uset; | 
 |         return NULL; | 
 |     } | 
 |  | 
 |     // Advance the current scan postion over the UnicodeSet. | 
 |     //   Don't just set fScanIndex because the line/char positions maintained | 
 |     //   for error reporting would be thrown off. | 
 |     i = pos.getIndex(); | 
 |     for (;;) { | 
 |         if (fNextIndex >= i) { | 
 |             break; | 
 |         } | 
 |         nextCharLL(); | 
 |     } | 
 |  | 
 |     return uset; | 
 | } | 
 |  | 
 |  | 
 | //------------------------------------------------------------------------------ | 
 | // | 
 | //  scanProp   Construct a UnicodeSet from the text at the current scan | 
 | //             position, which will be of the form \p{whaterver} | 
 | // | 
 | //             The scan position will be at the 'p' or 'P'.  On return | 
 | //             the scan position should be just after the '}' | 
 | // | 
 | //             Return a UnicodeSet, constructed from the \P pattern, | 
 | //             or NULL if the pattern is invalid. | 
 | // | 
 | //------------------------------------------------------------------------------ | 
 | UnicodeSet *RegexCompile::scanProp() { | 
 |     UnicodeSet    *uset = NULL; | 
 |  | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         return NULL; | 
 |     } | 
 |  | 
 |     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP || fC.fChar == chUpperN); | 
 |  | 
 |     // enclose the \p{property} from the regex pattern source in  [brackets] | 
 |     UnicodeString setPattern; | 
 |     setPattern.append(chLBracket); | 
 |     setPattern.append(chBackSlash); | 
 |     for (;;) { | 
 |         setPattern.append(fC.fChar); | 
 |         if (fC.fChar == chRBrace) { | 
 |             break; | 
 |         } | 
 |         nextChar(fC); | 
 |         if (fC.fChar == -1) { | 
 |             // Hit the end of the input string without finding the closing '}' | 
 |             error(U_REGEX_PROPERTY_SYNTAX); | 
 |             return NULL; | 
 |         } | 
 |     } | 
 |     setPattern.append(chRBracket); | 
 |  | 
 |     uint32_t   usetFlags = 0; | 
 |     if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 
 |         usetFlags |= USET_CASE_INSENSITIVE; | 
 |     } | 
 |     if (fModeFlags & UREGEX_COMMENTS) { | 
 |         usetFlags |= USET_IGNORE_SPACE; | 
 |     } | 
 |  | 
 |     // Build the UnicodeSet from the set pattern we just built up in a string. | 
 |     uset = new UnicodeSet(setPattern, usetFlags, NULL, *fStatus); | 
 |     if (U_FAILURE(*fStatus)) { | 
 |         delete uset; | 
 |         uset =  NULL; | 
 |     } | 
 |  | 
 |     nextChar(fC);      // Continue overall regex pattern processing with char after the '}' | 
 |     return uset; | 
 | } | 
 |  | 
 | U_NAMESPACE_END | 
 | #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS | 
 |  |