source/i18n/regexcmp.cpp - external/github.com/unicode-org/icu - Git at Google


 //
 //  file:  regexcmp.cpp
 //
 //  Copyright (C) 2002, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains the ICU regular expression compiler, which is responsible
 //  for processing a regular expression pattern into the compiled form that
 //  is used by the match finding engine.
 //

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
 #include "unicode/uchar.h"
 #include "unicode/uchriter.h"
 #include "unicode/parsepos.h"
 #include "unicode/parseerr.h"
 #include "unicode/regex.h"
 #include "uprops.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "uassert.h"
 #include "ucln_in.h"
 #include "mutex.h"

 #include "regeximp.h"
 #include "regexcst.h"   // Contains state table for the regex pattern parser.
                         //   generated by a Perl script.
 #include "regexcmp.h"


 U_NAMESPACE_BEGIN

 //----------------------------------------------------------------------------------------
 //
 // Unicode Sets for each of the character classes needed for parsing a regex pattern.
 //               (Initialized with hex values for portability to EBCDIC based machines.
 //                Really ugly, but there's no good way to avoid it.)
 //
 //              The sets are referred to by name in the regexcst.txt, which is the
 //              source form of the state transition table.  These names are converted
 //              to indicies in regexcst.h by the perl state table building script regexcst.pl.
 //              The indices are used to access the array gRuleSets.
 //
 //----------------------------------------------------------------------------------------

 // "Rule Char" Characters are those with no special meaning, and therefore do not
 //    need to be escaped to appear as literals in a regexp.  Expressed
 //    as the inverse of those needing escaping --  [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
 static const UChar gRuleSet_rule_char_pattern[]       = {
  //   [    ^      \     *     \     ?     \     +     \     [     \     (     /     )
     0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
  //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
     0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};


 static const UChar gRuleSet_digit_char_pattern[] = {
 //    [    0      -    9     ]
     0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};


 static UnicodeSet  *gRuleSets[10];         // Array of ptrs to the actual UnicodeSet objects.
 static UnicodeSet  *gUnescapeCharSet;

 //
 //   Here are the backslash escape characters that ICU's unescape() function
 //    will handle.
 //
 static const UChar gUnescapeCharPattern[] = {
 //    [     a     c     e     f     n     r     t     u     U     ]
     0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};


 //
 //  White space characters that may appear within a pattern in free-form mode
 //
 static const UChar gRuleWhiteSpacePattern[] = {
     /* "[[:Cf:][:WSpace:]]" */
     91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
         83, 112, 97, 99, 101, 58, 93, 93, 0 };


 //
 //  Unicode Set Definitions for Regular Expression  \w
 //
 static const UChar gIsWordPattern[] = {
 //    [     \     p     {     L     l     }     \     p     {     L     u     }
     0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d,
 //          \     p     {     L     t     }     \     p     {     L     o     }
           0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d,
 //          \     p     {     N     d     }     ]
           0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};


 //
 //  Unicode Set Definitions for Regular Expression  \s
 //
     static const UChar gIsSpacePattern[] = {
 //    [     \     t     \     n     \     f     \     r     \     p     {     Z     }     ]
     0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d,  0};

 static UnicodeSet *gPropSets[URX_LAST_SET];


 //----------------------------------------------------------------------------------------
 //
 //   ThreadSafeUnicodeSetInit   Thread safe creation of a shared UnicodeSet.
 //
 //----------------------------------------------------------------------------------------
 static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UErrorCode &status) {
     if (*pSet == NULL) {
         UnicodeSet *t = new UnicodeSet(pattern, status);
         if (U_FAILURE(status)) {
             delete t;
             return;
         }
         if (t == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return;
         }
         Mutex  lock;
         if (*pSet == NULL) {
             *pSet = t;
         } else {
             delete t;
         }
     }
 }


 //----------------------------------------------------------------------------------------
 //
 //  Constructor.
 //
 //----------------------------------------------------------------------------------------
 RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
 {
     fStatus             = &status;

     fScanIndex = 0;
     fNextIndex = 0;
     fPeekChar  = -1;
     fLineNum    = 1;
     fCharNum    = 0;
     fQuoteMode  = FALSE;
     fFreeForm   = FALSE;

     fMatchOpenParen  = -1;
     fMatchCloseParen = -1;

     if (U_FAILURE(status)) {
         return;
     }

     //
     //  Register the I18n library for cleanup,
     //     but only if we haven't initialized our globals yet.
     if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
         ucln_i18n_registerCleanup();
     }

     //
     //  Set up the constant (static) Unicode Sets.
     //    TODO:  something cleaner for that -128 constant.
     //
     ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128],   gRuleSet_rule_char_pattern,  status);
     ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern,      status);
     ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_digit_char-128],  gRuleSet_digit_char_pattern, status);
     ThreadSafeUnicodeSetInit(&gUnescapeCharSet,                    gUnescapeCharPattern,        status);
     ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET],           gIsWordPattern,              status);
     ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET],          gIsSpacePattern,             status);
 }


 //----------------------------------------------------------------------------------------
 //
 //  Destructor
 //
 //----------------------------------------------------------------------------------------
 RegexCompile::~RegexCompile() {
 }


 //----------------------------------------------------------------------------------------
 //
 //   cleanup.    Called (indirectly) by u_cleanup to free all cached memory
 //
 //----------------------------------------------------------------------------------------
 void RegexCompile::cleanup() {
     delete gRuleSets[kRuleSet_rule_char-128];
     delete gRuleSets[kRuleSet_white_space-128];
     delete gRuleSets[kRuleSet_digit_char-128];
     delete gUnescapeCharSet;
     gRuleSets[kRuleSet_rule_char-128]   = NULL;
     gRuleSets[kRuleSet_white_space-128] = NULL;
     gRuleSets[kRuleSet_digit_char-128]  = NULL;
     gUnescapeCharSet = NULL;
     int i;
     for (i=0; i<URX_LAST_SET; i++) {
         delete (UnicodeSet *)gPropSets[i];
         gPropSets[i] = NULL;
     }
     return;
 }


 //---------------------------------------------------------------------------------
 //
 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
 //                           The state tables are hand-written in the file regexcst.txt,
 //                           and converted to the form used here by a perl
 //                           script regexcst.pl
 //
 //---------------------------------------------------------------------------------
 void    RegexCompile::compile(
                          RegexPattern &rxp,          // User level pattern object to receive
                                                      //    the compiled pattern.
                          const UnicodeString &pat,   // Source pat to be compiled.
                          UParseError &pp,            // Error position info
                          UErrorCode &e)              // Error Code
 {
     fStatus             = &e;
     fRXPat              = &rxp;
     fParseErr           = &pp;
     fStackPtr           = 0;
     fStack[fStackPtr]   = 0;

     if (U_FAILURE(*fStatus)) {
         return;
     }

     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
     U_ASSERT(fRXPat->fPattern.length() == 0);

     // Prepare the RegexPattern object to receive the compiled pattern.
     fRXPat->fPattern        = pat;
     fRXPat->fStaticSets     = gPropSets;


     // Initialize the pattern scanning state machine
     fPatternLength = pat.length();
     uint16_t                state = 1;
     const RegexTableEl      *tableEl;
     nextChar(fC);                        // Fetch the first char from the pattern string.

     //
     // Main loop for the regex pattern parsing state machine.
     //   Runs once per state transition.
     //   Each time through optionally performs, depending on the state table,
     //      - an advance to the the next pattern char
     //      - an action to be performed.
     //      - pushing or popping a state to/from the local state return stack.
     //   file regexcst.txt is the source for the state table.  The logic behind
     //     recongizing the pattern syntax is there, not here.
     //
     for (;;) {
         //  Bail out if anything has gone wrong.
         //  Regex pattern parsing stops on the first error encountered.
         if (U_FAILURE(*fStatus)) {
             break;
         }

         U_ASSERT(state != 0);

         // Find the state table element that matches the input char from the rule, or the
         //    class of the input character.  Start with the first table row for this
         //    state, then linearly scan forward until we find a row that matches the
         //    character.  The last row for each state always matches all characters, so
         //    the search will stop there, if not before.
         //
         tableEl = &gRuleParseStateTable[state];
         REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d)    state=%s ",
             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);

         for (;;) {    // loop through table rows belonging to this state, looking for one
                       //   that matches the current input char.
             REGEX_SCAN_DEBUG_PRINTF( ".");
             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
                 // Table row specified an individual character, not a set, and
                 //   the input character is not quoted, and
                 //   the input character matched it.
                 break;
             }
             if (tableEl->fCharClass == 255) {
                 // Table row specified default, match anything character class.
                 break;
             }
             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
                 // Table row specified "quoted" and the char was quoted.
                 break;
             }
             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
                 // Table row specified eof and we hit eof on the input.
                 break;
             }

             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
                 fC.fQuoted == FALSE &&                                      //   char is not escaped &&
                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                 UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128];
                 if (uniset->contains(fC.fChar)) {
                     // Table row specified a character class, or set of characters,
                     //   and the current char matches it.
                     break;
                 }
             }

             // No match on this row, advance to the next  row for this state,
             tableEl++;
         }
         REGEX_SCAN_DEBUG_PRINTF("\n");

         //
         // We've found the row of the state table that matches the current input
         //   character from the rules string.
         // Perform any action specified  by this row in the state table.
         if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
             // Break out of the state machine loop if the
             //   the action signalled some kind of error, or
             //   the action was to exit, occurs on normal end-of-rules-input.
             break;
         }

         if (tableEl->fPushState != 0) {
             fStackPtr++;
             if (fStackPtr >= kStackSize) {
                 error(U_REGEX_INTERNAL_ERROR);
                 REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
                 fStackPtr--;
             }
             fStack[fStackPtr] = tableEl->fPushState;
         }

         if (tableEl->fNextChar) {
             nextChar(fC);
         }

         // Get the next state from the table entry, or from the
         //   state stack if the next state was specified as "pop".
         if (tableEl->fNextState != 255) {
             state = tableEl->fNextState;
         } else {
             state = fStack[fStackPtr];
             fStackPtr--;
             if (fStackPtr < 0) {
                 // state stack underflow
                 // This will occur if the user pattern has mis-matched parentheses,
                 //   with extra close parens.
                 //
                 fStackPtr++;
                 error(U_REGEX_MISMATCHED_PAREN);
             }
         }

     }

     //
     // The pattern has now been read and processed, and the compiled code generated.
     //

     //
     // Compute the number of digits requried for the largest capture group number.
     //
     fRXPat->fMaxCaptureDigits = 1;
     int32_t  n = 10;
     for (;;) {
         if (n > fRXPat->fNumCaptureGroups) {
             break;
         }
         fRXPat->fMaxCaptureDigits++;
         n *= 10;
     }

     //
     // A stupid bit of non-sense to prevent code coverage testing from complaining
     //   about the pattern.dump() debug function.  Go through the motions of dumping,
     //   even though, without the #define set, it will do nothing.
     //
 #ifndef REGEX_DUMP_DEBUG
     static UBool phonyDumpDone = FALSE;
     if (phonyDumpDone==FALSE) {
         fRXPat->dump();
         phonyDumpDone = TRUE;
     }
 #endif

 }


 //----------------------------------------------------------------------------------------
 //
 //  doParseAction        Do some action during regex pattern parsing.
 //                       Called by the parse state machine.
 //
 //
 //----------------------------------------------------------------------------------------
 UBool RegexCompile::doParseActions(EParseAction action)
 {
     UBool   returnVal = TRUE;

     switch ((Regex_PatternParseAction)action) {

     case doPatStart:
         // Start of pattern compiles to:
         //0   SAVE   2        Fall back to position of FAIL
         //1   jmp    3
         //2   FAIL            Stop if we ever reach here.
         //3   NOP             Dummy, so start of pattern looks the same as
         //                    the start of an ( grouping.
         //4   NOP             Resreved, will be replaced by a save if there are
         //                    OR | operators at the top level
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus);
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP,  0), *fStatus);
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP,  0), *fStatus);

         fParenStack.push(-1, *fStatus);     // Begin a Paren Stack Frame
         fParenStack.push( 3, *fStatus);     // Push location of first NOP
         break;

     case doPatFinish:
         // We've scanned to the end of the pattern
         //  The end of pattern compiles to:
         //        URX_END
         //    which will stop the runtime match engine.
         //  Encountering end of pattern also behaves like a close paren,
         //   and forces fixups of the State Save at the beginning of the compiled pattern
         //   and of any OR operations at the top level.
         //
         handleCloseParen();
         if (fParenStack.size() > 0) {
             // Missing close paren in pattern.
             error(U_REGEX_MISMATCHED_PAREN);
         }

         // add the END operation to the compiled pattern.
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);

         // Terminate the pattern compilation state machine.
         returnVal = FALSE;
         break;


     case doOrOperator:
         // Scanning a '|', as in (A|B)
         {
             // Insert a SAVE operation at the start of the pattern section preceding
             //   this OR at this level.  This SAVE will branch the match forward
             //   to the right hand side of the OR in the event that the left hand
             //   side fails to match and backtracks.  Locate the position for the
             //   save from the location on the top of the parentheses stack.
             int32_t savePosition = fParenStack.popi();
             int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
             fRXPat->fCompiledPat->setElementAt(op, savePosition);

             // Append an JMP operation into the compiled pattern.  The operand for
             //  the OR will eventually be the location following the ')' for the
             //  group.  This will be patched in later, when the ')' is encountered.
             op = URX_BUILD(URX_JMP, 0);
             fRXPat->fCompiledPat->addElement(op, *fStatus);

             // Push the position of the newly added JMP op onto the parentheses stack.
             // This registers if for fixup when this block's close paren is encountered.
             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);

             // Append a NOP to the compiled pattern.  This is the slot reserved
             //   for a SAVE in the event that there is yet another '|' following
             //   this one.
             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
         }
         break;


     case doOpenCaptureParen:
         // Open Paren.
         //   Compile to a
         //      - NOP, which later may be replaced by a save-state if the
         //         parenthesized group gets a * quantifier, followed by
         //      - START_CAPTURE
         //      - NOP, which may later be replaced by a save-state if there
         //             is an '|' alternation within the parens.
         {
             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
             fRXPat->fNumCaptureGroups++;
             int32_t  cop = URX_BUILD(URX_START_CAPTURE, fRXPat->fNumCaptureGroups);
             fRXPat->fCompiledPat->addElement(cop, *fStatus);
             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

             // On the Parentheses stack, start a new frame and add the postions
             //   of the two NOPs.  Depending on what follows in the pattern, the
             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
             //   address of the end of the parenthesized group.
             fParenStack.push(-2, *fStatus);           // Begin a new frame.
             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
         }
          break;

     case doOpenNonCaptureParen:
         // Open non-caputuring (grouping only) Paren.
         //   Compile to a
         //      - NOP, which later may be replaced by a save-state if the
         //         parenthesized group gets a * quantifier, followed by
         //      - NOP, which may later be replaced by a save-state if there
         //             is an '|' alternation within the parens.
         {
             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

             // On the Parentheses stack, start a new frame and add the postions
             //   of the two NOPs.
             fParenStack.push(-1, *fStatus);                               // Begin a new frame.
             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
         }
          break;


     case doOpenAtomicParen:
         // Open Atomic Paren.
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doOpenLookAhead:
         // Open Paren.
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doOpenLookAheadNeg:
         // Open Paren.
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doOpenLookBehind:
         // Open Paren.
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doOpenLookBehindNeg:
         // Open Paren.
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doCloseParen:
         handleCloseParen();
         if (fParenStack.size() <= 0) {
             //  Extra close paren, or missing open paren.
             error(U_REGEX_MISMATCHED_PAREN);
         }
         break;

     case doNOP:
         break;


     case doBadOpenParenType:
     case doRuleError:
         error(U_REGEX_RULE_SYNTAX);
         returnVal = FALSE;
         break;


     case doMismatchedParenErr:
         error(U_REGEX_MISMATCHED_PAREN);
         returnVal = FALSE;
         break;

     case doPlus:
         //  Normal '+'  compiles to
         //     1.   stuff to be repeated  (already built)
         //     2.   state-save  4
         //     3.   jmp 1
         //     4.   ...
         {
             int32_t   topLoc = blockTopLoc(FALSE);        // location of item #1

             // Locate the position in the compiled pattern where the match will continue
             //   after completing the +   (4 in the comment above)
             int32_t continueLoc = fRXPat->fCompiledPat->size()+2;

             // Emit the STATE_SAVE
             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
             fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);

             // Emit the JMP
             int32_t jmpOp = URX_BUILD(URX_JMP, topLoc);
             fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
         }
         break;

     case doNGPlus:
         //  Non-greedy '+?'  compiles to
         //     1.   stuff to be repeated  (already built)
         //     2.   state-save  1
         //     3.   ...
         {
             int32_t topLoc      = blockTopLoc(FALSE);
             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
             fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
         }
         break;


     case doOpt:
         // Normal (greedy) ? quantifier.
         //  Compiles to
         //     1. state save 3
         //     2.    body of optional block
         //     3. ...
         // Insert the state save into the compiled pattern, and we're done.
         {
             int32_t   saveStateLoc = blockTopLoc(TRUE);
             int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
         }
         break;

     case doNGOpt:
         // Non-greedy ?? quantifier
         //   compiles to
         //    1.  jmp   4
         //    2.     body of optional block
         //    3   jmp   5
         //    4.  state save 2
         //    5    ...
         //  This code is less than ideal, with two jmps instead of one, because we can only
         //  insert one instruction at the top of the block being iterated.
         {
             int32_t  jmp1_loc = blockTopLoc(TRUE);
             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();

             int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);

             int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
             fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);

             int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
             fRXPat->fCompiledPat->addElement(save_op, *fStatus);
         }
         break;


     case doStar:
         // Normal (greedy) * quantifier.
         // Compiles to
         //       1.   STATE_SAVE   3
         //       2.      body of stuff being iterated over
         //       3.   JMP  0
         //       4.   ...
         //
         {
             // location of item #1, the STATE_SAVE
             int32_t   saveStateLoc = blockTopLoc(TRUE);

             // Locate the position in the compiled pattern where the match will continue
             //   after completing the *.   (4 in the comment above)
             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;

             // Put together the save state op store it into the compiled code.
             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);

             // Append the URX_JMP operation to the compiled pattern.  Its target
             // is the locaton of the state-save, above.
             int32_t jmpOp = URX_BUILD(URX_JMP, saveStateLoc);
             fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
         }
         break;

     case doNGStar:
         // Non-greedy *? quantifier
         // compiles to
         //     1.   JMP    3
         //     2.      body of stuff being iterated over
         //     3.   STATE_SAVE  2
         //     4    ...
         {
             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
             int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
             int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
             fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
         }
         break;


     case doLiteralChar:
         // We've just scanned a "normal" character from the pattern,
         literalChar();
         break;


     case doDotAny:
         // scanned a ".",  match any single character.
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
         break;

     case doCaret:       // TODO:  multi-line mode flag.
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
         break;


     case doDollar:       // TODO:  multi-line mode flag.
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
         break;

     case doBackslashA:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
         break;

     case doBackslashB:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
         break;

     case doBackslashb:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
         break;

     case doBackslashD:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
         break;

     case doBackslashd:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
         break;

     case doBackslashG:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
         break;

     case doBackslashS:
         fRXPat->fCompiledPat->addElement(
             URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
         break;

     case doBackslashs:
         fRXPat->fCompiledPat->addElement(
             URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
         break;

     case doBackslashW:
         fRXPat->fCompiledPat->addElement(
             URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
         break;

     case doBackslashw:
         fRXPat->fCompiledPat->addElement(
             URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
         break;

     case doBackslashX:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
         break;

     case doBackslashx:              // \x{abcd}   alternate hex format
         //  TODO:  implement
         error(U_REGEX_UNIMPLEMENTED);
         break;


     case doBackslashZ:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
         break;

     case doBackslashz:
         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
         break;

     case doExit:
         returnVal = FALSE;
         break;

     case doProperty:
         {
             UnicodeSet *theSet = scanProp();
             compileSet(theSet);
         }
         break;


     case doScanUnicodeSet:
         {
             UnicodeSet *theSet = scanSet();
             compileSet(theSet);
         }
         break;

     case doEnterQuoteMode:
         // Just scanned a \Q.  Put character scanner into quote mode.
         fQuoteMode = TRUE;
         break;

     case doBackRef:
         //  TODO:  implement back references.
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doNamedChar:            // \N{NAMED_CHAR}
         //  TODO:  implement
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doMatchMode:   //  (?i)    and similar
         // TODO:  implement
         error(U_REGEX_UNIMPLEMENTED);
         break;

     case doNotImplementedError:
         // TODO:  get rid of this once everything is implemented.
         error(U_REGEX_UNIMPLEMENTED);
         break;


     default:
         error(U_REGEX_INTERNAL_ERROR);
         returnVal = FALSE;
         break;
     }
     return returnVal;
 };


 //------------------------------------------------------------------------------
 //
 //   literalChar           We've encountered a literal character from the pattern,
 //                             or an escape sequence that reduces to a character.
 //                         Add it to the string containing all literal chars/strings from
 //                             the pattern.
 //                         If we are in a pattern string already, add the new char to it.
 //                         If we aren't in a pattern string, begin one now.
 //
 //------------------------------------------------------------------------------
 void RegexCompile::literalChar()  {
     int32_t           op;            // An operation in the compiled pattern.
     int32_t           opType;
     int32_t           patternLoc;   // A position in the compiled pattern.
     int32_t           stringLen;


     // If the last thing compiled into the pattern was not a literal char,
     //   force this new literal char to begin a new string, and not append to the previous.
     op     = fRXPat->fCompiledPat->lastElementi();
     opType = URX_TYPE(op);
     if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR)) {
         fixLiterals();
     }

     if (fStringOpStart == -1) {
         // First char of a string in the pattern.
         // Emit a OneChar op into the compiled pattern.
         op = URX_BUILD(URX_ONECHAR, fC.fChar);
         fRXPat->fCompiledPat->addElement(op, *fStatus);

         // Also add it to the string pool, in case we get a second adjacent literal
         //   and want to change form ONE_CHAR to STRING
         fStringOpStart = fRXPat->fLiteralText.length();
         fRXPat->fLiteralText.append(fC.fChar);
         return;
     }

     // We are adding onto an existing string
     fRXPat->fLiteralText.append(fC.fChar);

     // If the most recently emitted op is a URX_ONECHAR, change it to a string op.
     op     = fRXPat->fCompiledPat->lastElementi();
     opType = URX_TYPE(op);
     U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
     if (opType == URX_ONECHAR) {
         op         = URX_BUILD(URX_STRING, fStringOpStart);
         patternLoc = fRXPat->fCompiledPat->size() - 1;
         fRXPat->fCompiledPat->setElementAt(op, patternLoc);
         op         = URX_BUILD(URX_STRING_LEN, 0);
         fRXPat->fCompiledPat->addElement(op, *fStatus);
     }

     // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the
     //  string length to reflect the new char we just added to the string.
     stringLen  = fRXPat->fLiteralText.length() - fStringOpStart;
     op         = URX_BUILD(URX_STRING_LEN, stringLen);
     patternLoc = fRXPat->fCompiledPat->size() - 1;
     fRXPat->fCompiledPat->setElementAt(op, patternLoc);
 }


 //------------------------------------------------------------------------------
 //
 //    fixLiterals           When compiling something that can follow a literal
 //                          string in a pattern, we need to "fix" any preceding
 //                          string, which will cause any subsequent literals to
 //                          begin a new string, rather than appending to the
 //                          old one.
 //
 //                          Optionally, split the last char of the string off into
 //                          a single "ONE_CHAR" operation, so that quantifiers can
 //                          apply to that char alone.  Example:   abc*
 //                          The * needs to apply to the 'c' only.
 //
 //------------------------------------------------------------------------------
 void    RegexCompile::fixLiterals(UBool split) {
     int32_t  stringStart = fStringOpStart;    // start index of the current literal string
     int32_t  op;                              // An op from/for the compiled pattern.
     int32_t  opType;                          // An opcode type from the compiled pattern.
     int32_t  stringLastCharIdx;
     UChar32  lastChar;
     int32_t  stringNextToLastCharIdx;
     UChar32  nextToLastChar;
     int32_t  stringLen;

     fStringOpStart = -1;
     if (!split) {
         return;
     }

     // Split:  We need to  ensure that the last item in the compiled pattern does
     //   not refer to a literal string of more than one char.  If it does,
     //   separate the last char from the rest of the string.

     // If the last operation from the compiled pattern is not a string,
     //   nothing needs to be done
     op     = fRXPat->fCompiledPat->lastElementi();
     opType = URX_TYPE(op);
     if (opType != URX_STRING_LEN) {
         return;
     }
     stringLen = URX_VAL(op);

     //
     // Find the position of the last code point in the string  (might be a surrogate pair)
     //
     stringLastCharIdx = fRXPat->fLiteralText.length();
     stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
     lastChar          = fRXPat->fLiteralText.char32At(stringLastCharIdx);

     // The string should always be at least two code points long, meaning that there
     //   should be something before the last char position that we just found.
     U_ASSERT(stringLastCharIdx > stringStart);
     stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
     U_ASSERT(stringNextToLastCharIdx >= stringStart);
     nextToLastChar          = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);

     if (stringNextToLastCharIdx > stringStart) {
         // The length of string remaining after removing one char is two or more.
         // Leave the string in the compiled pattern, shorten it by one char,
         //   and append a URX_ONECHAR op for the last char.
         stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
         op = URX_BUILD(URX_STRING_LEN, stringLen);
         fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
         op = URX_BUILD(URX_ONECHAR, lastChar);
         fRXPat->fCompiledPat->addElement(op, *fStatus);
     } else {
         // The original string consisted of exactly two characters.  Replace
         // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
         // of URX_ONECHARs.
         op = URX_BUILD(URX_ONECHAR, nextToLastChar);
         fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
         op = URX_BUILD(URX_ONECHAR, lastChar);
         fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
     }
 }


 //------------------------------------------------------------------------------
 //
 //   blockTopLoc()          Find or create a location in the compiled pattern
 //                          at the start of the operation or block that has
 //                          just been compiled.  Needed when a quantifier (* or
 //                          whatever) appears, and we need to add an operation
 //                          at the start of the thing being quantified.
 //
 //                          (Parenthesized Blocks) have a slot with a NOP that
 //                          is reserved for this purpose.  .* or similar don't
 //                          and a slot needs to be added.
 //
 //       parameter reserveLoc   :  TRUE - ensure that there is space to add an opcode
 //                                        at the returned location.
 //                                 FALSE - just return the address, reserve a location there.
 //
 //------------------------------------------------------------------------------
 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
     int32_t   theLoc;
     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
     {
         // The item just processed is a parenthesized block.
         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
         U_ASSERT(theLoc > 0);
         uint32_t  opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
         U_ASSERT(URX_TYPE(opAtTheLoc) == URX_NOP);
     }
     else {
         // Item just compiled is a single thing, a ".", or a single char, or a set reference.
         // No slot for STATE_SAVE was pre-reserved in the compiled code.
         // We need to make space now.
         fixLiterals(TRUE);  // If last item was a string, separate the last char.
         theLoc = fRXPat->fCompiledPat->size()-1;
         if (reserveLoc) {
             int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
             int32_t prevType = URX_TYPE(opAtTheLoc);
             int32_t  nop = URX_BUILD(URX_NOP, 0);
             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
         }
     }
     return theLoc;
 }


 //------------------------------------------------------------------------------
 //
 //    handleCloseParen      When compiling a close paren, we need to go back
 //                          and fix up any JMP or SAVE operations within the
 //                          parenthesized block that need to target the end
 //                          of the block.  The locations of these are kept on
 //                          the paretheses stack.
 //
 //                          This function is called both when encountering a
 //                          real ) and at the end of the pattern.
 //
 //-------------------------------------------------------------------------------
 void  RegexCompile::handleCloseParen() {
     int32_t   patIdx;
     int32_t   patOp;
     if (fParenStack.size() <= 0) {
         error(U_REGEX_MISMATCHED_PAREN);
         return;
     }

     // Force any literal chars that may follow the close paren to start a new string,
     //   and not attach to any preceding it.
     fixLiterals(FALSE);

     // Fixup any operations within the just-closed parenthesized group
     //    that need to reference the end of the (block).
     //    (The first one on popped from the stack is an unused slot for
     //     alternation (OR) state save, but applying the fixup to it does no harm.)
     for (;;) {
         patIdx = fParenStack.popi();
         if (patIdx < 0) {
             break;
         }
         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
         patOp = fRXPat->fCompiledPat->elementAti(patIdx);
         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
         fMatchOpenParen     = patIdx;
     }

     // DO any additional fixups, depending on the specific kind of
     // parentesized grouping this is

     switch (patIdx) {
     case -1:
         // No additional fixups required.
         //   This is the case with most kinds of groupings.
         break;
     case -2:
         // Capturing Parentheses.
         //   Insert a End Capture op into the pattern.
         //   Grab the group number from the start capture op
         //      and put it into the end-capture op.
         {
             int32_t   captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
             int32_t   captureGroupNumber = URX_VAL(captureOp);
             U_ASSERT(captureGroupNumber > 0);
             int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, captureGroupNumber);
             fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
         }
         break;
     default:
         U_ASSERT(FALSE);
     }

     // remember the next location in the compiled pattern.
     // The compilation of Quantifiers will look at this to see whether its looping
     //   over a parenthesized block or a single item
     fMatchCloseParen = fRXPat->fCompiledPat->size();
 }


 //----------------------------------------------------------------------------------------
 //
 //   compileSet       Compile the pattern operations for a reference to a
 //                    UnicodeSet.
 //
 //----------------------------------------------------------------------------------------
 void        RegexCompile::compileSet(UnicodeSet *theSet)
 {
     if (theSet == NULL) {
         return;
     }
     int32_t  setSize = theSet->size();
     UChar32  firstSetChar = theSet->charAt(0);
     if (firstSetChar == -1) {
         // Sets that contain only strings, but no individual chars,
         // will end up here.   TODO:  figure out what to with sets containing strings.
         setSize = 0;
     }

     switch (setSize) {
     case 0:
         {
             // Set of no elements.   Always fails to match.
             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
             delete theSet;
         }
         break;

     case 1:
         {
             // The set contains only a single code point.  Put it into
             //   the compiled pattern as a single char operation rather
             //   than a set, and discard the set itself.
             int32_t  charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
             fRXPat->fCompiledPat->addElement(charToken, *fStatus);
             delete theSet;
         }
         break;

     default:
         {
             //  The set contains two or more chars.  (the normal case)
             //  Put it into the compiled pattern as a set.
             int32_t setNumber = fRXPat->fSets->size();
             fRXPat->fSets->addElement(theSet, *fStatus);
             int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
             fRXPat->fCompiledPat->addElement(setOp, *fStatus);
         }
     }
 }


 //----------------------------------------------------------------------------------------
 //
 //  Error         Report a rule parse error.
 //                Only report it if no previous error has been recorded.
 //
 //----------------------------------------------------------------------------------------
 void RegexCompile::error(UErrorCode e) {
     if (U_SUCCESS(*fStatus)) {
         *fStatus = e;
         fParseErr->line  = fLineNum;
         fParseErr->offset = fCharNum;
         fParseErr->preContext[0] = 0;    // TODO:  copy in some input pattern text
         fParseErr->preContext[0] = 0;
     }
 }


 //
 //  Assorted Unicode character constants.
 //     Numeric because there is no portable way to enter them as literals.
 //     (Think EBCDIC).
 //
 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
 static const UChar      chLF        = 0x0a;
 static const UChar      chNEL       = 0x85;      //    NEL newline variant
 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
 static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.
 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
 static const UChar      chE         = 0x45;      // 'E'
 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
 static const UChar      chLParen    = 0x28;
 static const UChar      chRParen    = 0x29;
 static const UChar      chLBracket  = 0x5b;
 static const UChar      chRBracket  = 0x5d;
 static const UChar      chRBrace    = 0x7d;
 static const UChar      chLowerP    = 0x70;
 static const UChar      chUpperP    = 0x50;


 //----------------------------------------------------------------------------------------
 //
 //  nextCharLL    Low Level Next Char from the regex pattern.
 //                Get a char from the string,
 //                keep track of input position for error reporting.
 //
 //----------------------------------------------------------------------------------------
 UChar32  RegexCompile::nextCharLL() {
     UChar32       ch;
     UnicodeString &pattern = fRXPat->fPattern;

     if (fPeekChar != -1) {
         ch = fPeekChar;
         fPeekChar = -1;
         return ch;
     }
     if (fPatternLength==0 || fNextIndex >= fPatternLength) {
         return (UChar32)-1;
     }
     ch         = pattern.char32At(fNextIndex);
     fNextIndex = pattern.moveIndex32(fNextIndex, 1);

     if (ch == chCR ||
         ch == chNEL ||
         ch == chLS   ||
         ch == chLF && fLastChar != chCR) {
         // Character is starting a new line.  Bump up the line number, and
         //  reset the column to 0.
         fLineNum++;
         fCharNum=0;
         if (fQuoteMode) {
             error(U_REGEX_RULE_SYNTAX);
             fQuoteMode = FALSE;
         }
     }
     else {
         // Character is not starting a new line.  Except in the case of a
         //   LF following a CR, increment the column position.
         if (ch != chLF) {
             fCharNum++;
         }
     }
     fLastChar = ch;
     return ch;
 }

 //---------------------------------------------------------------------------------
 //
 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
 //                 character without actually getting it.
 //
 //---------------------------------------------------------------------------------
 UChar32  RegexCompile::peekCharLL() {
     if (fPeekChar == -1) {
         fPeekChar = nextCharLL();
     }
     return fPeekChar;
 }


 //---------------------------------------------------------------------------------
 //
 //   nextChar     for pattern scanning.  At this level, we handle stripping
 //                out comments and processing some backslash character escapes.
 //                The rest of the pattern grammar is handled at the next level up.
 //
 //---------------------------------------------------------------------------------
 void RegexCompile::nextChar(RegexPatternChar &c) {

     // Unicode Character constants needed for the processing done by nextChar(),
     //   in hex because literals wont work on EBCDIC machines.

     fScanIndex = fNextIndex;
     c.fChar    = nextCharLL();
     c.fQuoted  = FALSE;

     if (fQuoteMode) {
         c.fQuoted = TRUE;
         if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
             fQuoteMode = FALSE;  //  Exit quote mode,
             nextCharLL();       // discard the E
             nextChar(c);        // recurse to get the real next char
         }
     }
     else
     {
         // We are not in a 'quoted region' of the source.
         //
         if (fFreeForm && c.fChar == chPound) {
             // Start of a comment.  Consume the rest of it.
             //  The new-line char that terminates the comment is always returned.
             //  It will be treated as white-space, and serves to break up anything
             //    that might otherwise incorrectly clump together with a comment in
             //    the middle (a variable name, for example.)
             for (;;) {
                 c.fChar = nextCharLL();
                 if (c.fChar == (UChar32)-1 ||  // EOF
                     c.fChar == chCR     ||
                     c.fChar == chLF     ||
                     c.fChar == chNEL    ||
                     c.fChar == chLS)       {break;}
             }
         }
         if (c.fChar == (UChar32)-1) {
             return;
         }

         //
         //  check for backslash escaped characters.
         //  Use UnicodeString::unescapeAt() to handle those that it can.
         //  Otherwise just return the '\', and let the pattern parser deal with it.
         //
         int32_t startX = fNextIndex;  // start and end positions of the
         int32_t endX   = fNextIndex;  //   sequence following the '\'
         if (c.fChar == chBackSlash) {
             if (gUnescapeCharSet->contains(peekCharLL())) {
                 nextCharLL();     // get & discard the peeked char.
                 c.fQuoted = TRUE;
                 c.fChar = fRXPat->fPattern.unescapeAt(endX);
                 if (startX == endX) {
                     error(U_REGEX_BAD_ESCAPE_SEQUENCE);
                 }
                 fCharNum += endX - startX;
                 fNextIndex = endX;
             }
         }
     }
     // putc(c.fChar, stdout);
 }


 //---------------------------------------------------------------------------------
 //
 //  scanSet    Construct a UnicodeSet from the text at the current scan
 //             position.  Advance the scan position to the first character
 //             after the set.
 //
 //             The scan position is normally under the control of the state machine
 //             that controls pattern parsing.  UnicodeSets, however, are parsed by
 //             the UnicodeSet constructor, not by the Regex pattern parser.
 //
 //---------------------------------------------------------------------------------
 UnicodeSet *RegexCompile::scanSet() {
     UnicodeSet    *uset = NULL;
     ParsePosition  pos;
     int            startPos;
     int            i;

     if (U_FAILURE(*fStatus)) {
         return NULL;
     }

     pos.setIndex(fScanIndex);
     startPos = fScanIndex;
     UErrorCode localStatus = U_ZERO_ERROR;
     uset = new UnicodeSet(fRXPat->fPattern, pos,
                          localStatus);
     if (U_FAILURE(localStatus)) {
         //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
         //         UnicodeSet appears to not be reporting correctly at this time.
         REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
         error(localStatus);
         delete uset;
         return NULL;
     }

     // Advance the current scan postion over the UnicodeSet.
     //   Don't just set fScanIndex because the line/char positions maintained
     //   for error reporting would be thrown off.
     i = pos.getIndex();
     for (;;) {
         if (fNextIndex >= i) {
             break;
         }
         nextCharLL();
     }

     return uset;
 };


 //---------------------------------------------------------------------------------
 //
 //  scanProp   Construct a UnicodeSet from the text at the current scan
 //             position, which will be of the form \p{whaterver}
 //
 //             The scan position will be at the 'p' or 'P'.  On return
 //             the scan position should be just after the '}'
 //
 //             Return a UnicodeSet, constructed from the \P pattern,
 //             or NULL if the pattern is invalid.
 //
 //---------------------------------------------------------------------------------
 UnicodeSet *RegexCompile::scanProp() {
     UnicodeSet    *uset = NULL;

     if (U_FAILURE(*fStatus)) {
         return NULL;
     }

     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP);

     // enclose the \p{property} from the regex pattern source in  [brackets]
     UnicodeString setPattern;
     setPattern.append(chLBracket);
     setPattern.append(chBackSlash);
     for (;;) {
         setPattern.append(fC.fChar);
         if (fC.fChar == chRBrace) {
             break;
         }
         nextChar(fC);
         if (fC.fChar == -1) {
             // Hit the end of the input string without finding the closing '}'
             *fStatus = U_REGEX_PROPERTY_SYNTAX;
             return NULL;
         }
     }
     setPattern.append(chRBracket);

     // Build the UnicodeSet from the set pattern we just built up in a string.
     uset = new UnicodeSet(setPattern, *fStatus);
     if (U_FAILURE(*fStatus)) {
         delete uset;
         uset =  NULL;
     }

     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
     return uset;
 };

 U_NAMESPACE_END
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS