icu4c/source/i18n/messageformat2_parser.cpp - external/github.com/unicode-org/icu - Git at Google

 // © 2024 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_NORMALIZATION

 #if !UCONFIG_NO_FORMATTING

 #if !UCONFIG_NO_MF2

 #include "unicode/uniset.h"
 #include "messageformat2_errors.h"
 #include "messageformat2_macros.h"
 #include "messageformat2_parser.h"
 #include "ucln_in.h"
 #include "umutex.h"
 #include "uvector.h" // U_ASSERT

 U_NAMESPACE_BEGIN

 namespace message2 {

 using namespace pluralimpl;

 using namespace data_model;

 /*
     The `ERROR()` macro sets a syntax error in the context
     and sets the offset in `parseError` to `index`. It does not alter control flow.
 */
 #define ERROR(errorCode)                                                                                \
     if (!errors.hasSyntaxError()) {                                                                     \
         setParseError(parseError, index);                                                               \
         errors.addSyntaxError(errorCode);                                                               \
     }

 #define ERROR_AT(errorCode, i)                                                                          \
     if (!errors.hasSyntaxError()) {                                                                     \
         setParseError(parseError, i);                                                                   \
         errors.addSyntaxError(errorCode);                                                               \
     }

 // Increments the line number and updates the "characters seen before
 // current line" count in `parseError`, iff `peek()` is a newline
 void Parser::maybeAdvanceLine() {
     if (peek() == LF) {
         parseError.line++;
         // add 1 to index to get the number of characters seen so far
         // (including the newline)
         parseError.lengthBeforeCurrentLine = index + 1;
     }
 }

 /*
     Signals an error and returns either if `parseError` already denotes an
     error, or `index` is out of bounds for the string `source`
 */
 #define CHECK_BOUNDS(errorCode)                                                            \
     if (!inBounds()) {                                                                     \
         ERROR(errorCode);                                                                  \
         return;                                                                            \
     }
 #define CHECK_BOUNDS_1(errorCode)                                                          \
     if (!inBounds(1)) {                                                                    \
         ERROR_AT(errorCode, index + 1);                                                    \
         return;                                                                            \
     }

 // -------------------------------------
 // Helper functions

 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
     for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
         out[i] = in[i];
         if (in[i] == '\0') {
             break;
         }
     }
 }

 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
     parseError.line = messageParseError.line;
     parseError.offset = messageParseError.offset;
     copyContext(messageParseError.preContext, parseError.preContext);
     copyContext(messageParseError.postContext, parseError.postContext);
 }

 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
     // Translate absolute to relative offset
     parseError.offset = index                               // Start with total number of characters seen
                       - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
     // TODO: Fill this in with actual pre and post-context
     parseError.preContext[0] = 0;
     parseError.postContext[0] = 0;
 }

 // -------------------------------------
 // Initialization of UnicodeSets

 namespace unisets {

 UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {};

 inline UnicodeSet* getImpl(Key key) {
     return gUnicodeSets[key];
 }

 icu::UInitOnce gMF2ParseUniSetsInitOnce {};
 }

 UnicodeSet* initContentChars(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     }
     result->add(0x000B, 0x000C); // Omit CR
     result->add(0x000E, 0x001F); // Omit SP
     result->add(0x0021, 0x002D); // Omit '.'
     result->add(0x002F, 0x003F); // Omit '@'
     result->add(0x0041, 0x005B); // Omit '\'
     result->add(0x005D, 0x007A); // Omit { | }
     result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE
     result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional
     result->freeze();
     return result;
 }

 UnicodeSet* initWhitespace(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     UnicodeSet* result = new UnicodeSet();
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     }
     result->add(SPACE);
     result->add(HTAB);
     result->add(CR);
     result->add(LF);
     result->add(IDEOGRAPHIC_SPACE);
     result->freeze();
     return result;
 }

 UnicodeSet* initBidiControls(UErrorCode& status) {
     UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     result->add(0x200E, 0x200F);
     result->add(0x2066, 0x2069);
     result->freeze();
     return result;
 }

 UnicodeSet* initAlpha(UErrorCode& status) {
     UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     result->freeze();
     return result;
 }

 UnicodeSet* initDigits(UErrorCode& status) {
     UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     result->freeze();
     return result;
 }

 UnicodeSet* initNameStartChars(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     UnicodeSet* result = new UnicodeSet();
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     };

     result->addAll(*isAlpha);
     result->add(0x002B);
     result->add(0x005F);
     result->add(0x00A1, 0x061B);
     result->add(0x061D, 0x167F);
     result->add(0x1681, 0x1FFF);
     result->add(0x200B, 0x200D);
     result->add(0x2010, 0x2027);
     result->add(0x2030, 0x205E);
     result->add(0x2060, 0x2065);
     result->add(0x206A, 0x2FFF);
     result->add(0x3001, 0xD7FF);
     result->add(0xE000, 0xFDCF);
     result->add(0xFDF0, 0xFFFD);
     result->add(0x10000, 0x1FFFD);
     result->add(0x20000, 0x2FFFD);
     result->add(0x30000, 0x3FFFD);
     result->add(0x40000, 0x4FFFD);
     result->add(0x50000, 0x5FFFD);
     result->add(0x60000, 0x6FFFD);
     result->add(0x70000, 0x7FFFD);
     result->add(0x80000, 0x8FFFD);
     result->add(0x90000, 0x9FFFD);
     result->add(0xA0000, 0xAFFFD);
     result->add(0xB0000, 0xBFFFD);
     result->add(0xC0000, 0xCFFFD);
     result->add(0xD0000, 0xDFFFD);
     result->add(0xE0000, 0xEFFFD);
     result->add(0xF0000, 0xFFFFD);
     result->add(0x100000, 0x10FFFD);
     result->freeze();
     return result;
 }

 UnicodeSet* initNameChars(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status);
     UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     UnicodeSet* result = new UnicodeSet();
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     };
     result->addAll(*nameStart);
     result->addAll(*digit);
     result->add(HYPHEN);
     result->add(PERIOD);
     result->freeze();
     return result;
 }

 UnicodeSet* initTextChars(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status);
     UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     UnicodeSet* result = new UnicodeSet();
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     };
     result->addAll(*content);
     result->addAll(*whitespace);
     result->add(PERIOD);
     result->add(AT);
     result->add(PIPE);
     result->freeze();
     return result;
 }

 UnicodeSet* initQuotedChars(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     UnicodeSet* result = new UnicodeSet();
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     };
     // content and whitespace were initialized by `initTextChars()`
     UnicodeSet* content = unisets::getImpl(unisets::CONTENT);
     if (content == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     }
     result->addAll(*content);
     UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE);
     if (whitespace == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     }
     result->addAll(*whitespace);
     result->add(PERIOD);
     result->add(AT);
     result->add(LEFT_CURLY_BRACE);
     result->add(RIGHT_CURLY_BRACE);
     result->freeze();
     return result;
 }

 UnicodeSet* initEscapableChars(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return nullptr;
     }

     UnicodeSet* result = new UnicodeSet();
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return nullptr;
     }
     result->add(PIPE);
     result->add(BACKSLASH);
     result->add(LEFT_CURLY_BRACE);
     result->add(RIGHT_CURLY_BRACE);
     result->freeze();
     return result;
 }

 namespace unisets {

 UBool U_CALLCONV cleanupMF2ParseUniSets() {
     for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
         delete gUnicodeSets[i];
         gUnicodeSets[i] = nullptr;
     }
     gMF2ParseUniSetsInitOnce.reset();
     return true;
 }

 void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) {
     ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets);
     /*
       Each of the init functions initializes the UnicodeSets
       that it depends on.

       initBidiControls (no dependencies)

       initEscapableChars (no dependencies)

       initNameChars depends on
          initDigits
          initNameStartChars depends on
            initAlpha

       initQuotedChars depends on
          initTextChars depends on
             initContentChars
             initWhitespace
      */
     gUnicodeSets[unisets::BIDI] = initBidiControls(status);
     gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status);
     gUnicodeSets[unisets::QUOTED] = initQuotedChars(status);
     gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status);

     if (U_FAILURE(status)) {
         cleanupMF2ParseUniSets();
     }
 }

 const UnicodeSet* get(Key key, UErrorCode& status) {
     umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
     UnicodeSet* result = getImpl(key);
     if (result == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
     return result;
 }

 }

 // -------------------------------------
 // Predicates

 /*
   The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:

   `isContentChar()`   : `content-char`
   `isTextChar()`      : `text-char`
   `isAlpha()`         : `ALPHA`
   `isDigit()`         : `DIGIT`
   `isNameStart()`     : `name-start`
   `isNameChar()`      : `name-char`
   `isUnquotedStart()` : `unquoted-start`
   `isQuotedChar()`    : `quoted-char`
   `isWhitespace()`    : `s`
 */

 bool Parser::isContentChar(UChar32 c) const {
     return contentChars->contains(c);
 }

 // See `bidi` in the MF2 grammar
 bool Parser::isBidiControl(UChar32 c) const {
     return bidiControlChars->contains(c);
 }

 // See `ws` in the MessageFormat 2 grammar
 bool Parser::isWhitespace(UChar32 c) const {
     return whitespaceChars->contains(c);
 }

 bool Parser::isTextChar(UChar32 c) const {
     return textChars->contains(c);
 }

 bool Parser::isAlpha(UChar32 c) const {
     return alphaChars->contains(c);
 }

 bool Parser::isDigit(UChar32 c) const {
     return digitChars->contains(c);
 }

 bool Parser::isNameStart(UChar32 c) const {
     return nameStartChars->contains(c);
 }

 bool Parser::isNameChar(UChar32 c) const {
     return nameChars->contains(c);
 }

 bool Parser::isUnquotedStart(UChar32 c) const {
     return isNameChar(c);
 }

 bool Parser::isQuotedChar(UChar32 c) const {
     return quotedChars->contains(c);
 }

 bool Parser::isEscapableChar(UChar32 c) const {
     return escapableChars->contains(c);
 }

 // Returns true iff `c` can begin a `function` nonterminal
 static bool isFunctionStart(UChar32 c) {
     switch (c) {
     case COLON: {
         return true;
     }
     default: {
         return false;
     }
     }
 }

 // Returns true iff `c` can begin an `annotation` nonterminal
 static bool isAnnotationStart(UChar32 c) {
     return isFunctionStart(c);
 }

 // Returns true iff `c` can begin a `literal` nonterminal
 bool Parser::isLiteralStart(UChar32 c) const {
     return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
 }

 // Returns true iff `c` can begin a `key` nonterminal
 bool Parser::isKeyStart(UChar32 c) const {
     return (c == ASTERISK || isLiteralStart(c));
 }

 bool Parser::isDeclarationStart() {
     return (peek() == ID_LOCAL[0]
             && inBounds(1)
             && peek(1) == ID_LOCAL[1])
         || (peek() == ID_INPUT[0]
             && inBounds(1)
             && peek(1) == ID_INPUT[1]);
 }

 // -------------------------------------
 // Parsing functions


 /*
   TODO: Since handling the whitespace ambiguities needs to be repeated
   in several different places and is hard to factor out,
   it probably would be better to replace the parser with a lexer + parser
   to separate tokenizing from parsing, which would simplify the code significantly.
   This has the disadvantage that there is no token grammar for MessageFormat,
   so one would have to be invented that isn't a component of the spec.
  */

 /*
     This is a recursive-descent scannerless parser that,
     with a few exceptions, uses 1 character of lookahead.

     This may not be an exhaustive list, as the additions of attributes and reserved
     statements introduced several new ambiguities.

 All but three of the exceptions involve ambiguities about the meaning of whitespace.
 One ambiguity not involving whitespace is:
 identifier -> namespace ":" name
 vs.
 identifier -> name

 `namespace` and `name` can't be distinguished without arbitrary lookahead.
 (For how this is handled, see parseIdentifier())

 The second ambiguity not involving whitespace is:
 complex-message -> *(declaration[s]) complex-body
                 -> declaration *(declaration[s]) complex-body
                 -> declaration complex-body
                 -> reserved-statement complex-body
                 -> .foo {$x} .match // ...
 When processing the '.', arbitrary lookahead is required to distinguish the
 arbitrary-length unsupported keyword from `.match`.
 (For how this is handled, see parseDeclarations()).

 The third ambiguity not involving whitespace is:
 complex-message -> *(declaration [s]) complex-body
                 -> reserved-statement *(declaration [s]) complex-body
                 -> reserved-statement complex-body
                 -> reserved-statement quotedPattern
                 -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
                 -> reserved-keyword expression quoted-pattern
  Example: .foo {1} {{1}}

  Without lookahead, the opening '{' of the quoted pattern can't be distinguished
  from the opening '{' of another expression in the unsupported statement.
  (Though this only requires 1 character of lookahead.)

  Otherwise:

 There are at least seven ambiguities in the grammar that can't be resolved with finite
 lookahead (since whitespace sequences can be arbitrarily long). They are resolved
 with a form of backtracking (early exit). No state needs to be saved/restored
 since whitespace doesn't affect the shape of the resulting parse tree, so it's
 not true backtracking.

 In addition, the grammar has been refactored
 in a semantics-preserving way in some cases to make the code easier to structure.

 First: variant = when 1*(s key) [s] pattern
    Example: when k     {a}
    When reading the first space after 'k', it's ambiguous whether it's the
    required space before another key, or the optional space before `pattern`.
  (See comments in parseNonEmptyKeys())

 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
         annotation = (function *(s option)) / reserved
    Example: {:f    }
    When reading the first space after 'f', it's ambiguous whether it's the
    required space before an option, or the optional trailing space after an options list
    (in this case, the options list is empty).
  (See comments in parseOptions() -- handling this case also meant it was easier to base
   the code on a slightly refactored grammar, which should be semantically equivalent.)

 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
         annotation = (function *(s option)) / reserved
    Example: {@a }
    Similar to the previous case; see comments in parseReserved()

 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
    Example: {|foo|   }
    When reading the first space after the '|', it's ambiguous whether it's the required
    space before an annotation, or the optional trailing space before the '}'.
   (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
   the same grammar refactoring as the second exception.)

     Most functions match a non-terminal in the grammar, except as explained
     in comments.

 Fifth: matcher = match-statement 1*([s] variant)
                -> match 1 *([s] selector) 1*([s] variant)
     Example: match {42} * {{_}}
  When reading the space after the first '}', it's unclear whether
  it's the optional space before another selector, or the optional space
  before a variant.

 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
        -> "{" [s] function *(s attribute) [s] "}"
        -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
        -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"

      Example: {:func @foo}
 (Note: the same ambiguity is present with variable-expression and literal-expression)

 Seventh:


 When parsing the space, it's unclear whether it's the optional space before an
 option, or the optional space before an attribute.

  Unless otherwise noted in a comment, all helper functions that take
     a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
     have the precondition:
       `index` < `len()`
     and the postcondition:
       `U_FAILURE(errorCode)` || `index < `len()`
 */

 /*
   No pre, no post.
   A message may end with whitespace, so `index` may equal `len()` on exit.
 */
 void Parser::parseRequiredWS(UErrorCode& errorCode) {
     bool sawWhitespace = false;

     // The loop exits either when we consume all the input,
     // or when we see a non-whitespace character.
     while (true) {
         // Check if all input has been consumed
         if (!inBounds()) {
             // If whitespace isn't required -- or if we saw it already --
             // then the caller is responsible for checking this case and
             // setting an error if necessary.
             if (sawWhitespace) {
                 // Not an error.
                 return;
             }
             // Otherwise, whitespace is required; the end of the input has
             // been reached without whitespace. This is an error.
             ERROR(errorCode);
             return;
         }

         // Input remains; process the next character if it's whitespace,
         // exit the loop otherwise
         if (isWhitespace(peek())) {
             sawWhitespace = true;
             // Increment line number in parse error if we consume a newline
             maybeAdvanceLine();
             next();
         } else {
             break;
         }
     }

     if (!sawWhitespace) {
         ERROR(errorCode);
     }
 }

 void Parser::parseOptionalBidi() {
     while (true) {
         if (!inBounds()) {
             return;
         }
         if (isBidiControl(peek())) {
             next();
         } else {
             break;
         }
     }
 }

 /*
   No pre, no post, because a message may end with whitespace
   Matches `s` in the MF2 grammar
 */
 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
     parseOptionalBidi();
     parseRequiredWS(errorCode);
     parseOptionalWhitespace();
     normalizedInput += SPACE;
 }

 /*
   No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
 */
 void Parser::parseOptionalWhitespace() {
     while (true) {
         if (!inBounds()) {
             return;
         }
         auto cp = peek();
         if (isWhitespace(cp) || isBidiControl(cp)) {
             maybeAdvanceLine();
             next();
         } else {
             break;
         }
     }
 }

 // Consumes a single character, signaling an error if `peek()` != `c`
 // No postcondition -- a message can end with a '}' token
 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
     CHECK_BOUNDS(errorCode);

     if (peek() == c) {
         next();
         normalizedInput += c;
         return;
     }
     // Next character didn't match -- error out
     ERROR(errorCode);
 }

 /*
    Consumes a fixed-length token, signaling an error if the token isn't a prefix of
    the string beginning at `peek()`
    No postcondition -- a message can end with a '}' token
 */
 void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) {
     U_ASSERT(inBounds());

     int32_t tokenPos = 0;
     while (tokenPos < static_cast<int32_t>(token.length())) {
         if (peek() != token[tokenPos]) {
             ERROR(errorCode);
             return;
         }
         normalizedInput += token[tokenPos];
         next();
         tokenPos++;
     }
 }

 /*
    Consumes optional whitespace, possibly advancing `index` to `index'`,
    then consumes a fixed-length token (signaling an error if the token isn't a prefix of
    the string beginning at `source[index']`),
    then consumes optional whitespace again
 */
 void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
     // No need for error check or bounds check before parseOptionalWhitespace
     parseOptionalWhitespace();
     // Establish precondition
     CHECK_BOUNDS(errorCode);
     parseToken(token, errorCode);
     parseOptionalWhitespace();
     // Guarantee postcondition
     CHECK_BOUNDS(errorCode);
 }

 /*
    Consumes optional whitespace, possibly advancing `index` to `index'`,
    then consumes a single character (signaling an error if it doesn't match
    `source[index']`),
    then consumes optional whitespace again
 */
 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
     // No need for error check or bounds check before parseOptionalWhitespace()
     parseOptionalWhitespace();
     // Establish precondition
     CHECK_BOUNDS(errorCode);
     parseToken(c, errorCode);
     parseOptionalWhitespace();
     // Guarantee postcondition
     CHECK_BOUNDS(errorCode);
 }

 /*
   Consumes a possibly-empty sequence of name-chars. Appends to `str`
   and returns `str`.
 */
 UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) {
     if (U_FAILURE(errorCode)) {
         return {};
     }

     while (isNameChar(peek())) {
         UChar32 c = peek();
         str += c;
         normalizedInput += c;
         next();
         if (!inBounds()) {
             ERROR(errorCode);
             break;
         }
     }

     return str;
 }

 /*
   Consumes a non-empty sequence of `name-char`s, the first of which is
   also a `name-start`.
   that begins with a character `start` such that `isNameStart(start)`.

   Returns this sequence.

   (Matches the `name` nonterminal in the grammar.)
 */
 UnicodeString Parser::parseName(UErrorCode& errorCode) {
     UnicodeString name;

     U_ASSERT(inBounds());

     if (!(isNameStart(peek()) || isBidiControl(peek()))) {
         ERROR(errorCode);
         return name;
     }

     // name       = [bidi] name-start *name-char [bidi]

     // [bidi]
     parseOptionalBidi();

     // name-start *name-char
     parseNameChars(name, errorCode);

     // [bidi]
     parseOptionalBidi();

     return name;
 }

 /*
   Consumes a '$' followed by a `name`, returning a VariableName
   with `name` as its name

   (Matches the `variable` nonterminal in the grammar.)
 */
 VariableName Parser::parseVariableName(UErrorCode& errorCode) {
     VariableName result;

     U_ASSERT(inBounds());

     parseToken(DOLLAR, errorCode);
     if (!inBounds()) {
         ERROR(errorCode);
         return result;
     }
     return VariableName(parseName(errorCode));
 }

 /*
   Corresponds to the `identifier` nonterminal in the grammar
 */
 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
     U_ASSERT(inBounds());

     UnicodeString result;
     // The following is a hack to get around ambiguity in the grammar:
     // identifier -> namespace ":" name
     // vs.
     // identifier -> name
     // can't be distinguished without arbitrary lookahead.
     // Instead, we treat the production as:
     // identifier -> namespace *(":"name)
     // and then check for multiple colons.

     // Parse namespace
     result += parseName(errorCode);
     int32_t firstColon = -1;
     while (inBounds() && peek() == COLON) {
         // Parse ':' separator
         if (firstColon == -1) {
             firstColon = index;
         }
         parseToken(COLON, errorCode);
         result += COLON;
         // Check for message ending with something like "foo:"
         if (!inBounds()) {
             ERROR(errorCode);
         } else {
             // Parse name part
             result += parseName(errorCode);
         }
     }

     // If there's at least one ':', scan from the first ':'
     // to the end of the name to check for multiple ':'s
     if (firstColon != -1) {
         for (int32_t i = firstColon + 1; i < result.length(); i++) {
             if (result[i] == COLON) {
                 ERROR_AT(errorCode, i);
                 return {};
             }
         }
     }

     return result;
 }

 /*
   Consumes a reference to a function, matching the ": identifier"
   in the `function` nonterminal in the grammar.

   Returns the function name.
 */
 FunctionName Parser::parseFunction(UErrorCode& errorCode) {
     U_ASSERT(inBounds());
     if (!isFunctionStart(peek())) {
         ERROR(errorCode);
         return FunctionName();
     }

     normalizedInput += peek();
     next(); // Consume the function start character
     if (!inBounds()) {
         ERROR(errorCode);
         return FunctionName();
     }
     return parseIdentifier(errorCode);
 }


 /*
   Precondition: peek() == BACKSLASH

   Consume an escaped character.
   Corresponds to `escaped-char` in the grammar.

   No postcondition (a message can end with an escaped char)
 */
 UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) {
     U_ASSERT(inBounds());
     U_ASSERT(peek() == BACKSLASH);
     normalizedInput += BACKSLASH;
     next(); // Skip the initial backslash
     UnicodeString str;
     if (inBounds()) {
         // Expect a '{', '|' or '}'
         switch (peek()) {
         case LEFT_CURLY_BRACE:
         case RIGHT_CURLY_BRACE:
         case PIPE:
         case BACKSLASH: {
             /* Append to the output string */
             str += peek();
             /* Update normalizedInput */
             normalizedInput += peek();
             /* Consume the character */
             next();
             return str;
         }
         default: {
             // No other characters are allowed here
             break;
         }
         }
     }
    // If control reaches here, there was an error
    ERROR(errorCode);
    return str;
 }


 /*
   Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
 */
 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
     bool error = false;

     UnicodeString contents;
     if (U_SUCCESS(errorCode)) {
         // Parse the opening '|'
         parseToken(PIPE, errorCode);
         if (!inBounds()) {
             ERROR(errorCode);
             error = true;
         } else {
             // Parse the contents
             bool done = false;
             while (!done) {
                 if (peek() == BACKSLASH) {
                     contents += parseEscapeSequence(errorCode);
                 } else if (isQuotedChar(peek())) {
                     contents += peek();
                     // Handle cases like:
                     // |}{| -- we want to escape everywhere that
                     // can be escaped, to make round-trip checking
                     // easier -- so this case normalizes to
                     // |\}\{|
                     if (isEscapableChar(peek())) {
                         normalizedInput += BACKSLASH;
                     }
                     normalizedInput += peek();
                     next(); // Consume this character
                     maybeAdvanceLine();
                 } else {
                     // Assume the sequence of literal characters ends here
                     done = true;
                 }
                 if (!inBounds()) {
                     ERROR(errorCode);
                     error = true;
                     break;
                 }
             }
         }
     }

     if (error) {
         return {};
     }

     // Parse the closing '|'
     parseToken(PIPE, errorCode);

     return Literal(true, contents);
 }

 // Parse (1*DIGIT)
 UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
     if (U_FAILURE(errorCode)) {
         return {};
     }

     U_ASSERT(isDigit(peek()));

     UnicodeString contents;
     do {
         contents += peek();
         normalizedInput += peek();
         next();
         if (!inBounds()) {
             ERROR(errorCode);
             return {};
         }
     } while (isDigit(peek()));

     return contents;
 }
 /*
   Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
 */
 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
     if (U_FAILURE(errorCode)) {
         return {};
     }
     // unquoted-literal = 1*name-char

     if (!(isNameChar(peek()))) {
         ERROR(errorCode);
         return {};
     }

     UnicodeString contents;
     parseNameChars(contents, errorCode);
     return Literal(false, contents);
 }

 /*
   Consume and return a literal, matching the `literal` nonterminal in the grammar.
 */
 Literal Parser::parseLiteral(UErrorCode& errorCode) {
     Literal result;
     if (!inBounds()) {
         ERROR(errorCode);
     } else {
         if (peek() == PIPE) {
             result = parseQuotedLiteral(errorCode);
         } else {
             result = parseUnquotedLiteral(errorCode);
         }
         // Guarantee postcondition
         if (!inBounds()) {
             ERROR(errorCode);
         }
     }

     return result;
 }

 /*
   Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.

   Adds the option to `options`
 */
 template<class T>
 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
     U_ASSERT(inBounds());

     U_ASSERT(peek() == AT);
     // Consume the '@'
     parseToken(AT, errorCode);

     // Parse LHS
     UnicodeString lhs = parseIdentifier(errorCode);

     // Prepare to "backtrack" to resolve ambiguity
     // about whether whitespace precedes another
     // attribute, or the '=' sign
     int32_t savedIndex = index;
     parseOptionalWhitespace();

     Operand rand;
     if (peek() == EQUALS) {
         // Parse '='
         parseTokenWithWhitespace(EQUALS, errorCode);

         UnicodeString rhsStr;
         // Parse RHS, which must be a literal
         // attribute = "@" identifier [o "=" o literal]
         rand = Operand(parseLiteral(errorCode));
     } else {
         // attribute -> "@" identifier [[s] "=" [s]]
         // Use null operand, which `rand` is already set to
         // "Backtrack" by restoring the whitespace (if there was any)
         index = savedIndex;
     }

     attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
 }

 /*
   Consume a name-value pair, matching the `option` nonterminal in the grammar.

   Adds the option to `optionList`
 */
 template<class T>
 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
     U_ASSERT(inBounds());

     // Parse LHS
     UnicodeString lhs = parseIdentifier(errorCode);

     // Parse '='
     parseTokenWithWhitespace(EQUALS, errorCode);

     UnicodeString rhsStr;
     Operand rand;
     // Parse RHS, which is either a literal or variable
     switch (peek()) {
     case DOLLAR: {
         rand = Operand(parseVariableName(errorCode));
         break;
     }
     default: {
         // Must be a literal
         rand = Operand(parseLiteral(errorCode));
         break;
     }
     }
     U_ASSERT(!rand.isNull());

     // Finally, add the key=value mapping
     // Use a local error code, check for duplicate option error and
     // record it as with other errors
     UErrorCode status = U_ZERO_ERROR;
     addOption.addOption(lhs, std::move(rand), status);
     if (U_FAILURE(status)) {
       U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
       errors.setDuplicateOptionName(errorCode);
     }
 }

 /*
   Note: there are multiple overloads of parseOptions() for parsing
   options within markup, vs. within an expression, vs. parsing
   attributes. This should be refactored. TODO
  */

 /*
   Consume optional whitespace followed by a sequence of options
   (possibly empty), separated by whitespace
 */
 template <class T>
 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
     // Early exit if out of bounds -- no more work is possible
     CHECK_BOUNDS(errorCode);

 /*
 Arbitrary lookahead is required to parse option lists. To see why, consider
 these rules from the grammar:

 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
 annotation = (function *(s option)) / reserved

 And this example:
 {:foo  }

 Derivation:
 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
            -> "{" [s] annotation [s] "}"
            -> "{" [s] ((function *(s option)) / reserved) [s] "}"
            -> "{" [s] function *(s option) [s] "}"

 In this example, knowing whether to expect a '}' or the start of another option
 after the whitespace would require arbitrary lookahead -- in other words, which
 rule should we apply?
     *(s option) -> s option *(s option)
   or
     *(s option) ->

 The same would apply to the example {:foo k=v } (note the trailing space after "v").

 This is addressed using a form of backtracking and (to make the backtracking easier
 to apply) a slight refactoring to the grammar.

 This code is written as if the grammar is:
   expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
   annotation = (function *(s option) [s]) / (reserved [s])

 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.

 Note that when "backtracking" really just means early exit, since only whitespace
 is involved and there's no state to save.

 There is a separate but similar ambiguity as to whether the space precedes
 an option or an attribute.
 */

     while(true) {
         // If the next character is not whitespace, that means we've already
         // parsed the entire options list (which may have been empty) and there's
         // no trailing whitespace. In that case, exit.
         if (!isWhitespace(peek())) {
             break;
         }
         int32_t firstWhitespace = index;

         // In any case other than an empty options list, there must be at least
         // one whitespace character.
         parseRequiredWhitespace(errorCode);
         // Restore precondition
         CHECK_BOUNDS(errorCode);

         // If a name character follows, then at least one more option remains
         // in the list.
         // Otherwise, we've consumed all the options and any trailing whitespace,
         // and can exit.
         // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
         // so we back out to [s].
         if (!isNameStart(peek())) {
             // We've consumed all the options (meaning that either we consumed non-empty
             // whitespace, or consumed at least one option.)
             // Done.
             // Remove the required whitespace from normalizedInput
             normalizedInput.truncate(normalizedInput.length() - 1);
             // "Backtrack" so as to leave the optional whitespace there
             // when parsing attributes
             index = firstWhitespace;
             break;
         }
         parseOption(addOption, errorCode);
     }
 }

 /*
   Consume optional whitespace followed by a sequence of attributes
   (possibly empty), separated by whitespace
 */
 template<class T>
 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {

     // Early exit if out of bounds -- no more work is possible
     if (!inBounds()) {
         ERROR(errorCode);
         return;
     }

 /*
 Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
 (See comment in parseOptions()).
 */

     while(true) {
         // If the next character is not whitespace, that means we've already
         // parsed the entire attributes list (which may have been empty) and there's
         // no trailing whitespace. In that case, exit.
         if (!isWhitespace(peek())) {
             break;
         }

         // In any case other than an empty attributes list, there must be at least
         // one whitespace character.
         parseRequiredWhitespace(errorCode);
         // Restore precondition
         if (!inBounds()) {
             ERROR(errorCode);
             break;
         }

         // If an '@' follows, then at least one more attribute remains
         // in the list.
         // Otherwise, we've consumed all the attributes and any trailing whitespace,
         // and can exit.
         // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
         // so we back out to [s].
         if (peek() != AT) {
             // We've consumed all the attributes (meaning that either we consumed non-empty
             // whitespace, or consumed at least one attribute.)
             // Done.
             // Remove the whitespace from normalizedInput
             normalizedInput.truncate(normalizedInput.length() - 1);
             break;
         }
         parseAttribute(attrAdder, errorCode);
     }
 }

 /*
   Consume a function call, matching the `annotation`
   nonterminal in the grammar

   Returns an `Operator` representing this (a reserved is a parse error)
 */
 Operator Parser::parseAnnotation(UErrorCode& status) {
     U_ASSERT(inBounds());
     Operator::Builder ratorBuilder(status);
     if (U_FAILURE(status)) {
         return {};
     }
     if (isFunctionStart(peek())) {
         // Consume the function name
         FunctionName func = parseFunction(status);
         ratorBuilder.setFunctionName(std::move(func));

         OptionAdder<Operator::Builder> addOptions(ratorBuilder);
         // Consume the options (which may be empty)
         parseOptions(addOptions, status);
     } else {
         ERROR(status);
     }
     return ratorBuilder.build(status);
 }

 /*
   Consume a literal or variable (depending on `isVariable`),
   followed by either required whitespace followed by an annotation,
   or optional whitespace.
 */
 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
                                                   Expression::Builder& builder,
                                                   UErrorCode& status) {
     CHECK_ERROR(status);

     U_ASSERT(inBounds());

     Operand rand;
     if (isVariable) {
         rand = Operand(parseVariableName(status));
     } else {
         rand = Operand(parseLiteral(status));
     }

     builder.setOperand(std::move(rand));

 /*
 Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
 To see why, consider this rule from the grammar:

 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"

 And this example:

 {|foo|   }

 Derivation:
 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
            -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
            -> "{" [s] (literal [s annotation]) [s] "}"

 When reading the ' ' after the second '|', it's ambiguous whether that's the required
 space before an annotation, or the optional space before the '}'.

 To make this ambiguity easier to handle, this code is based on the same grammar
 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
 the comment in `parseOptions()` for details.
 */

     if (isWhitespace(peek())) {
       int32_t firstWhitespace = index;

       // If the next character is whitespace, either [s annotation] or [s] applies
       // (the character is either the required space before an annotation, or optional
       // trailing space after the literal or variable). It's still ambiguous which
       // one does apply.
       parseOptionalWhitespace();
       // Restore precondition
       CHECK_BOUNDS(status);

       // This next check resolves the ambiguity between [s annotation] and [s]
       bool isSAnnotation = isAnnotationStart(peek());

       if (isSAnnotation) {
         normalizedInput += SPACE;
       }

       if (isSAnnotation) {
         // The previously consumed whitespace precedes an annotation
         builder.setOperator(parseAnnotation(status));
       } else {
           // Either there's a right curly brace (will be consumed by the caller),
           // or there's an error and the trailing whitespace should be
           // handled by the caller. However, this is not an error
           // here because we're just parsing `literal [s annotation]`.
           index = firstWhitespace;
       }
     } else {
       // Either there was never whitespace, or
       // the previously consumed whitespace is the optional trailing whitespace;
       // either the next character is '}' or the error will be handled by parseExpression.
       // Do nothing, since the operand was already set
     }

     // At the end of this code, the next character should either be '}',
     // whitespace followed by a '}',
     // or end-of-input
 }

 /*
   Consume an expression, matching the `expression` nonterminal in the grammar
 */

 static void exprFallback(Expression::Builder& exprBuilder) {
     // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
     // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
     exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
 }

 static Expression exprFallback(UErrorCode& status) {
     Expression result;
     if (U_SUCCESS(status)) {
         Expression::Builder exprBuilder(status);
         if (U_SUCCESS(status)) {
             // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
             // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
             exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
             UErrorCode status = U_ZERO_ERROR;
             result = exprBuilder.build(status);
             // An operand was set, so there can't be an error
             U_ASSERT(U_SUCCESS(status));
         }
     }
     return result;
 }

 Expression Parser::parseExpression(UErrorCode& status) {
     if (U_FAILURE(status)) {
         return {};
     }

     // Early return if out of input -- no more work is possible
     U_ASSERT(inBounds());

     // Parse opening brace
     parseToken(LEFT_CURLY_BRACE, status);
     // Optional whitespace after opening brace
     parseOptionalWhitespace();

     Expression::Builder exprBuilder(status);
     // Restore precondition
     if (!inBounds()) {
         exprFallback(exprBuilder);
     } else {
         // literal '|', variable '$' or annotation
         switch (peek()) {
         case PIPE: {
             // Quoted literal
             parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
             break;
         }
         case DOLLAR: {
             // Variable
             parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
             break;
         }
         default: {
             if (isAnnotationStart(peek())) {
                 Operator rator = parseAnnotation(status);
                 exprBuilder.setOperator(std::move(rator));
             } else if (isUnquotedStart(peek())) {
                 // Unquoted literal
                 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
             } else {
                 // Not a literal, variable or annotation -- error out
                 ERROR(status);
                 exprFallback(exprBuilder);
                 break;
             }
             break;
         }
         }
     }

     // Parse attributes
     AttributeAdder<Expression::Builder> attrAdder(exprBuilder);
     parseAttributes(attrAdder, status);

     // Parse optional space
     // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
     parseOptionalWhitespace();

     // Either an operand or operator (or both) must have been set already,
     // so there can't be an error
     UErrorCode localStatus = U_ZERO_ERROR;
     Expression result = exprBuilder.build(localStatus);
     U_ASSERT(U_SUCCESS(localStatus));

     // Check for end-of-input and missing '}'
     if (!inBounds()) {
         ERROR(status);
     } else {
         // Otherwise, it's safe to check for the '}'
         parseToken(RIGHT_CURLY_BRACE, status);
     }
     return result;
 }

 /*
   Parse a .local declaration, matching the `local-declaration`
   production in the grammar
 */
 void Parser::parseLocalDeclaration(UErrorCode& status) {
     // End-of-input here would be an error; even empty
     // declarations must be followed by a body
     CHECK_BOUNDS(status);

     parseToken(ID_LOCAL, status);
     parseRequiredWhitespace(status);

     // Restore precondition
     CHECK_BOUNDS(status);
     VariableName lhs = parseVariableName(status);
     parseTokenWithWhitespace(EQUALS, status);
     // Restore precondition before calling parseExpression()
     CHECK_BOUNDS(status);

     Expression rhs = parseExpression(status);

     // Add binding from lhs to rhs, unless there was an error
     // (This ensures that if there was a correct lhs but a
     // parse error in rhs, the fallback for uses of the
     // lhs will be its own name rather than the rhs)
     /* This affects the behavior of this test case, which the spec
        is ambiguous about:

        .local $bar {|foo|} {{{$bar}}}

        Should `$bar` still be bound to a value although
        its declaration is syntactically incorrect (missing the '=')?
        This code says no, but it needs to change if
        https://github.com/unicode-org/message-format-wg/issues/703
        is resolved differently.
     */
     CHECK_ERROR(status);
     if (!errors.hasSyntaxError()) {
         dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
         // Check if status is U_DUPLICATE_DECLARATION_ERROR
         // and add that as an internal error if so
         if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
             status = U_ZERO_ERROR;
             errors.addError(StaticErrorType::DuplicateDeclarationError, status);
         }
     }
 }

 /*
   Parse an .input declaration, matching the `local-declaration`
   production in the grammar
 */
 void Parser::parseInputDeclaration(UErrorCode& status) {
     // End-of-input here would be an error; even empty
     // declarations must be followed by a body
     CHECK_BOUNDS(status);

     parseToken(ID_INPUT, status);
     parseOptionalWhitespace();

     // Restore precondition before calling parseExpression()
     CHECK_BOUNDS(status);

     // Save the index for error diagnostics
     int32_t exprIndex = index;
     Expression rhs = parseExpression(status);

     // Here we have to check that the rhs is a variable-expression
     if (!rhs.getOperand().isVariable()) {
         // This case is a syntax error; report it at the beginning
         // of the expression
         ERROR_AT(status, exprIndex);
         return;
     }

     VariableName lhs = rhs.getOperand().asVariable();

     // Add binding from lhs to rhs
     // This just adds a new local variable that shadows the message
     // argument referred to, which is harmless.
     // When evaluating the RHS, the new local is not in scope
     // and the message argument will be correctly referred to.
     CHECK_ERROR(status);
     if (!errors.hasSyntaxError()) {
         dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
         // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
         // and add that as an internal error if so
         if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
             status = U_ZERO_ERROR;
             errors.addError(StaticErrorType::DuplicateDeclarationError, status);
         }
     }
 }

 /*
   Consume a possibly-empty sequence of declarations separated by whitespace;
   each declaration matches the `declaration` nonterminal in the grammar

   Builds up an environment representing those declarations
 */
 void Parser::parseDeclarations(UErrorCode& status) {
     // End-of-input here would be an error; even empty
     // declarations must be followed by a body
     CHECK_BOUNDS(status);

     while (peek() == PERIOD) {
         CHECK_BOUNDS_1(status);
         if (peek(1) == ID_LOCAL[1]) {
             parseLocalDeclaration(status);
         } else if (peek(1) == ID_INPUT[1]) {
             parseInputDeclaration(status);
         } else {
             // Done parsing declarations
             break;
         }

         // Avoid looping infinitely
         CHECK_ERROR(status);

         parseOptionalWhitespace();
         // Restore precondition
         CHECK_BOUNDS(status);
     }
 }

 /*
   Consume a text character
   matching the `text-char` nonterminal in the grammar

   No postcondition (a message can end with a text-char)
 */
 UnicodeString Parser::parseTextChar(UErrorCode& status) {
     UnicodeString str;
     if (!inBounds() || !(isTextChar(peek()))) {
         // Error -- text-char is expected here
         ERROR(status);
     } else {
         // See comment in parseQuotedLiteral()
         if (isEscapableChar(peek())) {
             normalizedInput += BACKSLASH;
         }
         normalizedInput += peek();
         str += peek();
         next();
         maybeAdvanceLine();
     }
     return str;
 }

 /*
   Consume an `nmtoken`, `literal`, or the string "*", matching
   the `key` nonterminal in the grammar
 */
 Key Parser::parseKey(UErrorCode& status) {
     U_ASSERT(inBounds());

     Key k; // wildcard by default
     // Literal | '*'
     switch (peek()) {
     case ASTERISK: {
         next();
         normalizedInput += ASTERISK;
         // Guarantee postcondition
         if (!inBounds()) {
             ERROR(status);
             return k;
         }
         break;
     }
     default: {
         // Literal
         k = Key(parseLiteral(status));
         break;
     }
     }
     return k;
 }

 /*
   Consume a non-empty sequence of `key`s separated by whitespace

   Takes ownership of `keys`
 */
 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
     SelectorKeys result;

     if (U_FAILURE(status)) {
         return result;
     }

     U_ASSERT(inBounds());

 /*
 Arbitrary lookahead is required to parse key lists. To see why, consider
 this rule from the grammar:

 variant = key *(s key) [s] quoted-pattern

 And this example:
 when k1 k2   {a}

 Derivation:
    variant -> key *(s key) [s] quoted-pattern
            -> key s key *(s key) quoted-pattern

 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
 to know whether to expect the start of a pattern or the start of another key.
 In other words: is the second whitespace sequence the required space in *(s key),
 or the optional space in [s] quoted-pattern?

 This is addressed using "backtracking" (similarly to `parseOptions()`).
 */

     SelectorKeys::Builder keysBuilder(status);
     if (U_FAILURE(status)) {
         return result;
     }

     // Since the first key is required, it's simplest to parse it separately.
     keysBuilder.add(parseKey(status), status);

     // Restore precondition
     if (!inBounds()) {
         ERROR(status);
         return result;
     }

     // We've seen at least one whitespace-key pair, so now we can parse
     // *(s key) [s]
     while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) {
         bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek());
         parseRequiredWhitespace(status);
         if (!wasWhitespace) {
             // Avoid infinite loop when parsing something like:
             // when * @{!...
             next();
         }

         // Restore precondition
         if (!inBounds()) {
             ERROR(status);
             return result;
         }

         // At this point, it's ambiguous whether we are inside (s key) or [s].
         // This check resolves that ambiguity.
         if (peek() == LEFT_CURLY_BRACE) {
             // A pattern follows, so what we just parsed was the optional
             // trailing whitespace. All the keys have been parsed.

             // Unpush the whitespace from `normalizedInput`
             normalizedInput.truncate(normalizedInput.length() - 1);
             break;
         }
         keysBuilder.add(parseKey(status), status);
     }

     return keysBuilder.build(status);
 }

 Pattern Parser::parseQuotedPattern(UErrorCode& status) {
     U_ASSERT(inBounds());

     parseToken(LEFT_CURLY_BRACE, status);
     parseToken(LEFT_CURLY_BRACE, status);
     Pattern p = parseSimpleMessage(status);
     parseToken(RIGHT_CURLY_BRACE, status);
     parseToken(RIGHT_CURLY_BRACE, status);
     return p;
 }

 /*
   Consume a `placeholder`, matching the nonterminal in the grammar
   No postcondition (a markup can end a message)
 */
 Markup Parser::parseMarkup(UErrorCode& status) {
     U_ASSERT(inBounds(1));

     U_ASSERT(peek() == LEFT_CURLY_BRACE);

     Markup::Builder builder(status);
     if (U_FAILURE(status)) {
         return {};
     }

     // Consume the '{'
     next();
     normalizedInput += LEFT_CURLY_BRACE;
     parseOptionalWhitespace();
     bool closing = false;
     switch (peek()) {
     case NUMBER_SIGN: {
         // Open or standalone; consume the '#'
         normalizedInput += peek();
         next();
         break;
     }
     case SLASH: {
         // Closing
         normalizedInput += peek();
         closing = true;
         next();
         break;
     }
     default: {
         ERROR(status);
         return {};
     }
     }

     // Parse the markup identifier
     builder.setName(parseIdentifier(status));

     // Parse the options, which must begin with a ' '
     // if present
     if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
         OptionAdder<Markup::Builder> optionAdder(builder);
         parseOptions(optionAdder, status);
     }

     // Parse the attributes, which also must begin
     // with a ' '
     if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
         AttributeAdder<Markup::Builder> attrAdder(builder);
         parseAttributes(attrAdder, status);
     }

     parseOptionalWhitespace();

     bool standalone = false;
     // Check if this is a standalone or not
     if (!closing) {
         if (inBounds() && peek() == SLASH) {
             standalone = true;
             normalizedInput += SLASH;
             next();
         }
     }

     parseToken(RIGHT_CURLY_BRACE, status);

     if (standalone) {
         builder.setStandalone();
     } else if (closing) {
         builder.setClose();
     } else {
         builder.setOpen();
     }

     return builder.build(status);
 }

 /*
   Consume a `placeholder`, matching the nonterminal in the grammar
   No postcondition (a placeholder can end a message)
 */
 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
     U_ASSERT(peek() == LEFT_CURLY_BRACE);

     if (!inBounds()) {
         ERROR(status);
         return exprFallback(status);
     }

     // Need to look ahead arbitrarily since whitespace
     // can appear before the '{' and '#'
     // in markup
     int32_t tempIndex = 1;
     bool isMarkup = false;
     while (inBounds(1)) {
         UChar32 c = peek(tempIndex);
         if (c == NUMBER_SIGN || c == SLASH) {
             isMarkup = true;
             break;
         }
         if (!(isWhitespace(c) || isBidiControl(c))) {
             break;
         }
         tempIndex++;
     }

     if (isMarkup) {
         return parseMarkup(status);
     }
     return parseExpression(status);
 }

 /*
   Consume a `simple-message`, matching the nonterminal in the grammar
   Postcondition: `index == len()` or U_FAILURE(status);
   for a syntactically correct message, this will consume the entire input
 */
 Pattern Parser::parseSimpleMessage(UErrorCode& status) {
     Pattern::Builder result(status);

     if (U_SUCCESS(status)) {
         Expression expression;
         while (inBounds()) {
             switch (peek()) {
             case LEFT_CURLY_BRACE: {
                 // Must be placeholder
                 std::variant<Expression, Markup> piece = parsePlaceholder(status);
                 if (std::holds_alternative<Expression>(piece)) {
                     Expression expr = *std::get_if<Expression>(&piece);
                     result.add(std::move(expr), status);
                 } else {
                     Markup markup = *std::get_if<Markup>(&piece);
                     result.add(std::move(markup), status);
                 }
                 break;
             }
             case BACKSLASH: {
                 // Must be escaped-char
                 result.add(parseEscapeSequence(status), status);
                 break;
             }
             case RIGHT_CURLY_BRACE: {
                 // Distinguish unescaped '}' from end of quoted pattern
                 break;
             }
             default: {
                 // Must be text-char
                 result.add(parseTextChar(status), status);
                 break;
             }
             }
             if (peek() == RIGHT_CURLY_BRACE) {
                 // End of quoted pattern
                 break;
             }
             // Don't loop infinitely
             if (errors.hasSyntaxError() || U_FAILURE(status)) {
                 break;
             }
         }
     }
     return result.build(status);
 }

 void Parser::parseVariant(UErrorCode& status) {
     CHECK_ERROR(status);

     // At least one key is required
     SelectorKeys keyList(parseNonEmptyKeys(status));

     // parseNonEmptyKeys() consumes any trailing whitespace,
     // so the pattern can be consumed next.

     // Restore precondition before calling parsePattern()
     // (which must return a non-null value)
     CHECK_BOUNDS(status);
     Pattern rhs = parseQuotedPattern(status);

     dataModel.addVariant(std::move(keyList), std::move(rhs), status);
 }

 /*
   Consume a `selectors` (matching the nonterminal in the grammar),
   followed by a non-empty sequence of `variant`s (matching the nonterminal
   in the grammar) preceded by whitespace
   No postcondition (on return, `index` might equal `len()` with no syntax error
   because a message can end with a variant)
 */
 void Parser::parseSelectors(UErrorCode& status) {
     CHECK_ERROR(status);

     U_ASSERT(inBounds());

     parseToken(ID_MATCH, status);

     bool empty = true;
     // Parse selectors
     // "Backtracking" is required here. It's not clear if whitespace is
     // (`[s]` selector) or (`[s]` variant)
     while (isWhitespace(peek()) || isBidiControl(peek()) || peek() == DOLLAR) {
         int32_t whitespaceStart = index;
         parseRequiredWhitespace(status);
         // Restore precondition
         CHECK_BOUNDS(status);
         if (peek() != DOLLAR) {
             // This is not necessarily an error, but rather,
             // means the whitespace we parsed was the optional
             // whitespace preceding the first variant, not the
             // required whitespace preceding a subsequent variable.
             // In that case, "push back" the whitespace.
             normalizedInput.truncate(normalizedInput.length() - 1);
             index = whitespaceStart;
             break;
         }
         VariableName var = parseVariableName(status);
         empty = false;

         dataModel.addSelector(std::move(var), status);
         CHECK_ERROR(status);
     }

     // At least one selector is required
     if (empty) {
         ERROR(status);
         return;
     }

     #define CHECK_END_OF_INPUT                     \
         if (!inBounds()) {                         \
             break;                                 \
         }                                          \

     // Parse required whitespace before first variant
     parseRequiredWhitespace(status);

     // Parse variants

     while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) {
         // Trailing whitespace is allowed.
         parseOptionalWhitespace();
         if (!inBounds()) {
             return;
         }

         parseVariant(status);

         // Restore the precondition, *without* erroring out if we've
         // reached the end of input. That's because it's valid for the
         // message to end with a variant that has no trailing whitespace.
         // Why do we need to check this condition twice inside the loop?
         // Because if we don't check it here, the `isWhitespace()` call in
         // the loop head will read off the end of the input string.
         CHECK_END_OF_INPUT

         if (errors.hasSyntaxError() || U_FAILURE(status)) {
             break;
         }
     }
 }

 /*
   Consume a `body` (matching the nonterminal in the grammar),
   No postcondition (on return, `index` might equal `len()` with no syntax error,
   because a message can end with a body (trailing whitespace is optional)
 */

 void Parser::errorPattern(UErrorCode& status) {
     errors.addSyntaxError(status);
     // Set to empty pattern
     Pattern::Builder result = Pattern::Builder(status);
     CHECK_ERROR(status);

     // If still in bounds, then add the remaining input as a single text part
     // to the pattern
     /*
       TODO: this behavior isn't documented in the spec, but it comes from
       https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
       and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
       whether this is the intent behind the spec
      */
     UnicodeString partStr(LEFT_CURLY_BRACE);
     while (inBounds()) {
         partStr += peek();
         next();
     }
     // Add curly braces around the entire output (same comment as above)
     partStr += RIGHT_CURLY_BRACE;
     result.add(std::move(partStr), status);
     dataModel.setPattern(result.build(status));
 }

 void Parser::parseBody(UErrorCode& status) {
     CHECK_ERROR(status);

     // Out-of-input is a syntax warning
     if (!inBounds()) {
         errorPattern(status);
         return;
     }

     // Body must be either a pattern or selectors
     switch (peek()) {
     case LEFT_CURLY_BRACE: {
         // Pattern
         dataModel.setPattern(parseQuotedPattern(status));
         break;
     }
     case ID_MATCH[0]: {
         // Selectors
         parseSelectors(status);
         return;
     }
     default: {
         ERROR(status);
         errorPattern(status);
         return;
     }
     }
 }

 // -------------------------------------
 // Parses the source pattern.

 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
     CHECK_ERROR(status);

     bool complex = false;
     // First, "look ahead" to determine if this is a simple or complex
     // message. To do that, check the first non-whitespace character.
     while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) {
         next();
     }

     // Message can be empty, so we need to only look ahead
     // if we know it's non-empty
     if (inBounds()) {
         if (peek() == PERIOD
             || (inBounds(1)
                 && peek() == LEFT_CURLY_BRACE
                 && peek(1) == LEFT_CURLY_BRACE)) {
             complex = true;
         }
     }
     // Reset index
     index = 0;

     // Message can be empty, so we need to only look ahead
     // if we know it's non-empty
     if (complex) {
         parseOptionalWhitespace();
         parseDeclarations(status);
         parseBody(status);
         parseOptionalWhitespace();
     } else {
         // Simple message
         // For normalization, quote the pattern
         normalizedInput += LEFT_CURLY_BRACE;
         normalizedInput += LEFT_CURLY_BRACE;
         dataModel.setPattern(parseSimpleMessage(status));
         normalizedInput += RIGHT_CURLY_BRACE;
         normalizedInput += RIGHT_CURLY_BRACE;
     }

     CHECK_ERROR(status);

     // There are no errors; finally, check that the entire input was consumed
     if (!allConsumed()) {
         ERROR(status);
     }

     // Finally, copy the relevant fields of the internal `MessageParseError`
     // into the `UParseError` argument
     translateParseError(parseError, parseErrorResult);
 }

 Parser::~Parser() {}

 } // namespace message2
 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_MF2 */

 #endif /* #if !UCONFIG_NO_FORMATTING */

 #endif /* #if !UCONFIG_NO_NORMALIZATION */