| // © 2024 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| #if !UCONFIG_NO_FORMATTING |
| |
| #if !UCONFIG_NO_MF2 |
| |
| #include "unicode/uniset.h" |
| #include "messageformat2_errors.h" |
| #include "messageformat2_macros.h" |
| #include "messageformat2_parser.h" |
| #include "ucln_in.h" |
| #include "umutex.h" |
| #include "uvector.h" // U_ASSERT |
| |
| U_NAMESPACE_BEGIN |
| |
| namespace message2 { |
| |
| using namespace pluralimpl; |
| |
| using namespace data_model; |
| |
| /* |
| The `ERROR()` macro sets a syntax error in the context |
| and sets the offset in `parseError` to `index`. It does not alter control flow. |
| */ |
| #define ERROR(errorCode) \ |
| if (!errors.hasSyntaxError()) { \ |
| setParseError(parseError, index); \ |
| errors.addSyntaxError(errorCode); \ |
| } |
| |
| #define ERROR_AT(errorCode, i) \ |
| if (!errors.hasSyntaxError()) { \ |
| setParseError(parseError, i); \ |
| errors.addSyntaxError(errorCode); \ |
| } |
| |
| // Increments the line number and updates the "characters seen before |
| // current line" count in `parseError`, iff `peek()` is a newline |
| void Parser::maybeAdvanceLine() { |
| if (peek() == LF) { |
| parseError.line++; |
| // add 1 to index to get the number of characters seen so far |
| // (including the newline) |
| parseError.lengthBeforeCurrentLine = index + 1; |
| } |
| } |
| |
| /* |
| Signals an error and returns either if `parseError` already denotes an |
| error, or `index` is out of bounds for the string `source` |
| */ |
| #define CHECK_BOUNDS(errorCode) \ |
| if (!inBounds()) { \ |
| ERROR(errorCode); \ |
| return; \ |
| } |
| #define CHECK_BOUNDS_1(errorCode) \ |
| if (!inBounds(1)) { \ |
| ERROR_AT(errorCode, index + 1); \ |
| return; \ |
| } |
| |
| // ------------------------------------- |
| // Helper functions |
| |
| static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) { |
| for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) { |
| out[i] = in[i]; |
| if (in[i] == '\0') { |
| break; |
| } |
| } |
| } |
| |
| /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) { |
| parseError.line = messageParseError.line; |
| parseError.offset = messageParseError.offset; |
| copyContext(messageParseError.preContext, parseError.preContext); |
| copyContext(messageParseError.postContext, parseError.postContext); |
| } |
| |
| /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) { |
| // Translate absolute to relative offset |
| parseError.offset = index // Start with total number of characters seen |
| - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line |
| // TODO: Fill this in with actual pre and post-context |
| parseError.preContext[0] = 0; |
| parseError.postContext[0] = 0; |
| } |
| |
| // ------------------------------------- |
| // Initialization of UnicodeSets |
| |
| namespace unisets { |
| |
| UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {}; |
| |
| inline UnicodeSet* getImpl(Key key) { |
| return gUnicodeSets[key]; |
| } |
| |
| icu::UInitOnce gMF2ParseUniSetsInitOnce {}; |
| } |
| |
| UnicodeSet* initContentChars(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| result->add(0x000B, 0x000C); // Omit CR |
| result->add(0x000E, 0x001F); // Omit SP |
| result->add(0x0021, 0x002D); // Omit '.' |
| result->add(0x002F, 0x003F); // Omit '@' |
| result->add(0x0041, 0x005B); // Omit '\' |
| result->add(0x005D, 0x007A); // Omit { | } |
| result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE |
| result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initWhitespace(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| UnicodeSet* result = new UnicodeSet(); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| result->add(SPACE); |
| result->add(HTAB); |
| result->add(CR); |
| result->add(LF); |
| result->add(IDEOGRAPHIC_SPACE); |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initBidiControls(UErrorCode& status) { |
| UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| result->add(0x200E, 0x200F); |
| result->add(0x2066, 0x2069); |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initAlpha(UErrorCode& status) { |
| UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initDigits(UErrorCode& status) { |
| UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initNameStartChars(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| UnicodeSet* result = new UnicodeSet(); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| }; |
| |
| result->addAll(*isAlpha); |
| result->add(0x002B); |
| result->add(0x005F); |
| result->add(0x00A1, 0x061B); |
| result->add(0x061D, 0x167F); |
| result->add(0x1681, 0x1FFF); |
| result->add(0x200B, 0x200D); |
| result->add(0x2010, 0x2027); |
| result->add(0x2030, 0x205E); |
| result->add(0x2060, 0x2065); |
| result->add(0x206A, 0x2FFF); |
| result->add(0x3001, 0xD7FF); |
| result->add(0xE000, 0xFDCF); |
| result->add(0xFDF0, 0xFFFD); |
| result->add(0x10000, 0x1FFFD); |
| result->add(0x20000, 0x2FFFD); |
| result->add(0x30000, 0x3FFFD); |
| result->add(0x40000, 0x4FFFD); |
| result->add(0x50000, 0x5FFFD); |
| result->add(0x60000, 0x6FFFD); |
| result->add(0x70000, 0x7FFFD); |
| result->add(0x80000, 0x8FFFD); |
| result->add(0x90000, 0x9FFFD); |
| result->add(0xA0000, 0xAFFFD); |
| result->add(0xB0000, 0xBFFFD); |
| result->add(0xC0000, 0xCFFFD); |
| result->add(0xD0000, 0xDFFFD); |
| result->add(0xE0000, 0xEFFFD); |
| result->add(0xF0000, 0xFFFFD); |
| result->add(0x100000, 0x10FFFD); |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initNameChars(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status); |
| UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| UnicodeSet* result = new UnicodeSet(); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| }; |
| result->addAll(*nameStart); |
| result->addAll(*digit); |
| result->add(HYPHEN); |
| result->add(PERIOD); |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initTextChars(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status); |
| UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| UnicodeSet* result = new UnicodeSet(); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| }; |
| result->addAll(*content); |
| result->addAll(*whitespace); |
| result->add(PERIOD); |
| result->add(AT); |
| result->add(PIPE); |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initQuotedChars(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| UnicodeSet* result = new UnicodeSet(); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| }; |
| // content and whitespace were initialized by `initTextChars()` |
| UnicodeSet* content = unisets::getImpl(unisets::CONTENT); |
| if (content == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| result->addAll(*content); |
| UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE); |
| if (whitespace == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| result->addAll(*whitespace); |
| result->add(PERIOD); |
| result->add(AT); |
| result->add(LEFT_CURLY_BRACE); |
| result->add(RIGHT_CURLY_BRACE); |
| result->freeze(); |
| return result; |
| } |
| |
| UnicodeSet* initEscapableChars(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| |
| UnicodeSet* result = new UnicodeSet(); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| result->add(PIPE); |
| result->add(BACKSLASH); |
| result->add(LEFT_CURLY_BRACE); |
| result->add(RIGHT_CURLY_BRACE); |
| result->freeze(); |
| return result; |
| } |
| |
| namespace unisets { |
| |
| UBool U_CALLCONV cleanupMF2ParseUniSets() { |
| for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { |
| delete gUnicodeSets[i]; |
| gUnicodeSets[i] = nullptr; |
| } |
| gMF2ParseUniSetsInitOnce.reset(); |
| return true; |
| } |
| |
| void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) { |
| ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets); |
| /* |
| Each of the init functions initializes the UnicodeSets |
| that it depends on. |
| |
| initBidiControls (no dependencies) |
| |
| initEscapableChars (no dependencies) |
| |
| initNameChars depends on |
| initDigits |
| initNameStartChars depends on |
| initAlpha |
| |
| initQuotedChars depends on |
| initTextChars depends on |
| initContentChars |
| initWhitespace |
| */ |
| gUnicodeSets[unisets::BIDI] = initBidiControls(status); |
| gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status); |
| gUnicodeSets[unisets::QUOTED] = initQuotedChars(status); |
| gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status); |
| |
| if (U_FAILURE(status)) { |
| cleanupMF2ParseUniSets(); |
| } |
| } |
| |
| const UnicodeSet* get(Key key, UErrorCode& status) { |
| umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status); |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| UnicodeSet* result = getImpl(key); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| return result; |
| } |
| |
| } |
| |
| // ------------------------------------- |
| // Predicates |
| |
| /* |
| The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar: |
| |
| `isContentChar()` : `content-char` |
| `isTextChar()` : `text-char` |
| `isAlpha()` : `ALPHA` |
| `isDigit()` : `DIGIT` |
| `isNameStart()` : `name-start` |
| `isNameChar()` : `name-char` |
| `isUnquotedStart()` : `unquoted-start` |
| `isQuotedChar()` : `quoted-char` |
| `isWhitespace()` : `s` |
| */ |
| |
| bool Parser::isContentChar(UChar32 c) const { |
| return contentChars->contains(c); |
| } |
| |
| // See `bidi` in the MF2 grammar |
| bool Parser::isBidiControl(UChar32 c) const { |
| return bidiControlChars->contains(c); |
| } |
| |
| // See `ws` in the MessageFormat 2 grammar |
| bool Parser::isWhitespace(UChar32 c) const { |
| return whitespaceChars->contains(c); |
| } |
| |
| bool Parser::isTextChar(UChar32 c) const { |
| return textChars->contains(c); |
| } |
| |
| bool Parser::isAlpha(UChar32 c) const { |
| return alphaChars->contains(c); |
| } |
| |
| bool Parser::isDigit(UChar32 c) const { |
| return digitChars->contains(c); |
| } |
| |
| bool Parser::isNameStart(UChar32 c) const { |
| return nameStartChars->contains(c); |
| } |
| |
| bool Parser::isNameChar(UChar32 c) const { |
| return nameChars->contains(c); |
| } |
| |
| bool Parser::isUnquotedStart(UChar32 c) const { |
| return isNameChar(c); |
| } |
| |
| bool Parser::isQuotedChar(UChar32 c) const { |
| return quotedChars->contains(c); |
| } |
| |
| bool Parser::isEscapableChar(UChar32 c) const { |
| return escapableChars->contains(c); |
| } |
| |
| // Returns true iff `c` can begin a `function` nonterminal |
| static bool isFunctionStart(UChar32 c) { |
| switch (c) { |
| case COLON: { |
| return true; |
| } |
| default: { |
| return false; |
| } |
| } |
| } |
| |
| // Returns true iff `c` can begin an `annotation` nonterminal |
| static bool isAnnotationStart(UChar32 c) { |
| return isFunctionStart(c); |
| } |
| |
| // Returns true iff `c` can begin a `literal` nonterminal |
| bool Parser::isLiteralStart(UChar32 c) const { |
| return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c)); |
| } |
| |
| // Returns true iff `c` can begin a `key` nonterminal |
| bool Parser::isKeyStart(UChar32 c) const { |
| return (c == ASTERISK || isLiteralStart(c)); |
| } |
| |
| bool Parser::isDeclarationStart() { |
| return (peek() == ID_LOCAL[0] |
| && inBounds(1) |
| && peek(1) == ID_LOCAL[1]) |
| || (peek() == ID_INPUT[0] |
| && inBounds(1) |
| && peek(1) == ID_INPUT[1]); |
| } |
| |
| // ------------------------------------- |
| // Parsing functions |
| |
| |
| /* |
| TODO: Since handling the whitespace ambiguities needs to be repeated |
| in several different places and is hard to factor out, |
| it probably would be better to replace the parser with a lexer + parser |
| to separate tokenizing from parsing, which would simplify the code significantly. |
| This has the disadvantage that there is no token grammar for MessageFormat, |
| so one would have to be invented that isn't a component of the spec. |
| */ |
| |
| /* |
| This is a recursive-descent scannerless parser that, |
| with a few exceptions, uses 1 character of lookahead. |
| |
| This may not be an exhaustive list, as the additions of attributes and reserved |
| statements introduced several new ambiguities. |
| |
| All but three of the exceptions involve ambiguities about the meaning of whitespace. |
| One ambiguity not involving whitespace is: |
| identifier -> namespace ":" name |
| vs. |
| identifier -> name |
| |
| `namespace` and `name` can't be distinguished without arbitrary lookahead. |
| (For how this is handled, see parseIdentifier()) |
| |
| The second ambiguity not involving whitespace is: |
| complex-message -> *(declaration[s]) complex-body |
| -> declaration *(declaration[s]) complex-body |
| -> declaration complex-body |
| -> reserved-statement complex-body |
| -> .foo {$x} .match // ... |
| When processing the '.', arbitrary lookahead is required to distinguish the |
| arbitrary-length unsupported keyword from `.match`. |
| (For how this is handled, see parseDeclarations()). |
| |
| The third ambiguity not involving whitespace is: |
| complex-message -> *(declaration [s]) complex-body |
| -> reserved-statement *(declaration [s]) complex-body |
| -> reserved-statement complex-body |
| -> reserved-statement quotedPattern |
| -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern |
| -> reserved-keyword expression quoted-pattern |
| Example: .foo {1} {{1}} |
| |
| Without lookahead, the opening '{' of the quoted pattern can't be distinguished |
| from the opening '{' of another expression in the unsupported statement. |
| (Though this only requires 1 character of lookahead.) |
| |
| Otherwise: |
| |
| There are at least seven ambiguities in the grammar that can't be resolved with finite |
| lookahead (since whitespace sequences can be arbitrarily long). They are resolved |
| with a form of backtracking (early exit). No state needs to be saved/restored |
| since whitespace doesn't affect the shape of the resulting parse tree, so it's |
| not true backtracking. |
| |
| In addition, the grammar has been refactored |
| in a semantics-preserving way in some cases to make the code easier to structure. |
| |
| First: variant = when 1*(s key) [s] pattern |
| Example: when k {a} |
| When reading the first space after 'k', it's ambiguous whether it's the |
| required space before another key, or the optional space before `pattern`. |
| (See comments in parseNonEmptyKeys()) |
| |
| Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| annotation = (function *(s option)) / reserved |
| Example: {:f } |
| When reading the first space after 'f', it's ambiguous whether it's the |
| required space before an option, or the optional trailing space after an options list |
| (in this case, the options list is empty). |
| (See comments in parseOptions() -- handling this case also meant it was easier to base |
| the code on a slightly refactored grammar, which should be semantically equivalent.) |
| |
| Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| annotation = (function *(s option)) / reserved |
| Example: {@a } |
| Similar to the previous case; see comments in parseReserved() |
| |
| Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| Example: {|foo| } |
| When reading the first space after the '|', it's ambiguous whether it's the required |
| space before an annotation, or the optional trailing space before the '}'. |
| (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on |
| the same grammar refactoring as the second exception.) |
| |
| Most functions match a non-terminal in the grammar, except as explained |
| in comments. |
| |
| Fifth: matcher = match-statement 1*([s] variant) |
| -> match 1 *([s] selector) 1*([s] variant) |
| Example: match {42} * {{_}} |
| When reading the space after the first '}', it's unclear whether |
| it's the optional space before another selector, or the optional space |
| before a variant. |
| |
| Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}" |
| -> "{" [s] function *(s attribute) [s] "}" |
| -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}" |
| -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}" |
| |
| Example: {:func @foo} |
| (Note: the same ambiguity is present with variable-expression and literal-expression) |
| |
| Seventh: |
| |
| |
| When parsing the space, it's unclear whether it's the optional space before an |
| option, or the optional space before an attribute. |
| |
| Unless otherwise noted in a comment, all helper functions that take |
| a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode` |
| have the precondition: |
| `index` < `len()` |
| and the postcondition: |
| `U_FAILURE(errorCode)` || `index < `len()` |
| */ |
| |
| /* |
| No pre, no post. |
| A message may end with whitespace, so `index` may equal `len()` on exit. |
| */ |
| void Parser::parseRequiredWS(UErrorCode& errorCode) { |
| bool sawWhitespace = false; |
| |
| // The loop exits either when we consume all the input, |
| // or when we see a non-whitespace character. |
| while (true) { |
| // Check if all input has been consumed |
| if (!inBounds()) { |
| // If whitespace isn't required -- or if we saw it already -- |
| // then the caller is responsible for checking this case and |
| // setting an error if necessary. |
| if (sawWhitespace) { |
| // Not an error. |
| return; |
| } |
| // Otherwise, whitespace is required; the end of the input has |
| // been reached without whitespace. This is an error. |
| ERROR(errorCode); |
| return; |
| } |
| |
| // Input remains; process the next character if it's whitespace, |
| // exit the loop otherwise |
| if (isWhitespace(peek())) { |
| sawWhitespace = true; |
| // Increment line number in parse error if we consume a newline |
| maybeAdvanceLine(); |
| next(); |
| } else { |
| break; |
| } |
| } |
| |
| if (!sawWhitespace) { |
| ERROR(errorCode); |
| } |
| } |
| |
| void Parser::parseOptionalBidi() { |
| while (true) { |
| if (!inBounds()) { |
| return; |
| } |
| if (isBidiControl(peek())) { |
| next(); |
| } else { |
| break; |
| } |
| } |
| } |
| |
| /* |
| No pre, no post, because a message may end with whitespace |
| Matches `s` in the MF2 grammar |
| */ |
| void Parser::parseRequiredWhitespace(UErrorCode& errorCode) { |
| parseOptionalBidi(); |
| parseRequiredWS(errorCode); |
| parseOptionalWhitespace(); |
| normalizedInput += SPACE; |
| } |
| |
| /* |
| No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`. |
| */ |
| void Parser::parseOptionalWhitespace() { |
| while (true) { |
| if (!inBounds()) { |
| return; |
| } |
| auto cp = peek(); |
| if (isWhitespace(cp) || isBidiControl(cp)) { |
| maybeAdvanceLine(); |
| next(); |
| } else { |
| break; |
| } |
| } |
| } |
| |
| // Consumes a single character, signaling an error if `peek()` != `c` |
| // No postcondition -- a message can end with a '}' token |
| void Parser::parseToken(UChar32 c, UErrorCode& errorCode) { |
| CHECK_BOUNDS(errorCode); |
| |
| if (peek() == c) { |
| next(); |
| normalizedInput += c; |
| return; |
| } |
| // Next character didn't match -- error out |
| ERROR(errorCode); |
| } |
| |
| /* |
| Consumes a fixed-length token, signaling an error if the token isn't a prefix of |
| the string beginning at `peek()` |
| No postcondition -- a message can end with a '}' token |
| */ |
| void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) { |
| U_ASSERT(inBounds()); |
| |
| int32_t tokenPos = 0; |
| while (tokenPos < static_cast<int32_t>(token.length())) { |
| if (peek() != token[tokenPos]) { |
| ERROR(errorCode); |
| return; |
| } |
| normalizedInput += token[tokenPos]; |
| next(); |
| tokenPos++; |
| } |
| } |
| |
| /* |
| Consumes optional whitespace, possibly advancing `index` to `index'`, |
| then consumes a fixed-length token (signaling an error if the token isn't a prefix of |
| the string beginning at `source[index']`), |
| then consumes optional whitespace again |
| */ |
| void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) { |
| // No need for error check or bounds check before parseOptionalWhitespace |
| parseOptionalWhitespace(); |
| // Establish precondition |
| CHECK_BOUNDS(errorCode); |
| parseToken(token, errorCode); |
| parseOptionalWhitespace(); |
| // Guarantee postcondition |
| CHECK_BOUNDS(errorCode); |
| } |
| |
| /* |
| Consumes optional whitespace, possibly advancing `index` to `index'`, |
| then consumes a single character (signaling an error if it doesn't match |
| `source[index']`), |
| then consumes optional whitespace again |
| */ |
| void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { |
| // No need for error check or bounds check before parseOptionalWhitespace() |
| parseOptionalWhitespace(); |
| // Establish precondition |
| CHECK_BOUNDS(errorCode); |
| parseToken(c, errorCode); |
| parseOptionalWhitespace(); |
| // Guarantee postcondition |
| CHECK_BOUNDS(errorCode); |
| } |
| |
| /* |
| Consumes a possibly-empty sequence of name-chars. Appends to `str` |
| and returns `str`. |
| */ |
| UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) { |
| if (U_FAILURE(errorCode)) { |
| return {}; |
| } |
| |
| while (isNameChar(peek())) { |
| UChar32 c = peek(); |
| str += c; |
| normalizedInput += c; |
| next(); |
| if (!inBounds()) { |
| ERROR(errorCode); |
| break; |
| } |
| } |
| |
| return str; |
| } |
| |
| /* |
| Consumes a non-empty sequence of `name-char`s, the first of which is |
| also a `name-start`. |
| that begins with a character `start` such that `isNameStart(start)`. |
| |
| Returns this sequence. |
| |
| (Matches the `name` nonterminal in the grammar.) |
| */ |
| UnicodeString Parser::parseName(UErrorCode& errorCode) { |
| UnicodeString name; |
| |
| U_ASSERT(inBounds()); |
| |
| if (!(isNameStart(peek()) || isBidiControl(peek()))) { |
| ERROR(errorCode); |
| return name; |
| } |
| |
| // name = [bidi] name-start *name-char [bidi] |
| |
| // [bidi] |
| parseOptionalBidi(); |
| |
| // name-start *name-char |
| parseNameChars(name, errorCode); |
| |
| // [bidi] |
| parseOptionalBidi(); |
| |
| return name; |
| } |
| |
| /* |
| Consumes a '$' followed by a `name`, returning a VariableName |
| with `name` as its name |
| |
| (Matches the `variable` nonterminal in the grammar.) |
| */ |
| VariableName Parser::parseVariableName(UErrorCode& errorCode) { |
| VariableName result; |
| |
| U_ASSERT(inBounds()); |
| |
| parseToken(DOLLAR, errorCode); |
| if (!inBounds()) { |
| ERROR(errorCode); |
| return result; |
| } |
| return VariableName(parseName(errorCode)); |
| } |
| |
| /* |
| Corresponds to the `identifier` nonterminal in the grammar |
| */ |
| UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { |
| U_ASSERT(inBounds()); |
| |
| UnicodeString result; |
| // The following is a hack to get around ambiguity in the grammar: |
| // identifier -> namespace ":" name |
| // vs. |
| // identifier -> name |
| // can't be distinguished without arbitrary lookahead. |
| // Instead, we treat the production as: |
| // identifier -> namespace *(":"name) |
| // and then check for multiple colons. |
| |
| // Parse namespace |
| result += parseName(errorCode); |
| int32_t firstColon = -1; |
| while (inBounds() && peek() == COLON) { |
| // Parse ':' separator |
| if (firstColon == -1) { |
| firstColon = index; |
| } |
| parseToken(COLON, errorCode); |
| result += COLON; |
| // Check for message ending with something like "foo:" |
| if (!inBounds()) { |
| ERROR(errorCode); |
| } else { |
| // Parse name part |
| result += parseName(errorCode); |
| } |
| } |
| |
| // If there's at least one ':', scan from the first ':' |
| // to the end of the name to check for multiple ':'s |
| if (firstColon != -1) { |
| for (int32_t i = firstColon + 1; i < result.length(); i++) { |
| if (result[i] == COLON) { |
| ERROR_AT(errorCode, i); |
| return {}; |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| /* |
| Consumes a reference to a function, matching the ": identifier" |
| in the `function` nonterminal in the grammar. |
| |
| Returns the function name. |
| */ |
| FunctionName Parser::parseFunction(UErrorCode& errorCode) { |
| U_ASSERT(inBounds()); |
| if (!isFunctionStart(peek())) { |
| ERROR(errorCode); |
| return FunctionName(); |
| } |
| |
| normalizedInput += peek(); |
| next(); // Consume the function start character |
| if (!inBounds()) { |
| ERROR(errorCode); |
| return FunctionName(); |
| } |
| return parseIdentifier(errorCode); |
| } |
| |
| |
| /* |
| Precondition: peek() == BACKSLASH |
| |
| Consume an escaped character. |
| Corresponds to `escaped-char` in the grammar. |
| |
| No postcondition (a message can end with an escaped char) |
| */ |
| UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { |
| U_ASSERT(inBounds()); |
| U_ASSERT(peek() == BACKSLASH); |
| normalizedInput += BACKSLASH; |
| next(); // Skip the initial backslash |
| UnicodeString str; |
| if (inBounds()) { |
| // Expect a '{', '|' or '}' |
| switch (peek()) { |
| case LEFT_CURLY_BRACE: |
| case RIGHT_CURLY_BRACE: |
| case PIPE: |
| case BACKSLASH: { |
| /* Append to the output string */ |
| str += peek(); |
| /* Update normalizedInput */ |
| normalizedInput += peek(); |
| /* Consume the character */ |
| next(); |
| return str; |
| } |
| default: { |
| // No other characters are allowed here |
| break; |
| } |
| } |
| } |
| // If control reaches here, there was an error |
| ERROR(errorCode); |
| return str; |
| } |
| |
| |
| /* |
| Consume and return a quoted literal, matching the `literal` nonterminal in the grammar. |
| */ |
| Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) { |
| bool error = false; |
| |
| UnicodeString contents; |
| if (U_SUCCESS(errorCode)) { |
| // Parse the opening '|' |
| parseToken(PIPE, errorCode); |
| if (!inBounds()) { |
| ERROR(errorCode); |
| error = true; |
| } else { |
| // Parse the contents |
| bool done = false; |
| while (!done) { |
| if (peek() == BACKSLASH) { |
| contents += parseEscapeSequence(errorCode); |
| } else if (isQuotedChar(peek())) { |
| contents += peek(); |
| // Handle cases like: |
| // |}{| -- we want to escape everywhere that |
| // can be escaped, to make round-trip checking |
| // easier -- so this case normalizes to |
| // |\}\{| |
| if (isEscapableChar(peek())) { |
| normalizedInput += BACKSLASH; |
| } |
| normalizedInput += peek(); |
| next(); // Consume this character |
| maybeAdvanceLine(); |
| } else { |
| // Assume the sequence of literal characters ends here |
| done = true; |
| } |
| if (!inBounds()) { |
| ERROR(errorCode); |
| error = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| if (error) { |
| return {}; |
| } |
| |
| // Parse the closing '|' |
| parseToken(PIPE, errorCode); |
| |
| return Literal(true, contents); |
| } |
| |
| // Parse (1*DIGIT) |
| UnicodeString Parser::parseDigits(UErrorCode& errorCode) { |
| if (U_FAILURE(errorCode)) { |
| return {}; |
| } |
| |
| U_ASSERT(isDigit(peek())); |
| |
| UnicodeString contents; |
| do { |
| contents += peek(); |
| normalizedInput += peek(); |
| next(); |
| if (!inBounds()) { |
| ERROR(errorCode); |
| return {}; |
| } |
| } while (isDigit(peek())); |
| |
| return contents; |
| } |
| /* |
| Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar. |
| */ |
| Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { |
| if (U_FAILURE(errorCode)) { |
| return {}; |
| } |
| // unquoted-literal = 1*name-char |
| |
| if (!(isNameChar(peek()))) { |
| ERROR(errorCode); |
| return {}; |
| } |
| |
| UnicodeString contents; |
| parseNameChars(contents, errorCode); |
| return Literal(false, contents); |
| } |
| |
| /* |
| Consume and return a literal, matching the `literal` nonterminal in the grammar. |
| */ |
| Literal Parser::parseLiteral(UErrorCode& errorCode) { |
| Literal result; |
| if (!inBounds()) { |
| ERROR(errorCode); |
| } else { |
| if (peek() == PIPE) { |
| result = parseQuotedLiteral(errorCode); |
| } else { |
| result = parseUnquotedLiteral(errorCode); |
| } |
| // Guarantee postcondition |
| if (!inBounds()) { |
| ERROR(errorCode); |
| } |
| } |
| |
| return result; |
| } |
| |
| /* |
| Consume a @name-value pair, matching the `attribute` nonterminal in the grammar. |
| |
| Adds the option to `options` |
| */ |
| template<class T> |
| void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { |
| U_ASSERT(inBounds()); |
| |
| U_ASSERT(peek() == AT); |
| // Consume the '@' |
| parseToken(AT, errorCode); |
| |
| // Parse LHS |
| UnicodeString lhs = parseIdentifier(errorCode); |
| |
| // Prepare to "backtrack" to resolve ambiguity |
| // about whether whitespace precedes another |
| // attribute, or the '=' sign |
| int32_t savedIndex = index; |
| parseOptionalWhitespace(); |
| |
| Operand rand; |
| if (peek() == EQUALS) { |
| // Parse '=' |
| parseTokenWithWhitespace(EQUALS, errorCode); |
| |
| UnicodeString rhsStr; |
| // Parse RHS, which must be a literal |
| // attribute = "@" identifier [o "=" o literal] |
| rand = Operand(parseLiteral(errorCode)); |
| } else { |
| // attribute -> "@" identifier [[s] "=" [s]] |
| // Use null operand, which `rand` is already set to |
| // "Backtrack" by restoring the whitespace (if there was any) |
| index = savedIndex; |
| } |
| |
| attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode); |
| } |
| |
| /* |
| Consume a name-value pair, matching the `option` nonterminal in the grammar. |
| |
| Adds the option to `optionList` |
| */ |
| template<class T> |
| void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) { |
| U_ASSERT(inBounds()); |
| |
| // Parse LHS |
| UnicodeString lhs = parseIdentifier(errorCode); |
| |
| // Parse '=' |
| parseTokenWithWhitespace(EQUALS, errorCode); |
| |
| UnicodeString rhsStr; |
| Operand rand; |
| // Parse RHS, which is either a literal or variable |
| switch (peek()) { |
| case DOLLAR: { |
| rand = Operand(parseVariableName(errorCode)); |
| break; |
| } |
| default: { |
| // Must be a literal |
| rand = Operand(parseLiteral(errorCode)); |
| break; |
| } |
| } |
| U_ASSERT(!rand.isNull()); |
| |
| // Finally, add the key=value mapping |
| // Use a local error code, check for duplicate option error and |
| // record it as with other errors |
| UErrorCode status = U_ZERO_ERROR; |
| addOption.addOption(lhs, std::move(rand), status); |
| if (U_FAILURE(status)) { |
| U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR); |
| errors.setDuplicateOptionName(errorCode); |
| } |
| } |
| |
| /* |
| Note: there are multiple overloads of parseOptions() for parsing |
| options within markup, vs. within an expression, vs. parsing |
| attributes. This should be refactored. TODO |
| */ |
| |
| /* |
| Consume optional whitespace followed by a sequence of options |
| (possibly empty), separated by whitespace |
| */ |
| template <class T> |
| void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) { |
| // Early exit if out of bounds -- no more work is possible |
| CHECK_BOUNDS(errorCode); |
| |
| /* |
| Arbitrary lookahead is required to parse option lists. To see why, consider |
| these rules from the grammar: |
| |
| expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| annotation = (function *(s option)) / reserved |
| |
| And this example: |
| {:foo } |
| |
| Derivation: |
| expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| -> "{" [s] annotation [s] "}" |
| -> "{" [s] ((function *(s option)) / reserved) [s] "}" |
| -> "{" [s] function *(s option) [s] "}" |
| |
| In this example, knowing whether to expect a '}' or the start of another option |
| after the whitespace would require arbitrary lookahead -- in other words, which |
| rule should we apply? |
| *(s option) -> s option *(s option) |
| or |
| *(s option) -> |
| |
| The same would apply to the example {:foo k=v } (note the trailing space after "v"). |
| |
| This is addressed using a form of backtracking and (to make the backtracking easier |
| to apply) a slight refactoring to the grammar. |
| |
| This code is written as if the grammar is: |
| expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}" |
| annotation = (function *(s option) [s]) / (reserved [s]) |
| |
| Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning |
| that `parseExpression()` can safely require a '}' after `parseOptions()` finishes. |
| |
| Note that when "backtracking" really just means early exit, since only whitespace |
| is involved and there's no state to save. |
| |
| There is a separate but similar ambiguity as to whether the space precedes |
| an option or an attribute. |
| */ |
| |
| while(true) { |
| // If the next character is not whitespace, that means we've already |
| // parsed the entire options list (which may have been empty) and there's |
| // no trailing whitespace. In that case, exit. |
| if (!isWhitespace(peek())) { |
| break; |
| } |
| int32_t firstWhitespace = index; |
| |
| // In any case other than an empty options list, there must be at least |
| // one whitespace character. |
| parseRequiredWhitespace(errorCode); |
| // Restore precondition |
| CHECK_BOUNDS(errorCode); |
| |
| // If a name character follows, then at least one more option remains |
| // in the list. |
| // Otherwise, we've consumed all the options and any trailing whitespace, |
| // and can exit. |
| // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, |
| // so we back out to [s]. |
| if (!isNameStart(peek())) { |
| // We've consumed all the options (meaning that either we consumed non-empty |
| // whitespace, or consumed at least one option.) |
| // Done. |
| // Remove the required whitespace from normalizedInput |
| normalizedInput.truncate(normalizedInput.length() - 1); |
| // "Backtrack" so as to leave the optional whitespace there |
| // when parsing attributes |
| index = firstWhitespace; |
| break; |
| } |
| parseOption(addOption, errorCode); |
| } |
| } |
| |
| /* |
| Consume optional whitespace followed by a sequence of attributes |
| (possibly empty), separated by whitespace |
| */ |
| template<class T> |
| void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { |
| |
| // Early exit if out of bounds -- no more work is possible |
| if (!inBounds()) { |
| ERROR(errorCode); |
| return; |
| } |
| |
| /* |
| Arbitrary lookahead is required to parse attribute lists, similarly to option lists. |
| (See comment in parseOptions()). |
| */ |
| |
| while(true) { |
| // If the next character is not whitespace, that means we've already |
| // parsed the entire attributes list (which may have been empty) and there's |
| // no trailing whitespace. In that case, exit. |
| if (!isWhitespace(peek())) { |
| break; |
| } |
| |
| // In any case other than an empty attributes list, there must be at least |
| // one whitespace character. |
| parseRequiredWhitespace(errorCode); |
| // Restore precondition |
| if (!inBounds()) { |
| ERROR(errorCode); |
| break; |
| } |
| |
| // If an '@' follows, then at least one more attribute remains |
| // in the list. |
| // Otherwise, we've consumed all the attributes and any trailing whitespace, |
| // and can exit. |
| // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, |
| // so we back out to [s]. |
| if (peek() != AT) { |
| // We've consumed all the attributes (meaning that either we consumed non-empty |
| // whitespace, or consumed at least one attribute.) |
| // Done. |
| // Remove the whitespace from normalizedInput |
| normalizedInput.truncate(normalizedInput.length() - 1); |
| break; |
| } |
| parseAttribute(attrAdder, errorCode); |
| } |
| } |
| |
| /* |
| Consume a function call, matching the `annotation` |
| nonterminal in the grammar |
| |
| Returns an `Operator` representing this (a reserved is a parse error) |
| */ |
| Operator Parser::parseAnnotation(UErrorCode& status) { |
| U_ASSERT(inBounds()); |
| Operator::Builder ratorBuilder(status); |
| if (U_FAILURE(status)) { |
| return {}; |
| } |
| if (isFunctionStart(peek())) { |
| // Consume the function name |
| FunctionName func = parseFunction(status); |
| ratorBuilder.setFunctionName(std::move(func)); |
| |
| OptionAdder<Operator::Builder> addOptions(ratorBuilder); |
| // Consume the options (which may be empty) |
| parseOptions(addOptions, status); |
| } else { |
| ERROR(status); |
| } |
| return ratorBuilder.build(status); |
| } |
| |
| /* |
| Consume a literal or variable (depending on `isVariable`), |
| followed by either required whitespace followed by an annotation, |
| or optional whitespace. |
| */ |
| void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable, |
| Expression::Builder& builder, |
| UErrorCode& status) { |
| CHECK_ERROR(status); |
| |
| U_ASSERT(inBounds()); |
| |
| Operand rand; |
| if (isVariable) { |
| rand = Operand(parseVariableName(status)); |
| } else { |
| rand = Operand(parseLiteral(status)); |
| } |
| |
| builder.setOperand(std::move(rand)); |
| |
| /* |
| Parsing a literal or variable with an optional annotation requires arbitrary lookahead. |
| To see why, consider this rule from the grammar: |
| |
| expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| |
| And this example: |
| |
| {|foo| } |
| |
| Derivation: |
| expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
| -> "{" [s] ((literal / variable) [s annotation]) [s] "}" |
| -> "{" [s] (literal [s annotation]) [s] "}" |
| |
| When reading the ' ' after the second '|', it's ambiguous whether that's the required |
| space before an annotation, or the optional space before the '}'. |
| |
| To make this ambiguity easier to handle, this code is based on the same grammar |
| refactoring for the `expression` nonterminal that `parseOptions()` relies on. See |
| the comment in `parseOptions()` for details. |
| */ |
| |
| if (isWhitespace(peek())) { |
| int32_t firstWhitespace = index; |
| |
| // If the next character is whitespace, either [s annotation] or [s] applies |
| // (the character is either the required space before an annotation, or optional |
| // trailing space after the literal or variable). It's still ambiguous which |
| // one does apply. |
| parseOptionalWhitespace(); |
| // Restore precondition |
| CHECK_BOUNDS(status); |
| |
| // This next check resolves the ambiguity between [s annotation] and [s] |
| bool isSAnnotation = isAnnotationStart(peek()); |
| |
| if (isSAnnotation) { |
| normalizedInput += SPACE; |
| } |
| |
| if (isSAnnotation) { |
| // The previously consumed whitespace precedes an annotation |
| builder.setOperator(parseAnnotation(status)); |
| } else { |
| // Either there's a right curly brace (will be consumed by the caller), |
| // or there's an error and the trailing whitespace should be |
| // handled by the caller. However, this is not an error |
| // here because we're just parsing `literal [s annotation]`. |
| index = firstWhitespace; |
| } |
| } else { |
| // Either there was never whitespace, or |
| // the previously consumed whitespace is the optional trailing whitespace; |
| // either the next character is '}' or the error will be handled by parseExpression. |
| // Do nothing, since the operand was already set |
| } |
| |
| // At the end of this code, the next character should either be '}', |
| // whitespace followed by a '}', |
| // or end-of-input |
| } |
| |
| /* |
| Consume an expression, matching the `expression` nonterminal in the grammar |
| */ |
| |
| static void exprFallback(Expression::Builder& exprBuilder) { |
| // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER |
| // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution |
| exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT)))); |
| } |
| |
| static Expression exprFallback(UErrorCode& status) { |
| Expression result; |
| if (U_SUCCESS(status)) { |
| Expression::Builder exprBuilder(status); |
| if (U_SUCCESS(status)) { |
| // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER |
| // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution |
| exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT)))); |
| UErrorCode status = U_ZERO_ERROR; |
| result = exprBuilder.build(status); |
| // An operand was set, so there can't be an error |
| U_ASSERT(U_SUCCESS(status)); |
| } |
| } |
| return result; |
| } |
| |
| Expression Parser::parseExpression(UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return {}; |
| } |
| |
| // Early return if out of input -- no more work is possible |
| U_ASSERT(inBounds()); |
| |
| // Parse opening brace |
| parseToken(LEFT_CURLY_BRACE, status); |
| // Optional whitespace after opening brace |
| parseOptionalWhitespace(); |
| |
| Expression::Builder exprBuilder(status); |
| // Restore precondition |
| if (!inBounds()) { |
| exprFallback(exprBuilder); |
| } else { |
| // literal '|', variable '$' or annotation |
| switch (peek()) { |
| case PIPE: { |
| // Quoted literal |
| parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); |
| break; |
| } |
| case DOLLAR: { |
| // Variable |
| parseLiteralOrVariableWithAnnotation(true, exprBuilder, status); |
| break; |
| } |
| default: { |
| if (isAnnotationStart(peek())) { |
| Operator rator = parseAnnotation(status); |
| exprBuilder.setOperator(std::move(rator)); |
| } else if (isUnquotedStart(peek())) { |
| // Unquoted literal |
| parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); |
| } else { |
| // Not a literal, variable or annotation -- error out |
| ERROR(status); |
| exprFallback(exprBuilder); |
| break; |
| } |
| break; |
| } |
| } |
| } |
| |
| // Parse attributes |
| AttributeAdder<Expression::Builder> attrAdder(exprBuilder); |
| parseAttributes(attrAdder, status); |
| |
| // Parse optional space |
| // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}") |
| parseOptionalWhitespace(); |
| |
| // Either an operand or operator (or both) must have been set already, |
| // so there can't be an error |
| UErrorCode localStatus = U_ZERO_ERROR; |
| Expression result = exprBuilder.build(localStatus); |
| U_ASSERT(U_SUCCESS(localStatus)); |
| |
| // Check for end-of-input and missing '}' |
| if (!inBounds()) { |
| ERROR(status); |
| } else { |
| // Otherwise, it's safe to check for the '}' |
| parseToken(RIGHT_CURLY_BRACE, status); |
| } |
| return result; |
| } |
| |
| /* |
| Parse a .local declaration, matching the `local-declaration` |
| production in the grammar |
| */ |
| void Parser::parseLocalDeclaration(UErrorCode& status) { |
| // End-of-input here would be an error; even empty |
| // declarations must be followed by a body |
| CHECK_BOUNDS(status); |
| |
| parseToken(ID_LOCAL, status); |
| parseRequiredWhitespace(status); |
| |
| // Restore precondition |
| CHECK_BOUNDS(status); |
| VariableName lhs = parseVariableName(status); |
| parseTokenWithWhitespace(EQUALS, status); |
| // Restore precondition before calling parseExpression() |
| CHECK_BOUNDS(status); |
| |
| Expression rhs = parseExpression(status); |
| |
| // Add binding from lhs to rhs, unless there was an error |
| // (This ensures that if there was a correct lhs but a |
| // parse error in rhs, the fallback for uses of the |
| // lhs will be its own name rather than the rhs) |
| /* This affects the behavior of this test case, which the spec |
| is ambiguous about: |
| |
| .local $bar {|foo|} {{{$bar}}} |
| |
| Should `$bar` still be bound to a value although |
| its declaration is syntactically incorrect (missing the '=')? |
| This code says no, but it needs to change if |
| https://github.com/unicode-org/message-format-wg/issues/703 |
| is resolved differently. |
| */ |
| CHECK_ERROR(status); |
| if (!errors.hasSyntaxError()) { |
| dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status); |
| // Check if status is U_DUPLICATE_DECLARATION_ERROR |
| // and add that as an internal error if so |
| if (status == U_MF_DUPLICATE_DECLARATION_ERROR) { |
| status = U_ZERO_ERROR; |
| errors.addError(StaticErrorType::DuplicateDeclarationError, status); |
| } |
| } |
| } |
| |
| /* |
| Parse an .input declaration, matching the `local-declaration` |
| production in the grammar |
| */ |
| void Parser::parseInputDeclaration(UErrorCode& status) { |
| // End-of-input here would be an error; even empty |
| // declarations must be followed by a body |
| CHECK_BOUNDS(status); |
| |
| parseToken(ID_INPUT, status); |
| parseOptionalWhitespace(); |
| |
| // Restore precondition before calling parseExpression() |
| CHECK_BOUNDS(status); |
| |
| // Save the index for error diagnostics |
| int32_t exprIndex = index; |
| Expression rhs = parseExpression(status); |
| |
| // Here we have to check that the rhs is a variable-expression |
| if (!rhs.getOperand().isVariable()) { |
| // This case is a syntax error; report it at the beginning |
| // of the expression |
| ERROR_AT(status, exprIndex); |
| return; |
| } |
| |
| VariableName lhs = rhs.getOperand().asVariable(); |
| |
| // Add binding from lhs to rhs |
| // This just adds a new local variable that shadows the message |
| // argument referred to, which is harmless. |
| // When evaluating the RHS, the new local is not in scope |
| // and the message argument will be correctly referred to. |
| CHECK_ERROR(status); |
| if (!errors.hasSyntaxError()) { |
| dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status); |
| // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR |
| // and add that as an internal error if so |
| if (status == U_MF_DUPLICATE_DECLARATION_ERROR) { |
| status = U_ZERO_ERROR; |
| errors.addError(StaticErrorType::DuplicateDeclarationError, status); |
| } |
| } |
| } |
| |
| /* |
| Consume a possibly-empty sequence of declarations separated by whitespace; |
| each declaration matches the `declaration` nonterminal in the grammar |
| |
| Builds up an environment representing those declarations |
| */ |
| void Parser::parseDeclarations(UErrorCode& status) { |
| // End-of-input here would be an error; even empty |
| // declarations must be followed by a body |
| CHECK_BOUNDS(status); |
| |
| while (peek() == PERIOD) { |
| CHECK_BOUNDS_1(status); |
| if (peek(1) == ID_LOCAL[1]) { |
| parseLocalDeclaration(status); |
| } else if (peek(1) == ID_INPUT[1]) { |
| parseInputDeclaration(status); |
| } else { |
| // Done parsing declarations |
| break; |
| } |
| |
| // Avoid looping infinitely |
| CHECK_ERROR(status); |
| |
| parseOptionalWhitespace(); |
| // Restore precondition |
| CHECK_BOUNDS(status); |
| } |
| } |
| |
| /* |
| Consume a text character |
| matching the `text-char` nonterminal in the grammar |
| |
| No postcondition (a message can end with a text-char) |
| */ |
| UnicodeString Parser::parseTextChar(UErrorCode& status) { |
| UnicodeString str; |
| if (!inBounds() || !(isTextChar(peek()))) { |
| // Error -- text-char is expected here |
| ERROR(status); |
| } else { |
| // See comment in parseQuotedLiteral() |
| if (isEscapableChar(peek())) { |
| normalizedInput += BACKSLASH; |
| } |
| normalizedInput += peek(); |
| str += peek(); |
| next(); |
| maybeAdvanceLine(); |
| } |
| return str; |
| } |
| |
| /* |
| Consume an `nmtoken`, `literal`, or the string "*", matching |
| the `key` nonterminal in the grammar |
| */ |
| Key Parser::parseKey(UErrorCode& status) { |
| U_ASSERT(inBounds()); |
| |
| Key k; // wildcard by default |
| // Literal | '*' |
| switch (peek()) { |
| case ASTERISK: { |
| next(); |
| normalizedInput += ASTERISK; |
| // Guarantee postcondition |
| if (!inBounds()) { |
| ERROR(status); |
| return k; |
| } |
| break; |
| } |
| default: { |
| // Literal |
| k = Key(parseLiteral(status)); |
| break; |
| } |
| } |
| return k; |
| } |
| |
| /* |
| Consume a non-empty sequence of `key`s separated by whitespace |
| |
| Takes ownership of `keys` |
| */ |
| SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) { |
| SelectorKeys result; |
| |
| if (U_FAILURE(status)) { |
| return result; |
| } |
| |
| U_ASSERT(inBounds()); |
| |
| /* |
| Arbitrary lookahead is required to parse key lists. To see why, consider |
| this rule from the grammar: |
| |
| variant = key *(s key) [s] quoted-pattern |
| |
| And this example: |
| when k1 k2 {a} |
| |
| Derivation: |
| variant -> key *(s key) [s] quoted-pattern |
| -> key s key *(s key) quoted-pattern |
| |
| After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead |
| to know whether to expect the start of a pattern or the start of another key. |
| In other words: is the second whitespace sequence the required space in *(s key), |
| or the optional space in [s] quoted-pattern? |
| |
| This is addressed using "backtracking" (similarly to `parseOptions()`). |
| */ |
| |
| SelectorKeys::Builder keysBuilder(status); |
| if (U_FAILURE(status)) { |
| return result; |
| } |
| |
| // Since the first key is required, it's simplest to parse it separately. |
| keysBuilder.add(parseKey(status), status); |
| |
| // Restore precondition |
| if (!inBounds()) { |
| ERROR(status); |
| return result; |
| } |
| |
| // We've seen at least one whitespace-key pair, so now we can parse |
| // *(s key) [s] |
| while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) { |
| bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek()); |
| parseRequiredWhitespace(status); |
| if (!wasWhitespace) { |
| // Avoid infinite loop when parsing something like: |
| // when * @{!... |
| next(); |
| } |
| |
| // Restore precondition |
| if (!inBounds()) { |
| ERROR(status); |
| return result; |
| } |
| |
| // At this point, it's ambiguous whether we are inside (s key) or [s]. |
| // This check resolves that ambiguity. |
| if (peek() == LEFT_CURLY_BRACE) { |
| // A pattern follows, so what we just parsed was the optional |
| // trailing whitespace. All the keys have been parsed. |
| |
| // Unpush the whitespace from `normalizedInput` |
| normalizedInput.truncate(normalizedInput.length() - 1); |
| break; |
| } |
| keysBuilder.add(parseKey(status), status); |
| } |
| |
| return keysBuilder.build(status); |
| } |
| |
| Pattern Parser::parseQuotedPattern(UErrorCode& status) { |
| U_ASSERT(inBounds()); |
| |
| parseToken(LEFT_CURLY_BRACE, status); |
| parseToken(LEFT_CURLY_BRACE, status); |
| Pattern p = parseSimpleMessage(status); |
| parseToken(RIGHT_CURLY_BRACE, status); |
| parseToken(RIGHT_CURLY_BRACE, status); |
| return p; |
| } |
| |
| /* |
| Consume a `placeholder`, matching the nonterminal in the grammar |
| No postcondition (a markup can end a message) |
| */ |
| Markup Parser::parseMarkup(UErrorCode& status) { |
| U_ASSERT(inBounds(1)); |
| |
| U_ASSERT(peek() == LEFT_CURLY_BRACE); |
| |
| Markup::Builder builder(status); |
| if (U_FAILURE(status)) { |
| return {}; |
| } |
| |
| // Consume the '{' |
| next(); |
| normalizedInput += LEFT_CURLY_BRACE; |
| parseOptionalWhitespace(); |
| bool closing = false; |
| switch (peek()) { |
| case NUMBER_SIGN: { |
| // Open or standalone; consume the '#' |
| normalizedInput += peek(); |
| next(); |
| break; |
| } |
| case SLASH: { |
| // Closing |
| normalizedInput += peek(); |
| closing = true; |
| next(); |
| break; |
| } |
| default: { |
| ERROR(status); |
| return {}; |
| } |
| } |
| |
| // Parse the markup identifier |
| builder.setName(parseIdentifier(status)); |
| |
| // Parse the options, which must begin with a ' ' |
| // if present |
| if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { |
| OptionAdder<Markup::Builder> optionAdder(builder); |
| parseOptions(optionAdder, status); |
| } |
| |
| // Parse the attributes, which also must begin |
| // with a ' ' |
| if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { |
| AttributeAdder<Markup::Builder> attrAdder(builder); |
| parseAttributes(attrAdder, status); |
| } |
| |
| parseOptionalWhitespace(); |
| |
| bool standalone = false; |
| // Check if this is a standalone or not |
| if (!closing) { |
| if (inBounds() && peek() == SLASH) { |
| standalone = true; |
| normalizedInput += SLASH; |
| next(); |
| } |
| } |
| |
| parseToken(RIGHT_CURLY_BRACE, status); |
| |
| if (standalone) { |
| builder.setStandalone(); |
| } else if (closing) { |
| builder.setClose(); |
| } else { |
| builder.setOpen(); |
| } |
| |
| return builder.build(status); |
| } |
| |
| /* |
| Consume a `placeholder`, matching the nonterminal in the grammar |
| No postcondition (a placeholder can end a message) |
| */ |
| std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) { |
| U_ASSERT(peek() == LEFT_CURLY_BRACE); |
| |
| if (!inBounds()) { |
| ERROR(status); |
| return exprFallback(status); |
| } |
| |
| // Need to look ahead arbitrarily since whitespace |
| // can appear before the '{' and '#' |
| // in markup |
| int32_t tempIndex = 1; |
| bool isMarkup = false; |
| while (inBounds(1)) { |
| UChar32 c = peek(tempIndex); |
| if (c == NUMBER_SIGN || c == SLASH) { |
| isMarkup = true; |
| break; |
| } |
| if (!(isWhitespace(c) || isBidiControl(c))) { |
| break; |
| } |
| tempIndex++; |
| } |
| |
| if (isMarkup) { |
| return parseMarkup(status); |
| } |
| return parseExpression(status); |
| } |
| |
| /* |
| Consume a `simple-message`, matching the nonterminal in the grammar |
| Postcondition: `index == len()` or U_FAILURE(status); |
| for a syntactically correct message, this will consume the entire input |
| */ |
| Pattern Parser::parseSimpleMessage(UErrorCode& status) { |
| Pattern::Builder result(status); |
| |
| if (U_SUCCESS(status)) { |
| Expression expression; |
| while (inBounds()) { |
| switch (peek()) { |
| case LEFT_CURLY_BRACE: { |
| // Must be placeholder |
| std::variant<Expression, Markup> piece = parsePlaceholder(status); |
| if (std::holds_alternative<Expression>(piece)) { |
| Expression expr = *std::get_if<Expression>(&piece); |
| result.add(std::move(expr), status); |
| } else { |
| Markup markup = *std::get_if<Markup>(&piece); |
| result.add(std::move(markup), status); |
| } |
| break; |
| } |
| case BACKSLASH: { |
| // Must be escaped-char |
| result.add(parseEscapeSequence(status), status); |
| break; |
| } |
| case RIGHT_CURLY_BRACE: { |
| // Distinguish unescaped '}' from end of quoted pattern |
| break; |
| } |
| default: { |
| // Must be text-char |
| result.add(parseTextChar(status), status); |
| break; |
| } |
| } |
| if (peek() == RIGHT_CURLY_BRACE) { |
| // End of quoted pattern |
| break; |
| } |
| // Don't loop infinitely |
| if (errors.hasSyntaxError() || U_FAILURE(status)) { |
| break; |
| } |
| } |
| } |
| return result.build(status); |
| } |
| |
| void Parser::parseVariant(UErrorCode& status) { |
| CHECK_ERROR(status); |
| |
| // At least one key is required |
| SelectorKeys keyList(parseNonEmptyKeys(status)); |
| |
| // parseNonEmptyKeys() consumes any trailing whitespace, |
| // so the pattern can be consumed next. |
| |
| // Restore precondition before calling parsePattern() |
| // (which must return a non-null value) |
| CHECK_BOUNDS(status); |
| Pattern rhs = parseQuotedPattern(status); |
| |
| dataModel.addVariant(std::move(keyList), std::move(rhs), status); |
| } |
| |
| /* |
| Consume a `selectors` (matching the nonterminal in the grammar), |
| followed by a non-empty sequence of `variant`s (matching the nonterminal |
| in the grammar) preceded by whitespace |
| No postcondition (on return, `index` might equal `len()` with no syntax error |
| because a message can end with a variant) |
| */ |
| void Parser::parseSelectors(UErrorCode& status) { |
| CHECK_ERROR(status); |
| |
| U_ASSERT(inBounds()); |
| |
| parseToken(ID_MATCH, status); |
| |
| bool empty = true; |
| // Parse selectors |
| // "Backtracking" is required here. It's not clear if whitespace is |
| // (`[s]` selector) or (`[s]` variant) |
| while (isWhitespace(peek()) || isBidiControl(peek()) || peek() == DOLLAR) { |
| int32_t whitespaceStart = index; |
| parseRequiredWhitespace(status); |
| // Restore precondition |
| CHECK_BOUNDS(status); |
| if (peek() != DOLLAR) { |
| // This is not necessarily an error, but rather, |
| // means the whitespace we parsed was the optional |
| // whitespace preceding the first variant, not the |
| // required whitespace preceding a subsequent variable. |
| // In that case, "push back" the whitespace. |
| normalizedInput.truncate(normalizedInput.length() - 1); |
| index = whitespaceStart; |
| break; |
| } |
| VariableName var = parseVariableName(status); |
| empty = false; |
| |
| dataModel.addSelector(std::move(var), status); |
| CHECK_ERROR(status); |
| } |
| |
| // At least one selector is required |
| if (empty) { |
| ERROR(status); |
| return; |
| } |
| |
| #define CHECK_END_OF_INPUT \ |
| if (!inBounds()) { \ |
| break; \ |
| } \ |
| |
| // Parse required whitespace before first variant |
| parseRequiredWhitespace(status); |
| |
| // Parse variants |
| |
| while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) { |
| // Trailing whitespace is allowed. |
| parseOptionalWhitespace(); |
| if (!inBounds()) { |
| return; |
| } |
| |
| parseVariant(status); |
| |
| // Restore the precondition, *without* erroring out if we've |
| // reached the end of input. That's because it's valid for the |
| // message to end with a variant that has no trailing whitespace. |
| // Why do we need to check this condition twice inside the loop? |
| // Because if we don't check it here, the `isWhitespace()` call in |
| // the loop head will read off the end of the input string. |
| CHECK_END_OF_INPUT |
| |
| if (errors.hasSyntaxError() || U_FAILURE(status)) { |
| break; |
| } |
| } |
| } |
| |
| /* |
| Consume a `body` (matching the nonterminal in the grammar), |
| No postcondition (on return, `index` might equal `len()` with no syntax error, |
| because a message can end with a body (trailing whitespace is optional) |
| */ |
| |
| void Parser::errorPattern(UErrorCode& status) { |
| errors.addSyntaxError(status); |
| // Set to empty pattern |
| Pattern::Builder result = Pattern::Builder(status); |
| CHECK_ERROR(status); |
| |
| // If still in bounds, then add the remaining input as a single text part |
| // to the pattern |
| /* |
| TODO: this behavior isn't documented in the spec, but it comes from |
| https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236 |
| and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify |
| whether this is the intent behind the spec |
| */ |
| UnicodeString partStr(LEFT_CURLY_BRACE); |
| while (inBounds()) { |
| partStr += peek(); |
| next(); |
| } |
| // Add curly braces around the entire output (same comment as above) |
| partStr += RIGHT_CURLY_BRACE; |
| result.add(std::move(partStr), status); |
| dataModel.setPattern(result.build(status)); |
| } |
| |
| void Parser::parseBody(UErrorCode& status) { |
| CHECK_ERROR(status); |
| |
| // Out-of-input is a syntax warning |
| if (!inBounds()) { |
| errorPattern(status); |
| return; |
| } |
| |
| // Body must be either a pattern or selectors |
| switch (peek()) { |
| case LEFT_CURLY_BRACE: { |
| // Pattern |
| dataModel.setPattern(parseQuotedPattern(status)); |
| break; |
| } |
| case ID_MATCH[0]: { |
| // Selectors |
| parseSelectors(status); |
| return; |
| } |
| default: { |
| ERROR(status); |
| errorPattern(status); |
| return; |
| } |
| } |
| } |
| |
| // ------------------------------------- |
| // Parses the source pattern. |
| |
| void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { |
| CHECK_ERROR(status); |
| |
| bool complex = false; |
| // First, "look ahead" to determine if this is a simple or complex |
| // message. To do that, check the first non-whitespace character. |
| while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) { |
| next(); |
| } |
| |
| // Message can be empty, so we need to only look ahead |
| // if we know it's non-empty |
| if (inBounds()) { |
| if (peek() == PERIOD |
| || (inBounds(1) |
| && peek() == LEFT_CURLY_BRACE |
| && peek(1) == LEFT_CURLY_BRACE)) { |
| complex = true; |
| } |
| } |
| // Reset index |
| index = 0; |
| |
| // Message can be empty, so we need to only look ahead |
| // if we know it's non-empty |
| if (complex) { |
| parseOptionalWhitespace(); |
| parseDeclarations(status); |
| parseBody(status); |
| parseOptionalWhitespace(); |
| } else { |
| // Simple message |
| // For normalization, quote the pattern |
| normalizedInput += LEFT_CURLY_BRACE; |
| normalizedInput += LEFT_CURLY_BRACE; |
| dataModel.setPattern(parseSimpleMessage(status)); |
| normalizedInput += RIGHT_CURLY_BRACE; |
| normalizedInput += RIGHT_CURLY_BRACE; |
| } |
| |
| CHECK_ERROR(status); |
| |
| // There are no errors; finally, check that the entire input was consumed |
| if (!allConsumed()) { |
| ERROR(status); |
| } |
| |
| // Finally, copy the relevant fields of the internal `MessageParseError` |
| // into the `UParseError` argument |
| translateParseError(parseError, parseErrorResult); |
| } |
| |
| Parser::~Parser() {} |
| |
| } // namespace message2 |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_MF2 */ |
| |
| #endif /* #if !UCONFIG_NO_FORMATTING */ |
| |
| #endif /* #if !UCONFIG_NO_NORMALIZATION */ |