| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * collationruleparser.cpp |
| * |
| * (replaced the former ucol_tok.cpp) |
| * |
| * created on: 2013apr10 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/normalizer2.h" |
| #include "unicode/parseerr.h" |
| #include "unicode/uchar.h" |
| #include "unicode/ucol.h" |
| #include "unicode/uloc.h" |
| #include "unicode/unistr.h" |
| #include "unicode/utf16.h" |
| #include "charstr.h" |
| #include "cmemory.h" |
| #include "collation.h" |
| #include "collationdata.h" |
| #include "collationruleparser.h" |
| #include "collationsettings.h" |
| #include "collationtailoring.h" |
| #include "cstring.h" |
| #include "patternprops.h" |
| #include "uassert.h" |
| #include "uvectr32.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| namespace { |
| |
| static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before" |
| const int32_t BEFORE_LENGTH = 7; |
| |
| } // namespace |
| |
| CollationRuleParser::Sink::~Sink() {} |
| |
| void |
| CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {} |
| |
| void |
| CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {} |
| |
| CollationRuleParser::Importer::~Importer() {} |
| |
| CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode) |
| : nfd(*Normalizer2::getNFDInstance(errorCode)), |
| nfc(*Normalizer2::getNFCInstance(errorCode)), |
| rules(NULL), baseData(base), settings(NULL), |
| parseError(NULL), errorReason(NULL), |
| sink(NULL), importer(NULL), |
| ruleIndex(0) { |
| } |
| |
| CollationRuleParser::~CollationRuleParser() { |
| } |
| |
| void |
| CollationRuleParser::parse(const UnicodeString &ruleString, |
| CollationSettings &outSettings, |
| UParseError *outParseError, |
| UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return; } |
| settings = &outSettings; |
| parseError = outParseError; |
| if(parseError != NULL) { |
| parseError->line = 0; |
| parseError->offset = -1; |
| parseError->preContext[0] = 0; |
| parseError->postContext[0] = 0; |
| } |
| errorReason = NULL; |
| parse(ruleString, errorCode); |
| } |
| |
| void |
| CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return; } |
| rules = &ruleString; |
| ruleIndex = 0; |
| |
| while(ruleIndex < rules->length()) { |
| UChar c = rules->charAt(ruleIndex); |
| if(PatternProps::isWhiteSpace(c)) { |
| ++ruleIndex; |
| continue; |
| } |
| switch(c) { |
| case 0x26: // '&' |
| parseRuleChain(errorCode); |
| break; |
| case 0x5b: // '[' |
| parseSetting(errorCode); |
| break; |
| case 0x23: // '#' starts a comment, until the end of the line |
| ruleIndex = skipComment(ruleIndex + 1); |
| break; |
| case 0x40: // '@' is equivalent to [backwards 2] |
| settings->setFlag(CollationSettings::BACKWARD_SECONDARY, |
| UCOL_ON, 0, errorCode); |
| ++ruleIndex; |
| break; |
| case 0x21: // '!' used to turn on Thai/Lao character reversal |
| // Accept but ignore. The root collator has contractions |
| // that are equivalent to the character reversal, where appropriate. |
| ++ruleIndex; |
| break; |
| default: |
| setParseError("expected a reset or setting or comment", errorCode); |
| break; |
| } |
| if(U_FAILURE(errorCode)) { return; } |
| } |
| } |
| |
| void |
| CollationRuleParser::parseRuleChain(UErrorCode &errorCode) { |
| int32_t resetStrength = parseResetAndPosition(errorCode); |
| UBool isFirstRelation = TRUE; |
| for(;;) { |
| int32_t result = parseRelationOperator(errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| if(result < 0) { |
| if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) { |
| // '#' starts a comment, until the end of the line |
| ruleIndex = skipComment(ruleIndex + 1); |
| continue; |
| } |
| if(isFirstRelation) { |
| setParseError("reset not followed by a relation", errorCode); |
| } |
| return; |
| } |
| int32_t strength = result & STRENGTH_MASK; |
| if(resetStrength < UCOL_IDENTICAL) { |
| // reset-before rule chain |
| if(isFirstRelation) { |
| if(strength != resetStrength) { |
| setParseError("reset-before strength differs from its first relation", errorCode); |
| return; |
| } |
| } else { |
| if(strength < resetStrength) { |
| setParseError("reset-before strength followed by a stronger relation", errorCode); |
| return; |
| } |
| } |
| } |
| int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator |
| if((result & STARRED_FLAG) == 0) { |
| parseRelationStrings(strength, i, errorCode); |
| } else { |
| parseStarredCharacters(strength, i, errorCode); |
| } |
| if(U_FAILURE(errorCode)) { return; } |
| isFirstRelation = FALSE; |
| } |
| } |
| |
| int32_t |
| CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } |
| int32_t i = skipWhiteSpace(ruleIndex + 1); |
| int32_t j; |
| UChar c; |
| int32_t resetStrength; |
| if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 && |
| (j = i + BEFORE_LENGTH) < rules->length() && |
| PatternProps::isWhiteSpace(rules->charAt(j)) && |
| ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() && |
| 0x31 <= (c = rules->charAt(j)) && c <= 0x33 && |
| rules->charAt(j + 1) == 0x5d) { |
| // &[before n] with n=1 or 2 or 3 |
| resetStrength = UCOL_PRIMARY + (c - 0x31); |
| i = skipWhiteSpace(j + 2); |
| } else { |
| resetStrength = UCOL_IDENTICAL; |
| } |
| if(i >= rules->length()) { |
| setParseError("reset without position", errorCode); |
| return UCOL_DEFAULT; |
| } |
| UnicodeString str; |
| if(rules->charAt(i) == 0x5b) { // '[' |
| i = parseSpecialPosition(i, str, errorCode); |
| } else { |
| i = parseTailoringString(i, str, errorCode); |
| } |
| sink->addReset(resetStrength, str, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { setErrorContext(); } |
| ruleIndex = i; |
| return resetStrength; |
| } |
| |
| int32_t |
| CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } |
| ruleIndex = skipWhiteSpace(ruleIndex); |
| if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; } |
| int32_t strength; |
| int32_t i = ruleIndex; |
| UChar c = rules->charAt(i++); |
| switch(c) { |
| case 0x3c: // '<' |
| if(i < rules->length() && rules->charAt(i) == 0x3c) { // << |
| ++i; |
| if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<< |
| ++i; |
| if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<< |
| ++i; |
| strength = UCOL_QUATERNARY; |
| } else { |
| strength = UCOL_TERTIARY; |
| } |
| } else { |
| strength = UCOL_SECONDARY; |
| } |
| } else { |
| strength = UCOL_PRIMARY; |
| } |
| if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' |
| ++i; |
| strength |= STARRED_FLAG; |
| } |
| break; |
| case 0x3b: // ';' same as << |
| strength = UCOL_SECONDARY; |
| break; |
| case 0x2c: // ',' same as <<< |
| strength = UCOL_TERTIARY; |
| break; |
| case 0x3d: // '=' |
| strength = UCOL_IDENTICAL; |
| if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' |
| ++i; |
| strength |= STARRED_FLAG; |
| } |
| break; |
| default: |
| return UCOL_DEFAULT; |
| } |
| return ((i - ruleIndex) << OFFSET_SHIFT) | strength; |
| } |
| |
| void |
| CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) { |
| // Parse |
| // prefix | str / extension |
| // where prefix and extension are optional. |
| UnicodeString prefix, str, extension; |
| i = parseTailoringString(i, str, errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| UChar next = (i < rules->length()) ? rules->charAt(i) : 0; |
| if(next == 0x7c) { // '|' separates the context prefix from the string. |
| prefix = str; |
| i = parseTailoringString(i + 1, str, errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| next = (i < rules->length()) ? rules->charAt(i) : 0; |
| } |
| if(next == 0x2f) { // '/' separates the string from the extension. |
| i = parseTailoringString(i + 1, extension, errorCode); |
| } |
| if(!prefix.isEmpty()) { |
| UChar32 prefix0 = prefix.char32At(0); |
| UChar32 c = str.char32At(0); |
| if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { |
| setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary", |
| errorCode); |
| return; |
| } |
| } |
| sink->addRelation(strength, prefix, str, extension, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { setErrorContext(); } |
| ruleIndex = i; |
| } |
| |
| void |
| CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) { |
| UnicodeString empty, raw; |
| i = parseString(skipWhiteSpace(i), raw, errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| if(raw.isEmpty()) { |
| setParseError("missing starred-relation string", errorCode); |
| return; |
| } |
| UChar32 prev = -1; |
| int32_t j = 0; |
| for(;;) { |
| while(j < raw.length()) { |
| UChar32 c = raw.char32At(j); |
| if(!nfd.isInert(c)) { |
| setParseError("starred-relation string is not all NFD-inert", errorCode); |
| return; |
| } |
| sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { |
| setErrorContext(); |
| return; |
| } |
| j += U16_LENGTH(c); |
| prev = c; |
| } |
| if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-' |
| break; |
| } |
| if(prev < 0) { |
| setParseError("range without start in starred-relation string", errorCode); |
| return; |
| } |
| i = parseString(i + 1, raw, errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| if(raw.isEmpty()) { |
| setParseError("range without end in starred-relation string", errorCode); |
| return; |
| } |
| UChar32 c = raw.char32At(0); |
| if(c < prev) { |
| setParseError("range start greater than end in starred-relation string", errorCode); |
| return; |
| } |
| // range prev-c |
| UnicodeString s; |
| while(++prev <= c) { |
| if(!nfd.isInert(prev)) { |
| setParseError("starred-relation string range is not all NFD-inert", errorCode); |
| return; |
| } |
| if(U_IS_SURROGATE(prev)) { |
| setParseError("starred-relation string range contains a surrogate", errorCode); |
| return; |
| } |
| if(0xfffd <= prev && prev <= 0xffff) { |
| setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode); |
| return; |
| } |
| s.setTo(prev); |
| sink->addRelation(strength, empty, s, empty, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { |
| setErrorContext(); |
| return; |
| } |
| } |
| prev = -1; |
| j = U16_LENGTH(c); |
| } |
| ruleIndex = skipWhiteSpace(i); |
| } |
| |
| int32_t |
| CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) { |
| i = parseString(skipWhiteSpace(i), raw, errorCode); |
| if(U_SUCCESS(errorCode) && raw.isEmpty()) { |
| setParseError("missing relation string", errorCode); |
| } |
| return skipWhiteSpace(i); |
| } |
| |
| int32_t |
| CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return i; } |
| raw.remove(); |
| while(i < rules->length()) { |
| UChar32 c = rules->charAt(i++); |
| if(isSyntaxChar(c)) { |
| if(c == 0x27) { // apostrophe |
| if(i < rules->length() && rules->charAt(i) == 0x27) { |
| // Double apostrophe, encodes a single one. |
| raw.append((UChar)0x27); |
| ++i; |
| continue; |
| } |
| // Quote literal text until the next single apostrophe. |
| for(;;) { |
| if(i == rules->length()) { |
| setParseError("quoted literal text missing terminating apostrophe", errorCode); |
| return i; |
| } |
| c = rules->charAt(i++); |
| if(c == 0x27) { |
| if(i < rules->length() && rules->charAt(i) == 0x27) { |
| // Double apostrophe inside quoted literal text, |
| // still encodes a single apostrophe. |
| ++i; |
| } else { |
| break; |
| } |
| } |
| raw.append((UChar)c); |
| } |
| } else if(c == 0x5c) { // backslash |
| if(i == rules->length()) { |
| setParseError("backslash escape at the end of the rule string", errorCode); |
| return i; |
| } |
| c = rules->char32At(i); |
| raw.append(c); |
| i += U16_LENGTH(c); |
| } else { |
| // Any other syntax character terminates a string. |
| --i; |
| break; |
| } |
| } else if(PatternProps::isWhiteSpace(c)) { |
| // Unquoted white space terminates a string. |
| --i; |
| break; |
| } else { |
| raw.append((UChar)c); |
| } |
| } |
| for(int32_t j = 0; j < raw.length();) { |
| UChar32 c = raw.char32At(j); |
| if(U_IS_SURROGATE(c)) { |
| setParseError("string contains an unpaired surrogate", errorCode); |
| return i; |
| } |
| if(0xfffd <= c && c <= 0xffff) { |
| setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode); |
| return i; |
| } |
| j += U16_LENGTH(c); |
| } |
| return i; |
| } |
| |
| namespace { |
| |
| static const char *const positions[] = { |
| "first tertiary ignorable", |
| "last tertiary ignorable", |
| "first secondary ignorable", |
| "last secondary ignorable", |
| "first primary ignorable", |
| "last primary ignorable", |
| "first variable", |
| "last variable", |
| "first regular", |
| "last regular", |
| "first implicit", |
| "last implicit", |
| "first trailing", |
| "last trailing" |
| }; |
| |
| } // namespace |
| |
| int32_t |
| CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return 0; } |
| UnicodeString raw; |
| int32_t j = readWords(i + 1, raw); |
| if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ] |
| ++j; |
| for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) { |
| if(raw == UnicodeString(positions[pos], -1, US_INV)) { |
| str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos)); |
| return j; |
| } |
| } |
| if(raw == UNICODE_STRING_SIMPLE("top")) { |
| str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR)); |
| return j; |
| } |
| if(raw == UNICODE_STRING_SIMPLE("variable top")) { |
| str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE)); |
| return j; |
| } |
| } |
| setParseError("not a valid special reset position", errorCode); |
| return i; |
| } |
| |
| void |
| CollationRuleParser::parseSetting(UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return; } |
| UnicodeString raw; |
| int32_t i = ruleIndex + 1; |
| int32_t j = readWords(i, raw); |
| if(j <= i || raw.isEmpty()) { |
| setParseError("expected a setting/option at '['", errorCode); |
| } |
| if(rules->charAt(j) == 0x5d) { // words end with ] |
| ++j; |
| if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) && |
| (raw.length() == 7 || raw.charAt(7) == 0x20)) { |
| parseReordering(raw, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| if(raw == UNICODE_STRING_SIMPLE("backwards 2")) { |
| settings->setFlag(CollationSettings::BACKWARD_SECONDARY, |
| UCOL_ON, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| UnicodeString v; |
| int32_t valueIndex = raw.lastIndexOf((UChar)0x20); |
| if(valueIndex >= 0) { |
| v.setTo(raw, valueIndex + 1); |
| raw.truncate(valueIndex); |
| } |
| if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) { |
| int32_t value = UCOL_DEFAULT; |
| UChar c = v.charAt(0); |
| if(0x31 <= c && c <= 0x34) { // 1..4 |
| value = UCOL_PRIMARY + (c - 0x31); |
| } else if(c == 0x49) { // 'I' |
| value = UCOL_IDENTICAL; |
| } |
| if(value != UCOL_DEFAULT) { |
| settings->setStrength(value, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("alternate")) { |
| UColAttributeValue value = UCOL_DEFAULT; |
| if(v == UNICODE_STRING_SIMPLE("non-ignorable")) { |
| value = UCOL_NON_IGNORABLE; |
| } else if(v == UNICODE_STRING_SIMPLE("shifted")) { |
| value = UCOL_SHIFTED; |
| } |
| if(value != UCOL_DEFAULT) { |
| settings->setAlternateHandling(value, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) { |
| int32_t value = UCOL_DEFAULT; |
| if(v == UNICODE_STRING_SIMPLE("space")) { |
| value = CollationSettings::MAX_VAR_SPACE; |
| } else if(v == UNICODE_STRING_SIMPLE("punct")) { |
| value = CollationSettings::MAX_VAR_PUNCT; |
| } else if(v == UNICODE_STRING_SIMPLE("symbol")) { |
| value = CollationSettings::MAX_VAR_SYMBOL; |
| } else if(v == UNICODE_STRING_SIMPLE("currency")) { |
| value = CollationSettings::MAX_VAR_CURRENCY; |
| } |
| if(value != UCOL_DEFAULT) { |
| settings->setMaxVariable(value, 0, errorCode); |
| settings->variableTop = baseData->getLastPrimaryForGroup( |
| UCOL_REORDER_CODE_FIRST + value); |
| U_ASSERT(settings->variableTop != 0); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) { |
| UColAttributeValue value = UCOL_DEFAULT; |
| if(v == UNICODE_STRING_SIMPLE("off")) { |
| value = UCOL_OFF; |
| } else if(v == UNICODE_STRING_SIMPLE("lower")) { |
| value = UCOL_LOWER_FIRST; |
| } else if(v == UNICODE_STRING_SIMPLE("upper")) { |
| value = UCOL_UPPER_FIRST; |
| } |
| if(value != UCOL_DEFAULT) { |
| settings->setCaseFirst(value, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) { |
| UColAttributeValue value = getOnOffValue(v); |
| if(value != UCOL_DEFAULT) { |
| settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("normalization")) { |
| UColAttributeValue value = getOnOffValue(v); |
| if(value != UCOL_DEFAULT) { |
| settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) { |
| UColAttributeValue value = getOnOffValue(v); |
| if(value != UCOL_DEFAULT) { |
| settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode); |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) { |
| UColAttributeValue value = getOnOffValue(v); |
| if(value != UCOL_DEFAULT) { |
| if(value == UCOL_ON) { |
| setParseError("[hiraganaQ on] is not supported", errorCode); |
| } |
| ruleIndex = j; |
| return; |
| } |
| } else if(raw == UNICODE_STRING_SIMPLE("import")) { |
| CharString lang; |
| lang.appendInvariantChars(v, errorCode); |
| if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; } |
| // BCP 47 language tag -> ICU locale ID |
| char localeID[ULOC_FULLNAME_CAPACITY]; |
| int32_t parsedLength; |
| int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY, |
| &parsedLength, &errorCode); |
| if(U_FAILURE(errorCode) || |
| parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) { |
| errorCode = U_ZERO_ERROR; |
| setParseError("expected language tag in [import langTag]", errorCode); |
| return; |
| } |
| // localeID minus all keywords |
| char baseID[ULOC_FULLNAME_CAPACITY]; |
| length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode); |
| if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) { |
| errorCode = U_ZERO_ERROR; |
| setParseError("expected language tag in [import langTag]", errorCode); |
| return; |
| } |
| if(length == 0) { |
| uprv_strcpy(baseID, "root"); |
| } else if(*baseID == '_') { |
| uprv_memmove(baseID + 3, baseID, length + 1); |
| uprv_memcpy(baseID, "und", 3); |
| } |
| // @collation=type, or length=0 if not specified |
| char collationType[ULOC_KEYWORDS_CAPACITY]; |
| length = uloc_getKeywordValue(localeID, "collation", |
| collationType, ULOC_KEYWORDS_CAPACITY, |
| &errorCode); |
| if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) { |
| errorCode = U_ZERO_ERROR; |
| setParseError("expected language tag in [import langTag]", errorCode); |
| return; |
| } |
| if(importer == NULL) { |
| setParseError("[import langTag] is not supported", errorCode); |
| } else { |
| UnicodeString importedRules; |
| importer->getRules(baseID, length > 0 ? collationType : "standard", |
| importedRules, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { |
| if(errorReason == NULL) { |
| errorReason = "[import langTag] failed"; |
| } |
| setErrorContext(); |
| return; |
| } |
| const UnicodeString *outerRules = rules; |
| int32_t outerRuleIndex = ruleIndex; |
| parse(importedRules, errorCode); |
| if(U_FAILURE(errorCode)) { |
| if(parseError != NULL) { |
| parseError->offset = outerRuleIndex; |
| } |
| } |
| rules = outerRules; |
| ruleIndex = j; |
| } |
| return; |
| } |
| } else if(rules->charAt(j) == 0x5b) { // words end with [ |
| UnicodeSet set; |
| j = parseUnicodeSet(j, set, errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| if(raw == UNICODE_STRING_SIMPLE("optimize")) { |
| sink->optimize(set, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { setErrorContext(); } |
| ruleIndex = j; |
| return; |
| } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) { |
| sink->suppressContractions(set, errorReason, errorCode); |
| if(U_FAILURE(errorCode)) { setErrorContext(); } |
| ruleIndex = j; |
| return; |
| } |
| } |
| setParseError("not a valid setting/option", errorCode); |
| } |
| |
| void |
| CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return; } |
| int32_t i = 7; // after "reorder" |
| if(i == raw.length()) { |
| // empty [reorder] with no codes |
| settings->resetReordering(); |
| return; |
| } |
| // Parse the codes in [reorder aa bb cc]. |
| UVector32 reorderCodes(errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| CharString word; |
| while(i < raw.length()) { |
| ++i; // skip the word-separating space |
| int32_t limit = raw.indexOf((UChar)0x20, i); |
| if(limit < 0) { limit = raw.length(); } |
| word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| int32_t code = getReorderCode(word.data()); |
| if(code < 0) { |
| setParseError("unknown script or reorder code", errorCode); |
| return; |
| } |
| reorderCodes.addElement(code, errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| i = limit; |
| } |
| settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode); |
| } |
| |
| static const char *const gSpecialReorderCodes[] = { |
| "space", "punct", "symbol", "currency", "digit" |
| }; |
| |
| int32_t |
| CollationRuleParser::getReorderCode(const char *word) { |
| for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) { |
| if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) { |
| return UCOL_REORDER_CODE_FIRST + i; |
| } |
| } |
| int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); |
| if(script >= 0) { |
| return script; |
| } |
| if(uprv_stricmp(word, "others") == 0) { |
| return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN |
| } |
| return -1; |
| } |
| |
| UColAttributeValue |
| CollationRuleParser::getOnOffValue(const UnicodeString &s) { |
| if(s == UNICODE_STRING_SIMPLE("on")) { |
| return UCOL_ON; |
| } else if(s == UNICODE_STRING_SIMPLE("off")) { |
| return UCOL_OFF; |
| } else { |
| return UCOL_DEFAULT; |
| } |
| } |
| |
| int32_t |
| CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) { |
| // Collect a UnicodeSet pattern between a balanced pair of [brackets]. |
| int32_t level = 0; |
| int32_t j = i; |
| for(;;) { |
| if(j == rules->length()) { |
| setParseError("unbalanced UnicodeSet pattern brackets", errorCode); |
| return j; |
| } |
| UChar c = rules->charAt(j++); |
| if(c == 0x5b) { // '[' |
| ++level; |
| } else if(c == 0x5d) { // ']' |
| if(--level == 0) { break; } |
| } |
| } |
| set.applyPattern(rules->tempSubStringBetween(i, j), errorCode); |
| if(U_FAILURE(errorCode)) { |
| errorCode = U_ZERO_ERROR; |
| setParseError("not a valid UnicodeSet pattern", errorCode); |
| return j; |
| } |
| j = skipWhiteSpace(j); |
| if(j == rules->length() || rules->charAt(j) != 0x5d) { |
| setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode); |
| return j; |
| } |
| return ++j; |
| } |
| |
| int32_t |
| CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const { |
| static const UChar sp = 0x20; |
| raw.remove(); |
| i = skipWhiteSpace(i); |
| for(;;) { |
| if(i >= rules->length()) { return 0; } |
| UChar c = rules->charAt(i); |
| if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ |
| if(raw.isEmpty()) { return i; } |
| if(raw.endsWith(&sp, 1)) { // remove trailing space |
| raw.truncate(raw.length() - 1); |
| } |
| return i; |
| } |
| if(PatternProps::isWhiteSpace(c)) { |
| raw.append(sp); |
| i = skipWhiteSpace(i + 1); |
| } else { |
| raw.append(c); |
| ++i; |
| } |
| } |
| } |
| |
| int32_t |
| CollationRuleParser::skipComment(int32_t i) const { |
| // skip to past the newline |
| while(i < rules->length()) { |
| UChar c = rules->charAt(i++); |
| // LF or FF or CR or NEL or LS or PS |
| if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) { |
| // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." |
| // NLF (new line function) = CR or LF or CR+LF or NEL. |
| // No need to collect all of CR+LF because a following LF will be ignored anyway. |
| break; |
| } |
| } |
| return i; |
| } |
| |
| void |
| CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return; } |
| // Error code consistent with the old parser (from ca. 2001), |
| // rather than U_PARSE_ERROR; |
| errorCode = U_INVALID_FORMAT_ERROR; |
| errorReason = reason; |
| if(parseError != NULL) { setErrorContext(); } |
| } |
| |
| void |
| CollationRuleParser::setErrorContext() { |
| if(parseError == NULL) { return; } |
| |
| // Note: This relies on the calling code maintaining the ruleIndex |
| // at a position that is useful for debugging. |
| // For example, at the beginning of a reset or relation etc. |
| parseError->offset = ruleIndex; |
| parseError->line = 0; // We are not counting line numbers. |
| |
| // before ruleIndex |
| int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); |
| if(start < 0) { |
| start = 0; |
| } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) { |
| ++start; |
| } |
| int32_t length = ruleIndex - start; |
| rules->extract(start, length, parseError->preContext); |
| parseError->preContext[length] = 0; |
| |
| // starting from ruleIndex |
| length = rules->length() - ruleIndex; |
| if(length >= U_PARSE_CONTEXT_LEN) { |
| length = U_PARSE_CONTEXT_LEN - 1; |
| if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) { |
| --length; |
| } |
| } |
| rules->extract(ruleIndex, length, parseError->postContext); |
| parseError->postContext[length] = 0; |
| } |
| |
| UBool |
| CollationRuleParser::isSyntaxChar(UChar32 c) { |
| return 0x21 <= c && c <= 0x7e && |
| (c <= 0x2f || (0x3a <= c && c <= 0x40) || |
| (0x5b <= c && c <= 0x60) || (0x7b <= c)); |
| } |
| |
| int32_t |
| CollationRuleParser::skipWhiteSpace(int32_t i) const { |
| while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) { |
| ++i; |
| } |
| return i; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_COLLATION |