| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 2009-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| #include "unicode/bytestream.h" |
| #include "unicode/utypes.h" |
| #include "unicode/ures.h" |
| #include "unicode/localpointer.h" |
| #include "unicode/putil.h" |
| #include "unicode/uenum.h" |
| #include "unicode/uloc.h" |
| #include "ustr_imp.h" |
| #include "bytesinkutil.h" |
| #include "charstr.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "putilimp.h" |
| #include "uinvchar.h" |
| #include "ulocimp.h" |
| #include "uassert.h" |
| |
| |
| /* struct holding a single variant */ |
| typedef struct VariantListEntry { |
| const char *variant; |
| struct VariantListEntry *next; |
| } VariantListEntry; |
| |
| /* struct holding a single attribute value */ |
| struct AttributeListEntry : public icu::UMemory { |
| const char *attribute; |
| struct AttributeListEntry *next; |
| }; |
| |
| /* struct holding a single extension */ |
| struct ExtensionListEntry : public icu::UMemory { |
| const char *key; |
| const char *value; |
| struct ExtensionListEntry *next; |
| }; |
| |
| #define MAXEXTLANG 3 |
| typedef struct ULanguageTag { |
| char *buf; /* holding parsed subtags */ |
| const char *language; |
| const char *extlang[MAXEXTLANG]; |
| const char *script; |
| const char *region; |
| VariantListEntry *variants; |
| ExtensionListEntry *extensions; |
| const char *privateuse; |
| const char *legacy; |
| } ULanguageTag; |
| |
| #define MINLEN 2 |
| #define SEP '-' |
| #define PRIVATEUSE 'x' |
| #define LDMLEXT 'u' |
| |
| #define LOCALE_SEP '_' |
| #define LOCALE_EXT_SEP '@' |
| #define LOCALE_KEYWORD_SEP ';' |
| #define LOCALE_KEY_TYPE_SEP '=' |
| |
| #define ISALPHA(c) uprv_isASCIILetter(c) |
| #define ISNUMERIC(c) ((c)>='0' && (c)<='9') |
| |
| static const char EMPTY[] = ""; |
| static const char LANG_UND[] = "und"; |
| static const char PRIVATEUSE_KEY[] = "x"; |
| static const char _POSIX[] = "_POSIX"; |
| static const char POSIX_KEY[] = "va"; |
| static const char POSIX_VALUE[] = "posix"; |
| static const char LOCALE_ATTRIBUTE_KEY[] = "attribute"; |
| static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant"; |
| static const char LOCALE_TYPE_YES[] = "yes"; |
| |
| #define LANG_UND_LEN 3 |
| |
| /* |
| Updated on 2018-09-12 from |
| https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . |
| |
| This table has 2 parts. The part for |
| legacy language tags (marked as “Type: grandfathered” in BCP 47) |
| is generated by the following scripts from the IANA language tag registry. |
| |
| curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ |
| egrep -A 7 'Type: grandfathered' | \ |
| egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \ |
| awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\ |
| tr 'A-Z' 'a-z' |
| |
| |
| The 2nd part is made of five ICU-specific entries. They're kept for |
| the backward compatibility for now, even though there are no preferred |
| values. They may have to be removed for the strict BCP 47 compliance. |
| |
| */ |
| static const char* const LEGACY[] = { |
| /* legacy preferred */ |
| "art-lojban", "jbo", |
| "en-gb-oed", "en-gb-oxendict", |
| "i-ami", "ami", |
| "i-bnn", "bnn", |
| "i-hak", "hak", |
| "i-klingon", "tlh", |
| "i-lux", "lb", |
| "i-navajo", "nv", |
| "i-pwn", "pwn", |
| "i-tao", "tao", |
| "i-tay", "tay", |
| "i-tsu", "tsu", |
| "no-bok", "nb", |
| "no-nyn", "nn", |
| "sgn-be-fr", "sfb", |
| "sgn-be-nl", "vgt", |
| "sgn-ch-de", "sgg", |
| "zh-guoyu", "cmn", |
| "zh-hakka", "hak", |
| "zh-min-nan", "nan", |
| "zh-xiang", "hsn", |
| |
| // Legacy tags with no preferred value in the IANA |
| // registry. Kept for now for the backward compatibility |
| // because ICU has mapped them this way. |
| "i-default", "en-x-i-default", |
| "i-enochian", "und-x-i-enochian", |
| "i-mingo", "see-x-i-mingo", |
| "zh-min", "nan-x-zh-min", |
| }; |
| |
| /* |
| Updated on 2018-09-12 from |
| https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . |
| |
| The table lists redundant tags with preferred value in the IANA language tag registry. |
| It's generated with the following command: |
| |
| curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ |
| grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \ |
| awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \ |
| tr 'A-Z' 'a-z' |
| |
| In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because |
| a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'. |
| */ |
| |
| static const char* const REDUNDANT[] = { |
| // redundant preferred |
| "sgn-br", "bzs", |
| "sgn-co", "csn", |
| "sgn-de", "gsg", |
| "sgn-dk", "dsl", |
| "sgn-es", "ssp", |
| "sgn-fr", "fsl", |
| "sgn-gb", "bfi", |
| "sgn-gr", "gss", |
| "sgn-ie", "isg", |
| "sgn-it", "ise", |
| "sgn-jp", "jsl", |
| "sgn-mx", "mfs", |
| "sgn-ni", "ncs", |
| "sgn-nl", "dse", |
| "sgn-no", "nsl", |
| "sgn-pt", "psr", |
| "sgn-se", "swl", |
| "sgn-us", "ase", |
| "sgn-za", "sfs", |
| "zh-cmn", "cmn", |
| "zh-cmn-hans", "cmn-hans", |
| "zh-cmn-hant", "cmn-hant", |
| "zh-gan", "gan", |
| "zh-wuu", "wuu", |
| "zh-yue", "yue", |
| |
| // variant tag with preferred value |
| "ja-latn-hepburn-heploc", "ja-latn-alalc97", |
| }; |
| |
| /* |
| Updated on 2018-09-12 from |
| https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . |
| |
| grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \ |
| grep -B1 'Preferred' | grep -v '^--' | \ |
| awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |
| |
| Make sure that 2-letter language subtags come before 3-letter subtags. |
| */ |
| static const char DEPRECATEDLANGS[][4] = { |
| /* deprecated new */ |
| "in", "id", |
| "iw", "he", |
| "ji", "yi", |
| "jw", "jv", |
| "mo", "ro", |
| "aam", "aas", |
| "adp", "dz", |
| "aue", "ktz", |
| "ayx", "nun", |
| "bgm", "bcg", |
| "bjd", "drl", |
| "ccq", "rki", |
| "cjr", "mom", |
| "cka", "cmr", |
| "cmk", "xch", |
| "coy", "pij", |
| "cqu", "quh", |
| "drh", "khk", |
| "drw", "prs", |
| "gav", "dev", |
| "gfx", "vaj", |
| "ggn", "gvr", |
| "gti", "nyc", |
| "guv", "duz", |
| "hrr", "jal", |
| "ibi", "opa", |
| "ilw", "gal", |
| "jeg", "oyb", |
| "kgc", "tdf", |
| "kgh", "kml", |
| "koj", "kwv", |
| "krm", "bmf", |
| "ktr", "dtp", |
| "kvs", "gdj", |
| "kwq", "yam", |
| "kxe", "tvd", |
| "kzj", "dtp", |
| "kzt", "dtp", |
| "lii", "raq", |
| "lmm", "rmx", |
| "meg", "cir", |
| "mst", "mry", |
| "mwj", "vaj", |
| "myt", "mry", |
| "nad", "xny", |
| "ncp", "kdz", |
| "nnx", "ngv", |
| "nts", "pij", |
| "oun", "vaj", |
| "pcr", "adx", |
| "pmc", "huw", |
| "pmu", "phr", |
| "ppa", "bfy", |
| "ppr", "lcq", |
| "pry", "prt", |
| "puz", "pub", |
| "sca", "hle", |
| "skk", "oyb", |
| "tdu", "dtp", |
| "thc", "tpo", |
| "thx", "oyb", |
| "tie", "ras", |
| "tkk", "twm", |
| "tlw", "weo", |
| "tmp", "tyj", |
| "tne", "kak", |
| "tnf", "prs", |
| "tsf", "taj", |
| "uok", "ema", |
| "xba", "cax", |
| "xia", "acn", |
| "xkh", "waw", |
| "xsj", "suj", |
| "ybd", "rki", |
| "yma", "lrr", |
| "ymt", "mtm", |
| "yos", "zom", |
| "yuu", "yug", |
| }; |
| |
| /* |
| Updated on 2018-04-24 from |
| |
| curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \ |
| grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \ |
| grep -B1 'Preferred' | \ |
| awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |
| */ |
| static const char DEPRECATEDREGIONS[][3] = { |
| /* deprecated new */ |
| "BU", "MM", |
| "DD", "DE", |
| "FX", "FR", |
| "TP", "TL", |
| "YD", "YE", |
| "ZR", "CD", |
| }; |
| |
| /* |
| * ------------------------------------------------- |
| * |
| * These ultag_ functions may be exposed as APIs later |
| * |
| * ------------------------------------------------- |
| */ |
| |
| static ULanguageTag* |
| ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status); |
| |
| static void |
| ultag_close(ULanguageTag* langtag); |
| |
| static const char* |
| ultag_getLanguage(const ULanguageTag* langtag); |
| |
| #if 0 |
| static const char* |
| ultag_getJDKLanguage(const ULanguageTag* langtag); |
| #endif |
| |
| static const char* |
| ultag_getExtlang(const ULanguageTag* langtag, int32_t idx); |
| |
| static int32_t |
| ultag_getExtlangSize(const ULanguageTag* langtag); |
| |
| static const char* |
| ultag_getScript(const ULanguageTag* langtag); |
| |
| static const char* |
| ultag_getRegion(const ULanguageTag* langtag); |
| |
| static const char* |
| ultag_getVariant(const ULanguageTag* langtag, int32_t idx); |
| |
| static int32_t |
| ultag_getVariantsSize(const ULanguageTag* langtag); |
| |
| static const char* |
| ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx); |
| |
| static const char* |
| ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx); |
| |
| static int32_t |
| ultag_getExtensionsSize(const ULanguageTag* langtag); |
| |
| static const char* |
| ultag_getPrivateUse(const ULanguageTag* langtag); |
| |
| #if 0 |
| static const char* |
| ultag_getLegacy(const ULanguageTag* langtag); |
| #endif |
| |
| U_NAMESPACE_BEGIN |
| |
| /** |
| * \class LocalULanguageTagPointer |
| * "Smart pointer" class, closes a ULanguageTag via ultag_close(). |
| * For most methods see the LocalPointerBase base class. |
| * |
| * @see LocalPointerBase |
| * @see LocalPointer |
| * @internal |
| */ |
| U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close); |
| |
| U_NAMESPACE_END |
| |
| /* |
| * ------------------------------------------------- |
| * |
| * Language subtag syntax validation functions |
| * |
| * ------------------------------------------------- |
| */ |
| |
| static UBool |
| _isAlphaString(const char* s, int32_t len) { |
| int32_t i; |
| for (i = 0; i < len; i++) { |
| if (!ISALPHA(*(s + i))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| static UBool |
| _isNumericString(const char* s, int32_t len) { |
| int32_t i; |
| for (i = 0; i < len; i++) { |
| if (!ISNUMERIC(*(s + i))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| static UBool |
| _isAlphaNumericString(const char* s, int32_t len) { |
| int32_t i; |
| for (i = 0; i < len; i++) { |
| if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| static UBool |
| _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) { |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len >= min && len <= max && _isAlphaNumericString(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| |
| U_CFUNC UBool |
| ultag_isLanguageSubtag(const char* s, int32_t len) { |
| /* |
| * unicode_language_subtag = alpha{2,3} | alpha{5,8}; |
| * NOTE: Per ICUTC 2019/01/23- accepting alpha 4 |
| * See ICU-20372 |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len >= 2 && len <= 8 && _isAlphaString(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| |
| static UBool |
| _isExtlangSubtag(const char* s, int32_t len) { |
| /* |
| * extlang = 3ALPHA ; selected ISO 639 codes |
| * *2("-" 3ALPHA) ; permanently reserved |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len == 3 && _isAlphaString(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| |
| U_CFUNC UBool |
| ultag_isScriptSubtag(const char* s, int32_t len) { |
| /* |
| * script = 4ALPHA ; ISO 15924 code |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len == 4 && _isAlphaString(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| |
| U_CFUNC UBool |
| ultag_isRegionSubtag(const char* s, int32_t len) { |
| /* |
| * region = 2ALPHA ; ISO 3166-1 code |
| * / 3DIGIT ; UN M.49 code |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len == 2 && _isAlphaString(s, len)) { |
| return true; |
| } |
| if (len == 3 && _isNumericString(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| |
| static UBool |
| _isVariantSubtag(const char* s, int32_t len) { |
| /* |
| * variant = 5*8alphanum ; registered variants |
| * / (DIGIT 3alphanum) |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) { |
| return true; |
| } |
| if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) { |
| return true; |
| } |
| return false; |
| } |
| |
| static UBool |
| _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) { |
| const char *p = s; |
| const char *pSubtag = nullptr; |
| |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| |
| while ((p - s) < len) { |
| if (*p == SEP) { |
| if (pSubtag == nullptr) { |
| return false; |
| } |
| if (!test(pSubtag, (int32_t)(p - pSubtag))) { |
| return false; |
| } |
| pSubtag = nullptr; |
| } else if (pSubtag == nullptr) { |
| pSubtag = p; |
| } |
| p++; |
| } |
| if (pSubtag == nullptr) { |
| return false; |
| } |
| return test(pSubtag, (int32_t)(p - pSubtag)); |
| } |
| |
| U_CFUNC UBool |
| ultag_isVariantSubtags(const char* s, int32_t len) { |
| return _isSepListOf(&_isVariantSubtag, s, len); |
| } |
| |
| // This is for the ICU-specific "lvariant" handling. |
| static UBool |
| _isPrivateuseVariantSubtag(const char* s, int32_t len) { |
| /* |
| * variant = 1*8alphanum ; registered variants |
| * / (DIGIT 3alphanum) |
| */ |
| return _isAlphaNumericStringLimitedLength(s, len , 1, 8); |
| } |
| |
| static UBool |
| _isExtensionSingleton(const char* s, int32_t len) { |
| /* |
| * extension = singleton 1*("-" (2*8alphanum)) |
| * |
| * singleton = DIGIT ; 0 - 9 |
| * / %x41-57 ; A - W |
| * / %x59-5A ; Y - Z |
| * / %x61-77 ; a - w |
| * / %x79-7A ; y - z |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) { |
| return true; |
| } |
| return false; |
| } |
| |
| static UBool |
| _isExtensionSubtag(const char* s, int32_t len) { |
| /* |
| * extension = singleton 1*("-" (2*8alphanum)) |
| */ |
| return _isAlphaNumericStringLimitedLength(s, len, 2, 8); |
| } |
| |
| U_CFUNC UBool |
| ultag_isExtensionSubtags(const char* s, int32_t len) { |
| return _isSepListOf(&_isExtensionSubtag, s, len); |
| } |
| |
| static UBool |
| _isPrivateuseValueSubtag(const char* s, int32_t len) { |
| /* |
| * privateuse = "x" 1*("-" (1*8alphanum)) |
| */ |
| return _isAlphaNumericStringLimitedLength(s, len, 1, 8); |
| } |
| |
| U_CFUNC UBool |
| ultag_isPrivateuseValueSubtags(const char* s, int32_t len) { |
| return _isSepListOf(&_isPrivateuseValueSubtag, s, len); |
| } |
| |
| U_CFUNC UBool |
| ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) { |
| /* |
| * attribute = alphanum{3,8} ; |
| */ |
| return _isAlphaNumericStringLimitedLength(s, len , 3, 8); |
| } |
| |
| U_CFUNC UBool |
| ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) { |
| return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len); |
| } |
| |
| U_CFUNC UBool |
| ultag_isUnicodeLocaleKey(const char* s, int32_t len) { |
| /* |
| * key = alphanum alpha ; |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) { |
| return true; |
| } |
| return false; |
| } |
| |
| U_CFUNC UBool |
| _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) { |
| /* |
| * alphanum{3,8} |
| */ |
| return _isAlphaNumericStringLimitedLength(s, len , 3, 8); |
| } |
| |
| U_CFUNC UBool |
| ultag_isUnicodeLocaleType(const char*s, int32_t len) { |
| /* |
| * type = alphanum{3,8} (sep alphanum{3,8})* ; |
| */ |
| return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len); |
| } |
| |
| static UBool |
| _isTKey(const char* s, int32_t len) |
| { |
| /* |
| * tkey = alpha digit ; |
| */ |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) { |
| return true; |
| } |
| return false; |
| } |
| |
| U_CAPI const char * U_EXPORT2 |
| ultag_getTKeyStart(const char *localeID) { |
| const char *result = localeID; |
| const char *sep; |
| while((sep = uprv_strchr(result, SEP)) != nullptr) { |
| if (_isTKey(result, static_cast<int32_t>(sep - result))) { |
| return result; |
| } |
| result = ++sep; |
| } |
| if (_isTKey(result, -1)) { |
| return result; |
| } |
| return nullptr; |
| } |
| |
| static UBool |
| _isTValue(const char* s, int32_t len) |
| { |
| /* |
| * tvalue = (sep alphanum{3,8})+ ; |
| */ |
| return _isAlphaNumericStringLimitedLength(s, len , 3, 8); |
| } |
| |
| static UBool |
| _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len) |
| { |
| const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end |
| const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag, |
| // unicode_region_subtag, unicode_variant_subtag, tkey or end |
| const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag, |
| // unicode_variant_subtag, tkey, or end |
| const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag, |
| // tkey, or end. |
| const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag |
| // tkey or end. |
| const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here. |
| const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end |
| |
| |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| switch (state) { |
| case kStart: |
| if (ultag_isLanguageSubtag(s, len) && len != 4) { |
| state = kGotLanguage; |
| return true; |
| } |
| if (_isTKey(s, len)) { |
| state = kGotTKey; |
| return true; |
| } |
| return false; |
| case kGotLanguage: |
| if (ultag_isScriptSubtag(s, len)) { |
| state = kGotScript; |
| return true; |
| } |
| U_FALLTHROUGH; |
| case kGotScript: |
| if (ultag_isRegionSubtag(s, len)) { |
| state = kGotRegion; |
| return true; |
| } |
| U_FALLTHROUGH; |
| case kGotRegion: |
| U_FALLTHROUGH; |
| case kGotVariant: |
| if (_isVariantSubtag(s, len)) { |
| state = kGotVariant; |
| return true; |
| } |
| if (_isTKey(s, len)) { |
| state = kGotTKey; |
| return true; |
| } |
| return false; |
| case kGotTKey: |
| if (_isTValue(s, len)) { |
| state = kGotTValue; |
| return true; |
| } |
| return false; |
| case kGotTValue: |
| if (_isTKey(s, len)) { |
| state = kGotTKey; |
| return true; |
| } |
| if (_isTValue(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| return false; |
| } |
| |
| static UBool |
| _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len) |
| { |
| const int32_t kStart = 0; // Start, wait for a key or attribute or end |
| const int32_t kGotKey = 1; // Got a key, wait for type or key or end |
| const int32_t kGotType = 2; // Got a type, wait for key or end |
| |
| switch (state) { |
| case kStart: |
| if (ultag_isUnicodeLocaleKey(s, len)) { |
| state = kGotKey; |
| return true; |
| } |
| if (ultag_isUnicodeLocaleAttribute(s, len)) { |
| return true; |
| } |
| return false; |
| case kGotKey: |
| if (ultag_isUnicodeLocaleKey(s, len)) { |
| return true; |
| } |
| if (_isUnicodeLocaleTypeSubtag(s, len)) { |
| state = kGotType; |
| return true; |
| } |
| return false; |
| case kGotType: |
| if (ultag_isUnicodeLocaleKey(s, len)) { |
| state = kGotKey; |
| return true; |
| } |
| if (_isUnicodeLocaleTypeSubtag(s, len)) { |
| return true; |
| } |
| return false; |
| } |
| return false; |
| } |
| |
| static UBool |
| _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len) |
| { |
| int32_t state = 0; |
| const char* p; |
| const char* start = s; |
| int32_t subtagLen = 0; |
| |
| if (len < 0) { |
| len = (int32_t)uprv_strlen(s); |
| } |
| |
| for (p = s; len > 0; p++, len--) { |
| if (*p == SEP) { |
| if (!test(state, start, subtagLen)) { |
| return false; |
| } |
| subtagLen = 0; |
| start = p + 1; |
| } else { |
| subtagLen++; |
| } |
| } |
| |
| if (test(state, start, subtagLen) && state >= 0) { |
| return true; |
| } |
| return false; |
| } |
| |
| U_CFUNC UBool |
| ultag_isTransformedExtensionSubtags(const char* s, int32_t len) |
| { |
| return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len); |
| } |
| |
| U_CFUNC UBool |
| ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) { |
| return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len); |
| } |
| |
| |
| /* |
| * ------------------------------------------------- |
| * |
| * Helper functions |
| * |
| * ------------------------------------------------- |
| */ |
| |
| static UBool |
| _addVariantToList(VariantListEntry **first, VariantListEntry *var) { |
| UBool bAdded = true; |
| |
| if (*first == nullptr) { |
| var->next = nullptr; |
| *first = var; |
| } else { |
| VariantListEntry *prev, *cur; |
| int32_t cmp; |
| |
| /* variants order should be preserved */ |
| prev = nullptr; |
| cur = *first; |
| while (true) { |
| if (cur == nullptr) { |
| prev->next = var; |
| var->next = nullptr; |
| break; |
| } |
| |
| /* Checking for duplicate variant */ |
| cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant); |
| if (cmp == 0) { |
| /* duplicated variant */ |
| bAdded = false; |
| break; |
| } |
| prev = cur; |
| cur = cur->next; |
| } |
| } |
| |
| return bAdded; |
| } |
| |
| static UBool |
| _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) { |
| UBool bAdded = true; |
| |
| if (*first == nullptr) { |
| attr->next = nullptr; |
| *first = attr; |
| } else { |
| AttributeListEntry *prev, *cur; |
| int32_t cmp; |
| |
| /* reorder variants in alphabetical order */ |
| prev = nullptr; |
| cur = *first; |
| while (true) { |
| if (cur == nullptr) { |
| prev->next = attr; |
| attr->next = nullptr; |
| break; |
| } |
| cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute); |
| if (cmp < 0) { |
| if (prev == nullptr) { |
| *first = attr; |
| } else { |
| prev->next = attr; |
| } |
| attr->next = cur; |
| break; |
| } |
| if (cmp == 0) { |
| /* duplicated variant */ |
| bAdded = false; |
| break; |
| } |
| prev = cur; |
| cur = cur->next; |
| } |
| } |
| |
| return bAdded; |
| } |
| |
| |
| static UBool |
| _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) { |
| UBool bAdded = true; |
| |
| if (*first == nullptr) { |
| ext->next = nullptr; |
| *first = ext; |
| } else { |
| ExtensionListEntry *prev, *cur; |
| int32_t cmp; |
| |
| /* reorder variants in alphabetical order */ |
| prev = nullptr; |
| cur = *first; |
| while (true) { |
| if (cur == nullptr) { |
| prev->next = ext; |
| ext->next = nullptr; |
| break; |
| } |
| if (localeToBCP) { |
| /* special handling for locale to bcp conversion */ |
| int32_t len, curlen; |
| |
| len = (int32_t)uprv_strlen(ext->key); |
| curlen = (int32_t)uprv_strlen(cur->key); |
| |
| if (len == 1 && curlen == 1) { |
| if (*(ext->key) == *(cur->key)) { |
| cmp = 0; |
| } else if (*(ext->key) == PRIVATEUSE) { |
| cmp = 1; |
| } else if (*(cur->key) == PRIVATEUSE) { |
| cmp = -1; |
| } else { |
| cmp = *(ext->key) - *(cur->key); |
| } |
| } else if (len == 1) { |
| cmp = *(ext->key) - LDMLEXT; |
| } else if (curlen == 1) { |
| cmp = LDMLEXT - *(cur->key); |
| } else { |
| cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); |
| /* Both are u extension keys - we need special handling for 'attribute' */ |
| if (cmp != 0) { |
| if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) { |
| cmp = 1; |
| } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { |
| cmp = -1; |
| } |
| } |
| } |
| } else { |
| cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); |
| } |
| if (cmp < 0) { |
| if (prev == nullptr) { |
| *first = ext; |
| } else { |
| prev->next = ext; |
| } |
| ext->next = cur; |
| break; |
| } |
| if (cmp == 0) { |
| /* duplicated extension key */ |
| bAdded = false; |
| break; |
| } |
| prev = cur; |
| cur = cur->next; |
| } |
| } |
| |
| return bAdded; |
| } |
| |
| static void |
| _initializeULanguageTag(ULanguageTag* langtag) { |
| int32_t i; |
| |
| langtag->buf = nullptr; |
| |
| langtag->language = EMPTY; |
| for (i = 0; i < MAXEXTLANG; i++) { |
| langtag->extlang[i] = nullptr; |
| } |
| |
| langtag->script = EMPTY; |
| langtag->region = EMPTY; |
| |
| langtag->variants = nullptr; |
| langtag->extensions = nullptr; |
| |
| langtag->legacy = EMPTY; |
| langtag->privateuse = EMPTY; |
| } |
| |
| static void |
| _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { |
| char buf[ULOC_LANG_CAPACITY]; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| int32_t len, i; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus); |
| if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| len = 0; |
| } |
| |
| /* Note: returned language code is in lower case letters */ |
| |
| if (len == 0) { |
| sink.Append(LANG_UND, LANG_UND_LEN); |
| } else if (!ultag_isLanguageSubtag(buf, len)) { |
| /* invalid language code */ |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| sink.Append(LANG_UND, LANG_UND_LEN); |
| } else { |
| /* resolve deprecated */ |
| for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) { |
| // 2-letter deprecated subtags are listede before 3-letter |
| // ones in DEPRECATEDLANGS[]. Get out of loop on coming |
| // across the 1st 3-letter subtag, if the input is a 2-letter code. |
| // to avoid continuing to try when there's no match. |
| if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break; |
| if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) { |
| uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]); |
| len = (int32_t)uprv_strlen(buf); |
| break; |
| } |
| } |
| sink.Append(buf, len); |
| } |
| } |
| |
| static void |
| _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { |
| char buf[ULOC_SCRIPT_CAPACITY]; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| int32_t len; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus); |
| if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return; |
| } |
| |
| if (len > 0) { |
| if (!ultag_isScriptSubtag(buf, len)) { |
| /* invalid script code */ |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return; |
| } else { |
| sink.Append("-", 1); |
| sink.Append(buf, len); |
| } |
| } |
| } |
| |
| static void |
| _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { |
| char buf[ULOC_COUNTRY_CAPACITY]; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| int32_t len; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus); |
| if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return; |
| } |
| |
| if (len > 0) { |
| if (!ultag_isRegionSubtag(buf, len)) { |
| /* invalid region code */ |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return; |
| } else { |
| sink.Append("-", 1); |
| /* resolve deprecated */ |
| for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) { |
| if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) { |
| uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]); |
| len = (int32_t)uprv_strlen(buf); |
| break; |
| } |
| } |
| sink.Append(buf, len); |
| } |
| } |
| } |
| |
| static void _sortVariants(VariantListEntry* first) { |
| for (VariantListEntry* var1 = first; var1 != nullptr; var1 = var1->next) { |
| for (VariantListEntry* var2 = var1->next; var2 != nullptr; var2 = var2->next) { |
| // Swap var1->variant and var2->variant. |
| if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) { |
| const char* temp = var1->variant; |
| var1->variant = var2->variant; |
| var2->variant = temp; |
| } |
| } |
| } |
| } |
| |
| static void |
| _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) { |
| char buf[ULOC_FULLNAME_CAPACITY]; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| int32_t len, i; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); |
| if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return; |
| } |
| |
| if (len > 0) { |
| char *p, *pVar; |
| UBool bNext = true; |
| VariantListEntry *var; |
| VariantListEntry *varFirst = nullptr; |
| |
| pVar = nullptr; |
| p = buf; |
| while (bNext) { |
| if (*p == SEP || *p == LOCALE_SEP || *p == 0) { |
| if (*p == 0) { |
| bNext = false; |
| } else { |
| *p = 0; /* terminate */ |
| } |
| if (pVar == nullptr) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| /* ignore empty variant */ |
| } else { |
| /* ICU uses upper case letters for variants, but |
| the canonical format is lowercase in BCP47 */ |
| for (i = 0; *(pVar + i) != 0; i++) { |
| *(pVar + i) = uprv_tolower(*(pVar + i)); |
| } |
| |
| /* validate */ |
| if (_isVariantSubtag(pVar, -1)) { |
| if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) { |
| /* emit the variant to the list */ |
| var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); |
| if (var == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| var->variant = pVar; |
| if (!_addVariantToList(&varFirst, var)) { |
| /* duplicated variant */ |
| uprv_free(var); |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| } |
| } else { |
| /* Special handling for POSIX variant, need to remember that we had it and then */ |
| /* treat it like an extension later. */ |
| *hadPosix = true; |
| } |
| } else if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } else if (_isPrivateuseValueSubtag(pVar, -1)) { |
| /* Handle private use subtags separately */ |
| break; |
| } |
| } |
| /* reset variant starting position */ |
| pVar = nullptr; |
| } else if (pVar == nullptr) { |
| pVar = p; |
| } |
| p++; |
| } |
| |
| if (U_SUCCESS(*status)) { |
| if (varFirst != nullptr) { |
| int32_t varLen; |
| |
| /* per UTS35, we should sort the variants */ |
| _sortVariants(varFirst); |
| |
| /* write out validated/normalized variants to the target */ |
| var = varFirst; |
| while (var != nullptr) { |
| sink.Append("-", 1); |
| varLen = (int32_t)uprv_strlen(var->variant); |
| sink.Append(var->variant, varLen); |
| var = var->next; |
| } |
| } |
| } |
| |
| /* clean up */ |
| var = varFirst; |
| while (var != nullptr) { |
| VariantListEntry *tmpVar = var->next; |
| uprv_free(var); |
| var = tmpVar; |
| } |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| } |
| } |
| |
| static void |
| _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) { |
| char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 }; |
| int32_t attrBufLength = 0; |
| |
| icu::MemoryPool<AttributeListEntry> attrPool; |
| icu::MemoryPool<ExtensionListEntry> extPool; |
| icu::MemoryPool<icu::CharString> strPool; |
| |
| icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status)); |
| if (U_FAILURE(*status) && !hadPosix) { |
| return; |
| } |
| if (keywordEnum.isValid() || hadPosix) { |
| /* reorder extensions */ |
| int32_t len; |
| const char *key; |
| ExtensionListEntry *firstExt = nullptr; |
| ExtensionListEntry *ext; |
| AttributeListEntry *firstAttr = nullptr; |
| AttributeListEntry *attr; |
| icu::MemoryPool<icu::CharString> extBufPool; |
| const char *bcpKey=nullptr, *bcpValue=nullptr; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| int32_t keylen; |
| UBool isBcpUExt; |
| |
| while (true) { |
| key = uenum_next(keywordEnum.getAlias(), nullptr, status); |
| if (key == nullptr) { |
| break; |
| } |
| |
| icu::CharString buf; |
| { |
| icu::CharStringByteSink sink(&buf); |
| ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus); |
| } |
| len = buf.length(); |
| |
| if (U_FAILURE(tmpStatus)) { |
| if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| /* ignore this keyword */ |
| tmpStatus = U_ZERO_ERROR; |
| continue; |
| } |
| |
| keylen = (int32_t)uprv_strlen(key); |
| isBcpUExt = (keylen > 1); |
| |
| /* special keyword used for representing Unicode locale attributes */ |
| if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) { |
| if (len > 0) { |
| int32_t i = 0; |
| while (true) { |
| attrBufLength = 0; |
| for (; i < len; i++) { |
| if (buf[i] != '-') { |
| if (static_cast<size_t>(attrBufLength) < sizeof(attrBuf)) { |
| attrBuf[attrBufLength++] = buf[i]; |
| } else { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| } else { |
| i++; |
| break; |
| } |
| } |
| if (attrBufLength > 0) { |
| if (static_cast<size_t>(attrBufLength) < sizeof(attrBuf)) { |
| attrBuf[attrBufLength] = 0; |
| } else { |
| *status = U_STRING_NOT_TERMINATED_WARNING; |
| } |
| |
| } else if (i >= len){ |
| break; |
| } |
| |
| /* create AttributeListEntry */ |
| attr = attrPool.create(); |
| if (attr == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| icu::CharString* attrValue = |
| strPool.create(attrBuf, attrBufLength, *status); |
| if (attrValue == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| if (U_FAILURE(*status)) { |
| break; |
| } |
| attr->attribute = attrValue->data(); |
| |
| if (!_addAttributeToList(&firstAttr, attr)) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| } |
| } |
| /* for a place holder ExtensionListEntry */ |
| bcpKey = LOCALE_ATTRIBUTE_KEY; |
| bcpValue = nullptr; |
| } |
| } else if (isBcpUExt) { |
| bcpKey = uloc_toUnicodeLocaleKey(key); |
| if (bcpKey == nullptr) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| continue; |
| } |
| |
| /* we've checked buf is null-terminated above */ |
| bcpValue = uloc_toUnicodeLocaleType(key, buf.data()); |
| if (bcpValue == nullptr) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| continue; |
| } |
| if (bcpValue == buf.data()) { |
| /* |
| When uloc_toUnicodeLocaleType(key, buf) returns the |
| input value as is, the value is well-formed, but has |
| no known mapping. This implementation normalizes the |
| value to lower case |
| */ |
| icu::CharString* extBuf = extBufPool.create(buf, tmpStatus); |
| |
| if (extBuf == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| if (U_FAILURE(tmpStatus)) { |
| *status = tmpStatus; |
| break; |
| } |
| |
| T_CString_toLowerCase(extBuf->data()); |
| bcpValue = extBuf->data(); |
| } |
| } else { |
| if (*key == PRIVATEUSE) { |
| if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| continue; |
| } |
| } else { |
| if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| continue; |
| } |
| } |
| bcpKey = key; |
| icu::CharString* extBuf = |
| extBufPool.create(buf.data(), len, tmpStatus); |
| if (extBuf == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| if (U_FAILURE(tmpStatus)) { |
| *status = tmpStatus; |
| break; |
| } |
| bcpValue = extBuf->data(); |
| } |
| |
| /* create ExtensionListEntry */ |
| ext = extPool.create(); |
| if (ext == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| ext->key = bcpKey; |
| ext->value = bcpValue; |
| |
| if (!_addExtensionToList(&firstExt, ext, true)) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| } |
| } |
| |
| /* Special handling for POSIX variant - add the keywords for POSIX */ |
| if (hadPosix) { |
| /* create ExtensionListEntry for POSIX */ |
| ext = extPool.create(); |
| if (ext == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| ext->key = POSIX_KEY; |
| ext->value = POSIX_VALUE; |
| |
| if (!_addExtensionToList(&firstExt, ext, true)) { |
| // Silently ignore errors. |
| } |
| } |
| |
| if (U_SUCCESS(*status) && (firstExt != nullptr || firstAttr != nullptr)) { |
| UBool startLDMLExtension = false; |
| for (ext = firstExt; ext; ext = ext->next) { |
| if (!startLDMLExtension && uprv_strlen(ext->key) > 1) { |
| /* first LDML u singlton extension */ |
| sink.Append("-u", 2); |
| startLDMLExtension = true; |
| } |
| |
| /* write out the sorted BCP47 attributes, extensions and private use */ |
| if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { |
| /* write the value for the attributes */ |
| for (attr = firstAttr; attr; attr = attr->next) { |
| sink.Append("-", 1); |
| sink.Append( |
| attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute))); |
| } |
| } else { |
| sink.Append("-", 1); |
| sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key))); |
| if (uprv_strcmp(ext->value, "true") != 0 && |
| uprv_strcmp(ext->value, "yes") != 0) { |
| sink.Append("-", 1); |
| sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value))); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Append keywords parsed from LDML extension value |
| * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional} |
| * Note: char* buf is used for storing keywords |
| */ |
| static void |
| _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) { |
| const char *pTag; /* beginning of current subtag */ |
| const char *pKwds; /* beginning of key-type pairs */ |
| UBool variantExists = *posixVariant; |
| |
| ExtensionListEntry *kwdFirst = nullptr; /* first LDML keyword */ |
| ExtensionListEntry *kwd, *nextKwd; |
| |
| int32_t len; |
| |
| /* Reset the posixVariant value */ |
| *posixVariant = false; |
| |
| pTag = ldmlext; |
| pKwds = nullptr; |
| |
| { |
| AttributeListEntry *attrFirst = nullptr; /* first attribute */ |
| AttributeListEntry *attr, *nextAttr; |
| |
| char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; |
| int32_t attrBufIdx = 0; |
| |
| icu::MemoryPool<AttributeListEntry> attrPool; |
| |
| /* Iterate through u extension attributes */ |
| while (*pTag) { |
| /* locate next separator char */ |
| for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); |
| |
| if (ultag_isUnicodeLocaleKey(pTag, len)) { |
| pKwds = pTag; |
| break; |
| } |
| |
| /* add this attribute to the list */ |
| attr = attrPool.create(); |
| if (attr == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) { |
| uprv_memcpy(&attrBuf[attrBufIdx], pTag, len); |
| attrBuf[attrBufIdx + len] = 0; |
| attr->attribute = &attrBuf[attrBufIdx]; |
| attrBufIdx += (len + 1); |
| } else { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| // duplicate attribute is ignored, causes no error. |
| _addAttributeToList(&attrFirst, attr); |
| |
| /* next tag */ |
| pTag += len; |
| if (*pTag) { |
| /* next to the separator */ |
| pTag++; |
| } |
| } |
| |
| if (attrFirst) { |
| /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */ |
| |
| kwd = extPool.create(); |
| if (kwd == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| icu::CharString* value = kwdBuf.create(); |
| if (value == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| /* attribute subtags sorted in alphabetical order as type */ |
| attr = attrFirst; |
| while (attr != nullptr) { |
| nextAttr = attr->next; |
| if (attr != attrFirst) { |
| value->append('-', *status); |
| } |
| value->append(attr->attribute, *status); |
| attr = nextAttr; |
| } |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| kwd->key = LOCALE_ATTRIBUTE_KEY; |
| kwd->value = value->data(); |
| |
| if (!_addExtensionToList(&kwdFirst, kwd, false)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| } |
| } |
| |
| if (pKwds) { |
| const char *pBcpKey = nullptr; /* u extension key subtag */ |
| const char *pBcpType = nullptr; /* beginning of u extension type subtag(s) */ |
| int32_t bcpKeyLen = 0; |
| int32_t bcpTypeLen = 0; |
| UBool isDone = false; |
| |
| pTag = pKwds; |
| /* BCP47 representation of LDML key/type pairs */ |
| while (!isDone) { |
| const char *pNextBcpKey = nullptr; |
| int32_t nextBcpKeyLen = 0; |
| UBool emitKeyword = false; |
| |
| if (*pTag) { |
| /* locate next separator char */ |
| for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); |
| |
| if (ultag_isUnicodeLocaleKey(pTag, len)) { |
| if (pBcpKey) { |
| emitKeyword = true; |
| pNextBcpKey = pTag; |
| nextBcpKeyLen = len; |
| } else { |
| pBcpKey = pTag; |
| bcpKeyLen = len; |
| } |
| } else { |
| U_ASSERT(pBcpKey != nullptr); |
| /* within LDML type subtags */ |
| if (pBcpType) { |
| bcpTypeLen += (len + 1); |
| } else { |
| pBcpType = pTag; |
| bcpTypeLen = len; |
| } |
| } |
| |
| /* next tag */ |
| pTag += len; |
| if (*pTag) { |
| /* next to the separator */ |
| pTag++; |
| } |
| } else { |
| /* processing last one */ |
| emitKeyword = true; |
| isDone = true; |
| } |
| |
| if (emitKeyword) { |
| const char *pKey = nullptr; /* LDML key */ |
| const char *pType = nullptr; /* LDML type */ |
| |
| char bcpKeyBuf[3]; /* BCP key length is always 2 for now */ |
| |
| U_ASSERT(pBcpKey != nullptr); |
| |
| if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) { |
| /* the BCP key is invalid */ |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| U_ASSERT(bcpKeyLen <= 2); |
| |
| uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen); |
| bcpKeyBuf[bcpKeyLen] = 0; |
| |
| /* u extension key to LDML key */ |
| pKey = uloc_toLegacyKey(bcpKeyBuf); |
| if (pKey == nullptr) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| if (pKey == bcpKeyBuf) { |
| /* |
| The key returned by toLegacyKey points to the input buffer. |
| We normalize the result key to lower case. |
| */ |
| T_CString_toLowerCase(bcpKeyBuf); |
| icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status); |
| if (key == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| pKey = key->data(); |
| } |
| |
| if (pBcpType) { |
| char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */ |
| if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) { |
| /* the BCP type is too long */ |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen); |
| bcpTypeBuf[bcpTypeLen] = 0; |
| |
| /* BCP type to locale type */ |
| pType = uloc_toLegacyType(pKey, bcpTypeBuf); |
| if (pType == nullptr) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| if (pType == bcpTypeBuf) { |
| /* |
| The type returned by toLegacyType points to the input buffer. |
| We normalize the result type to lower case. |
| */ |
| /* normalize to lower case */ |
| T_CString_toLowerCase(bcpTypeBuf); |
| icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status); |
| if (type == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| pType = type->data(); |
| } |
| } else { |
| /* typeless - default type value is "yes" */ |
| pType = LOCALE_TYPE_YES; |
| } |
| |
| /* Special handling for u-va-posix, since we want to treat this as a variant, |
| not as a keyword */ |
| if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) { |
| *posixVariant = true; |
| } else { |
| /* create an ExtensionListEntry for this keyword */ |
| kwd = extPool.create(); |
| if (kwd == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| kwd->key = pKey; |
| kwd->value = pType; |
| |
| if (!_addExtensionToList(&kwdFirst, kwd, false)) { |
| // duplicate keyword is allowed, Only the first |
| // is honored. |
| } |
| } |
| |
| pBcpKey = pNextBcpKey; |
| bcpKeyLen = pNextBcpKey != nullptr ? nextBcpKeyLen : 0; |
| pBcpType = nullptr; |
| bcpTypeLen = 0; |
| } |
| } |
| } |
| |
| kwd = kwdFirst; |
| while (kwd != nullptr) { |
| nextKwd = kwd->next; |
| _addExtensionToList(appendTo, kwd, false); |
| kwd = nextKwd; |
| } |
| } |
| |
| |
| static void |
| _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) { |
| int32_t i, n; |
| int32_t len; |
| ExtensionListEntry *kwdFirst = nullptr; |
| ExtensionListEntry *kwd; |
| const char *key, *type; |
| icu::MemoryPool<ExtensionListEntry> extPool; |
| icu::MemoryPool<icu::CharString> kwdBuf; |
| UBool posixVariant = false; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| n = ultag_getExtensionsSize(langtag); |
| |
| /* resolve locale keywords and reordering keys */ |
| for (i = 0; i < n; i++) { |
| key = ultag_getExtensionKey(langtag, i); |
| type = ultag_getExtensionValue(langtag, i); |
| if (*key == LDMLEXT) { |
| /* Determine if variants already exists */ |
| if (ultag_getVariantsSize(langtag)) { |
| posixVariant = true; |
| } |
| |
| _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status); |
| if (U_FAILURE(*status)) { |
| break; |
| } |
| } else { |
| kwd = extPool.create(); |
| if (kwd == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| kwd->key = key; |
| kwd->value = type; |
| if (!_addExtensionToList(&kwdFirst, kwd, false)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } |
| } |
| } |
| |
| if (U_SUCCESS(*status)) { |
| type = ultag_getPrivateUse(langtag); |
| if ((int32_t)uprv_strlen(type) > 0) { |
| /* add private use as a keyword */ |
| kwd = extPool.create(); |
| if (kwd == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| kwd->key = PRIVATEUSE_KEY; |
| kwd->value = type; |
| if (!_addExtensionToList(&kwdFirst, kwd, false)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| } |
| } |
| } |
| |
| /* If a POSIX variant was in the extensions, write it out before writing the keywords. */ |
| |
| if (U_SUCCESS(*status) && posixVariant) { |
| len = (int32_t) uprv_strlen(_POSIX); |
| sink.Append(_POSIX, len); |
| } |
| |
| if (U_SUCCESS(*status) && kwdFirst != nullptr) { |
| /* write out the sorted keywords */ |
| UBool firstValue = true; |
| kwd = kwdFirst; |
| do { |
| if (firstValue) { |
| sink.Append("@", 1); |
| firstValue = false; |
| } else { |
| sink.Append(";", 1); |
| } |
| |
| /* key */ |
| len = (int32_t)uprv_strlen(kwd->key); |
| sink.Append(kwd->key, len); |
| sink.Append("=", 1); |
| |
| /* type */ |
| len = (int32_t)uprv_strlen(kwd->value); |
| sink.Append(kwd->value, len); |
| |
| kwd = kwd->next; |
| } while (kwd); |
| } |
| } |
| |
| static void |
| _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) { |
| (void)hadPosix; |
| char buf[ULOC_FULLNAME_CAPACITY]; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| int32_t len, i; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); |
| if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { |
| if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return; |
| } |
| |
| if (len > 0) { |
| char *p, *pPriv; |
| UBool bNext = true; |
| UBool firstValue = true; |
| UBool writeValue; |
| |
| pPriv = nullptr; |
| p = buf; |
| while (bNext) { |
| writeValue = false; |
| if (*p == SEP || *p == LOCALE_SEP || *p == 0) { |
| if (*p == 0) { |
| bNext = false; |
| } else { |
| *p = 0; /* terminate */ |
| } |
| if (pPriv != nullptr) { |
| /* Private use in the canonical format is lowercase in BCP47 */ |
| for (i = 0; *(pPriv + i) != 0; i++) { |
| *(pPriv + i) = uprv_tolower(*(pPriv + i)); |
| } |
| |
| /* validate */ |
| if (_isPrivateuseValueSubtag(pPriv, -1)) { |
| if (firstValue) { |
| if (!_isVariantSubtag(pPriv, -1)) { |
| writeValue = true; |
| } |
| } else { |
| writeValue = true; |
| } |
| } else if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| } else { |
| break; |
| } |
| |
| if (writeValue) { |
| sink.Append("-", 1); |
| |
| if (firstValue) { |
| sink.Append(PRIVATEUSE_KEY, UPRV_LENGTHOF(PRIVATEUSE_KEY) - 1); |
| sink.Append("-", 1); |
| sink.Append(PRIVUSE_VARIANT_PREFIX, UPRV_LENGTHOF(PRIVUSE_VARIANT_PREFIX) - 1); |
| sink.Append("-", 1); |
| firstValue = false; |
| } |
| |
| len = (int32_t)uprv_strlen(pPriv); |
| sink.Append(pPriv, len); |
| } |
| } |
| /* reset private use starting position */ |
| pPriv = nullptr; |
| } else if (pPriv == nullptr) { |
| pPriv = p; |
| } |
| p++; |
| } |
| } |
| } |
| |
| /* |
| * ------------------------------------------------- |
| * |
| * ultag_ functions |
| * |
| * ------------------------------------------------- |
| */ |
| |
| /* Bit flags used by the parser */ |
| #define LANG 0x0001 |
| #define EXTL 0x0002 |
| #define SCRT 0x0004 |
| #define REGN 0x0008 |
| #define VART 0x0010 |
| #define EXTS 0x0020 |
| #define EXTV 0x0040 |
| #define PRIV 0x0080 |
| |
| /** |
| * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function. |
| * As a work-around, optimization is disabled for this function on VS2015 and VS2017. |
| * This work-around should be removed once the following versions of Visual Studio are no |
| * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4. |
| */ |
| #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924) |
| #pragma optimize( "", off ) |
| #endif |
| |
| static ULanguageTag* |
| ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) { |
| char *tagBuf; |
| int16_t next; |
| char *pSubtag, *pNext, *pLastGoodPosition; |
| int32_t subtagLen; |
| int32_t extlangIdx; |
| ExtensionListEntry *pExtension; |
| char *pExtValueSubtag, *pExtValueSubtagEnd; |
| int32_t i; |
| UBool privateuseVar = false; |
| int32_t legacyLen = 0; |
| |
| if (parsedLen != nullptr) { |
| *parsedLen = 0; |
| } |
| |
| if (U_FAILURE(*status)) { |
| return nullptr; |
| } |
| |
| if (tagLen < 0) { |
| tagLen = (int32_t)uprv_strlen(tag); |
| } |
| |
| /* copy the entire string */ |
| tagBuf = (char*)uprv_malloc(tagLen + 1); |
| if (tagBuf == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| |
| if (tagLen > 0) { |
| uprv_memcpy(tagBuf, tag, tagLen); |
| } |
| *(tagBuf + tagLen) = 0; |
| |
| /* create a ULanguageTag */ |
| icu::LocalULanguageTagPointer t( |
| (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag))); |
| if (t.isNull()) { |
| uprv_free(tagBuf); |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| _initializeULanguageTag(t.getAlias()); |
| t->buf = tagBuf; |
| |
| if (tagLen < MINLEN) { |
| /* the input tag is too short - return empty ULanguageTag */ |
| return t.orphan(); |
| } |
| |
| size_t parsedLenDelta = 0; |
| // Legacy tag will be consider together. Legacy tag with intervening |
| // script and region such as art-DE-lojban or art-Latn-lojban won't be |
| // matched. |
| /* check if the tag is legacy */ |
| for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) { |
| int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i])); |
| if (tagLen < checkLegacyLen) { |
| continue; |
| } |
| if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') { |
| // make sure next char is '-'. |
| continue; |
| } |
| if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) { |
| int32_t newTagLength; |
| |
| legacyLen = checkLegacyLen; /* back up for output parsedLen */ |
| int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1])); |
| newTagLength = replacementLen + tagLen - checkLegacyLen; |
| int32_t oldTagLength = tagLen; |
| if (tagLen < newTagLength) { |
| uprv_free(tagBuf); |
| // Change t->buf after the free and before return to avoid the second double free in |
| // the destructor of t when t is out of scope. |
| t->buf = tagBuf = (char*)uprv_malloc(newTagLength + 1); |
| if (tagBuf == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| tagLen = newTagLength; |
| } |
| parsedLenDelta = checkLegacyLen - replacementLen; |
| uprv_strcpy(t->buf, LEGACY[i + 1]); |
| if (checkLegacyLen != tagLen) { |
| uprv_memcpy(t->buf + replacementLen, tag + checkLegacyLen, |
| oldTagLength - checkLegacyLen); |
| // NUL-terminate after memcpy(). |
| t->buf[replacementLen + oldTagLength - checkLegacyLen] = 0; |
| } |
| break; |
| } |
| } |
| |
| if (legacyLen == 0) { |
| for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) { |
| const char* redundantTag = REDUNDANT[i]; |
| size_t redundantTagLen = uprv_strlen(redundantTag); |
| // The preferred tag for a redundant tag is always shorter than redundant |
| // tag. A redundant tag may or may not be followed by other subtags. |
| // (i.e. "zh-yue" or "zh-yue-u-co-pinyin"). |
| if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) { |
| const char* redundantTagEnd = tagBuf + redundantTagLen; |
| if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) { |
| const char* preferredTag = REDUNDANT[i + 1]; |
| size_t preferredTagLen = uprv_strlen(preferredTag); |
| uprv_memcpy(t->buf, preferredTag, preferredTagLen); |
| if (*redundantTagEnd == SEP) { |
| uprv_memmove(tagBuf + preferredTagLen, |
| redundantTagEnd, |
| tagLen - redundantTagLen + 1); |
| } else { |
| tagBuf[preferredTagLen] = '\0'; |
| } |
| // parsedLen should be the length of the input |
| // before redundantTag is replaced by preferredTag. |
| // Save the delta to add it back later. |
| parsedLenDelta = redundantTagLen - preferredTagLen; |
| break; |
| } |
| } |
| } |
| } |
| |
| /* |
| * langtag = language |
| * ["-" script] |
| * ["-" region] |
| * *("-" variant) |
| * *("-" extension) |
| * ["-" privateuse] |
| */ |
| |
| next = LANG | PRIV; |
| pNext = pLastGoodPosition = tagBuf; |
| extlangIdx = 0; |
| pExtension = nullptr; |
| pExtValueSubtag = nullptr; |
| pExtValueSubtagEnd = nullptr; |
| |
| while (pNext) { |
| char *pSep; |
| |
| pSubtag = pNext; |
| |
| /* locate next separator char */ |
| pSep = pSubtag; |
| while (*pSep) { |
| if (*pSep == SEP) { |
| break; |
| } |
| pSep++; |
| } |
| if (*pSep == 0) { |
| /* last subtag */ |
| pNext = nullptr; |
| } else { |
| pNext = pSep + 1; |
| } |
| subtagLen = (int32_t)(pSep - pSubtag); |
| |
| if (next & LANG) { |
| if (ultag_isLanguageSubtag(pSubtag, subtagLen)) { |
| *pSep = 0; /* terminate */ |
| // TODO: move deprecated language code handling here. |
| t->language = T_CString_toLowerCase(pSubtag); |
| |
| pLastGoodPosition = pSep; |
| next = SCRT | REGN | VART | EXTS | PRIV; |
| if (subtagLen <= 3) |
| next |= EXTL; |
| continue; |
| } |
| } |
| if (next & EXTL) { |
| if (_isExtlangSubtag(pSubtag, subtagLen)) { |
| *pSep = 0; |
| t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag); |
| |
| pLastGoodPosition = pSep; |
| if (extlangIdx < 3) { |
| next = EXTL | SCRT | REGN | VART | EXTS | PRIV; |
| } else { |
| next = SCRT | REGN | VART | EXTS | PRIV; |
| } |
| continue; |
| } |
| } |
| if (next & SCRT) { |
| if (ultag_isScriptSubtag(pSubtag, subtagLen)) { |
| char *p = pSubtag; |
| |
| *pSep = 0; |
| |
| /* to title case */ |
| *p = uprv_toupper(*p); |
| p++; |
| for (; *p; p++) { |
| *p = uprv_tolower(*p); |
| } |
| |
| t->script = pSubtag; |
| |
| pLastGoodPosition = pSep; |
| next = REGN | VART | EXTS | PRIV; |
| continue; |
| } |
| } |
| if (next & REGN) { |
| if (ultag_isRegionSubtag(pSubtag, subtagLen)) { |
| *pSep = 0; |
| // TODO: move deprecated region code handling here. |
| t->region = T_CString_toUpperCase(pSubtag); |
| |
| pLastGoodPosition = pSep; |
| next = VART | EXTS | PRIV; |
| continue; |
| } |
| } |
| if (next & VART) { |
| if (_isVariantSubtag(pSubtag, subtagLen) || |
| (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) { |
| VariantListEntry *var; |
| UBool isAdded; |
| |
| var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); |
| if (var == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| *pSep = 0; |
| var->variant = T_CString_toUpperCase(pSubtag); |
| isAdded = _addVariantToList(&(t->variants), var); |
| if (!isAdded) { |
| /* duplicated variant entry */ |
| uprv_free(var); |
| break; |
| } |
| pLastGoodPosition = pSep; |
| next = VART | EXTS | PRIV; |
| continue; |
| } |
| } |
| if (next & EXTS) { |
| if (_isExtensionSingleton(pSubtag, subtagLen)) { |
| if (pExtension != nullptr) { |
| if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) { |
| /* the previous extension is incomplete */ |
| uprv_free(pExtension); |
| pExtension = nullptr; |
| break; |
| } |
| |
| /* terminate the previous extension value */ |
| *pExtValueSubtagEnd = 0; |
| pExtension->value = T_CString_toLowerCase(pExtValueSubtag); |
| |
| /* insert the extension to the list */ |
| if (_addExtensionToList(&(t->extensions), pExtension, false)) { |
| pLastGoodPosition = pExtValueSubtagEnd; |
| } else { |
| /* stop parsing here */ |
| uprv_free(pExtension); |
| pExtension = nullptr; |
| break; |
| } |
| } |
| |
| /* create a new extension */ |
| pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry)); |
| if (pExtension == nullptr) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return nullptr; |
| } |
| *pSep = 0; |
| pExtension->key = T_CString_toLowerCase(pSubtag); |
| pExtension->value = nullptr; /* will be set later */ |
| |
| /* |
| * reset the start and the end location of extension value |
| * subtags for this extension |
| */ |
| pExtValueSubtag = nullptr; |
| pExtValueSubtagEnd = nullptr; |
| |
| next = EXTV; |
| continue; |
| } |
| } |
| if (next & EXTV) { |
| if (_isExtensionSubtag(pSubtag, subtagLen)) { |
| if (pExtValueSubtag == nullptr) { |
| /* if the start position of this extension's value is not yet, |
| this one is the first value subtag */ |
| pExtValueSubtag = pSubtag; |
| } |
| |
| /* Mark the end of this subtag */ |
| pExtValueSubtagEnd = pSep; |
| next = EXTS | EXTV | PRIV; |
| |
| continue; |
| } |
| } |
| if (next & PRIV) { |
| if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) { |
| char *pPrivuseVal; |
| |
| if (pExtension != nullptr) { |
| /* Process the last extension */ |
| if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) { |
| /* the previous extension is incomplete */ |
| uprv_free(pExtension); |
| pExtension = nullptr; |
| break; |
| } else { |
| /* terminate the previous extension value */ |
| *pExtValueSubtagEnd = 0; |
| pExtension->value = T_CString_toLowerCase(pExtValueSubtag); |
| |
| /* insert the extension to the list */ |
| if (_addExtensionToList(&(t->extensions), pExtension, false)) { |
| pLastGoodPosition = pExtValueSubtagEnd; |
| pExtension = nullptr; |
| } else { |
| /* stop parsing here */ |
| uprv_free(pExtension); |
| pExtension = nullptr; |
| break; |
| } |
| } |
| } |
| |
| /* The rest of part will be private use value subtags */ |
| if (pNext == nullptr) { |
| /* empty private use subtag */ |
| break; |
| } |
| /* back up the private use value start position */ |
| pPrivuseVal = pNext; |
| |
| /* validate private use value subtags */ |
| while (pNext) { |
| pSubtag = pNext; |
| pSep = pSubtag; |
| while (*pSep) { |
| if (*pSep == SEP) { |
| break; |
| } |
| pSep++; |
| } |
| if (*pSep == 0) { |
| /* last subtag */ |
| pNext = nullptr; |
| } else { |
| pNext = pSep + 1; |
| } |
| subtagLen = (int32_t)(pSep - pSubtag); |
| |
| if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) { |
| *pSep = 0; |
| next = VART; |
| privateuseVar = true; |
| break; |
| } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) { |
| pLastGoodPosition = pSep; |
| } else { |
| break; |
| } |
| } |
| |
| if (next == VART) { |
| continue; |
| } |
| |
| if (pLastGoodPosition - pPrivuseVal > 0) { |
| *pLastGoodPosition = 0; |
| t->privateuse = T_CString_toLowerCase(pPrivuseVal); |
| } |
| /* No more subtags, exiting the parse loop */ |
| break; |
| } |
| break; |
| } |
| |
| /* If we fell through here, it means this subtag is illegal - quit parsing */ |
| break; |
| } |
| |
| if (pExtension != nullptr) { |
| /* Process the last extension */ |
| if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) { |
| /* the previous extension is incomplete */ |
| uprv_free(pExtension); |
| } else { |
| /* terminate the previous extension value */ |
| *pExtValueSubtagEnd = 0; |
| pExtension->value = T_CString_toLowerCase(pExtValueSubtag); |
| /* insert the extension to the list */ |
| if (_addExtensionToList(&(t->extensions), pExtension, false)) { |
| pLastGoodPosition = pExtValueSubtagEnd; |
| } else { |
| uprv_free(pExtension); |
| } |
| } |
| } |
| |
| if (parsedLen != nullptr) { |
| *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta); |
| } |
| |
| return t.orphan(); |
| } |
| |
| // Ticket #12705 - Turn optimization back on. |
| #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924) |
| #pragma optimize( "", on ) |
| #endif |
| |
| static void |
| ultag_close(ULanguageTag* langtag) { |
| |
| if (langtag == nullptr) { |
| return; |
| } |
| |
| uprv_free(langtag->buf); |
| |
| if (langtag->variants) { |
| VariantListEntry *curVar = langtag->variants; |
| while (curVar) { |
| VariantListEntry *nextVar = curVar->next; |
| uprv_free(curVar); |
| curVar = nextVar; |
| } |
| } |
| |
| if (langtag->extensions) { |
| ExtensionListEntry *curExt = langtag->extensions; |
| while (curExt) { |
| ExtensionListEntry *nextExt = curExt->next; |
| uprv_free(curExt); |
| curExt = nextExt; |
| } |
| } |
| |
| uprv_free(langtag); |
| } |
| |
| static const char* |
| ultag_getLanguage(const ULanguageTag* langtag) { |
| return langtag->language; |
| } |
| |
| #if 0 |
| static const char* |
| ultag_getJDKLanguage(const ULanguageTag* langtag) { |
| int32_t i; |
| for (i = 0; DEPRECATEDLANGS[i] != nullptr; i += 2) { |
| if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) { |
| return DEPRECATEDLANGS[i + 1]; |
| } |
| } |
| return langtag->language; |
| } |
| #endif |
| |
| static const char* |
| ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) { |
| if (idx >= 0 && idx < MAXEXTLANG) { |
| return langtag->extlang[idx]; |
| } |
| return nullptr; |
| } |
| |
| static int32_t |
| ultag_getExtlangSize(const ULanguageTag* langtag) { |
| int32_t size = 0; |
| int32_t i; |
| for (i = 0; i < MAXEXTLANG; i++) { |
| if (langtag->extlang[i]) { |
| size++; |
| } |
| } |
| return size; |
| } |
| |
| static const char* |
| ultag_getScript(const ULanguageTag* langtag) { |
| return langtag->script; |
| } |
| |
| static const char* |
| ultag_getRegion(const ULanguageTag* langtag) { |
| return langtag->region; |
| } |
| |
| static const char* |
| ultag_getVariant(const ULanguageTag* langtag, int32_t idx) { |
| const char *var = nullptr; |
| VariantListEntry *cur = langtag->variants; |
| int32_t i = 0; |
| while (cur) { |
| if (i == idx) { |
| var = cur->variant; |
| break; |
| } |
| cur = cur->next; |
| i++; |
| } |
| return var; |
| } |
| |
| static int32_t |
| ultag_getVariantsSize(const ULanguageTag* langtag) { |
| int32_t size = 0; |
| VariantListEntry *cur = langtag->variants; |
| while (true) { |
| if (cur == nullptr) { |
| break; |
| } |
| size++; |
| cur = cur->next; |
| } |
| return size; |
| } |
| |
| static const char* |
| ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) { |
| const char *key = nullptr; |
| ExtensionListEntry *cur = langtag->extensions; |
| int32_t i = 0; |
| while (cur) { |
| if (i == idx) { |
| key = cur->key; |
| break; |
| } |
| cur = cur->next; |
| i++; |
| } |
| return key; |
| } |
| |
| static const char* |
| ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) { |
| const char *val = nullptr; |
| ExtensionListEntry *cur = langtag->extensions; |
| int32_t i = 0; |
| while (cur) { |
| if (i == idx) { |
| val = cur->value; |
| break; |
| } |
| cur = cur->next; |
| i++; |
| } |
| return val; |
| } |
| |
| static int32_t |
| ultag_getExtensionsSize(const ULanguageTag* langtag) { |
| int32_t size = 0; |
| ExtensionListEntry *cur = langtag->extensions; |
| while (true) { |
| if (cur == nullptr) { |
| break; |
| } |
| size++; |
| cur = cur->next; |
| } |
| return size; |
| } |
| |
| static const char* |
| ultag_getPrivateUse(const ULanguageTag* langtag) { |
| return langtag->privateuse; |
| } |
| |
| #if 0 |
| static const char* |
| ultag_getLegacy(const ULanguageTag* langtag) { |
| return langtag->legacy; |
| } |
| #endif |
| |
| |
| /* |
| * ------------------------------------------------- |
| * |
| * Locale/BCP47 conversion APIs, exposed as uloc_* |
| * |
| * ------------------------------------------------- |
| */ |
| U_CAPI int32_t U_EXPORT2 |
| uloc_toLanguageTag(const char* localeID, |
| char* langtag, |
| int32_t langtagCapacity, |
| UBool strict, |
| UErrorCode* status) { |
| if (U_FAILURE(*status)) { |
| return 0; |
| } |
| |
| icu::CheckedArrayByteSink sink(langtag, langtagCapacity); |
| ulocimp_toLanguageTag(localeID, sink, strict, status); |
| |
| int32_t reslen = sink.NumberOfBytesAppended(); |
| |
| if (U_FAILURE(*status)) { |
| return reslen; |
| } |
| |
| if (sink.Overflowed()) { |
| *status = U_BUFFER_OVERFLOW_ERROR; |
| } else { |
| u_terminateChars(langtag, langtagCapacity, reslen, status); |
| } |
| |
| return reslen; |
| } |
| |
| |
| U_CAPI void U_EXPORT2 |
| ulocimp_toLanguageTag(const char* localeID, |
| icu::ByteSink& sink, |
| UBool strict, |
| UErrorCode* status) { |
| icu::CharString canonical; |
| int32_t reslen; |
| UErrorCode tmpStatus = U_ZERO_ERROR; |
| UBool hadPosix = false; |
| const char* pKeywordStart; |
| |
| /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */ |
| int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID)); |
| if (resultCapacity > 0) { |
| char* buffer; |
| |
| for (;;) { |
| buffer = canonical.getAppendBuffer( |
| /*minCapacity=*/resultCapacity, |
| /*desiredCapacityHint=*/resultCapacity, |
| resultCapacity, |
| tmpStatus); |
| |
| if (U_FAILURE(tmpStatus)) { |
| *status = tmpStatus; |
| return; |
| } |
| |
| reslen = |
| uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus); |
| |
| if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) { |
| break; |
| } |
| |
| resultCapacity = reslen; |
| tmpStatus = U_ZERO_ERROR; |
| } |
| |
| if (U_FAILURE(tmpStatus)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| canonical.append(buffer, reslen, tmpStatus); |
| if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { |
| tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString. |
| } |
| |
| if (U_FAILURE(tmpStatus)) { |
| *status = tmpStatus; |
| return; |
| } |
| } |
| |
| /* For handling special case - private use only tag */ |
| pKeywordStart = locale_getKeywordsStart(canonical.data()); |
| if (pKeywordStart == canonical.data()) { |
| int kwdCnt = 0; |
| UBool done = false; |
| |
| icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus)); |
| if (U_SUCCESS(tmpStatus)) { |
| kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus); |
| if (kwdCnt == 1) { |
| const char *key; |
| int32_t len = 0; |
| |
| key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus); |
| if (len == 1 && *key == PRIVATEUSE) { |
| icu::CharString buf; |
| { |
| icu::CharStringByteSink sink(&buf); |
| ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus); |
| } |
| if (U_SUCCESS(tmpStatus)) { |
| if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) { |
| /* return private use only tag */ |
| sink.Append("und-x-", 6); |
| sink.Append(buf.data(), buf.length()); |
| done = true; |
| } else if (strict) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| done = true; |
| } |
| /* if not strict mode, then "und" will be returned */ |
| } else { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| done = true; |
| } |
| } |
| } |
| if (done) { |
| return; |
| } |
| } |
| } |
| |
| _appendLanguageToLanguageTag(canonical.data(), sink, strict, status); |
| _appendScriptToLanguageTag(canonical.data(), sink, strict, status); |
| _appendRegionToLanguageTag(canonical.data(), sink, strict, status); |
| _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status); |
| _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status); |
| _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status); |
| } |
| |
| |
| U_CAPI int32_t U_EXPORT2 |
| uloc_forLanguageTag(const char* langtag, |
| char* localeID, |
| int32_t localeIDCapacity, |
| int32_t* parsedLength, |
| UErrorCode* status) { |
| if (U_FAILURE(*status)) { |
| return 0; |
| } |
| |
| icu::CheckedArrayByteSink sink(localeID, localeIDCapacity); |
| ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status); |
| |
| int32_t reslen = sink.NumberOfBytesAppended(); |
| |
| if (U_FAILURE(*status)) { |
| return reslen; |
| } |
| |
| if (sink.Overflowed()) { |
| *status = U_BUFFER_OVERFLOW_ERROR; |
| } else { |
| u_terminateChars(localeID, localeIDCapacity, reslen, status); |
| } |
| |
| return reslen; |
| } |
| |
| |
| U_CAPI void U_EXPORT2 |
| ulocimp_forLanguageTag(const char* langtag, |
| int32_t tagLen, |
| icu::ByteSink& sink, |
| int32_t* parsedLength, |
| UErrorCode* status) { |
| UBool isEmpty = true; |
| const char *subtag, *p; |
| int32_t len; |
| int32_t i, n; |
| UBool noRegion = true; |
| |
| icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status)); |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| /* language */ |
| subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias()); |
| if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) { |
| len = (int32_t)uprv_strlen(subtag); |
| if (len > 0) { |
| sink.Append(subtag, len); |
| isEmpty = false; |
| } |
| } |
| |
| /* script */ |
| subtag = ultag_getScript(lt.getAlias()); |
| len = (int32_t)uprv_strlen(subtag); |
| if (len > 0) { |
| sink.Append("_", 1); |
| isEmpty = false; |
| |
| /* write out the script in title case */ |
| char c = uprv_toupper(*subtag); |
| sink.Append(&c, 1); |
| sink.Append(subtag + 1, len - 1); |
| } |
| |
| /* region */ |
| subtag = ultag_getRegion(lt.getAlias()); |
| len = (int32_t)uprv_strlen(subtag); |
| if (len > 0) { |
| sink.Append("_", 1); |
| isEmpty = false; |
| |
| /* write out the region in upper case */ |
| p = subtag; |
| while (*p) { |
| char c = uprv_toupper(*p); |
| sink.Append(&c, 1); |
| p++; |
| } |
| noRegion = false; |
| } |
| |
| /* variants */ |
| _sortVariants(lt.getAlias()->variants); |
| n = ultag_getVariantsSize(lt.getAlias()); |
| if (n > 0) { |
| if (noRegion) { |
| sink.Append("_", 1); |
| isEmpty = false; |
| } |
| |
| for (i = 0; i < n; i++) { |
| subtag = ultag_getVariant(lt.getAlias(), i); |
| sink.Append("_", 1); |
| |
| /* write out the variant in upper case */ |
| p = subtag; |
| while (*p) { |
| char c = uprv_toupper(*p); |
| sink.Append(&c, 1); |
| p++; |
| } |
| } |
| } |
| |
| /* keywords */ |
| n = ultag_getExtensionsSize(lt.getAlias()); |
| subtag = ultag_getPrivateUse(lt.getAlias()); |
| if (n > 0 || uprv_strlen(subtag) > 0) { |
| if (isEmpty && n > 0) { |
| /* need a language */ |
| sink.Append(LANG_UND, LANG_UND_LEN); |
| } |
| _appendKeywords(lt.getAlias(), sink, status); |
| } |
| } |