| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: ucol_tok.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created 02/22/2001 |
| * created by: Vladimir Weinstein |
| * |
| * This module reads a tailoring rule string and produces a list of |
| * tokens that will be turned into collation elements |
| * |
| */ |
| |
| #include "unicode/ustring.h" |
| #include "unicode/uchar.h" |
| |
| #include "cmemory.h" |
| #include "ucol_tok.h" |
| #include "ucmp32.h" |
| |
| static const UChar *rulesToParse = 0; |
| |
| /* will use a small structure, tokHash */ |
| |
| int32_t |
| uhash_hashTokens(const void *k) { |
| int32_t hash = 0; |
| if (k != NULL) { |
| const uint32_t key = (const uint32_t)k; |
| int32_t len = (key & 0xFF000000)>>24; |
| int32_t inc = ((len - 32) / 32) + 1; |
| |
| const UChar *p = (key & 0x00FFFFFF) + rulesToParse; |
| const UChar *limit = p + len; |
| |
| while (p<limit) { |
| hash = (hash * 37) + *p; |
| p += inc; |
| } |
| } |
| return hash; |
| } |
| |
| UBool uhash_compareTokens(const void *key1, const void *key2) { |
| const uint32_t p1 = (const uint32_t) key1; |
| const uint32_t p2 = (const uint32_t) key2; |
| const UChar *s1 = (p1 & 0x00FFFFFF) + rulesToParse; |
| const UChar *s2 = (p2 & 0x00FFFFFF) + rulesToParse; |
| uint32_t s1L = ((p1 & 0xFF000000) >> 24); |
| uint32_t s2L = ((p2 & 0xFF000000) >> 24); |
| |
| if (p1 == p2) { |
| return TRUE; |
| } |
| if (p1 == 0 || p2 == 0) { |
| return FALSE; |
| } |
| if(s1L != s2L) { |
| return FALSE; |
| } |
| if(p1 == p2) { |
| return TRUE; |
| } |
| const UChar *end = s1+s1L-1; |
| while((s1 < end) && *s1 == *s2) { |
| ++s1; |
| ++s2; |
| } |
| if(*s1 == *s2) { |
| return TRUE; |
| } else { |
| return FALSE; |
| } |
| } |
| |
| void deleteToken(void *token) { |
| UColToken *tok = (UColToken *)token; |
| uprv_free(tok); |
| } |
| |
| void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, UCollator *UCA, UErrorCode *status) { |
| uint32_t nSize = 0; |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| |
| src->source = (UChar *)uprv_malloc((2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); |
| nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, 2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
| if(nSize > (uint32_t)(2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE) || *status == U_BUFFER_OVERFLOW_ERROR) { |
| *status = U_ZERO_ERROR; |
| src->source = (UChar *)realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); |
| nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
| } |
| src->current = src->source; |
| src->end = src->source+nSize; |
| src->sourceCurrent = src->source; |
| src->extraCurrent = src->end; |
| src->extraEnd = src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
| src->UCA = UCA; |
| src->invUCA = ucol_initInverseUCA(status); |
| src->resultLen = 0; |
| src->lh = 0; |
| src->varTop = NULL; |
| src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, status); |
| uhash_setValueDeleter(src->tailored, deleteToken); |
| |
| src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); |
| |
| uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); |
| |
| rulesToParse = src->source; |
| src->lh = (UColTokListHeader *)uprv_malloc(512*sizeof(UColTokListHeader)); |
| src->resultLen = 0; |
| } |
| |
| /* -1 off, 1 on, 0 neither */ |
| int32_t ucol_uprv_tok_isOnorOf(const UChar* onoff) { |
| if(onoff) { |
| if(u_tolower(*onoff) == 0x006F /*'o'*/) { |
| onoff++; |
| if(u_tolower(*onoff) == 0x006e /*'n'*/) { |
| return 1; |
| } else if(u_tolower(*onoff) == 0x0066 /*'f'*/) { |
| onoff++; |
| if(u_tolower(*onoff) == 0x0066 /*'f'*/) { |
| return -1; |
| } |
| } |
| } |
| } |
| return 0; |
| } |
| |
| void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { |
| switch(attrib) { |
| case UCOL_FRENCH_COLLATION: |
| opts->frenchCollation = value; |
| break; |
| case UCOL_ALTERNATE_HANDLING: |
| opts->alternateHandling = value; |
| break; |
| case UCOL_CASE_FIRST: |
| opts->caseFirst = value; |
| break; |
| case UCOL_CASE_LEVEL: |
| opts->caseLevel = value; |
| break; |
| case UCOL_NORMALIZATION_MODE: |
| opts->normalizationMode = value; |
| break; |
| case UCOL_STRENGTH: |
| opts->strength = value; |
| break; |
| case UCOL_ATTRIBUTE_COUNT: |
| default: |
| break; |
| } |
| } |
| |
| #define UTOK_OPTION_COUNT 13 |
| |
| static UBool didInit = FALSE; |
| /* we can be strict, or we can be lenient */ |
| /* I'd surely be lenient with the option arguments */ |
| /* maybe even with options */ |
| U_STRING_DECL(suboption_00, "non-ignorable", 13); |
| U_STRING_DECL(suboption_01, "shifted", 7); |
| |
| U_STRING_DECL(suboption_02, "lower", 5); |
| U_STRING_DECL(suboption_03, "upper", 5); |
| U_STRING_DECL(suboption_04, "off", 3); |
| U_STRING_DECL(suboption_05, "on", 2); |
| U_STRING_DECL(suboption_06, "1", 1); |
| U_STRING_DECL(suboption_07, "2", 1); |
| U_STRING_DECL(suboption_08, "3", 1); |
| |
| |
| |
| U_STRING_DECL(option_00, "undefined", 9); |
| U_STRING_DECL(option_01, "rearrange", 9); |
| U_STRING_DECL(option_02, "alternate", 9); |
| U_STRING_DECL(option_03, "backwards", 9); |
| U_STRING_DECL(option_04, "variable top", 12); |
| U_STRING_DECL(option_05, "top", 3); |
| U_STRING_DECL(option_06, "normalization", 13); |
| U_STRING_DECL(option_07, "caseLevel", 9); |
| U_STRING_DECL(option_08, "caseFirst", 9); |
| U_STRING_DECL(option_09, "scriptOrder", 11); |
| U_STRING_DECL(option_10, "charsetname", 11); |
| U_STRING_DECL(option_11, "charset", 7); |
| U_STRING_DECL(option_12, "before", 6); |
| |
| |
| ucolTokSuboption alternateSub[2] = { |
| {suboption_00, 13, UCOL_NON_IGNORABLE}, |
| {suboption_01, 7, UCOL_SHIFTED} |
| }; |
| |
| ucolTokSuboption caseFirstSub[3] = { |
| {suboption_02, 5, UCOL_LOWER_FIRST}, |
| {suboption_03, 5, UCOL_UPPER_FIRST}, |
| {suboption_04, 3, UCOL_OFF}, |
| }; |
| |
| ucolTokSuboption onOffSub[2] = { |
| {suboption_04, 3, UCOL_OFF}, |
| {suboption_05, 2, UCOL_ON} |
| }; |
| |
| ucolTokSuboption frenchSub[1] = { |
| {suboption_07, 1, UCOL_ON} |
| }; |
| |
| ucolTokSuboption beforeSub[3] = { |
| {suboption_06, 1, UCOL_PRIMARY}, |
| {suboption_07, 1, UCOL_SECONDARY}, |
| {suboption_08, 1, UCOL_TERTIARY} |
| }; |
| |
| |
| ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { |
| {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ |
| {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ |
| {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ |
| {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ |
| {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ |
| {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ |
| {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ |
| {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ |
| {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ |
| {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ |
| {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ |
| {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ |
| {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */ |
| }; |
| |
| int32_t |
| u_strncmpNoCase(const UChar *s1, |
| const UChar *s2, |
| int32_t n) |
| { |
| if(n > 0) { |
| int32_t rc; |
| for(;;) { |
| rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); |
| if(rc != 0 || *s1 == 0 || --n == 0) { |
| return rc; |
| } |
| ++s1; |
| ++s2; |
| } |
| } |
| return 0; |
| } |
| |
| uint8_t ucol_uprv_tok_readAndSetOption(UColOptionSet *opts, const UChar* start, const UChar *end, UErrorCode *status) { |
| uint32_t i = 0; |
| int32_t j=0; |
| UBool foundOption = FALSE; |
| const UChar *optionArg = NULL; |
| if(!didInit) { |
| U_STRING_INIT(suboption_00, "non-ignorable", 13); |
| U_STRING_INIT(suboption_01, "shifted", 7); |
| |
| U_STRING_INIT(suboption_02, "lower", 5); |
| U_STRING_INIT(suboption_03, "upper", 5); |
| U_STRING_INIT(suboption_04, "off", 3); |
| U_STRING_INIT(suboption_05, "on", 2); |
| |
| U_STRING_INIT(suboption_06, "1", 1); |
| U_STRING_INIT(suboption_07, "2", 1); |
| U_STRING_INIT(suboption_08, "3", 1); |
| |
| |
| U_STRING_INIT(option_00, "undefined", 9); |
| U_STRING_INIT(option_01, "rearrange", 9); |
| U_STRING_INIT(option_02, "alternate", 9); |
| U_STRING_INIT(option_03, "backwards", 9); |
| U_STRING_INIT(option_04, "variable top", 12); |
| U_STRING_INIT(option_05, "top", 3); |
| U_STRING_INIT(option_06, "normalization", 13); |
| U_STRING_INIT(option_07, "caseLevel", 9); |
| U_STRING_INIT(option_08, "caseFirst", 9); |
| U_STRING_INIT(option_09, "scriptOrder", 11); |
| U_STRING_INIT(option_10, "charsetname", 11); |
| U_STRING_INIT(option_11, "charset", 7); |
| U_STRING_INIT(option_12, "before", 6); |
| } |
| start++; /*skip opening '['*/ |
| while(i < UTOK_OPTION_COUNT) { |
| if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
| foundOption = TRUE; |
| if(end - start > rulesOptions[i].optionLen) { |
| optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ |
| while(u_isWhitespace(*optionArg)) { /* eat whitespace */ |
| optionArg++; |
| } |
| } |
| break; |
| } |
| i++; |
| } |
| |
| if(!foundOption) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return FALSE; |
| } |
| |
| if(i<5) { |
| if(optionArg) { |
| for(j = 0; j<rulesOptions[i].subSize; j++) { |
| if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
| ucol_uprv_tok_setOptionInImage(opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
| return UCOL_TOK_SUCCESS; |
| } |
| } |
| } |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return FALSE; |
| } else if(i == 5) { /* variable top */ |
| return UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; |
| } else if(i == 6) { /*rearange */ |
| return UCOL_TOK_SUCCESS; |
| } else if(i == 7) { /*top */ |
| return UCOL_TOK_SUCCESS | UCOL_TOK_TOP; |
| } else if(i == 8) { /*before*/ |
| if(optionArg) { |
| for(j = 0; j<rulesOptions[i].subSize; j++) { |
| if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
| return UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1; |
| } |
| } |
| } |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } else { |
| *status = U_UNSUPPORTED_ERROR; |
| return 0; |
| } |
| } |
| |
| const UChar *ucol_tok_parseNextToken(UColTokenParser *src, |
| uint32_t *strength, |
| uint32_t *chOffset, uint32_t *chLen, |
| uint32_t *exOffset, uint32_t *exLen, |
| uint8_t *specs, |
| UBool startOfRules, |
| UErrorCode *status) { |
| /* parsing part */ |
| |
| UBool variableTop = FALSE; |
| UBool top = FALSE; |
| UBool inChars = TRUE; |
| UBool inQuote = FALSE; |
| UBool wasInQuote = FALSE; |
| UChar *optionEnd = NULL; |
| uint8_t before = 0; |
| |
| uint32_t newCharsLen = 0, newExtensionLen = 0; |
| uint32_t charsOffset = 0, extensionOffset = 0; |
| uint32_t newStrength = UCOL_TOK_UNSET; |
| |
| while (src->current < src->end) { |
| UChar ch = *(src->current); |
| |
| if (inQuote) { |
| if (ch == 0x0027/*'\''*/) { |
| inQuote = FALSE; |
| } else { |
| if ((newCharsLen == 0) || inChars) { |
| if(newCharsLen == 0) { |
| charsOffset = src->extraCurrent - src->source; |
| } |
| newCharsLen++; |
| } else { |
| if(newExtensionLen == 0) { |
| extensionOffset = src->extraCurrent - src->source; |
| } |
| newExtensionLen++; |
| } |
| } |
| } else { |
| /* Sets the strength for this entry */ |
| switch (ch) { |
| case 0x003D/*'='*/ : |
| if (newStrength != UCOL_TOK_UNSET) { |
| goto EndOfLoop; |
| } |
| |
| /* if we start with strength, we'll reset to top */ |
| if(startOfRules == TRUE) { |
| top = TRUE; |
| newStrength = UCOL_TOK_RESET; |
| goto EndOfLoop; |
| } |
| newStrength = UCOL_IDENTICAL; |
| break; |
| |
| case 0x002C/*','*/: |
| if (newStrength != UCOL_TOK_UNSET) { |
| goto EndOfLoop; |
| } |
| |
| /* if we start with strength, we'll reset to top */ |
| if(startOfRules == TRUE) { |
| top = TRUE; |
| newStrength = UCOL_TOK_RESET; |
| goto EndOfLoop; |
| } |
| newStrength = UCOL_TERTIARY; |
| break; |
| |
| case 0x003B/*';'*/: |
| if (newStrength != UCOL_TOK_UNSET) { |
| goto EndOfLoop; |
| } |
| |
| /* if we start with strength, we'll reset to top */ |
| if(startOfRules == TRUE) { |
| top = TRUE; |
| newStrength = UCOL_TOK_RESET; |
| goto EndOfLoop; |
| } |
| newStrength = UCOL_SECONDARY; |
| break; |
| |
| case 0x003C/*'<'*/: |
| if (newStrength != UCOL_TOK_UNSET) { |
| goto EndOfLoop; |
| } |
| |
| /* if we start with strength, we'll reset to top */ |
| if(startOfRules == TRUE) { |
| top = TRUE; |
| newStrength = UCOL_TOK_RESET; |
| goto EndOfLoop; |
| } |
| /* before this, do a scan to verify whether this is */ |
| /* another strength */ |
| if(*(src->current+1) == 0x003C) { |
| src->current++; |
| if(*(src->current+1) == 0x003C) { |
| src->current++; /* three in a row! */ |
| newStrength = UCOL_TERTIARY; |
| } else { /* two in a row */ |
| newStrength = UCOL_SECONDARY; |
| } |
| } else { /* just one */ |
| newStrength = UCOL_PRIMARY; |
| } |
| break; |
| |
| case 0x0026/*'&'*/: |
| if (newStrength != UCOL_TOK_UNSET) { |
| goto EndOfLoop; |
| } |
| |
| newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ |
| break; |
| |
| case 0x005b/*'['*/: |
| /* options - read an option, analyze it */ |
| if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) { |
| uint8_t result = ucol_uprv_tok_readAndSetOption(src->opts, src->current, optionEnd, status); |
| src->current = optionEnd; |
| if(U_SUCCESS(*status)) { |
| if(result & UCOL_TOK_TOP) { |
| if(newStrength == UCOL_TOK_RESET) { |
| top = TRUE; |
| src->current++; |
| goto EndOfLoop; |
| } else { |
| *status = U_INVALID_FORMAT_ERROR; |
| } |
| } else if(result & UCOL_TOK_VARIABLE_TOP) { |
| if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { |
| variableTop = TRUE; |
| charsOffset = src->extraCurrent - src->source; |
| newCharsLen = 1; |
| *src->extraCurrent++ = 0xFFFF; |
| src->current++; |
| goto EndOfLoop; |
| } else { |
| *status = U_INVALID_FORMAT_ERROR; |
| } |
| } else if (result & UCOL_TOK_BEFORE){ |
| if(newStrength == UCOL_TOK_RESET) { |
| before = result & UCOL_TOK_BEFORE; |
| } else { |
| *status = U_INVALID_FORMAT_ERROR; |
| } |
| } |
| } else { |
| return NULL; |
| } |
| } |
| break; |
| /* Ignore the white spaces */ |
| case 0x0009/*'\t'*/: |
| case 0x000C/*'\f'*/: |
| case 0x000D/*'\r'*/: |
| case 0x000A/*'\n'*/: |
| case 0x0020/*' '*/: |
| break; /* skip whitespace TODO use Unicode */ |
| case 0x002F/*'/'*/: |
| wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ |
| inChars = FALSE; /* we're now processing expansion */ |
| break; |
| |
| /* found a quote, we're gonna start copying */ |
| case 0x0027/*'\''*/: |
| if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| |
| inQuote = TRUE; |
| |
| if(inChars) { /* we're doing characters */ |
| if(wasInQuote == FALSE) { |
| charsOffset = src->extraCurrent - src->source; |
| } |
| if (newCharsLen != 0) { |
| uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar)); |
| src->extraCurrent += newCharsLen; |
| } |
| newCharsLen++; |
| } else { /* we're doing an expansion */ |
| if(wasInQuote == FALSE) { |
| extensionOffset = src->extraCurrent - src->source; |
| } |
| if (newExtensionLen != 0) { |
| uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar)); |
| src->extraCurrent += newExtensionLen; |
| } |
| newExtensionLen++; |
| } |
| |
| wasInQuote = TRUE; |
| |
| ch = *(++(src->current)); |
| if(ch == 0x0027) { /* copy the double quote */ |
| *src->extraCurrent++ = ch; |
| inQuote = FALSE; |
| } |
| break; |
| |
| /* '@' is french only if the strength is not currently set */ |
| /* if it is, it's just a regular character in collation rules */ |
| case 0x0040/*'@'*/: |
| if (newStrength == UCOL_TOK_UNSET) { |
| src->opts->frenchCollation = UCOL_ON; |
| break; |
| } |
| |
| default: |
| if (newStrength == UCOL_TOK_UNSET) { |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| |
| if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| |
| if(ch == 0x0000 && src->current+1 == src->end) { |
| break; |
| } |
| |
| if (inChars) { |
| if(newCharsLen == 0) { |
| charsOffset = src->current - src->source; |
| } |
| newCharsLen++; |
| } else { |
| if(newExtensionLen == 0) { |
| extensionOffset = src->current - src->source; |
| } |
| newExtensionLen++; |
| } |
| |
| break; |
| } |
| } |
| |
| if(wasInQuote) { |
| if(ch != 0x27) { |
| *src->extraCurrent++ = ch; |
| } |
| if(src->extraCurrent == src->extraEnd) { |
| /* reallocate */ |
| UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); |
| if(newSrc != NULL) { |
| src->current = newSrc + (src->current - src->source); |
| src->extraCurrent = newSrc + (src->extraCurrent - src->source); |
| src->end = newSrc + (src->end - src->source); |
| src->extraEnd = newSrc + (src->extraEnd-src->source)*2; |
| src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); |
| src->source = newSrc; |
| } else { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| } |
| } |
| |
| src->current++; |
| } |
| |
| EndOfLoop: |
| wasInQuote = FALSE; |
| if (newStrength == UCOL_TOK_UNSET) { |
| return NULL; |
| } |
| |
| if (newCharsLen == 0 && top == FALSE) { |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| |
| *strength = newStrength; |
| |
| *chOffset = charsOffset; |
| *chLen = newCharsLen; |
| *exOffset = extensionOffset; |
| *exLen = newExtensionLen; |
| *specs = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; |
| |
| return src->current; |
| } |
| |
| inline void getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint32_t strength, uint32_t *charsOffset, uint32_t *newCharsLen, UErrorCode *status) { |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| /* this is a virgin before - we need to fish the anchor from the UCA */ |
| collIterate s; |
| uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; |
| uint32_t CE, SecondCE; |
| uint32_t invPos; |
| if(sourceToken != NULL) { |
| init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s); |
| } else { |
| init_collIterate(src->UCA, src->source+*charsOffset, 1, &s); |
| } |
| |
| baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; |
| baseContCE = ucol_getNextCE(src->UCA, &s, status); |
| if(baseContCE == UCOL_NO_MORE_CES) { |
| baseContCE = 0; |
| } |
| |
| invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength); |
| |
| uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| uint32_t ch = CETable[3*invPos+2]; |
| |
| if((ch & UCOL_INV_SIZEMASK) != 0) { |
| uint32_t *conts = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->conts); |
| uint32_t offset = (ch & UCOL_INV_OFFSETMASK); |
| ch = conts[offset]; |
| } |
| *src->extraCurrent++ = (UChar)ch; |
| *charsOffset = src->extraCurrent - src->source - 1; |
| *newCharsLen = 1; |
| } |
| |
| /* |
| Processing Description |
| 1 Build a ListList. Each list has a header, which contains two lists (positive |
| and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and |
| reset may be null. |
| 2 As you process, you keep a LAST pointer that points to the last token you |
| handled. |
| */ |
| |
| uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) { |
| UColToken *lastToken = NULL; |
| const UChar *parseEnd = NULL; |
| uint32_t expandNext = 0; |
| UBool variableTop = FALSE; |
| UBool top = FALSE; |
| uint8_t specs = 0; |
| |
| UColTokListHeader *ListList = NULL; |
| |
| uint32_t newCharsLen = 0, newExtensionsLen = 0; |
| uint32_t charsOffset = 0, extensionOffset = 0; |
| uint32_t newStrength = UCOL_TOK_UNSET; |
| |
| UHashtable *uchars2tokens = src->tailored; |
| ListList = src->lh; |
| |
| while(src->current < src->end) { |
| |
| parseEnd = ucol_tok_parseNextToken(src, |
| &newStrength, |
| &charsOffset, &newCharsLen, |
| &extensionOffset, &newExtensionsLen, |
| &specs, |
| (UBool)(lastToken == NULL), |
| status); |
| |
| variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); |
| top = ((specs & UCOL_TOK_TOP) != 0); |
| |
| if(U_SUCCESS(*status) && parseEnd != NULL) { |
| UColToken *sourceToken = NULL; |
| uint32_t key = 0; |
| uint32_t lastStrength = UCOL_TOK_UNSET; |
| |
| if(lastToken != NULL ) { |
| lastStrength = lastToken->strength; |
| } |
| |
| key = newCharsLen << 24 | charsOffset; |
| |
| /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ |
| sourceToken = (UColToken *)uhash_get(uchars2tokens, (void *)key); |
| |
| if(newStrength != UCOL_TOK_RESET) { |
| if(lastToken == NULL) { /* this means that rules haven't started properly */ |
| *status = U_INVALID_FORMAT_ERROR; |
| return 0; |
| } |
| /* 6 Otherwise (when relation != reset) */ |
| if(sourceToken == NULL) { |
| /* If sourceToken is null, create new one, */ |
| sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
| sourceToken->source = newCharsLen << 24 | charsOffset; |
| |
| sourceToken->debugSource = *(src->source + charsOffset); |
| |
| sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
| sourceToken->next = NULL; |
| sourceToken->previous = NULL; |
| sourceToken->noOfCEs = 0; |
| sourceToken->noOfExpCEs = 0; |
| uhash_put(uchars2tokens, (void *)sourceToken->source, sourceToken, status); |
| } else { |
| /* we could have fished out a reset here */ |
| if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { |
| /* otherwise remove sourceToken from where it was. */ |
| if(sourceToken->next != NULL) { |
| if(sourceToken->next->strength > sourceToken->strength) { |
| sourceToken->next->strength = sourceToken->strength; |
| } |
| sourceToken->next->previous = sourceToken->previous; |
| } else { |
| sourceToken->listHeader->last = sourceToken->previous; |
| } |
| |
| if(sourceToken->previous != NULL) { |
| sourceToken->previous->next = sourceToken->next; |
| } else { |
| sourceToken->listHeader->first = sourceToken->next; |
| } |
| sourceToken->next = NULL; |
| sourceToken->previous = NULL; |
| } |
| } |
| |
| sourceToken->strength = newStrength; |
| sourceToken->listHeader = lastToken->listHeader; |
| |
| /* |
| 1. Find the strongest strength in each list, and set strongestP and strongestN |
| accordingly in the headers. |
| */ |
| if(lastStrength == UCOL_TOK_RESET |
| || sourceToken->listHeader->first == 0) { |
| /* If LAST is a reset |
| insert sourceToken in the list. */ |
| if(sourceToken->listHeader->first == 0) { |
| sourceToken->listHeader->first = sourceToken; |
| sourceToken->listHeader->last = sourceToken; |
| } else { /* we need to find a place for us */ |
| /* and we'll get in front of the same strength */ |
| if(sourceToken->listHeader->first->strength <= sourceToken->strength) { |
| sourceToken->next = sourceToken->listHeader->first; |
| sourceToken->next->previous = sourceToken; |
| sourceToken->listHeader->first = sourceToken; |
| sourceToken->previous = NULL; |
| } else { |
| lastToken = sourceToken->listHeader->first; |
| while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
| lastToken = lastToken->next; |
| } |
| if(lastToken->next != NULL) { |
| lastToken->next->previous = sourceToken; |
| } else { |
| sourceToken->listHeader->last = sourceToken; |
| } |
| sourceToken->previous = lastToken; |
| sourceToken->next = lastToken->next; |
| lastToken->next = sourceToken; |
| } |
| } |
| } else { |
| /* Otherwise (when LAST is not a reset) |
| if polarity (LAST) == polarity(relation), insert sourceToken after LAST, |
| otherwise insert before. |
| when inserting after or before, search to the next position with the same |
| strength in that direction. (This is called postpone insertion). */ |
| if(sourceToken != lastToken) { |
| if(lastToken->polarity == sourceToken->polarity) { |
| while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
| lastToken = lastToken->next; |
| } |
| sourceToken->previous = lastToken; |
| if(lastToken->next != NULL) { |
| lastToken->next->previous = sourceToken; |
| } else { |
| sourceToken->listHeader->last = sourceToken; |
| } |
| |
| sourceToken->next = lastToken->next; |
| lastToken->next = sourceToken; |
| } else { |
| while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { |
| lastToken = lastToken->previous; |
| } |
| sourceToken->next = lastToken; |
| if(lastToken->previous != NULL) { |
| lastToken->previous->next = sourceToken; |
| } else { |
| sourceToken->listHeader->first = sourceToken; |
| } |
| sourceToken->previous = lastToken->previous; |
| lastToken->previous = sourceToken; |
| } |
| } else { /* repeated one thing twice in rules, stay with the stronger strength */ |
| if(lastStrength < sourceToken->strength) { |
| sourceToken->strength = lastStrength; |
| } |
| } |
| } |
| |
| /* if the token was a variable top, we're gonna put it in */ |
| if(variableTop == TRUE && src->varTop == NULL) { |
| variableTop = FALSE; |
| src->varTop = sourceToken; |
| } |
| |
| // Treat the expansions. |
| // There are two types of expansions: explicit (x / y) and reset based propagating expansions |
| // (&abc * d * e <=> &ab * d / c * e / c) |
| // if both of them are in effect for a token, they are combined. |
| |
| sourceToken->expansion = newExtensionsLen << 24 | extensionOffset; |
| |
| if(expandNext != 0) { |
| if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ |
| expandNext = 0; |
| } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ |
| sourceToken->expansion = expandNext; |
| } else { /* there is both explicit and implicit expansion. We need to make a combination */ |
| memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); |
| memcpy(src->extraCurrent+(expandNext >> 24), src->source + extensionOffset, newExtensionsLen*sizeof(UChar)); |
| sourceToken->expansion = ((expandNext >> 24) + newExtensionsLen)<<24 | (src->extraCurrent - src->source); |
| src->extraCurrent += (expandNext >> 24) + newExtensionsLen; |
| } |
| } |
| |
| // This is just for debugging purposes |
| if(sourceToken->expansion != 0) { |
| sourceToken->debugExpansion = *(src->source + extensionOffset); |
| } else { |
| sourceToken->debugExpansion = 0; |
| } |
| } else { |
| if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ |
| uint32_t searchCharsLen = newCharsLen; |
| while(searchCharsLen > 1 && sourceToken == NULL) { |
| searchCharsLen--; |
| key = searchCharsLen << 24 | charsOffset; |
| sourceToken = (UColToken *)uhash_get(uchars2tokens, (void *)key); |
| } |
| if(sourceToken != NULL) { |
| expandNext = (newCharsLen - searchCharsLen) << 24 | (charsOffset + searchCharsLen); |
| } |
| } |
| |
| uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
| collIterate s; |
| |
| if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ |
| uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
| if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
| /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ |
| while(sourceToken->strength > strength && sourceToken->previous != NULL) { |
| sourceToken = sourceToken->previous; |
| } |
| /* here, either we hit the strength or NULL */ |
| if(sourceToken->strength == strength) { |
| if(sourceToken->previous != NULL) { |
| sourceToken = sourceToken->previous; |
| } else { /* start of list */ |
| sourceToken = sourceToken->listHeader->reset; |
| } |
| } else { /* we hit NULL */ |
| /* we should be doing the else part */ |
| sourceToken = sourceToken->listHeader->reset; |
| getVirginBefore(src, sourceToken, strength, &charsOffset, &newCharsLen, status); |
| sourceToken = NULL; |
| } |
| } else { |
| getVirginBefore(src, sourceToken, strength, &charsOffset, &newCharsLen, status); |
| sourceToken = NULL; |
| } |
| } |
| |
| |
| if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { |
| /* if the previous token was also a reset, */ |
| /*this means that we have two consecutive resets */ |
| /* and we want to remove the previous one if empty*/ |
| if(ListList[src->resultLen-1].first == NULL) { |
| src->resultLen--; |
| } |
| } |
| |
| /* 5 If the relation is a reset: |
| If sourceToken is null |
| Create new list, create new sourceToken, make the baseCE from source, put |
| the sourceToken in ListHeader of the new list */ |
| if(sourceToken == NULL) { |
| |
| /* do the reset thing */ |
| sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
| sourceToken->source = newCharsLen << 24 | charsOffset; |
| sourceToken->expansion = newExtensionsLen << 24 | extensionOffset; |
| |
| sourceToken->debugSource = *(src->source + charsOffset); |
| sourceToken->debugExpansion = *(src->source + extensionOffset); |
| |
| |
| sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
| sourceToken->strength = UCOL_TOK_RESET; |
| sourceToken->next = NULL; |
| sourceToken->previous = NULL; |
| sourceToken->listHeader = &ListList[src->resultLen]; |
| /* |
| 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
| First convert all expansions into normal form. Examples: |
| If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
| d * ... into &x * c/y * d * ... |
| Note: reset values can never have expansions, although they can cause the |
| very next item to have one. They may be contractions, if they are found |
| earlier in the list. |
| */ |
| if(top == FALSE) { |
| uint32_t resetCharsOffset; |
| |
| init_collIterate(src->UCA, src->source+charsOffset, newCharsLen, &s); |
| |
| CE = ucol_getNextCE(src->UCA, &s, status); |
| resetCharsOffset = s.pos - src->source; |
| |
| SecondCE = ucol_getNextCE(src->UCA, &s, status); |
| |
| ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| if(isContinuation(SecondCE)) { |
| ListList[src->resultLen].baseContCE = SecondCE; |
| } else { |
| ListList[src->resultLen].baseContCE = 0; |
| } |
| if(newCharsLen > 1) { |
| sourceToken->source = ((resetCharsOffset - charsOffset ) << 24) | charsOffset; |
| expandNext = ((newCharsLen + charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); |
| } else { |
| expandNext = 0; |
| } |
| } else { /* top == TRUE */ |
| top = FALSE; |
| ListList[src->resultLen].baseCE = UCOL_RESET_TOP_VALUE; |
| ListList[src->resultLen].baseContCE = 0; |
| } |
| |
| |
| ListList[src->resultLen].first = NULL; |
| ListList[src->resultLen].last = NULL; |
| ListList[src->resultLen].first = NULL; |
| ListList[src->resultLen].last = NULL; |
| |
| ListList[src->resultLen].reset = sourceToken; |
| |
| src->resultLen++; |
| uhash_put(uchars2tokens, (void *)sourceToken->source, sourceToken, status); |
| } else { /* reset to something already in rules */ |
| top = FALSE; |
| } |
| } |
| /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ |
| lastToken = sourceToken; |
| } else { |
| return 0; |
| } |
| } |
| |
| if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
| src->resultLen--; |
| } |
| return src->resultLen; |
| } |
| |
| |
| void ucol_tok_closeTokenList(UColTokenParser *src) { |
| if(src->tailored != NULL) { |
| uhash_close(src->tailored); |
| } |
| if(src->lh != NULL) { |
| uprv_free(src->lh); |
| } |
| if(src->source != NULL) { |
| uprv_free(src->source); |
| } |
| if(src->opts != NULL) { |
| uprv_free(src->opts); |
| } |
| } |
| |