| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| |
| #include "unicode/ucol.h" |
| |
| #include "unicode/uloc.h" |
| #include "unicode/coll.h" |
| #include "unicode/tblcoll.h" |
| #include "unicode/coleitr.h" |
| #include "unicode/ustring.h" |
| #include "unicode/normlzr.h" |
| #include "cpputils.h" |
| |
| #define UCOL_LEVELTERMINATOR 0 |
| #define UCOL_IGNORABLE 0x0000 |
| #define UCOL_CHARINDEX 0x70000000 // need look up in .commit() |
| #define UCOL_EXPANDCHARINDEX 0x7E000000 // Expand index follows |
| #define UCOL_CONTRACTCHARINDEX 0x7F000000 // contract indexes follows |
| #define UCOL_UNMAPPED 0xFFFFFFFF // unmapped character values |
| #define UCOL_PRIMARYORDERINCREMENT 0x00010000 // primary strength increment |
| #define UCOL_SECONDARYORDERINCREMENT 0x00000100 // secondary strength increment |
| #define UCOL_TERTIARYORDERINCREMENT 0x00000001 // tertiary strength increment |
| #define UCOL_MAXIGNORABLE 0x00010000 // maximum ignorable char order value |
| #define UCOL_PRIMARYORDERMASK 0xffff0000 // mask off anything but primary order |
| #define UCOL_SECONDARYORDERMASK 0x0000ff00 // mask off anything but secondary order |
| #define UCOL_TERTIARYORDERMASK 0x000000ff // mask off anything but tertiary order |
| #define UCOL_SECONDARYRESETMASK 0x0000ffff // mask off secondary and tertiary order |
| #define UCOL_IGNORABLEMASK 0x0000ffff // mask off ignorable char order |
| #define UCOL_PRIMARYDIFFERENCEONLY 0xffff0000 // use only the primary difference |
| #define UCOL_SECONDARYDIFFERENCEONLY 0xffffff00 // use only the primary and secondary difference |
| #define UCOL_PRIMARYORDERSHIFT 16 // primary order shift |
| #define UCOL_SECONDARYORDERSHIFT 8 // secondary order shift |
| #define UCOL_SORTKEYOFFSET 1 // minimum sort key offset |
| #define UCOL_CONTRACTCHAROVERFLOW 0x7FFFFFFF // Indicates the char is a contract char |
| |
| U_CAPI int32_t |
| u_normalize(const UChar* source, |
| int32_t sourceLength, |
| UNormalizationMode mode, |
| int32_t option, |
| UChar* result, |
| int32_t resultLength, |
| UErrorCode* status) |
| { |
| if(U_FAILURE(*status)) return -1; |
| |
| Normalizer::EMode normMode; |
| switch(mode) { |
| case UCOL_NO_NORMALIZATION: |
| normMode = Normalizer::NO_OP; |
| break; |
| case UCOL_DECOMP_CAN: |
| normMode = Normalizer::DECOMP; |
| break; |
| case UCOL_DECOMP_COMPAT: |
| normMode = Normalizer::DECOMP_COMPAT; |
| break; |
| case UCOL_DECOMP_CAN_COMP_COMPAT: |
| normMode = Normalizer::COMPOSE; |
| break; |
| case UCOL_DECOMP_COMPAT_COMP_CAN: |
| normMode = Normalizer::COMPOSE_COMPAT; |
| break; |
| default: |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return -1; |
| } |
| |
| int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); |
| const UnicodeString src((UChar*)source, len, len); |
| UnicodeString dst(result, 0, resultLength); |
| Normalizer::normalize(src, normMode, option, dst, *status); |
| int32_t actualLen; |
| T_fillOutputParams(&dst, result, resultLength, &actualLen, status); |
| return actualLen; |
| } |
| |
| U_CAPI UCollator* |
| ucol_open( const char *loc, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return 0; |
| |
| Collator *col = 0; |
| |
| if(loc == 0) |
| col = Collator::createInstance(*status); |
| else |
| col = Collator::createInstance(Locale(loc), *status); |
| |
| if(col == 0) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| |
| return (UCollator*)col; |
| } |
| |
| U_CAPI UCollator* |
| ucol_openRules( const UChar *rules, |
| int32_t rulesLength, |
| UNormalizationMode mode, |
| UCollationStrength strength, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return 0; |
| |
| int32_t len = (rulesLength == -1 ? u_strlen(rules) : rulesLength); |
| const UnicodeString ruleString((UChar*)rules, len, len); |
| |
| Normalizer::EMode normMode; |
| switch(mode) { |
| case UCOL_NO_NORMALIZATION: |
| normMode = Normalizer::NO_OP; |
| break; |
| case UCOL_DECOMP_CAN: |
| normMode = Normalizer::DECOMP; |
| break; |
| case UCOL_DECOMP_COMPAT: |
| normMode = Normalizer::DECOMP_COMPAT; |
| break; |
| case UCOL_DECOMP_CAN_COMP_COMPAT: |
| normMode = Normalizer::COMPOSE; |
| break; |
| case UCOL_DECOMP_COMPAT_COMP_CAN: |
| normMode = Normalizer::COMPOSE_COMPAT; |
| break; |
| default: |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| RuleBasedCollator *col = 0; |
| col = new RuleBasedCollator(ruleString, |
| (Collator::ECollationStrength) strength, |
| normMode, |
| *status); |
| |
| if(col == 0) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| |
| return (UCollator*) col; |
| } |
| |
| U_CAPI void |
| ucol_close(UCollator *coll) |
| { |
| delete (Collator*)coll; |
| } |
| |
| U_CAPI UBool |
| ucol_greater( const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| const UChar *target, |
| int32_t targetLength) |
| { |
| return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| == UCOL_GREATER); |
| } |
| |
| U_CAPI UBool |
| ucol_greaterOrEqual( const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| const UChar *target, |
| int32_t targetLength) |
| { |
| return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| != UCOL_LESS); |
| } |
| |
| U_CAPI UBool |
| ucol_equal( const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| const UChar *target, |
| int32_t targetLength) |
| { |
| return (ucol_strcoll(coll, source, sourceLength, target, targetLength) |
| == UCOL_EQUAL); |
| } |
| |
| U_CAPI UCollationStrength |
| ucol_getStrength(const UCollator *coll) |
| { |
| return (UCollationStrength) ((Collator*)coll)->getStrength(); |
| } |
| |
| |
| U_CAPI void |
| ucol_setStrength( UCollator *coll, |
| UCollationStrength strength) |
| { |
| ((Collator*)coll)->setStrength((Collator::ECollationStrength)strength); |
| } |
| |
| U_CAPI UNormalizationMode |
| ucol_getNormalization(const UCollator* coll) |
| { |
| switch(((Collator*)coll)->getDecomposition()) { |
| case Normalizer::NO_OP: |
| return UCOL_NO_NORMALIZATION; |
| |
| case Normalizer::COMPOSE: |
| return UCOL_DECOMP_COMPAT_COMP_CAN; |
| |
| case Normalizer::COMPOSE_COMPAT: |
| return UCOL_DECOMP_CAN_COMP_COMPAT; |
| |
| case Normalizer::DECOMP: |
| return UCOL_DECOMP_COMPAT; |
| |
| case Normalizer::DECOMP_COMPAT: |
| return UCOL_DECOMP_COMPAT; |
| |
| } |
| return UCOL_NO_NORMALIZATION; |
| } |
| |
| U_CAPI void |
| ucol_setNormalization( UCollator *coll, |
| UNormalizationMode mode) |
| { |
| Normalizer::EMode normMode; |
| switch(mode) { |
| case UCOL_NO_NORMALIZATION: |
| normMode = Normalizer::NO_OP; |
| break; |
| case UCOL_DECOMP_CAN: |
| normMode = Normalizer::DECOMP; |
| break; |
| case UCOL_DECOMP_COMPAT: |
| normMode = Normalizer::DECOMP_COMPAT; |
| break; |
| case UCOL_DECOMP_COMPAT_COMP_CAN: |
| normMode = Normalizer::COMPOSE; |
| break; |
| case UCOL_DECOMP_CAN_COMP_COMPAT: |
| normMode = Normalizer::COMPOSE_COMPAT; |
| break; |
| default: |
| /* Shouldn't get here. */ |
| /* *status = U_ILLEGAL_ARGUMENT_ERROR; */ |
| return; |
| } |
| |
| ((Collator*)coll)->setDecomposition(normMode); |
| } |
| |
| U_CAPI int32_t |
| ucol_getDisplayName( const char *objLoc, |
| const char *dispLoc, |
| UChar *result, |
| int32_t resultLength, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return -1; |
| |
| UnicodeString dst(result, resultLength, resultLength); |
| Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst); |
| int32_t actLen; |
| T_fillOutputParams(&dst, result, resultLength, &actLen, status); |
| return actLen; |
| } |
| |
| U_CAPI const char* |
| ucol_getAvailable(int32_t index) |
| { |
| return uloc_getAvailable(index); |
| } |
| |
| U_CAPI int32_t |
| ucol_countAvailable() |
| { |
| return uloc_countAvailable(); |
| } |
| |
| U_CAPI const UChar* |
| ucol_getRules( const UCollator *coll, |
| int32_t *length) |
| { |
| const UnicodeString& rules = ((RuleBasedCollator*)coll)->getRules(); |
| *length = rules.length(); |
| return rules.getUChars(); |
| } |
| |
| static uint8_t utf16fixup[32] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0x20, 0xf8, 0xf8, 0xf8, 0xf8 |
| }; |
| |
| // This will get the next CE(s)? |
| // Should be part macro, part function |
| #include <stdio.h> |
| |
| #include "unicode/normlzr.h" |
| #include "ucmp32.h" |
| #include "tcoldata.h" |
| #include "tables.h" |
| #define UCOL_MAX_BUFFER 1000 |
| |
| struct collIterate { |
| UChar *string; // Original string |
| UChar *len; // Original string length |
| UChar *pos; // This is position in the string |
| uint32_t *toReturn; // This is the CE from CEs buffer that should be returned |
| uint32_t *CEpos; // This is the position to which we have stored processed CEs |
| uint32_t CEs[1024]; // This is where we store CEs |
| }; |
| |
| void init_collIterate(const UChar *string, int32_t len, collIterate *s) { |
| s->string = s->pos = (UChar *)string; |
| s->len = (UChar *)string+len; |
| s->CEpos = s->toReturn = s->CEs; |
| } |
| |
| int32_t ucol_getNextCE(const UCollator *coll, collIterate *source, UErrorCode *status) { |
| //printf("/Entry/ %x, %x, %x, %x, %x, %x\n", source->string, source->len, source->pos, source->CEs, source->CEpos, source->toReturn); |
| |
| if (U_FAILURE(*status) || (source->pos>=source->len && source->CEpos <= source->toReturn)) { |
| return CollationElementIterator::NULLORDER; |
| } |
| |
| if (source->CEpos > source->toReturn) { |
| //printf("/Expanded stuff/ %x, %x, %x, %x, %x, %x\n", source->string, source->len, source->pos, source->CEs, source->CEpos, source->toReturn); |
| return(*(source->toReturn++)); |
| } |
| |
| source->CEpos = source->toReturn = source->CEs; |
| |
| *(source->CEpos) = ucmp32_get(((RuleBasedCollator *)coll)->data->mapping, *(source->pos)); |
| |
| // this should benefit from reordering of the clauses, so that the cleanest case is returned the first. |
| |
| if(*(source->CEpos) < UCOL_EXPANDCHARINDEX) { |
| |
| source->pos++; |
| |
| //printf("/Normal Exit/ %x, %x, %x, %x, %x, %x\n", source->string, source->len, source->pos, source->CEs, source->CEpos, source->toReturn); |
| return (*(source->CEpos)); |
| } |
| |
| if (*(source->CEpos) == UCOL_UNMAPPED) { |
| // Returned an "unmapped" flag and save the character so it can be |
| // returned next time this method is called. |
| if (*(source->pos) == 0x0000) return *(source->pos++); // \u0000 is not valid in C++'s UnicodeString |
| *(source->CEpos++) = CollationElementIterator::UNMAPPEDCHARVALUE; |
| *(source->CEpos) = *(source->pos)<<16; |
| } else { |
| // Contraction sequence start... |
| if (*(source->CEpos) >= UCOL_CONTRACTCHARINDEX) { |
| // in place of: value = nextContractChar(cursor, ch, status); |
| VectorOfPToContractElement* list = ((RuleBasedCollator *)coll)->data->contractTable->at(*(source->CEpos)-UCOL_CONTRACTCHARINDEX); |
| // The upper line obtained a list of contracting sequences. |
| EntryPair *pair = (EntryPair *)list->at(0); // Taking out the first one. |
| int32_t order = pair->value; // This got us mapping for just the first element - the one that signalled a contraction. |
| |
| UChar key[1024]; |
| uint32_t posKey = 0; |
| |
| key[posKey++] = *(source->pos++); |
| int32_t getEntryValue = RuleBasedCollator::UNMAPPED; |
| |
| while(source->pos<source->len) { |
| |
| key[posKey++] = *(source->pos); |
| |
| // in place of: int32_t n = getEntry(list, key, TRUE); |
| { |
| int32_t i; |
| if (list != NULL) |
| { |
| for (i = 0; i < list->size(); i++) |
| { |
| EntryPair *pair = list->at(i); |
| |
| if ((pair != NULL) && (pair->fwd == TRUE /*fwd*/) && (pair->entryName == UnicodeString(key, posKey))) |
| { |
| getEntryValue = i; |
| goto done; |
| // break or something |
| } |
| } |
| } |
| getEntryValue = RuleBasedCollator::UNMAPPED; |
| } |
| done: |
| // end of getEntry |
| |
| if (getEntryValue == RuleBasedCollator::UNMAPPED) |
| { |
| break; |
| } |
| |
| source->pos++; |
| pair = (EntryPair *)list->at(getEntryValue); |
| order = pair->value; |
| } |
| *(source->CEpos++) = order; |
| //printf("/Contraction Exit/ %x, %x, %x, %x, %x, %x\n", source->string, source->len, source->pos, source->CEs, source->CEpos, source->toReturn); |
| |
| return (*(source->toReturn++)); |
| } |
| // Expansion sequence start... |
| if (*(source->CEs) >= UCOL_EXPANDCHARINDEX) { |
| VectorOfInt *v = ((RuleBasedCollator *)coll)->data->expandTable->at(*(source->CEpos)-UCOL_EXPANDCHARINDEX); |
| if(v != NULL) { |
| int32_t expandindex=0; |
| while(expandindex < v->size()) { |
| *(source->CEpos++) = v->at(expandindex++); |
| } |
| source->pos++; |
| //printf("/Expansion start Exit/ %x, %x, %x, %x, %x, %x\n", source->string, source->len, source->pos, source->CEs, source->CEpos, source->toReturn); |
| return (*(source->toReturn++)); |
| } |
| } |
| |
| // Thai/Lao reordering |
| if (CollationElementIterator::isThaiPreVowel(*(source->pos))) { |
| UChar consonant = *(source->pos+1); |
| if (CollationElementIterator::isThaiBaseConsonant(consonant)) { |
| source->pos++; |
| // find the element for consonant |
| // and reorder them |
| } |
| } |
| } |
| |
| source->CEpos++; |
| source->pos++; |
| |
| //printf("/Goofy Exit/ %x, %x, %x, %x, %x, %x\n", source->string, source->len, source->pos, source->CEs, source->CEpos, source->toReturn); |
| return (*(source->toReturn++)); |
| } |
| |
| U_CAPI UCollationResult |
| ucol_strcoll( const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| const UChar *target, |
| int32_t targetLength) |
| { |
| if (coll == NULL) return UCOL_EQUAL; |
| if (sourceLength == -1) sourceLength = u_strlen(source); |
| if (targetLength == -1) targetLength = u_strlen(target); |
| return (UCollationResult) ((Collator*)coll)->compare(source,sourceLength,target,targetLength); |
| } |
| |
| U_CAPI UCollationResult |
| ucol_strcollEx( const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| const UChar *target, |
| int32_t targetLength) |
| { |
| |
| // check if source and target are valid strings |
| if (((source == 0) && (target == 0)) || |
| ((sourceLength == 0) && (targetLength == 0))) |
| { |
| return UCOL_EQUAL; |
| } |
| |
| UCollationResult result = UCOL_EQUAL; |
| UErrorCode status = U_ZERO_ERROR; |
| |
| UChar normSource[UCOL_MAX_BUFFER], normTarget[UCOL_MAX_BUFFER]; |
| uint32_t normSourceLength = UCOL_MAX_BUFFER, normTargetLength = UCOL_MAX_BUFFER; |
| |
| #if 0 |
| if (cursor1 == NULL) |
| { |
| ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLength, getDecomposition()); |
| } |
| else |
| { |
| cursor1->setModeAndText(getDecomposition(), source, sourceLength, status); |
| } |
| |
| if ( /*cursor1->cursor == NULL ||*/ U_FAILURE(status)) |
| { |
| return Collator::EQUAL; |
| } |
| |
| if (cursor2 == NULL) |
| { |
| ((RuleBasedCollator *)this)->cursor2 = new NormalizerIterator(target, targetLength, getDecomposition()); |
| } |
| else |
| { |
| cursor2->setModeAndText(getDecomposition(), target, targetLength, status); |
| } |
| #endif |
| |
| collIterate sColl, tColl; |
| |
| |
| UNormalizationMode normMode = ucol_getNormalization(coll); |
| if(normMode == UCOL_NO_NORMALIZATION) { |
| init_collIterate(source, sourceLength, &sColl); |
| init_collIterate(target, targetLength, &tColl); |
| } else { |
| normSourceLength = u_normalize(source, sourceLength, normMode, 0, normSource, normSourceLength, &status); |
| normTargetLength = u_normalize(target, targetLength, normMode, 0, normTarget, normTargetLength, &status); |
| init_collIterate(normSource, normSourceLength, &sColl); |
| init_collIterate(normTarget, normTargetLength, &tColl); |
| } |
| |
| if (/*cursor2 == NULL ||*/ U_FAILURE(status)) |
| { |
| return UCOL_EQUAL; |
| } |
| |
| int32_t sOrder, tOrder; |
| // int32_t sOrder = CollationElementIterator::NULLORDER, tOrder = CollationElementIterator::NULLORDER; |
| UBool gets = TRUE, gett = TRUE; |
| UBool initialCheckSecTer = ucol_getStrength(coll) >= Collator::SECONDARY; |
| UBool checkSecTer = initialCheckSecTer; |
| UBool checkTertiary = ucol_getStrength(coll) >= Collator::TERTIARY; |
| UBool isFrenchSec = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, &status) == UCOL_ATTR_ON); |
| uint32_t pSOrder, pTOrder; |
| |
| for(;;) |
| { |
| // Get the next collation element in each of the strings, unless |
| // we've been requested to skip it. |
| if (gets) |
| { |
| //sOrder = getStrengthOrder((NormalizerIterator*)cursor1, status); |
| //printf("/Ex 1 Go/ %x, %x, %x, %x, %x, %x\n", sColl.string, sColl.len, sColl.pos, sColl.CEs, sColl.CEpos, sColl.toReturn); |
| sOrder = ucol_getNextCE(coll, &sColl, &status); |
| //printf("/Ex 1 End/ %x, %x, %x, %x, %x, %x\n", sColl.string, sColl.len, sColl.pos, sColl.CEs, sColl.CEpos, sColl.toReturn); |
| |
| if (U_FAILURE(status)) |
| { |
| return UCOL_EQUAL; |
| } |
| } |
| |
| gets = TRUE; |
| |
| if (gett) |
| { |
| //tOrder = getStrengthOrder((NormalizerIterator*)cursor2, status); |
| //printf("/Ex 2 Go/ %x, %x, %x, %x, %x, %x\n", tColl.string, tColl.len, tColl.pos, tColl.CEs, tColl.CEpos, tColl.toReturn); |
| tOrder = ucol_getNextCE(coll, &tColl, &status); |
| //printf("/Ex 2 End/ %x, %x, %x, %x, %x, %x\n", tColl.string, tColl.len, tColl.pos, tColl.CEs, tColl.CEpos, tColl.toReturn); |
| |
| if (U_FAILURE(status)) |
| { |
| return UCOL_EQUAL; |
| } |
| } |
| |
| gett = TRUE; |
| |
| // If we've hit the end of one of the strings, jump out of the loop |
| if ((sOrder == CollationElementIterator::NULLORDER)|| |
| (tOrder == CollationElementIterator::NULLORDER)) |
| { |
| break; |
| } |
| |
| // If there's no difference at this position, we can skip to the |
| // next one. |
| pSOrder = CollationElementIterator::primaryOrder(sOrder); |
| pTOrder = CollationElementIterator::primaryOrder(tOrder); |
| if (sOrder == tOrder) |
| { |
| if (isFrenchSec && pSOrder != 0) |
| { |
| if (!checkSecTer) |
| { |
| // in french, a secondary difference more to the right is stronger, |
| // so accents have to be checked with each base element |
| checkSecTer = initialCheckSecTer; |
| |
| // but tertiary differences are less important than the first |
| // secondary difference, so checking tertiary remains disabled |
| checkTertiary = FALSE; |
| } |
| } |
| |
| continue; |
| } |
| |
| // Compare primary differences first. |
| if (pSOrder != pTOrder) |
| { |
| if (sOrder == 0) |
| { |
| // The entire source element is ignorable. |
| // Skip to the next source element, but don't fetch another target element. |
| gett = FALSE; |
| continue; |
| } |
| |
| if (tOrder == 0) |
| { |
| gets = FALSE; |
| continue; |
| } |
| |
| // The source and target elements aren't ignorable, but it's still possible |
| // for the primary component of one of the elements to be ignorable.... |
| if (pSOrder == 0) // primary order in source is ignorable |
| { |
| // The source's primary is ignorable, but the target's isn't. We treat ignorables |
| // as a secondary difference, so remember that we found one. |
| if (checkSecTer) |
| { |
| result = UCOL_GREATER; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| |
| // Skip to the next source element, but don't fetch another target element. |
| gett = FALSE; |
| } |
| else if (pTOrder == 0) |
| { |
| // record differences - see the comment above. |
| if (checkSecTer) |
| { |
| result = UCOL_LESS; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| |
| // Skip to the next target element, but don't fetch another source element. |
| gets = FALSE; |
| } |
| else |
| { |
| // Neither of the orders is ignorable, and we already know that the primary |
| // orders are different because of the (pSOrder != pTOrder) test above. |
| // Record the difference and stop the comparison. |
| if (pSOrder < pTOrder) |
| { |
| return UCOL_LESS; // (strength is PRIMARY) |
| } |
| |
| return UCOL_GREATER; // (strength is PRIMARY) |
| } |
| } |
| else |
| { // else of if ( pSOrder != pTOrder ) |
| // primary order is the same, but complete order is different. So there |
| // are no base elements at this point, only ignorables (Since the strings are |
| // normalized) |
| |
| if (checkSecTer) |
| { |
| // a secondary or tertiary difference may still matter |
| uint32_t secSOrder = CollationElementIterator::secondaryOrder(sOrder); |
| uint32_t secTOrder = CollationElementIterator::secondaryOrder(tOrder); |
| |
| if (secSOrder != secTOrder) |
| { |
| // there is a secondary difference |
| result = (secSOrder < secTOrder) ? UCOL_LESS : UCOL_GREATER; |
| // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| // (even in french, only the first secondary difference within |
| // a base character matters) |
| } |
| else |
| { |
| if (checkTertiary) |
| { |
| // a tertiary difference may still matter |
| uint32_t terSOrder = CollationElementIterator::tertiaryOrder(sOrder); |
| uint32_t terTOrder = CollationElementIterator::tertiaryOrder(tOrder); |
| |
| if (terSOrder != terTOrder) |
| { |
| // there is a tertiary difference |
| result = (terSOrder < terTOrder) ? UCOL_LESS : UCOL_GREATER; |
| // (strength is TERTIARY) |
| checkTertiary = FALSE; |
| } |
| } |
| } |
| } // if (checkSecTer) |
| |
| } // if ( pSOrder != pTOrder ) |
| } // while() |
| |
| if (sOrder != CollationElementIterator::NULLORDER) |
| { |
| // (tOrder must be CollationElementIterator::NULLORDER, |
| // since this point is only reached when sOrder or tOrder is NULLORDER.) |
| // The source string has more elements, but the target string hasn't. |
| do |
| { |
| if (CollationElementIterator::primaryOrder(sOrder) != 0) |
| { |
| // We found an additional non-ignorable base character in the source string. |
| // This is a primary difference, so the source is greater |
| return UCOL_GREATER; // (strength is PRIMARY) |
| } |
| |
| if (CollationElementIterator::secondaryOrder(sOrder) != 0) |
| { |
| // Additional secondary elements mean the source string is greater |
| if (checkSecTer) |
| { |
| result = UCOL_GREATER; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| } |
| } |
| while ((sOrder = ucol_getNextCE(coll, &sColl, &status)) != CollationElementIterator::NULLORDER); |
| //while ((sOrder = getStrengthOrder(cursor1, status)) != CollationElementIterator::NULLORDER); |
| } |
| else if (tOrder != CollationElementIterator::NULLORDER) |
| { |
| // The target string has more elements, but the source string hasn't. |
| do |
| { |
| if (CollationElementIterator::primaryOrder(tOrder) != 0) |
| { |
| // We found an additional non-ignorable base character in the target string. |
| // This is a primary difference, so the source is less |
| return UCOL_LESS; // (strength is PRIMARY) |
| } |
| |
| if (CollationElementIterator::secondaryOrder(tOrder) != 0) |
| { |
| // Additional secondary elements in the target mean the source string is less |
| if (checkSecTer) |
| { |
| result = UCOL_LESS; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| } |
| } |
| while ((tOrder = ucol_getNextCE(coll, &tColl, &status)) != CollationElementIterator::NULLORDER); |
| //while ((tOrder = getStrengthOrder(cursor2, status)) != CollationElementIterator::NULLORDER); |
| } |
| |
| |
| // For IDENTICAL comparisons, we use a bitwise character comparison |
| // as a tiebreaker if all else is equal |
| // NOTE: The java code compares result with 0, and |
| // puts the result of the string comparison directly into result |
| if (result == Collator::EQUAL && ucol_getStrength(coll) == UCOL_IDENTICAL) |
| { |
| #if 0 |
| // ******** for the UChar normalization interface. |
| // It doesn't work much faster, and the code was broken |
| // so it's commented out. --srl |
| // UChar sourceDecomp[1024], targetDecomp[1024]; |
| // int32_t sourceDecompLength = 1024; |
| // int32_t targetDecompLength = 1024; |
| |
| // int8_t comparison; |
| // Normalizer::EMode decompMode = getDecomposition(); |
| |
| // if (decompMode != Normalizer::NO_OP) |
| // { |
| // Normalizer::normalize(source, sourceLength, decompMode, |
| // 0, sourceDecomp, sourceDecompLength, status); |
| |
| // Normalizer::normalize(target, targetLength, decompMode, |
| // 0, targetDecomp, targetDecompLength, status); |
| |
| // comparison = u_strcmp(sourceDecomp,targetDecomp); |
| // } |
| // else |
| // { |
| // comparison = u_strcmp(source, target); /* ! */ |
| // } |
| |
| #else |
| |
| UnicodeString sourceDecomp, targetDecomp; |
| |
| int8_t comparison; |
| |
| Normalizer::normalize(source, ((RuleBasedCollator *)coll)->getDecomposition(), |
| 0, sourceDecomp, status); |
| |
| Normalizer::normalize(target, ((RuleBasedCollator *)coll)->getDecomposition(), |
| 0, targetDecomp, status); |
| |
| comparison = sourceDecomp.compare(targetDecomp); |
| #endif |
| |
| if (comparison < 0) |
| { |
| result = UCOL_LESS; |
| } |
| else if (comparison == 0) |
| { |
| result = UCOL_EQUAL; |
| } |
| else |
| { |
| result = UCOL_GREATER; |
| } |
| } |
| |
| return result; |
| } |
| |
| U_CAPI int32_t |
| ucol_getSortKey(const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| uint8_t *result, |
| int32_t resultLength) |
| { |
| int32_t count; |
| const uint8_t* bytes = NULL; |
| CollationKey key; |
| int32_t copyLen; |
| int32_t len = (sourceLength == -1 ? u_strlen(source) |
| : sourceLength); |
| // UnicodeString string((UChar*)source, len, len); |
| UErrorCode status = U_ZERO_ERROR; |
| |
| ((Collator*)coll)->getCollationKey(source, len, key, status); |
| if(U_FAILURE(status)) |
| return 0; |
| |
| bytes = key.getByteArray(count); |
| |
| copyLen = uprv_min(count, resultLength); |
| uprv_arrayCopy((const int8_t*)bytes, (int8_t*)result, copyLen); |
| |
| // if(count > resultLength) { |
| // *status = U_BUFFER_OVERFLOW_ERROR; |
| // } |
| |
| return count; |
| } |
| |
| U_CAPI int32_t |
| ucol_getSortKeyEx(const UCollator *coll, |
| const UChar *source, |
| int32_t sourceLength, |
| uint8_t *result, |
| int32_t resultLength) |
| { |
| |
| //uprv_memset(result, 0xAA, resultLength); // for debug purposes |
| |
| |
| /* |
| Still problems in: |
| SUMMARY: |
| ******* [Total error count: 213] |
| Errors in |
| [tscoll/capitst/TestSortKey] // this is normal, since we are changing binary keys |
| [tscoll/cfrtst/TestSecondary] // this is also OK, ICU original implementation was messed up |
| [tscoll/cfrtst/TestTertiary] // probably the same as above |
| [tscoll/cjacoll/TestTertiary] // most probably due to normalization... |
| [tscoll/cg7coll/TestDemo4] // need to check |
| Total errors: 213 |
| */ |
| |
| uint32_t i = 0; // general purpose counter |
| |
| UErrorCode status = U_ZERO_ERROR; |
| |
| uint8_t prim[2*UCOL_MAX_BUFFER], second[UCOL_MAX_BUFFER], tert[UCOL_MAX_BUFFER]; |
| |
| uint8_t *primaries = prim, *secondaries = second, *tertiaries = tert; |
| |
| UChar normBuffer[2*UCOL_MAX_BUFFER]; |
| UChar *normSource = normBuffer; |
| int32_t normSourceLen = 2048; |
| |
| int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); |
| |
| UBool compareSec = (((RuleBasedCollator *)coll)->getStrength() >= Collator::SECONDARY); |
| UBool compareTer = (((RuleBasedCollator *)coll)->getStrength() >= Collator::TERTIARY); |
| UBool compareIdent = (((RuleBasedCollator *)coll)->getStrength() == Collator::IDENTICAL); |
| |
| if(len > UCOL_MAX_BUFFER) { |
| primaries = (uint8_t *)uprv_malloc(6*len*sizeof(uint8_t)); |
| if(compareSec) { |
| secondaries = (uint8_t *)uprv_malloc(2*len*sizeof(uint8_t)); |
| } |
| if(compareTer) { |
| tertiaries = (uint8_t *)uprv_malloc(2*len*sizeof(uint8_t)); |
| } |
| } |
| |
| uint8_t *primstart = primaries; |
| uint8_t *secstart = secondaries; |
| uint8_t *terstart = tertiaries; |
| |
| collIterate s; |
| init_collIterate((UChar *)source, len, &s); |
| |
| // If we need to normalize, we'll do it all at once at the beggining! |
| if(((RuleBasedCollator *)coll)->getDecomposition() != Normalizer::NO_OP) { |
| UnicodeString normalized; |
| Normalizer::normalize(UnicodeString(source, sourceLength), ((RuleBasedCollator *)coll)->getDecomposition(), |
| 0, normalized, status); |
| normSourceLen = normalized.length(); |
| |
| if(normSourceLen > UCOL_MAX_BUFFER) { |
| normSource = (UChar *) uprv_malloc(normSourceLen*sizeof(UChar)); |
| } |
| |
| normalized.extract(0, normSourceLen, normSource); |
| s.string = normSource; |
| s.pos = normSource; |
| s.len = normSource+normSourceLen; |
| } |
| |
| int32_t order = 0; |
| |
| uint16_t primary = 0; |
| uint8_t secondary = 0; |
| uint8_t tertiary = 0; |
| |
| while((order = ucol_getNextCE(coll, &s, &status)) != |
| CollationElementIterator::NULLORDER) { |
| primary = ((order & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT); |
| secondary = ((order & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT); |
| tertiary = (order & UCOL_TERTIARYORDERMASK); |
| |
| if(primary != UCOL_IGNORABLE) { |
| *(primaries++) = (primary+UCOL_SORTKEYOFFSET)>>8; |
| *(primaries++) = (primary+UCOL_SORTKEYOFFSET)&0xFF; |
| if(compareSec) { |
| *(secondaries++) = secondary+UCOL_SORTKEYOFFSET; |
| } |
| if(compareTer) { |
| *(tertiaries++) = tertiary+UCOL_SORTKEYOFFSET; |
| } |
| } else { |
| if(compareSec && secondary != 0) { |
| *(secondaries++) = secondary+UCOL_SORTKEYOFFSET; |
| } |
| if(compareTer && tertiary != 0) { |
| *(tertiaries++) = tertiary+UCOL_SORTKEYOFFSET; |
| } |
| } |
| } |
| |
| *(primaries++) = UCOL_LEVELTERMINATOR; |
| *(primaries++) = UCOL_LEVELTERMINATOR; |
| |
| |
| if(compareSec) { |
| uint32_t secsize = secondaries-secstart; |
| if(ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, &status) == UCOL_ATTR_ON) { // do the reverse copy |
| for(i = 0; i<secsize; i++) { |
| *(primaries++) = *(secondaries-i-1); |
| } |
| } else { |
| uprv_memcpy(primaries, secstart, secsize); |
| primaries += secsize; |
| } |
| |
| *(primaries++) = UCOL_LEVELTERMINATOR; |
| } |
| |
| if(compareTer) { |
| uint32_t tersize = tertiaries - terstart; |
| uprv_memcpy(primaries, terstart, tersize); |
| primaries += tersize; |
| *(primaries++) = UCOL_LEVELTERMINATOR; |
| } |
| |
| |
| if(compareIdent) { |
| for(i = 0; i<len; i++) { |
| *(primaries++) = (s.string[i] >> 8) + utf16fixup[s.string[i] >> 11]; |
| *(primaries++) = (s.string[i] & 0xFF); |
| } |
| *(primaries++) = UCOL_LEVELTERMINATOR; |
| } |
| |
| uprv_memcpy(result, primstart, uprv_min(resultLength, (primaries-primstart))); |
| |
| if(terstart != tert) { |
| uprv_free(terstart); |
| } |
| if(secstart != second) { |
| uprv_free(secstart); |
| } |
| if(primstart != prim) { |
| uprv_free(primstart); |
| } |
| if(normSource != normBuffer) { |
| uprv_free(normSource); |
| } |
| |
| return primaries-primstart; |
| } |
| |
| U_CAPI int32_t |
| ucol_keyHashCode( const uint8_t* key, |
| int32_t length) |
| { |
| CollationKey newKey(key, length); |
| return newKey.hashCode(); |
| } |
| |
| UCollationElements* |
| ucol_openElements( const UCollator *coll, |
| const UChar *text, |
| int32_t textLength, |
| UErrorCode *status) |
| { |
| int32_t len = (textLength == -1 ? u_strlen(text) : textLength); |
| const UnicodeString src((UChar*)text, len, len); |
| |
| CollationElementIterator *iter = 0; |
| iter = ((RuleBasedCollator*)coll)->createCollationElementIterator(src); |
| if(iter == 0) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| |
| return (UCollationElements*) iter; |
| } |
| |
| U_CAPI void |
| ucol_closeElements(UCollationElements *elems) |
| { |
| delete (CollationElementIterator*)elems; |
| } |
| |
| U_CAPI void |
| ucol_reset(UCollationElements *elems) |
| { |
| ((CollationElementIterator*)elems)->reset(); |
| } |
| |
| U_CAPI int32_t |
| ucol_next( UCollationElements *elems, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return UCOL_NULLORDER; |
| |
| return ((CollationElementIterator*)elems)->next(*status); |
| } |
| |
| U_CAPI int32_t |
| ucol_previous( UCollationElements *elems, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return UCOL_NULLORDER; |
| |
| return ((CollationElementIterator*)elems)->previous(*status); |
| } |
| |
| U_CAPI int32_t |
| ucol_getMaxExpansion( const UCollationElements *elems, |
| int32_t order) |
| { |
| return ((CollationElementIterator*)elems)->getMaxExpansion(order); |
| } |
| |
| U_CAPI void |
| ucol_setText(UCollationElements *elems, |
| const UChar *text, |
| int32_t textLength, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return; |
| |
| int32_t len = (textLength == -1 ? u_strlen(text) : textLength); |
| const UnicodeString src((UChar*)text, len, len); |
| |
| ((CollationElementIterator*)elems)->setText(src, *status); |
| } |
| |
| U_CAPI UTextOffset |
| ucol_getOffset(const UCollationElements *elems) |
| { |
| return ((CollationElementIterator*)elems)->getOffset(); |
| } |
| |
| U_CAPI void |
| ucol_setOffset( UCollationElements *elems, |
| UTextOffset offset, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) return; |
| |
| ((CollationElementIterator*)elems)->setOffset(offset, *status); |
| } |
| |
| U_CAPI void |
| ucol_getVersion(const UCollator* coll, |
| UVersionInfo versionInfo) |
| { |
| ((Collator*)coll)->getVersion(versionInfo); |
| } |
| |
| U_CAPI uint8_t * |
| ucol_cloneRuleData(UCollator *coll, int32_t *length, UErrorCode *status) |
| { |
| return ((RuleBasedCollator*)coll)->cloneRuleData(*length,*status); |
| } |
| |
| U_CAPI void ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { |
| *status = U_UNSUPPORTED_ERROR; |
| return; |
| } |
| |
| U_CAPI UColAttributeValue ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { |
| *status = U_UNSUPPORTED_ERROR; |
| return UCOL_ATTR_DEFAULT; |
| } |
| |
| U_CAPI UCollator *ucol_safeClone(const UCollator *coll, void *stackBuffer, uint32_t bufferSize, UErrorCode *status) { |
| *status = U_UNSUPPORTED_ERROR; |
| return NULL; |
| } |
| |
| U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll, |
| UCharForwardIterator *source, void *sourceContext, |
| UCharForwardIterator *target, void *targetContext) { |
| return UCOL_EQUAL; |
| } |
| |
| U_CAPI int32_t ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { |
| return 0; |
| } |