| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2006, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * file name: ucol.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * Modification history |
| * Date Name Comments |
| * 1996-1999 various members of ICU team maintained C API for collation framework |
| * 02/16/2001 synwee Added internal method getPrevSpecialCE |
| * 03/01/2001 synwee Added maxexpansion functionality. |
| * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "uassert.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/coleitr.h" |
| #include "unicode/unorm.h" |
| #include "unicode/udata.h" |
| #include "unicode/ustring.h" |
| |
| #include "ucol_imp.h" |
| #include "ucol_elm.h" |
| #include "bocsu.h" |
| |
| #include "unormimp.h" |
| #include "unorm_it.h" |
| #include "umutex.h" |
| #include "cmemory.h" |
| #include "ucln_in.h" |
| #include "cstring.h" |
| #include "utracimp.h" |
| #include "putilimp.h" |
| |
| #ifdef UCOL_DEBUG |
| #include <stdio.h> |
| #endif |
| |
| U_NAMESPACE_USE |
| |
| /* added by synwee for trie manipulation*/ |
| #define STAGE_1_SHIFT_ 10 |
| #define STAGE_2_SHIFT_ 4 |
| #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F |
| #define STAGE_3_MASK_ 0xF |
| #define LAST_BYTE_MASK_ 0xFF |
| #define SECOND_LAST_BYTE_SHIFT_ 8 |
| |
| #define ZERO_CC_LIMIT_ 0xC0 |
| |
| // static UCA. There is only one. Collators don't use it. |
| // It is referenced only in ucol_initUCA and ucol_cleanup |
| static UCollator* _staticUCA = NULL; |
| // static pointer to udata memory. Inited in ucol_initUCA |
| // used for cleanup in ucol_cleanup |
| static UDataMemory* UCA_DATA_MEM = NULL; |
| |
| // this is static pointer to the normalizer fcdTrieIndex |
| // it is always the same between calls to u_cleanup |
| // and therefore writing to it is not synchronized. |
| // It is cleaned in ucol_cleanup |
| static const uint16_t *fcdTrieIndex=NULL; |
| |
| // These are values from UCA required for |
| // implicit generation and supressing sort key compression |
| // they should regularly be in the UCA, but if one |
| // is running without UCA, it could be a problem |
| static int32_t maxRegularPrimary = 0xA0; |
| static int32_t minImplicitPrimary = 0xE0; |
| static int32_t maxImplicitPrimary = 0xE4; |
| |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV |
| isAcceptableUCA(void * /*context*/, |
| const char * /*type*/, const char * /*name*/, |
| const UDataInfo *pInfo){ |
| /* context, type & name are intentionally not used */ |
| if( pInfo->size>=20 && |
| pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
| pInfo->charsetFamily==U_CHARSET_FAMILY && |
| pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 && /* dataFormat="UCol" */ |
| pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 && |
| pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 && |
| pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 && |
| pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 && |
| pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// && |
| //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 && |
| //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh |
| //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh |
| ) { |
| UVersionInfo UCDVersion; |
| u_getUnicodeVersion(UCDVersion); |
| if(pInfo->dataVersion[0]==UCDVersion[0] && |
| pInfo->dataVersion[1]==UCDVersion[1]) { // && |
| //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] && |
| //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) { |
| return TRUE; |
| } else { |
| return FALSE; |
| } |
| } else { |
| return FALSE; |
| } |
| } |
| |
| |
| static int32_t U_CALLCONV |
| _getFoldingOffset(uint32_t data) { |
| return (int32_t)(data&0xFFFFFF); |
| } |
| |
| U_CDECL_END |
| |
| static |
| inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, |
| int32_t sourceLen, collIterate *s) { |
| (s)->string = (s)->pos = (UChar *)(sourceString); |
| (s)->origFlags = 0; |
| (s)->flags = 0; |
| if (sourceLen >= 0) { |
| s->flags |= UCOL_ITER_HASLEN; |
| (s)->endp = (UChar *)sourceString+sourceLen; |
| } |
| else { |
| /* change to enable easier checking for end of string for fcdpositon */ |
| (s)->endp = NULL; |
| } |
| (s)->CEpos = (s)->toReturn = (s)->CEs; |
| (s)->writableBuffer = (s)->stackWritableBuffer; |
| (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; |
| (s)->coll = (collator); |
| (s)->fcdPosition = 0; |
| if(collator->normalizationMode == UCOL_ON) { |
| (s)->flags |= UCOL_ITER_NORM; |
| } |
| if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { |
| (s)->flags |= UCOL_HIRAGANA_Q; |
| } |
| (s)->iterator = NULL; |
| //(s)->iteratorIndex = 0; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, |
| int32_t sourceLen, collIterate *s){ |
| /* Out-of-line version for use from other files. */ |
| IInit_collIterate(collator, sourceString, sourceLen, s); |
| } |
| |
| |
| /** |
| * Backup the state of the collIterate struct data |
| * @param data collIterate to backup |
| * @param backup storage |
| */ |
| static |
| inline void backupState(const collIterate *data, collIterateState *backup) |
| { |
| backup->fcdPosition = data->fcdPosition; |
| backup->flags = data->flags; |
| backup->origFlags = data->origFlags; |
| backup->pos = data->pos; |
| backup->bufferaddress = data->writableBuffer; |
| backup->buffersize = data->writableBufSize; |
| backup->iteratorMove = 0; |
| backup->iteratorIndex = 0; |
| if(data->iterator != NULL) { |
| //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); |
| backup->iteratorIndex = data->iterator->getState(data->iterator); |
| // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE |
| if(backup->iteratorIndex == UITER_NO_STATE) { |
| while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { |
| backup->iteratorMove++; |
| data->iterator->move(data->iterator, -1, UITER_CURRENT); |
| } |
| data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); |
| } |
| } |
| } |
| |
| /** |
| * Loads the state into the collIterate struct data |
| * @param data collIterate to backup |
| * @param backup storage |
| * @param forwards boolean to indicate if forwards iteration is used, |
| * false indicates backwards iteration |
| */ |
| static |
| inline void loadState(collIterate *data, const collIterateState *backup, |
| UBool forwards) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| data->flags = backup->flags; |
| data->origFlags = backup->origFlags; |
| if(data->iterator != NULL) { |
| //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); |
| data->iterator->setState(data->iterator, backup->iteratorIndex, &status); |
| if(backup->iteratorMove != 0) { |
| data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); |
| } |
| } |
| data->pos = backup->pos; |
| if ((data->flags & UCOL_ITER_INNORMBUF) && |
| data->writableBuffer != backup->bufferaddress) { |
| /* |
| this is when a new buffer has been reallocated and we'll have to |
| calculate the new position. |
| note the new buffer has to contain the contents of the old buffer. |
| */ |
| if (forwards) { |
| data->pos = data->writableBuffer + |
| (data->pos - backup->bufferaddress); |
| } |
| else { |
| /* backwards direction */ |
| uint32_t temp = backup->buffersize - |
| (data->pos - backup->bufferaddress); |
| data->pos = data->writableBuffer + (data->writableBufSize - temp); |
| } |
| } |
| if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| /* |
| this is alittle tricky. |
| if we are initially not in the normalization buffer, even if we |
| normalize in the later stage, the data in the buffer will be |
| ignored, since we skip back up to the data string. |
| however if we are already in the normalization buffer, any |
| further normalization will pull data into the normalization |
| buffer and modify the fcdPosition. |
| since we are keeping the data in the buffer for use, the |
| fcdPosition can not be reverted back. |
| arrgghh.... |
| */ |
| data->fcdPosition = backup->fcdPosition; |
| } |
| } |
| |
| |
| /* |
| * collIter_eos() |
| * Checks for a collIterate being positioned at the end of |
| * its source string. |
| * |
| */ |
| static |
| inline UBool collIter_eos(collIterate *s) { |
| if(s->flags & UCOL_USE_ITERATOR) { |
| return !(s->iterator->hasNext(s->iterator)); |
| } |
| if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { |
| // Null terminated string, but not at null, so not at end. |
| // Whether in main or normalization buffer doesn't matter. |
| return FALSE; |
| } |
| |
| // String with length. Can't be in normalization buffer, which is always |
| // null termintated. |
| if (s->flags & UCOL_ITER_HASLEN) { |
| return (s->pos == s->endp); |
| } |
| |
| // We are at a null termination, could be either normalization buffer or main string. |
| if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { |
| // At null at end of main string. |
| return TRUE; |
| } |
| |
| // At null at end of normalization buffer. Need to check whether there there are |
| // any characters left in the main buffer. |
| if(s->origFlags & UCOL_USE_ITERATOR) { |
| return !(s->iterator->hasNext(s->iterator)); |
| } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { |
| // Null terminated main string. fcdPosition is the 'return' position into main buf. |
| return (*s->fcdPosition == 0); |
| } |
| else { |
| // Main string with an end pointer. |
| return s->fcdPosition == s->endp; |
| } |
| } |
| |
| /* |
| * collIter_bos() |
| * Checks for a collIterate being positioned at the start of |
| * its source string. |
| * |
| */ |
| static |
| inline UBool collIter_bos(collIterate *source) { |
| // if we're going backwards, we need to know whether there is more in the |
| // iterator, even if we are in the side buffer |
| if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { |
| return !source->iterator->hasPrevious(source->iterator); |
| } |
| if (source->pos <= source->string || |
| ((source->flags & UCOL_ITER_INNORMBUF) && |
| *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| static |
| inline UBool collIter_SimpleBos(collIterate *source) { |
| // if we're going backwards, we need to know whether there is more in the |
| // iterator, even if we are in the side buffer |
| if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { |
| return !source->iterator->hasPrevious(source->iterator); |
| } |
| if (source->pos == source->string) { |
| return TRUE; |
| } |
| return FALSE; |
| } |
| //return (data->pos == data->string) || |
| |
| |
| /** |
| * Checks and free writable buffer if it is not the original stack buffer |
| * in collIterate. This function does not reassign the writable buffer. |
| * @param data collIterate struct to determine and free the writable buffer |
| */ |
| static |
| inline void freeHeapWritableBuffer(collIterate *data) |
| { |
| if (data->writableBuffer != data->stackWritableBuffer) { |
| uprv_free(data->writableBuffer); |
| } |
| } |
| |
| |
| /****************************************************************************/ |
| /* Following are the open/close functions */ |
| /* */ |
| /****************************************************************************/ |
| |
| static UCollator* |
| ucol_initFromBinary(const uint8_t *bin, int32_t length, |
| const UCollator *base, |
| UCollator *fillIn, |
| UErrorCode *status) |
| { |
| UCollator *result = fillIn; |
| if(U_FAILURE(*status)) { |
| return NULL; |
| } |
| /* |
| if(base == NULL) { |
| // we don't support null base yet |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| */ |
| // We need these and we could be running without UCA |
| uprv_uca_initImplicitConstants(0, 0, status); |
| UCATableHeader *colData = (UCATableHeader *)bin; |
| // do we want version check here? We're trying to figure out whether collators are compatible |
| if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || |
| uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || |
| colData->version[0] != UCOL_BUILDER_VERSION) |
| { |
| *status = U_COLLATOR_VERSION_MISMATCH; |
| return NULL; |
| } |
| else { |
| if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { |
| result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); |
| if(U_FAILURE(*status)){ |
| return NULL; |
| } |
| result->hasRealData = TRUE; |
| } |
| else { |
| if(base) { |
| result = ucol_initCollator(base->image, result, base, status); |
| ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); |
| if(U_FAILURE(*status)){ |
| return NULL; |
| } |
| result->hasRealData = FALSE; |
| } |
| else { |
| *status = U_USELESS_COLLATOR_ERROR; |
| return NULL; |
| } |
| } |
| result->freeImageOnClose = FALSE; |
| } |
| result->validLocale = NULL; |
| result->requestedLocale = NULL; |
| result->rules = NULL; |
| result->rulesLength = 0; |
| result->freeRulesOnClose = FALSE; |
| result->rb = NULL; |
| result->elements = NULL; |
| return result; |
| } |
| |
| U_CAPI UCollator* U_EXPORT2 |
| ucol_openBinary(const uint8_t *bin, int32_t length, |
| const UCollator *base, |
| UErrorCode *status) |
| { |
| return ucol_initFromBinary(bin, length, base, NULL, status); |
| } |
| |
| U_CAPI UCollator* U_EXPORT2 |
| ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) |
| { |
| UCollator * localCollator; |
| int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); |
| char *stackBufferChars = (char *)stackBuffer; |
| int32_t imageSize = 0; |
| int32_t rulesSize = 0; |
| int32_t rulesPadding = 0; |
| uint8_t *image; |
| UChar *rules; |
| UBool colAllocated = FALSE; |
| UBool imageAllocated = FALSE; |
| |
| if (status == NULL || U_FAILURE(*status)){ |
| return 0; |
| } |
| if ((stackBuffer && !pBufferSize) || !coll){ |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| if (coll->rules && coll->freeRulesOnClose) { |
| rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); |
| rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); |
| bufferSizeNeeded += rulesSize + rulesPadding; |
| } |
| |
| if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ |
| *pBufferSize = bufferSizeNeeded; |
| return 0; |
| } |
| |
| /* Pointers on 64-bit platforms need to be aligned |
| * on a 64-bit boundry in memory. |
| */ |
| if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { |
| int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); |
| if (*pBufferSize > offsetUp) { |
| *pBufferSize -= offsetUp; |
| stackBufferChars += offsetUp; |
| } |
| else { |
| /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ |
| *pBufferSize = 1; |
| } |
| } |
| stackBuffer = (void *)stackBufferChars; |
| |
| if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { |
| /* allocate one here...*/ |
| stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); |
| colAllocated = TRUE; |
| if (U_SUCCESS(*status)) { |
| *status = U_SAFECLONE_ALLOCATED_WARNING; |
| } |
| } |
| localCollator = (UCollator *)stackBufferChars; |
| rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); |
| { |
| UErrorCode tempStatus = U_ZERO_ERROR; |
| imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); |
| } |
| if (coll->freeImageOnClose) { |
| image = (uint8_t *)uprv_malloc(imageSize); |
| ucol_cloneBinary(coll, image, imageSize, status); |
| imageAllocated = TRUE; |
| } |
| else { |
| image = (uint8_t *)coll->image; |
| } |
| localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| |
| if (coll->rules) { |
| if (coll->freeRulesOnClose) { |
| localCollator->rules = u_strcpy(rules, coll->rules); |
| //bufferEnd += rulesSize; |
| } |
| else { |
| localCollator->rules = coll->rules; |
| } |
| localCollator->freeRulesOnClose = FALSE; |
| localCollator->rulesLength = coll->rulesLength; |
| } |
| |
| int32_t i; |
| for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { |
| ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); |
| } |
| localCollator->requestedLocale = NULL; // zero copies of pointers |
| localCollator->validLocale = NULL; |
| localCollator->rb = NULL; |
| localCollator->elements = NULL; |
| localCollator->freeOnClose = colAllocated; |
| localCollator->freeImageOnClose = imageAllocated; |
| return localCollator; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucol_close(UCollator *coll) |
| { |
| UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
| UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); |
| if(coll != NULL) { |
| // these are always owned by each UCollator struct, |
| // so we always free them |
| if(coll->validLocale != NULL) { |
| uprv_free(coll->validLocale); |
| } |
| if(coll->requestedLocale != NULL) { |
| uprv_free(coll->requestedLocale); |
| } |
| if(coll->resCleaner != NULL) { |
| coll->resCleaner(coll); |
| } |
| if(coll->latinOneCEs != NULL) { |
| uprv_free(coll->latinOneCEs); |
| } |
| if(coll->options != NULL && coll->freeOptionsOnClose) { |
| uprv_free(coll->options); |
| } |
| if(coll->rules != NULL && coll->freeRulesOnClose) { |
| uprv_free((UChar *)coll->rules); |
| } |
| if(coll->image != NULL && coll->freeImageOnClose) { |
| uprv_free((UCATableHeader *)coll->image); |
| } |
| |
| /* Here, it would be advisable to close: */ |
| /* - UData for UCA (unless we stuff it in the root resb */ |
| /* Again, do we need additional housekeeping... HMMM! */ |
| UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); |
| if(coll->freeOnClose){ |
| /* for safeClone, if freeOnClose is FALSE, |
| don't free the other instance data */ |
| uprv_free(coll); |
| } |
| } |
| UTRACE_EXIT(); |
| } |
| |
| /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ |
| /* you should be able to get the binary chunk to write out... Doesn't look very full now */ |
| U_CAPI uint8_t* U_EXPORT2 |
| ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) |
| { |
| uint8_t *result = NULL; |
| if(U_FAILURE(*status)) { |
| return NULL; |
| } |
| if(coll->hasRealData == TRUE) { |
| *length = coll->image->size; |
| result = (uint8_t *)uprv_malloc(*length); |
| /* test for NULL */ |
| if (result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| uprv_memcpy(result, coll->image, *length); |
| } else { |
| *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); |
| result = (uint8_t *)uprv_malloc(*length); |
| /* test for NULL */ |
| if (result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| |
| /* build the UCATableHeader with minimal entries */ |
| /* do not copy the header from the UCA file because its values are wrong! */ |
| /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ |
| |
| /* reset everything */ |
| uprv_memset(result, 0, *length); |
| |
| /* set the tailoring-specific values */ |
| UCATableHeader *myData = (UCATableHeader *)result; |
| myData->size = *length; |
| |
| /* offset for the options, the only part of the data that is present after the header */ |
| myData->options = sizeof(UCATableHeader); |
| |
| /* need to always set the expansion value for an upper bound of the options */ |
| myData->expansion = myData->options + sizeof(UColOptionSet); |
| |
| myData->magic = UCOL_HEADER_MAGIC; |
| myData->isBigEndian = U_IS_BIG_ENDIAN; |
| myData->charSetFamily = U_CHARSET_FAMILY; |
| |
| /* copy UCA's version; genrb will override all but the builder version with tailoring data */ |
| uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); |
| |
| uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); |
| uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); |
| uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); |
| myData->jamoSpecial = coll->image->jamoSpecial; |
| |
| /* copy the collator options */ |
| uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); |
| } |
| return result; |
| } |
| |
| void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| result->caseFirst = (UColAttributeValue)opts->caseFirst; |
| result->caseLevel = (UColAttributeValue)opts->caseLevel; |
| result->frenchCollation = (UColAttributeValue)opts->frenchCollation; |
| result->normalizationMode = (UColAttributeValue)opts->normalizationMode; |
| result->strength = (UColAttributeValue)opts->strength; |
| result->variableTopValue = opts->variableTopValue; |
| result->alternateHandling = (UColAttributeValue)opts->alternateHandling; |
| result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; |
| result->numericCollation = (UColAttributeValue)opts->numericCollation; |
| |
| result->caseFirstisDefault = TRUE; |
| result->caseLevelisDefault = TRUE; |
| result->frenchCollationisDefault = TRUE; |
| result->normalizationModeisDefault = TRUE; |
| result->strengthisDefault = TRUE; |
| result->variableTopValueisDefault = TRUE; |
| result->hiraganaQisDefault = TRUE; |
| result->numericCollationisDefault = TRUE; |
| |
| ucol_updateInternalState(result, status); |
| |
| result->options = opts; |
| } |
| |
| |
| /** |
| * Approximate determination if a character is at a contraction end. |
| * Guaranteed to be TRUE if a character is at the end of a contraction, |
| * otherwise it is not deterministic. |
| * @param c character to be determined |
| * @param coll collator |
| */ |
| static |
| inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { |
| if (U16_IS_TRAIL(c)) { |
| return TRUE; |
| } |
| |
| if (c < coll->minContrEndCP) { |
| return FALSE; |
| } |
| |
| int32_t hash = c; |
| uint8_t htbyte; |
| if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { |
| hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; |
| } |
| htbyte = coll->contrEndCP[hash>>3]; |
| return (((htbyte >> (hash & 7)) & 1) == 1); |
| } |
| |
| |
| |
| /* |
| * i_getCombiningClass() |
| * A fast, at least partly inline version of u_getCombiningClass() |
| * This is a candidate for further optimization. Used heavily |
| * in contraction processing. |
| */ |
| static |
| inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { |
| uint8_t sCC = 0; |
| if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { |
| sCC = u_getCombiningClass(c); |
| } |
| return sCC; |
| } |
| |
| UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { |
| UChar c; |
| UCollator *result = fillIn; |
| if(U_FAILURE(*status) || image == NULL) { |
| return NULL; |
| } |
| |
| if(result == NULL) { |
| result = (UCollator *)uprv_malloc(sizeof(UCollator)); |
| if(result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return result; |
| } |
| result->freeOnClose = TRUE; |
| } else { |
| result->freeOnClose = FALSE; |
| } |
| |
| result->image = image; |
| result->mapping.getFoldingOffset = _getFoldingOffset; |
| const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; |
| utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); |
| if(U_FAILURE(*status)) { |
| if(result->freeOnClose == TRUE) { |
| uprv_free(result); |
| result = NULL; |
| } |
| return result; |
| } |
| |
| /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/ |
| result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); |
| result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); |
| result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); |
| result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); |
| |
| result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options); |
| result->freeOptionsOnClose = FALSE; |
| |
| /* set attributes */ |
| result->caseFirst = (UColAttributeValue)result->options->caseFirst; |
| result->caseLevel = (UColAttributeValue)result->options->caseLevel; |
| result->frenchCollation = (UColAttributeValue)result->options->frenchCollation; |
| result->normalizationMode = (UColAttributeValue)result->options->normalizationMode; |
| result->strength = (UColAttributeValue)result->options->strength; |
| result->variableTopValue = result->options->variableTopValue; |
| result->alternateHandling = (UColAttributeValue)result->options->alternateHandling; |
| result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ; |
| result->numericCollation = (UColAttributeValue)result->options->numericCollation; |
| |
| result->caseFirstisDefault = TRUE; |
| result->caseLevelisDefault = TRUE; |
| result->frenchCollationisDefault = TRUE; |
| result->normalizationModeisDefault = TRUE; |
| result->strengthisDefault = TRUE; |
| result->variableTopValueisDefault = TRUE; |
| result->alternateHandlingisDefault = TRUE; |
| result->hiraganaQisDefault = TRUE; |
| result->numericCollationisDefault = TRUE; |
| |
| /*result->scriptOrder = NULL;*/ |
| |
| result->rules = NULL; |
| result->rulesLength = 0; |
| |
| /* get the version info from UCATableHeader and populate the Collator struct*/ |
| result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ |
| result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ |
| result->dataVersion[2] = 0; |
| result->dataVersion[3] = 0; |
| |
| result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; |
| result->minUnsafeCP = 0; |
| for (c=0; c<0x300; c++) { // Find the smallest unsafe char. |
| if (ucol_unsafeCP(c, result)) break; |
| } |
| result->minUnsafeCP = c; |
| |
| result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; |
| result->minContrEndCP = 0; |
| for (c=0; c<0x300; c++) { // Find the Contraction-ending char. |
| if (ucol_contractionEndCP(c, result)) break; |
| } |
| result->minContrEndCP = c; |
| |
| /* max expansion tables */ |
| result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + |
| result->image->endExpansionCE); |
| result->lastEndExpansionCE = result->endExpansionCE + |
| result->image->endExpansionCECount - 1; |
| result->expansionCESize = (uint8_t*)result->image + |
| result->image->expansionCESize; |
| |
| |
| //result->errorCode = *status; |
| |
| result->latinOneCEs = NULL; |
| |
| result->latinOneRegenTable = FALSE; |
| result->latinOneFailed = FALSE; |
| result->UCA = UCA; |
| result->resCleaner = NULL; |
| |
| ucol_updateInternalState(result, status); |
| |
| |
| return result; |
| } |
| |
| /* new Mark's code */ |
| |
| /** |
| * For generation of Implicit CEs |
| * @author Davis |
| * |
| * Cleaned up so that changes can be made more easily. |
| * Old values: |
| # First Implicit: E26A792D |
| # Last Implicit: E3DC70C0 |
| # First CJK: E0030300 |
| # Last CJK: E0A9DD00 |
| # First CJK_A: E0A9DF00 |
| # Last CJK_A: E0DE3100 |
| */ |
| /* Following is a port of Mark's code for new treatment of implicits. |
| * It is positioned here, since ucol_initUCA need to initialize the |
| * variables below according to the data in the fractional UCA. |
| */ |
| |
| /** |
| * Function used to: |
| * a) collapse the 2 different Han ranges from UCA into one (in the right order), and |
| * b) bump any non-CJK characters by 10FFFF. |
| * The relevant blocks are: |
| * A: 4E00..9FFF; CJK Unified Ideographs |
| * F900..FAFF; CJK Compatibility Ideographs |
| * B: 3400..4DBF; CJK Unified Ideographs Extension A |
| * 20000..XX; CJK Unified Ideographs Extension B (and others later on) |
| * As long as |
| * no new B characters are allocated between 4E00 and FAFF, and |
| * no new A characters are outside of this range, |
| * (very high probability) this simple code will work. |
| * The reordered blocks are: |
| * Block1 is CJK |
| * Block2 is CJK_COMPAT_USED |
| * Block3 is CJK_A |
| * (all contiguous) |
| * Any other CJK gets its normal code point |
| * Any non-CJK gets +10FFFF |
| * When we reorder Block1, we make sure that it is at the very start, |
| * so that it will use a 3-byte form. |
| * Warning: the we only pick up the compatibility characters that are |
| * NOT decomposed, so that block is smaller! |
| */ |
| |
| // CONSTANTS |
| static const UChar32 |
| NON_CJK_OFFSET = 0x110000, |
| UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 |
| |
| /** |
| * Precomputed by constructor |
| */ |
| static int32_t |
| final3Multiplier = 0, |
| final4Multiplier = 0, |
| final3Count = 0, |
| final4Count = 0, |
| medialCount = 0, |
| min3Primary = 0, |
| min4Primary = 0, |
| max4Primary = 0, |
| minTrail = 0, |
| maxTrail = 0, |
| max3Trail = 0, |
| max4Trail = 0, |
| min4Boundary = 0; |
| |
| static const UChar32 |
| CJK_BASE = 0x4E00, |
| CJK_LIMIT = 0x9FFF+1, |
| CJK_COMPAT_USED_BASE = 0xFA0E, |
| CJK_COMPAT_USED_LIMIT = 0xFA2F+1, |
| CJK_A_BASE = 0x3400, |
| CJK_A_LIMIT = 0x4DBF+1, |
| CJK_B_BASE = 0x20000, |
| CJK_B_LIMIT = 0x2A6DF+1; |
| |
| static UChar32 swapCJK(UChar32 i) { |
| |
| if (i >= CJK_BASE) { |
| if (i < CJK_LIMIT) return i - CJK_BASE; |
| |
| if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE |
| + (CJK_LIMIT - CJK_BASE); |
| if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_B_LIMIT) return i; // non-BMP-CJK |
| |
| return i + NON_CJK_OFFSET; // non-CJK |
| } |
| if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_A_LIMIT) return i - CJK_A_BASE |
| + (CJK_LIMIT - CJK_BASE) |
| + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| return i + NON_CJK_OFFSET; // non-CJK |
| } |
| |
| U_CAPI UChar32 U_EXPORT2 |
| uprv_uca_getRawFromCodePoint(UChar32 i) { |
| return swapCJK(i)+1; |
| } |
| |
| U_CAPI UChar32 U_EXPORT2 |
| uprv_uca_getCodePointFromRaw(UChar32 i) { |
| i--; |
| UChar32 result = 0; |
| if(i >= NON_CJK_OFFSET) { |
| result = i - NON_CJK_OFFSET; |
| } else if(i >= CJK_B_BASE) { |
| result = i; |
| } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted |
| if(i < CJK_LIMIT - CJK_BASE) { |
| result = i + CJK_BASE; |
| } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { |
| result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); |
| } else { |
| result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| } |
| } else { |
| result = -1; |
| } |
| return result; |
| } |
| |
| // GET IMPLICIT PRIMARY WEIGHTS |
| // Return value is left justified primary key |
| U_CAPI uint32_t U_EXPORT2 |
| uprv_uca_getImplicitFromRaw(UChar32 cp) { |
| /* |
| if (cp < 0 || cp > UCOL_MAX_INPUT) { |
| throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); |
| } |
| */ |
| int32_t last0 = cp - min4Boundary; |
| if (last0 < 0) { |
| int32_t last1 = cp / final3Count; |
| last0 = cp % final3Count; |
| |
| int32_t last2 = last1 / medialCount; |
| last1 %= medialCount; |
| |
| last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start |
| last1 = minTrail + last1; // offset |
| last2 = min3Primary + last2; // offset |
| /* |
| if (last2 >= min4Primary) { |
| throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); |
| } |
| */ |
| return (last2 << 24) + (last1 << 16) + (last0 << 8); |
| } else { |
| int32_t last1 = last0 / final4Count; |
| last0 %= final4Count; |
| |
| int32_t last2 = last1 / medialCount; |
| last1 %= medialCount; |
| |
| int32_t last3 = last2 / medialCount; |
| last2 %= medialCount; |
| |
| last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start |
| last1 = minTrail + last1; // offset |
| last2 = minTrail + last2; // offset |
| last3 = min4Primary + last3; // offset |
| /* |
| if (last3 > max4Primary) { |
| throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); |
| } |
| */ |
| return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; |
| } |
| } |
| |
| U_CAPI uint32_t U_EXPORT2 |
| uprv_uca_getImplicitPrimary(UChar32 cp) { |
| //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); |
| |
| cp = swapCJK(cp); |
| cp++; |
| // we now have a range of numbers from 0 to 21FFFF. |
| |
| //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); |
| |
| return uprv_uca_getImplicitFromRaw(cp); |
| } |
| |
| /** |
| * Converts implicit CE into raw integer ("code point") |
| * @param implicit |
| * @return -1 if illegal format |
| */ |
| U_CAPI UChar32 U_EXPORT2 |
| uprv_uca_getRawFromImplicit(uint32_t implicit) { |
| UChar32 result; |
| UChar32 b3 = implicit & 0xFF; |
| implicit >>= 8; |
| UChar32 b2 = implicit & 0xFF; |
| implicit >>= 8; |
| UChar32 b1 = implicit & 0xFF; |
| implicit >>= 8; |
| UChar32 b0 = implicit & 0xFF; |
| |
| // simple parameter checks |
| if (b0 < min3Primary || b0 > max4Primary |
| || b1 < minTrail || b1 > maxTrail) return -1; |
| // normal offsets |
| b1 -= minTrail; |
| |
| // take care of the final values, and compose |
| if (b0 < min4Primary) { |
| if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1; |
| b2 -= minTrail; |
| UChar32 remainder = b2 % final3Multiplier; |
| if (remainder != 0) return -1; |
| b0 -= min3Primary; |
| b2 /= final3Multiplier; |
| result = ((b0 * medialCount) + b1) * final3Count + b2; |
| } else { |
| if (b2 < minTrail || b2 > maxTrail |
| || b3 < minTrail || b3 > max4Trail) return -1; |
| b2 -= minTrail; |
| b3 -= minTrail; |
| UChar32 remainder = b3 % final4Multiplier; |
| if (remainder != 0) return -1; |
| b3 /= final4Multiplier; |
| b0 -= min4Primary; |
| result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; |
| } |
| // final check |
| if (result < 0 || result > UCOL_MAX_INPUT) return -1; |
| return result; |
| } |
| |
| |
| static inline int32_t divideAndRoundUp(int a, int b) { |
| return 1 + (a-1)/b; |
| } |
| |
| /* this function is either called from initUCA or from genUCA before |
| * doing canonical closure for the UCA. |
| */ |
| |
| /** |
| * Set up to generate implicits. |
| * @param minPrimary |
| * @param maxPrimary |
| * @param minTrail final byte |
| * @param maxTrail final byte |
| * @param gap3 the gap we leave for tailoring for 3-byte forms |
| * @param gap4 the gap we leave for tailoring for 4-byte forms |
| */ |
| static void initImplicitConstants(int minPrimary, int maxPrimary, |
| int minTrailIn, int maxTrailIn, |
| int gap3, int primaries3count, |
| UErrorCode *status) { |
| // some simple parameter checks |
| if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| }; |
| if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| }; |
| if (primaries3count < 1) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| }; |
| |
| minTrail = minTrailIn; |
| maxTrail = maxTrailIn; |
| |
| min3Primary = minPrimary; |
| max4Primary = maxPrimary; |
| // compute constants for use later. |
| // number of values we can use in trailing bytes |
| // leave room for empty values between AND above, e.g. if gap = 2 |
| // range 3..7 => +3 -4 -5 -6 -7: so 1 value |
| // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values |
| // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values |
| final3Multiplier = gap3 + 1; |
| final3Count = (maxTrail - minTrail + 1) / final3Multiplier; |
| max3Trail = minTrail + (final3Count - 1) * final3Multiplier; |
| |
| // medials can use full range |
| medialCount = (maxTrail - minTrail + 1); |
| // find out how many values fit in each form |
| int32_t threeByteCount = medialCount * final3Count; |
| // now determine where the 3/4 boundary is. |
| // we use 3 bytes below the boundary, and 4 above |
| int32_t primariesAvailable = maxPrimary - minPrimary + 1; |
| int32_t primaries4count = primariesAvailable - primaries3count; |
| |
| |
| int32_t min3ByteCoverage = primaries3count * threeByteCount; |
| min4Primary = minPrimary + primaries3count; |
| min4Boundary = min3ByteCoverage; |
| // Now expand out the multiplier for the 4 bytes, and redo. |
| |
| int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; |
| int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); |
| //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte); |
| int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); |
| //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte); |
| int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; |
| //if (DEBUG) System.out.println("expandedGap: " + gap4); |
| if (gap4 < 1) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| final4Multiplier = gap4 + 1; |
| final4Count = neededPerFinalByte; |
| max4Trail = minTrail + (final4Count - 1) * final4Multiplier; |
| /* |
| if (DEBUG) { |
| System.out.println("final4Count: " + final4Count); |
| for (int counter = 0; counter <= final4Count; ++counter) { |
| int value = minTrail + (1 + counter)*final4Multiplier; |
| System.out.println(counter + "\t" + value + "\t" + Utility.hex(value)); |
| } |
| } |
| */ |
| } |
| |
| /** |
| * Supply parameters for generating implicit CEs |
| */ |
| U_CAPI void U_EXPORT2 |
| uprv_uca_initImplicitConstants(int32_t, int32_t, UErrorCode *status) { |
| // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. |
| //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); |
| initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); |
| } |
| |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV |
| ucol_cleanup(void) |
| { |
| if (UCA_DATA_MEM) { |
| udata_close(UCA_DATA_MEM); |
| UCA_DATA_MEM = NULL; |
| } |
| if (_staticUCA) { |
| ucol_close(_staticUCA); |
| _staticUCA = NULL; |
| } |
| fcdTrieIndex = NULL; |
| return TRUE; |
| } |
| U_CDECL_END |
| |
| /* do not close UCA returned by ucol_initUCA! */ |
| UCollator * |
| ucol_initUCA(UErrorCode *status) { |
| if(U_FAILURE(*status)) { |
| return NULL; |
| } |
| umtx_lock(NULL); |
| UBool f = (_staticUCA == NULL); |
| umtx_unlock(NULL); |
| |
| if(f) { |
| UCollator *newUCA = NULL; |
| UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status); |
| |
| if(U_FAILURE(*status)) { |
| if (result) { |
| udata_close(result); |
| } |
| uprv_free(newUCA); |
| } |
| |
| // init FCD data |
| if (fcdTrieIndex == NULL) { |
| fcdTrieIndex = unorm_getFCDTrie(status); |
| ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
| } |
| |
| if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ |
| newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status); |
| if(U_SUCCESS(*status)){ |
| newUCA->rb = NULL; |
| newUCA->elements = NULL; |
| newUCA->validLocale = NULL; |
| newUCA->requestedLocale = NULL; |
| newUCA->hasRealData = FALSE; // real data lives in .dat file... |
| newUCA->freeImageOnClose = FALSE; |
| umtx_lock(NULL); |
| if(_staticUCA == NULL) { |
| _staticUCA = newUCA; |
| UCA_DATA_MEM = result; |
| result = NULL; |
| newUCA = NULL; |
| } |
| umtx_unlock(NULL); |
| |
| if(newUCA != NULL) { |
| udata_close(result); |
| uprv_free(newUCA); |
| } |
| else { |
| ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
| } |
| // Initalize variables for implicit generation |
| const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts); |
| uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status); |
| //_staticUCA->mapping.getFoldingOffset = _getFoldingOffset; |
| }else{ |
| udata_close(result); |
| uprv_free(newUCA); |
| _staticUCA= NULL; |
| } |
| } |
| } |
| return _staticUCA; |
| } |
| |
| |
| /* collIterNormalize Incremental Normalization happens here. */ |
| /* pick up the range of chars identifed by FCD, */ |
| /* normalize it into the collIterate's writable buffer, */ |
| /* switch the collIterate's state to use the writable buffer. */ |
| /* */ |
| static |
| void collIterNormalize(collIterate *collationSource) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| int32_t normLen; |
| UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ |
| UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ |
| |
| normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize, |
| srcP, (int32_t)(endP - srcP), |
| FALSE, 0, |
| &status); |
| if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) { |
| // reallocate and terminate |
| if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, |
| &collationSource->writableBuffer, |
| (int32_t *)&collationSource->writableBufSize, normLen + 1, |
| 0) |
| ) { |
| #ifdef UCOL_DEBUG |
| fprintf(stderr, "collIterNormalize(), out of memory\n"); |
| #endif |
| return; |
| } |
| status = U_ZERO_ERROR; |
| normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize, |
| srcP, (int32_t)(endP - srcP), |
| FALSE, 0, |
| &status); |
| } |
| if (U_FAILURE(status)) { |
| #ifdef UCOL_DEBUG |
| fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status)); |
| #endif |
| return; |
| } |
| |
| if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { |
| collationSource->flags |= UCOL_ITER_ALLOCATED; |
| } |
| collationSource->pos = collationSource->writableBuffer; |
| collationSource->origFlags = collationSource->flags; |
| collationSource->flags |= UCOL_ITER_INNORMBUF; |
| collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
| } |
| |
| |
| // This function takes the iterator and extracts normalized stuff up to the next boundary |
| // It is similar in the end results to the collIterNormalize, but for the cases when we |
| // use an iterator |
| static |
| inline void normalizeIterator(collIterate *collationSource) { |
| UErrorCode status = U_ZERO_ERROR; |
| UBool wasNormalized = FALSE; |
| //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); |
| uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); |
| int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
| (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
| if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { |
| // reallocate and terminate |
| if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, |
| &collationSource->writableBuffer, |
| (int32_t *)&collationSource->writableBufSize, normLen + 1, |
| 0) |
| ) { |
| #ifdef UCOL_DEBUG |
| fprintf(stderr, "normalizeIterator(), out of memory\n"); |
| #endif |
| return; |
| } |
| status = U_ZERO_ERROR; |
| //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); |
| collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); |
| normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
| (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
| } |
| // Terminate the buffer - we already checked that it is big enough |
| collationSource->writableBuffer[normLen] = 0; |
| if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { |
| collationSource->flags |= UCOL_ITER_ALLOCATED; |
| } |
| collationSource->pos = collationSource->writableBuffer; |
| collationSource->origFlags = collationSource->flags; |
| collationSource->flags |= UCOL_ITER_INNORMBUF; |
| collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
| } |
| |
| |
| /* Incremental FCD check and normalize */ |
| /* Called from getNextCE when normalization state is suspect. */ |
| /* When entering, the state is known to be this: */ |
| /* o We are working in the main buffer of the collIterate, not the side */ |
| /* writable buffer. When in the side buffer, normalization mode is always off, */ |
| /* so we won't get here. */ |
| /* o The leading combining class from the current character is 0 or */ |
| /* the trailing combining class of the previous char was zero. */ |
| /* True because the previous call to this function will have always exited */ |
| /* that way, and we get called for every char where cc might be non-zero. */ |
| static |
| inline UBool collIterFCD(collIterate *collationSource) { |
| UChar c, c2; |
| const UChar *srcP, *endP; |
| uint8_t leadingCC; |
| uint8_t prevTrailingCC = 0; |
| uint16_t fcd; |
| UBool needNormalize = FALSE; |
| |
| srcP = collationSource->pos-1; |
| |
| if (collationSource->flags & UCOL_ITER_HASLEN) { |
| endP = collationSource->endp; |
| } else { |
| endP = NULL; |
| } |
| |
| // Get the trailing combining class of the current character. If it's zero, |
| // we are OK. |
| c = *srcP++; |
| /* trie access */ |
| fcd = unorm_getFCD16(fcdTrieIndex, c); |
| if (fcd != 0) { |
| if (U16_IS_LEAD(c)) { |
| if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) { |
| ++srcP; |
| fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); |
| } else { |
| fcd = 0; |
| } |
| } |
| |
| prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| |
| if (prevTrailingCC != 0) { |
| // The current char has a non-zero trailing CC. Scan forward until we find |
| // a char with a leading cc of zero. |
| while (endP == NULL || srcP != endP) |
| { |
| const UChar *savedSrcP = srcP; |
| |
| c = *srcP++; |
| /* trie access */ |
| fcd = unorm_getFCD16(fcdTrieIndex, c); |
| if (fcd != 0 && U16_IS_LEAD(c)) { |
| if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) { |
| ++srcP; |
| fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); |
| } else { |
| fcd = 0; |
| } |
| } |
| leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| if (leadingCC == 0) { |
| srcP = savedSrcP; // Hit char that is not part of combining sequence. |
| // back up over it. (Could be surrogate pair!) |
| break; |
| } |
| |
| if (leadingCC < prevTrailingCC) { |
| needNormalize = TRUE; |
| } |
| |
| prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| } |
| } |
| } |
| |
| collationSource->fcdPosition = (UChar *)srcP; |
| |
| return needNormalize; |
| } |
| |
| /****************************************************************************/ |
| /* Following are the CE retrieval functions */ |
| /* */ |
| /****************************************************************************/ |
| |
| static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); |
| static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); |
| |
| /* there should be a macro version of this function in the header file */ |
| /* This is the first function that tries to fetch a collation element */ |
| /* If it's not succesfull or it encounters a more difficult situation */ |
| /* some more sofisticated and slower functions are invoked */ |
| static |
| inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { |
| uint32_t order = 0; |
| if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ |
| order = *(collationSource->toReturn++); /* if so, return them */ |
| if(collationSource->CEpos == collationSource->toReturn) { |
| collationSource->CEpos = collationSource->toReturn = collationSource->CEs; |
| } |
| return order; |
| } |
| |
| UChar ch = 0; |
| |
| for (;;) /* Loop handles case when incremental normalize switches */ |
| { /* to or from the side buffer / original string, and we */ |
| /* need to start again to get the next character. */ |
| |
| if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) |
| { |
| // The source string is null terminated and we're not working from the side buffer, |
| // and we're not normalizing. This is the fast path. |
| // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) |
| ch = *collationSource->pos++; |
| if (ch != 0) { |
| break; |
| } |
| else { |
| return UCOL_NO_MORE_CES; |
| } |
| } |
| |
| if (collationSource->flags & UCOL_ITER_HASLEN) { |
| // Normal path for strings when length is specified. |
| // (We can't be in side buffer because it is always null terminated.) |
| if (collationSource->pos >= collationSource->endp) { |
| // Ran off of the end of the main source string. We're done. |
| return UCOL_NO_MORE_CES; |
| } |
| ch = *collationSource->pos++; |
| } |
| else if(collationSource->flags & UCOL_USE_ITERATOR) { |
| UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); |
| if(iterCh == U_SENTINEL) { |
| return UCOL_NO_MORE_CES; |
| } |
| ch = (UChar)iterCh; |
| } |
| else |
| { |
| // Null terminated string. |
| ch = *collationSource->pos++; |
| if (ch == 0) { |
| // Ran off end of buffer. |
| if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
| // Ran off end of main string. backing up one character. |
| collationSource->pos--; |
| return UCOL_NO_MORE_CES; |
| } |
| else |
| { |
| // Hit null in the normalize side buffer. |
| // Usually this means the end of the normalized data, |
| // except for one odd case: a null followed by combining chars, |
| // which is the case if we are at the start of the buffer. |
| if (collationSource->pos == collationSource->writableBuffer+1) { |
| break; |
| } |
| |
| // Null marked end of side buffer. |
| // Revert to the main string and |
| // loop back to top to try again to get a character. |
| collationSource->pos = collationSource->fcdPosition; |
| collationSource->flags = collationSource->origFlags; |
| continue; |
| } |
| } |
| } |
| |
| if(collationSource->flags&UCOL_HIRAGANA_Q) { |
| if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) { |
| collationSource->flags |= UCOL_WAS_HIRAGANA; |
| } else { |
| collationSource->flags &= ~UCOL_WAS_HIRAGANA; |
| } |
| } |
| |
| // We've got a character. See if there's any fcd and/or normalization stuff to do. |
| // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. |
| if ((collationSource->flags & UCOL_ITER_NORM) == 0) { |
| break; |
| } |
| |
| if (collationSource->fcdPosition >= collationSource->pos) { |
| // An earlier FCD check has already covered the current character. |
| // We can go ahead and process this char. |
| break; |
| } |
| |
| if (ch < ZERO_CC_LIMIT_ ) { |
| // Fast fcd safe path. Trailing combining class == 0. This char is OK. |
| break; |
| } |
| |
| if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| // We need to peek at the next character in order to tell if we are FCD |
| if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { |
| // We are at the last char of source string. |
| // It is always OK for FCD check. |
| break; |
| } |
| |
| // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test |
| if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| break; |
| } |
| } |
| |
| |
| // Need a more complete FCD check and possible normalization. |
| if (collIterFCD(collationSource)) { |
| collIterNormalize(collationSource); |
| } |
| if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
| // No normalization was needed. Go ahead and process the char we already had. |
| break; |
| } |
| |
| // Some normalization happened. Next loop iteration will pick up a char |
| // from the normalization buffer. |
| |
| } // end for (;;) |
| |
| |
| if (ch <= 0xFF) { |
| /* For latin-1 characters we never need to fall back to the UCA table */ |
| /* because all of the UCA data is replicated in the latinOneMapping array */ |
| order = coll->latinOneMapping[ch]; |
| if (order > UCOL_NOT_FOUND) { |
| order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); |
| } |
| } |
| else |
| { |
| order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| if(order > UCOL_NOT_FOUND) { /* if a CE is special */ |
| order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ |
| } |
| if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ |
| /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ |
| order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| |
| if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ |
| order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); |
| } |
| } |
| } |
| if(order == UCOL_NOT_FOUND) { |
| order = getImplicit(ch, collationSource); |
| } |
| return order; /* return the CE */ |
| } |
| |
| /* ucol_getNextCE, out-of-line version for use from other files. */ |
| U_CAPI uint32_t U_EXPORT2 |
| ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { |
| return ucol_IGetNextCE(coll, collationSource, status); |
| } |
| |
| |
| /** |
| * Incremental previous normalization happens here. Pick up the range of chars |
| * identifed by FCD, normalize it into the collIterate's writable buffer, |
| * switch the collIterate's state to use the writable buffer. |
| * @param data collation iterator data |
| */ |
| static |
| void collPrevIterNormalize(collIterate *data) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UChar *pEnd = data->pos; /* End normalize + 1 */ |
| UChar *pStart; |
| uint32_t normLen; |
| UChar *pStartNorm; |
| |
| /* Start normalize */ |
| if (data->fcdPosition == NULL) { |
| pStart = data->string; |
| } |
| else { |
| pStart = data->fcdPosition + 1; |
| } |
| |
| normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, |
| data->writableBuffer, 0, &status); |
| |
| if (data->writableBufSize <= normLen) { |
| freeHeapWritableBuffer(data); |
| data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) * |
| sizeof(UChar)); |
| if(data->writableBuffer == NULL) { // something is wrong here, return |
| return; |
| } |
| data->flags |= UCOL_ITER_ALLOCATED; |
| /* to handle the zero termination */ |
| data->writableBufSize = normLen + 1; |
| } |
| status = U_ZERO_ERROR; |
| /* |
| this puts the null termination infront of the normalized string instead |
| of the end |
| */ |
| pStartNorm = data->writableBuffer + (data->writableBufSize - normLen); |
| *(pStartNorm - 1) = 0; |
| unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm, |
| normLen, &status); |
| |
| data->pos = data->writableBuffer + data->writableBufSize; |
| data->origFlags = data->flags; |
| data->flags |= UCOL_ITER_INNORMBUF; |
| data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| } |
| |
| |
| /** |
| * Incremental FCD check for previous iteration and normalize. Called from |
| * getPrevCE when normalization state is suspect. |
| * When entering, the state is known to be this: |
| * o We are working in the main buffer of the collIterate, not the side |
| * writable buffer. When in the side buffer, normalization mode is always |
| * off, so we won't get here. |
| * o The leading combining class from the current character is 0 or the |
| * trailing combining class of the previous char was zero. |
| * True because the previous call to this function will have always exited |
| * that way, and we get called for every char where cc might be non-zero. |
| * @param data collation iterate struct |
| * @return normalization status, TRUE for normalization to be done, FALSE |
| * otherwise |
| */ |
| static |
| inline UBool collPrevIterFCD(collIterate *data) |
| { |
| const UChar *src, *start; |
| UChar c, c2; |
| uint8_t leadingCC; |
| uint8_t trailingCC = 0; |
| uint16_t fcd; |
| UBool result = FALSE; |
| |
| start = data->string; |
| src = data->pos + 1; |
| |
| /* Get the trailing combining class of the current character. */ |
| c = *--src; |
| if (!U16_IS_SURROGATE(c)) { |
| fcd = unorm_getFCD16(fcdTrieIndex, c); |
| } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) { |
| --src; |
| fcd = unorm_getFCD16(fcdTrieIndex, c2); |
| if (fcd != 0) { |
| fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); |
| } |
| } else /* unpaired surrogate */ { |
| fcd = 0; |
| } |
| |
| leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| |
| if (leadingCC != 0) { |
| /* |
| The current char has a non-zero leading combining class. |
| Scan backward until we find a char with a trailing cc of zero. |
| */ |
| for (;;) |
| { |
| if (start == src) { |
| data->fcdPosition = NULL; |
| return result; |
| } |
| |
| c = *--src; |
| if (!U16_IS_SURROGATE(c)) { |
| fcd = unorm_getFCD16(fcdTrieIndex, c); |
| } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) { |
| --src; |
| fcd = unorm_getFCD16(fcdTrieIndex, c2); |
| if (fcd != 0) { |
| fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); |
| } |
| } else /* unpaired surrogate */ { |
| fcd = 0; |
| } |
| |
| trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| |
| if (trailingCC == 0) { |
| break; |
| } |
| |
| if (leadingCC < trailingCC) { |
| result = TRUE; |
| } |
| |
| leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| } |
| } |
| |
| data->fcdPosition = (UChar *)src; |
| |
| return result; |
| } |
| |
| /** gets a character from the string at a given offset |
| * Handles both normal and iterative cases. |
| * No error checking - caller beware! |
| */ |
| inline static |
| UChar peekCharacter(collIterate *source, int32_t offset) { |
| if(source->pos != NULL) { |
| return *(source->pos + offset); |
| } else if(source->iterator != NULL) { |
| if(offset != 0) { |
| source->iterator->move(source->iterator, offset, UITER_CURRENT); |
| UChar toReturn = (UChar)source->iterator->next(source->iterator); |
| source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); |
| return toReturn; |
| } else { |
| return (UChar)source->iterator->current(source->iterator); |
| } |
| } else { |
| return (UChar)U_SENTINEL; |
| } |
| } |
| |
| /** |
| * Determines if we are at the start of the data string in the backwards |
| * collation iterator |
| * @param data collation iterator |
| * @return TRUE if we are at the start |
| */ |
| static |
| inline UBool isAtStartPrevIterate(collIterate *data) { |
| if(data->pos == NULL && data->iterator != NULL) { |
| return !data->iterator->hasPrevious(data->iterator); |
| } |
| //return (collIter_bos(data)) || |
| return (data->pos == data->string) || |
| ((data->flags & UCOL_ITER_INNORMBUF) && |
| *(data->pos - 1) == 0 && data->fcdPosition == NULL); |
| } |
| |
| static |
| inline void goBackOne(collIterate *data) { |
| # if 0 |
| // somehow, it looks like we need to keep iterator synced up |
| // at all times, as above. |
| if(data->pos) { |
| data->pos--; |
| } |
| if(data->iterator) { |
| data->iterator->previous(data->iterator); |
| } |
| #endif |
| if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { |
| data->iterator->previous(data->iterator); |
| } |
| if(data->pos) { |
| data->pos --; |
| } |
| } |
| |
| /** |
| * Inline function that gets a simple CE. |
| * So what it does is that it will first check the expansion buffer. If the |
| * expansion buffer is not empty, ie the end pointer to the expansion buffer |
| * is different from the string pointer, we return the collation element at the |
| * return pointer and decrement it. |
| * For more complicated CEs it resorts to getComplicatedCE. |
| * @param coll collator data |
| * @param data collation iterator struct |
| * @param status error status |
| */ |
| static |
| inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, |
| UErrorCode *status) |
| { |
| uint32_t result = (uint32_t)UCOL_NULLORDER; |
| if (data->toReturn > data->CEs) { |
| data->toReturn --; |
| result = *(data->toReturn); |
| if (data->CEs == data->toReturn) { |
| data->CEpos = data->toReturn; |
| } |
| } |
| else { |
| UChar ch = 0; |
| /* |
| Loop handles case when incremental normalize switches to or from the |
| side buffer / original string, and we need to start again to get the |
| next character. |
| */ |
| for (;;) { |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* |
| Normal path for strings when length is specified. |
| Not in side buffer because it is always null terminated. |
| */ |
| if (data->pos <= data->string) { |
| /* End of the main source string */ |
| return UCOL_NO_MORE_CES; |
| } |
| data->pos --; |
| ch = *data->pos; |
| } |
| // we are using an iterator to go back. Pray for us! |
| else if (data->flags & UCOL_USE_ITERATOR) { |
| UChar32 iterCh = data->iterator->previous(data->iterator); |
| if(iterCh == U_SENTINEL) { |
| return UCOL_NO_MORE_CES; |
| } else { |
| ch = (UChar)iterCh; |
| } |
| } |
| else { |
| data->pos --; |
| ch = *data->pos; |
| /* we are in the side buffer. */ |
| if (ch == 0) { |
| /* |
| At the start of the normalize side buffer. |
| Go back to string. |
| Because pointer points to the last accessed character, |
| hence we have to increment it by one here. |
| */ |
| if (data->fcdPosition == NULL) { |
| data->pos = data->string; |
| return UCOL_NO_MORE_CES; |
| } |
| else { |
| data->pos = data->fcdPosition + 1; |
| } |
| data->flags = data->origFlags; |
| continue; |
| } |
| } |
| |
| if(data->flags&UCOL_HIRAGANA_Q) { |
| if(ch>=0x3040 && ch<=0x309f) { |
| data->flags |= UCOL_WAS_HIRAGANA; |
| } else { |
| data->flags &= ~UCOL_WAS_HIRAGANA; |
| } |
| } |
| |
| /* |
| * got a character to determine if there's fcd and/or normalization |
| * stuff to do. |
| * if the current character is not fcd. |
| * if current character is at the start of the string |
| * Trailing combining class == 0. |
| * Note if pos is in the writablebuffer, norm is always 0 |
| */ |
| if (ch < ZERO_CC_LIMIT_ || |
| // this should propel us out of the loop in the iterator case |
| (data->flags & UCOL_ITER_NORM) == 0 || |
| (data->fcdPosition != NULL && data->fcdPosition <= data->pos) |
| || data->string == data->pos) { |
| break; |
| } |
| |
| if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| /* if next character is FCD */ |
| if (data->pos == data->string) { |
| /* First char of string is always OK for FCD check */ |
| break; |
| } |
| |
| /* Not first char of string, do the FCD fast test */ |
| if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| break; |
| } |
| } |
| |
| /* Need a more complete FCD check and possible normalization. */ |
| if (collPrevIterFCD(data)) { |
| collPrevIterNormalize(data); |
| } |
| |
| if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| /* No normalization. Go ahead and process the char. */ |
| break; |
| } |
| |
| /* |
| Some normalization happened. |
| Next loop picks up a char from the normalization buffer. |
| */ |
| } |
| |
| /* attempt to handle contractions, after removal of the backwards |
| contraction |
| */ |
| if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { |
| result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); |
| } else { |
| if (ch <= 0xFF) { |
| result = coll->latinOneMapping[ch]; |
| } |
| else { |
| result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| } |
| if (result > UCOL_NOT_FOUND) { |
| result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); |
| } |
| if (result == UCOL_NOT_FOUND) { // Not found in master list |
| if (!isAtStartPrevIterate(data) && |
| ucol_contractionEndCP(ch, data->coll)) { |
| result = UCOL_CONTRACTION; |
| } else { |
| if(coll->UCA) { |
| result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| } |
| } |
| |
| if (result > UCOL_NOT_FOUND) { |
| if(coll->UCA) { |
| result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); |
| } |
| } |
| } |
| } |
| if(result == UCOL_NOT_FOUND) { |
| result = getPrevImplicit(ch, data); |
| } |
| } |
| return result; |
| } |
| |
| |
| /* ucol_getPrevCE, out-of-line version for use from other files. */ |
| U_CAPI uint32_t U_EXPORT2 |
| ucol_getPrevCE(const UCollator *coll, collIterate *data, |
| UErrorCode *status) { |
| return ucol_IGetPrevCE(coll, data, status); |
| } |
| |
| |
| /* this should be connected to special Jamo handling */ |
| U_CAPI uint32_t U_EXPORT2 |
| ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { |
| collIterate colIt; |
| uint32_t order; |
| IInit_collIterate(coll, &u, 1, &colIt); |
| order = ucol_IGetNextCE(coll, &colIt, status); |
| /*UCOL_GETNEXTCE(order, coll, colIt, status);*/ |
| return order; |
| } |
| |
| /** |
| * Inserts the argument character into the end of the buffer pushing back the |
| * null terminator. |
| * @param data collIterate struct data |
| * @param pNull pointer to the null termination |
| * @param ch character to be appended |
| * @return the position of the new addition |
| */ |
| static |
| inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch) |
| { |
| uint32_t size = data->writableBufSize; |
| UChar *newbuffer; |
| const uint32_t incsize = 5; |
| |
| if ((data->writableBuffer + size) > (pNull + 1)) { |
| *pNull = ch; |
| *(pNull + 1) = 0; |
| return pNull; |
| } |
| |
| /* |
| buffer will always be null terminated at the end. |
| giving extra space since it is likely that more characters will be added. |
| */ |
| size += incsize; |
| newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); |
| if(newbuffer != NULL) { // something wrong, but no status |
| uprv_memcpy(newbuffer, data->writableBuffer, |
| data->writableBufSize * sizeof(UChar)); |
| |
| freeHeapWritableBuffer(data); |
| data->writableBufSize = size; |
| data->writableBuffer = newbuffer; |
| |
| newbuffer = newbuffer + data->writableBufSize; |
| *newbuffer = ch; |
| *(newbuffer + 1) = 0; |
| } |
| return newbuffer; |
| } |
| |
| /** |
| * Inserts the argument string into the end of the buffer pushing back the |
| * null terminator. |
| * @param data collIterate struct data |
| * @param pNull pointer to the null termination |
| * @param string to be appended |
| * @param length of the string to be appended |
| * @return the position of the new addition |
| */ |
| static |
| inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str, |
| int32_t length) |
| { |
| uint32_t size = pNull - data->writableBuffer; |
| UChar *newbuffer; |
| |
| if (data->writableBuffer + data->writableBufSize > pNull + length + 1) { |
| uprv_memcpy(pNull, str, length * sizeof(UChar)); |
| *(pNull + length) = 0; |
| return pNull; |
| } |
| |
| /* |
| buffer will always be null terminated at the end. |
| giving extra space since it is likely that more characters will be added. |
| */ |
| newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1)); |
| if(newbuffer != NULL) { |
| uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar)); |
| uprv_memcpy(newbuffer + size, str, length * sizeof(UChar)); |
| |
| freeHeapWritableBuffer(data); |
| data->writableBufSize = size + length + 1; |
| data->writableBuffer = newbuffer; |
| } |
| |
| return newbuffer; |
| } |
| |
| /** |
| * Special normalization function for contraction in the forwards iterator. |
| * This normalization sequence will place the current character at source->pos |
| * and its following normalized sequence into the buffer. |
| * The fcd position, pos will be changed. |
| * pos will now point to positions in the buffer. |
| * Flags will be changed accordingly. |
| * @param data collation iterator data |
| */ |
| static |
| inline void normalizeNextContraction(collIterate *data) |
| { |
| UChar *buffer = data->writableBuffer; |
| uint32_t buffersize = data->writableBufSize; |
| uint32_t strsize; |
| UErrorCode status = U_ZERO_ERROR; |
| /* because the pointer points to the next character */ |
| UChar *pStart = data->pos - 1; |
| UChar *pEnd; |
| uint32_t normLen; |
| UChar *pStartNorm; |
| |
| if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| *data->writableBuffer = *(pStart - 1); |
| strsize = 1; |
| } |
| else { |
| strsize = u_strlen(data->writableBuffer); |
| } |
| |
| pEnd = data->fcdPosition; |
| |
| normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, |
| &status); |
| |
| if (buffersize <= normLen + strsize) { |
| uint32_t size = strsize + normLen + 1; |
| UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); |
| if(temp != NULL) { |
| uprv_memcpy(temp, buffer, sizeof(UChar) * strsize); |
| freeHeapWritableBuffer(data); |
| data->writableBuffer = temp; |
| data->writableBufSize = size; |
| data->flags |= UCOL_ITER_ALLOCATED; |
| } |
| } |
| |
| status = U_ZERO_ERROR; |
| pStartNorm = buffer + strsize; |
| /* null-termination will be added here */ |
| unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, |
| normLen + 1, &status); |
| |
| data->pos = data->writableBuffer + strsize; |
| data->origFlags = data->flags; |
| data->flags |= UCOL_ITER_INNORMBUF; |
| data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| } |
| |
| /** |
| * Contraction character management function that returns the next character |
| * for the forwards iterator. |
| * Does nothing if the next character is in buffer and not the first character |
| * in it. |
| * Else it checks next character in data string to see if it is normalizable. |
| * If it is not, the character is simply copied into the buffer, else |
| * the whole normalized substring is copied into the buffer, including the |
| * current character. |
| * @param data collation element iterator data |
| * @return next character |
| */ |
| static |
| inline UChar getNextNormalizedChar(collIterate *data) |
| { |
| UChar nextch; |
| UChar ch; |
| // Here we need to add the iterator code. One problem is the way |
| // end of string is handled. If we just return next char, it could |
| // be the sentinel. Most of the cases already check for this, but we |
| // need to be sure. |
| if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { |
| /* if no normalization and not in buffer. */ |
| if(data->flags & UCOL_USE_ITERATOR) { |
| return (UChar)data->iterator->next(data->iterator); |
| } else { |
| return *(data->pos ++); |
| } |
| } |
| |
| //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { |
| //normalizeIterator(data); |
| //} |
| |
| UChar *pEndWritableBuffer = NULL; |
| UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
| if ((innormbuf && *data->pos != 0) || |
| (data->fcdPosition != NULL && !innormbuf && |
| data->pos < data->fcdPosition)) { |
| /* |
| if next character is in normalized buffer, no further normalization |
| is required |
| */ |
| return *(data->pos ++); |
| } |
| |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* in data string */ |
| if (data->pos + 1 == data->endp) { |
| return *(data->pos ++); |
| } |
| } |
| else { |
| if (innormbuf) { |
| // inside the normalization buffer, but at the end |
| // (since we encountered zero). This means, in the |
| // case we're using char iterator, that we need to |
| // do another round of normalization. |
| //if(data->origFlags & UCOL_USE_ITERATOR) { |
| // we need to restore original flags, |
| // otherwise, we'll lose them |
| //data->flags = data->origFlags; |
| //normalizeIterator(data); |
| //return *(data->pos++); |
| //} else { |
| /* |
| in writable buffer, at this point fcdPosition can not be |
| pointing to the end of the data string. see contracting tag. |
| */ |
| if(data->fcdPosition) { |
| if (*(data->fcdPosition + 1) == 0 || |
| data->fcdPosition + 1 == data->endp) { |
| /* at the end of the string, dump it into the normalizer */ |
| data->pos = insertBufferEnd(data, data->pos, |
| *(data->fcdPosition)) + 1; |
| return *(data->fcdPosition ++); |
| } |
| pEndWritableBuffer = data->pos; |
| data->pos = data->fcdPosition; |
| } else if(data->origFlags & UCOL_USE_ITERATOR) { |
| // if we are here, we're using a normalizing iterator. |
| // we should just continue further. |
| data->flags = data->origFlags; |
| data->pos = NULL; |
| return (UChar)data->iterator->next(data->iterator); |
| } |
| //} |
| } |
| else { |
| if (*(data->pos + 1) == 0) { |
| return *(data->pos ++); |
| } |
| } |
| } |
| |
| ch = *data->pos ++; |
| nextch = *data->pos; |
| |
| /* |
| * if the current character is not fcd. |
| * Trailing combining class == 0. |
| */ |
| if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && |
| (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || |
| ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { |
| /* |
| Need a more complete FCD check and possible normalization. |
| normalize substring will be appended to buffer |
| */ |
| if (collIterFCD(data)) { |
| normalizeNextContraction(data); |
| return *(data->pos ++); |
| } |
| else if (innormbuf) { |
| /* fcdposition shifted even when there's no normalization, if we |
| don't input the rest into this, we'll get the wrong position when |
| we reach the end of the writableBuffer */ |
| int32_t length = data->fcdPosition - data->pos + 1; |
| data->pos = insertBufferEnd(data, pEndWritableBuffer, |
| data->pos - 1, length); |
| return *(data->pos ++); |
| } |
| } |
| |
| if (innormbuf) { |
| /* |
| no normalization is to be done hence only one character will be |
| appended to the buffer. |
| */ |
| data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1; |
| } |
| |
| /* points back to the pos in string */ |
| return ch; |
| } |
| |
| |
| |
| /** |
| * Function to copy the buffer into writableBuffer and sets the fcd position to |
| * the correct position |
| * @param source data string source |
| * @param buffer character buffer |
| * @param tempdb current position in buffer that has been used up |
| */ |
| static |
| inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer, |
| UChar *tempdb) |
| { |
| /* okay confusing part here. to ensure that the skipped characters are |
| considered later, we need to place it in the appropriate position in the |
| normalization buffer and reassign the pos pointer. simple case if pos |
| reside in string, simply copy to normalization buffer and |
| fcdposition = pos, pos = start of normalization buffer. if pos in |
| normalization buffer, we'll insert the copy infront of pos and point pos |
| to the start of the normalization buffer. why am i doing these copies? |
| well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does |
| not require any changes, which be really painful. */ |
| uint32_t length = u_strlen(buffer);; |
| if (source->flags & UCOL_ITER_INNORMBUF) { |
| u_strcpy(tempdb, source->pos); |
| } |
| else { |
| source->fcdPosition = source->pos; |
| source->origFlags = source->flags; |
| source->flags |= UCOL_ITER_INNORMBUF; |
| source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
| } |
| |
| if (length >= source->writableBufSize) { |
| freeHeapWritableBuffer(source); |
| source->writableBuffer = |
| (UChar *)uprv_malloc((length + 1) * sizeof(UChar)); |
| if(source->writableBuffer == NULL) { |
| return; |
| } |
| source->writableBufSize = length; |
| } |
| |
| u_strcpy(source->writableBuffer, buffer); |
| source->pos = source->writableBuffer; |
| } |
| |
| /** |
| * Function to get the discontiguos collation element within the source. |
| * Note this function will set the position to the appropriate places. |
| * @param coll current collator used |
| * @param source data string source |
| * @param constart index to the start character in the contraction table |
| * @return discontiguos collation element offset |
| */ |
| static |
| uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, |
| const UChar *constart) |
| { |
| /* source->pos currently points to the second combining character after |
| the start character */ |
| UChar *temppos = source->pos; |
| UChar buffer[4*UCOL_MAX_BUFFER]; |
| UChar *tempdb = buffer; |
| const UChar *tempconstart = constart; |
| uint8_t tempflags = source->flags; |
| UBool multicontraction = FALSE; |
| UChar *tempbufferpos = 0; |
| collIterateState discState; |
| |
| backupState(source, &discState); |
| |
| //*tempdb = *(source->pos - 1); |
| *tempdb = peekCharacter(source, -1); |
| tempdb ++; |
| while (TRUE) { |
| UChar *UCharOffset; |
| UChar schar, |
| tchar; |
| uint32_t result; |
| |
| if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) |
| || (peekCharacter(source, 0) == 0 && |
| //|| (*source->pos == 0 && |
| ((source->flags & UCOL_ITER_INNORMBUF) == 0 || |
| source->fcdPosition == NULL || |
| source->fcdPosition == source->endp || |
| *(source->fcdPosition) == 0 || |
| u_getCombiningClass(*(source->fcdPosition)) == 0)) || |
| /* end of string in null terminated string or stopped by a |
| null character, note fcd does not always point to a base |
| character after the discontiguos change */ |
| u_getCombiningClass(peekCharacter(source, 0)) == 0) { |
| //u_getCombiningClass(*(source->pos)) == 0) { |
| //constart = (UChar *)coll->image + getContractOffset(CE); |
| if (multicontraction) { |
| *tempbufferpos = 0; |
| source->pos = temppos - 1; |
| setDiscontiguosAttribute(source, buffer, tempdb); |
| return *(coll->contractionCEs + |
| (tempconstart - coll->contractionIndex)); |
| } |
| constart = tempconstart; |
| break; |
| } |
| |
| UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ |
| schar = getNextNormalizedChar(source); |
| |
| while (schar > (tchar = *UCharOffset)) { |
| UCharOffset++; |
| } |
| |
| if (schar != tchar) { |
| /* not the correct codepoint. we stuff the current codepoint into |
| the discontiguos buffer and try the next character */ |
| *tempdb = schar; |
| tempdb ++; |
| continue; |
| } |
| else { |
| if (u_getCombiningClass(schar) == |
| u_getCombiningClass(peekCharacter(source, -2))) { |
| //u_getCombiningClass(*(source->pos - 2))) { |
| *tempdb = schar; |
| tempdb ++; |
| continue; |
| } |
| result = *(coll->contractionCEs + |
| (UCharOffset - coll->contractionIndex)); |
| } |
| *tempdb = 0; |
| |
| if (result == UCOL_NOT_FOUND) { |
| break; |
| } else if (isContraction(result)) { |
| /* this is a multi-contraction*/ |
| tempconstart = (UChar *)coll->image + getContractOffset(result); |
| if (*(coll->contractionCEs + (constart - coll->contractionIndex)) |
| != UCOL_NOT_FOUND) { |
| multicontraction = TRUE; |
| temppos = source->pos + 1; |
| tempbufferpos = buffer + u_strlen(buffer); |
| } |
| } else { |
| setDiscontiguosAttribute(source, buffer, tempdb); |
| return result; |
| } |
| } |
| |
| /* no problems simply reverting just like that, |
| if we are in string before getting into this function, points back to |
| string hence no problem. |
| if we are in normalization buffer before getting into this function, |
| since we'll never use another normalization within this function, we |
| know that fcdposition points to a base character. the normalization buffer |
| never change, hence this revert works. */ |
| loadState(source, &discState, TRUE); |
| goBackOne(source); |
| |
| //source->pos = temppos - 1; |
| source->flags = tempflags; |
| return *(coll->contractionCEs + (constart - coll->contractionIndex)); |
| } |
| |
| static |
| inline UBool isNonChar(UChar32 cp) { |
| if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) { |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| /* now uses Mark's getImplicitPrimary code */ |
| static |
| inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { |
| if(isNonChar(cp)) { |
| return 0; |
| } |
| uint32_t r = uprv_uca_getImplicitPrimary(cp); |
| *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; |
| return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' |
| } |
| |
| /** |
| * Inserts the argument character into the front of the buffer replacing the |
| * front null terminator. |
| * @param data collation element iterator data |
| * @param pNull pointer to the null terminator |
| * @param ch character to be appended |
| * @return positon of added character |
| */ |
| static |
| inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch) |
| { |
| uint32_t size = data->writableBufSize; |
| UChar *end; |
| UChar *newbuffer; |
| const uint32_t incsize = 5; |
| |
| if (pNull > data->writableBuffer + 1) { |
| *pNull = ch; |
| *(pNull - 1) = 0; |
| return pNull; |
| } |
| |
| /* |
| buffer will always be null terminated infront. |
| giving extra space since it is likely that more characters will be added. |
| */ |
| size += incsize; |
| newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); |
| if(newbuffer == NULL) { |
| return NULL; |
| } |
| end = newbuffer + incsize; |
| uprv_memcpy(end, data->writableBuffer, |
| data->writableBufSize * sizeof(UChar)); |
| *end = ch; |
| *(end - 1) = 0; |
| |
| freeHeapWritableBuffer(data); |
| |
| data->writableBufSize = size; |
| data->writableBuffer = newbuffer; |
| return end; |
| } |
| |
| /** |
| * Special normalization function for contraction in the previous iterator. |
| * This normalization sequence will place the current character at source->pos |
| * and its following normalized sequence into the buffer. |
| * The fcd position, pos will be changed. |
| * pos will now point to positions in the buffer. |
| * Flags will be changed accordingly. |
| * @param data collation iterator data |
| */ |
| static |
| inline void normalizePrevContraction(collIterate *data, UErrorCode *status) |
| { |
| UChar *buffer = data->writableBuffer; |
| uint32_t buffersize = data->writableBufSize; |
| uint32_t nulltermsize; |
| UErrorCode localstatus = U_ZERO_ERROR; |
| UChar *pEnd = data->pos + 1; /* End normalize + 1 */ |
| UChar *pStart; |
| uint32_t normLen; |
| UChar *pStartNorm; |
| |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* |
| normalization buffer not used yet, we'll pull down the next |
| character into the end of the buffer |
| */ |
| *(buffer + (buffersize - 1)) = *(data->pos + 1); |
| nulltermsize = buffersize - 1; |
| } |
| else { |
| nulltermsize = buffersize; |
| UChar *temp = buffer + (nulltermsize - 1); |
| while (*(temp --) != 0) { |
| nulltermsize --; |
| } |
| } |
| |
| /* Start normalize */ |
| if (data->fcdPosition == NULL) { |
| pStart = data->string; |
| } |
| else { |
| pStart = data->fcdPosition + 1; |
| } |
| |
| normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, |
| &localstatus); |
| |
| if (nulltermsize <= normLen) { |
| uint32_t size = buffersize - nulltermsize + normLen + 1; |
| UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); |
| if (temp == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| nulltermsize = normLen + 1; |
| uprv_memcpy(temp + normLen, buffer, |
| sizeof(UChar) * (buffersize - nulltermsize)); |
| freeHeapWritableBuffer(data); |
| data->writableBuffer = temp; |
| data->writableBufSize = size; |
| } |
| |
| /* |
| this puts the null termination infront of the normalized string instead |
| of the end |
| */ |
| pStartNorm = buffer + (nulltermsize - normLen); |
| *(pStartNorm - 1) = 0; |
| unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, |
| status); |
| |
| data->pos = data->writableBuffer + nulltermsize; |
| data->origFlags = data->flags; |
| data->flags |= UCOL_ITER_INNORMBUF; |
| data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| } |
| |
| /** |
| * Contraction character management function that returns the previous character |
| * for the backwards iterator. |
| * Does nothing if the previous character is in buffer and not the first |
| * character in it. |
| * Else it checks previous character in data string to see if it is |
| * normalizable. |
| * If it is not, the character is simply copied into the buffer, else |
| * the whole normalized substring is copied into the buffer, including the |
| * current character. |
| * @param data collation element iterator data |
| * @return previous character |
| */ |
| static |
| inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) |
| { |
| UChar prevch; |
| UChar ch; |
| UChar *start; |
| UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
| UChar *pNull = NULL; |
| if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || |
| (innormbuf && *(data->pos - 1) != 0)) { |
| /* |
| if no normalization. |
| if previous character is in normalized buffer, no further normalization |
| is required |
| */ |
| if(data->flags & UCOL_USE_ITERATOR) { |
| data->iterator->move(data->iterator, -1, UITER_CURRENT); |
| return (UChar)data->iterator->next(data->iterator); |
| } else { |
| return *(data->pos - 1); |
| } |
| } |
| |
| start = data->pos; |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* in data string */ |
| if ((start - 1) == data->string) { |
| return *(start - 1); |
| } |
| start --; |
| ch = *start; |
| prevch = *(start - 1); |
| } |
| else { |
| /* |
| in writable buffer, at this point fcdPosition can not be NULL. |
| see contracting tag. |
| */ |
| if (data->fcdPosition == data->string) { |
| /* at the start of the string, just dump it into the normalizer */ |
| insertBufferFront(data, data->pos - 1, *(data->fcdPosition)); |
| data->fcdPosition = NULL; |
| return *(data->pos - 1); |
| } |
| pNull = data->pos - 1; |
| start = data->fcdPosition; |
| ch = *start; |
| prevch = *(start - 1); |
| } |
| /* |
| * if the current character is not fcd. |
| * Trailing combining class == 0. |
| */ |
| if (data->fcdPosition > start && |
| (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) |
| { |
| /* |
| Need a more complete FCD check and possible normalization. |
| normalize substring will be appended to buffer |
| */ |
| UChar *backuppos = data->pos; |
| data->pos = start; |
| if (collPrevIterFCD(data)) { |
| normalizePrevContraction(data, status); |
| return *(data->pos - 1); |
| } |
| data->pos = backuppos; |
| data->fcdPosition ++; |
| } |
| |
| if (innormbuf) { |
| /* |
| no normalization is to be done hence only one character will be |
| appended to the buffer. |
| */ |
| insertBufferFront(data, pNull, ch); |
| data->fcdPosition --; |
| } |
| |
| return ch; |
| } |
| |
| /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ |
| /* It is called by getNextCE */ |
| |
| uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { |
| collIterateState entryState; |
| backupState(source, &entryState); |
| UChar32 cp = ch; |
| |
| for (;;) { |
| // This loop will repeat only in the case of contractions, and only when a contraction |
| // is found and the first CE resulting from that contraction is itself a special |
| // (an expansion, for example.) All other special CE types are fully handled the |
| // first time through, and the loop exits. |
| |
| const uint32_t *CEOffset = NULL; |
| switch(getCETag(CE)) { |
| case NOT_FOUND_TAG: |
| /* This one is not found, and we'll let somebody else bother about it... no more games */ |
| return CE; |
| case SURROGATE_TAG: |
| /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ |
| /* two things can happen here: next code point can be a trailing surrogate - we will use it */ |
| /* to retrieve the CE, or it is not a trailing surrogate (or the string
|