| /* |
| ****************************************************************************** |
| * Copyright (c) 1996-2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ****************************************************************************** |
| * File unorm.cpp |
| * |
| * Created by: Vladimir Weinstein 12052000 |
| * |
| * Modification history : |
| * |
| * Date Name Description |
| * 02/01/01 synwee Added normalization quickcheck enum and method. |
| * 02/12/01 synwee Commented out quickcheck util api has been approved |
| * Added private method for doing FCD checks |
| * 02/23/01 synwee Modified quickcheck and checkFCE to run through |
| * string for codepoints < 0x300 for the normalization |
| * mode NFC. |
| */ |
| |
| #include "unicode/unorm.h" |
| #include "unicode/normlzr.h" |
| #include "unicode/ustring.h" |
| #include "unicode/udata.h" |
| #include "cpputils.h" |
| #include "ustr_imp.h" |
| #include "umutex.h" |
| |
| /* added by synwee */ |
| #include "unicode/uchar.h" |
| #include "unicode/utf16.h" |
| |
| /* added by synwee for trie manipulation*/ |
| #define STAGE_1_SHIFT_ 10 |
| #define STAGE_2_SHIFT_ 4 |
| #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F |
| #define STAGE_3_MASK_ 0xF |
| #define LAST_BYTE_MASK_ 0xFF |
| #define SECOND_LAST_BYTE_SHIFT_ 8 |
| |
| /* added by synwee for fast route in quickcheck and fcd */ |
| #define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 |
| |
| /* |
| * for a description of the file format, |
| * see icu/source/tools/genqchk/genqchk.c |
| */ |
| #define QCHK_DATA_NAME "qchk" |
| #define FCHK_DATA_NAME "fchk" |
| #define DATA_TYPE "dat" |
| |
| static UDataMemory *quickcheckData = NULL; |
| static UDataMemory *fcdcheckData = NULL; |
| |
| /** |
| * Authentication values |
| */ |
| static const uint8_t QCHK_DATA_FORMAT_[] = {0x71, 0x63, 0x68, 0x6b}; |
| static const uint8_t FCHK_DATA_FORMAT_[] = {0x66, 0x63, 0x68, 0x6b}; |
| static const uint8_t QCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; |
| static const uint8_t FCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; |
| |
| /** |
| * index values loaded from qchk.dat. |
| * static uint16_t indexes[8]; |
| */ |
| enum { |
| QCHK_INDEX_STAGE_2_BITS, |
| QCHK_INDEX_STAGE_3_BITS, |
| QCHK_INDEX_MIN_VALUES_SIZE, |
| QCHK_INDEX_STAGE_1_INDEX, |
| QCHK_INDEX_STAGE_2_INDEX, |
| QCHK_INDEX_STAGE_3_INDEX |
| }; |
| |
| /** |
| * index values loaded from qchk.dat. |
| * static uint16_t indexes[8]; |
| */ |
| enum { |
| FCHK_INDEX_STAGE_2_BITS, |
| FCHK_INDEX_STAGE_3_BITS, |
| FCHK_INDEX_STAGE_1_INDEX, |
| FCHK_INDEX_STAGE_2_INDEX, |
| FCHK_INDEX_STAGE_3_INDEX |
| }; |
| |
| /** |
| * Array of mask for determining normalization quick check values. |
| * Indexes follows the values in UNormalizationMode |
| */ |
| static const uint8_t QCHK_MASK_[] = {0, 0, 0x11, 0x22, 0x44, 0x88}; |
| /** |
| * Array of minimum codepoints that has UNORM_MAYBE or UNORM_NO quick check |
| * values. Indexes follows the values in UNormalizationMode. |
| * Generated values! Edit at your own risk. |
| */ |
| static const UChar32 *QCHK_MIN_VALUES_; |
| |
| /** |
| * Flag to indicate if data has been loaded |
| */ |
| static UBool isQuickCheckLoaded = FALSE; |
| static UBool isFCDCheckLoaded = FALSE; |
| |
| /** |
| * Minimum value to determine if quickcheck value contains a MAYBE |
| */ |
| static const uint8_t MIN_UNORM_MAYBE_ = 0x10; |
| |
| /** |
| * Array of normalization form corresponding to the index code point. |
| * Hence codepoint 0xABCD will have normalization form QUICK_CHECK_DATA[0xABCD]. |
| * UQUICK_CHECK_DATA[0xABCD] is a byte containing 2 sets of 4 bits information |
| * representing UNORM_MAYBE and UNORM_YES.<br> |
| * bits 1 2 3 4 5678<br> |
| * NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES<br> |
| * ie if UQUICK_CHECK_DATA[0xABCD] = 10000001, this means that 0xABCD is in |
| * NFD form and maybe in NFKC form |
| */ |
| static const uint16_t *QCHK_STAGE_1_; |
| static const uint16_t *QCHK_STAGE_2_; |
| static const uint8_t *QCHK_STAGE_3_; |
| |
| /** |
| * Trie data for FCD. |
| * Each index corresponds to each code point. |
| * Trie value is the combining class of the first and the last character of the |
| * NFD of the codepoint. |
| * size uint16_t for the first 2 stages instead of uint32_t to reduce size. |
| */ |
| static const uint16_t *FCHK_STAGE_1_; |
| static const uint16_t *FCHK_STAGE_2_; |
| static const uint16_t *FCHK_STAGE_3_; |
| |
| U_CAPI int32_t |
| unorm_normalize(const UChar* source, |
| int32_t sourceLength, |
| UNormalizationMode mode, |
| int32_t option, |
| UChar* result, |
| int32_t resultLength, |
| UErrorCode* status) |
| { |
| if(U_FAILURE(*status)) return -1; |
| |
| /* synwee : removed hard coded conversion */ |
| Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *status); |
| if (U_FAILURE(*status)) |
| return -1; |
| |
| int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); |
| const UnicodeString src(sourceLength == -1, source, len); |
| UnicodeString dst(result, 0, resultLength); |
| /* synwee : note quickcheck is added in C ++ normalize method */ |
| if ((option & UNORM_IGNORE_HANGUL) != 0) |
| option = Normalizer::IGNORE_HANGUL; |
| Normalizer::normalize(src, normMode, option, dst, *status); |
| return uprv_fillOutputString(dst, result, resultLength, status); |
| } |
| |
| static UBool |
| isQuickCheckAcceptable(void *context, |
| const char *type, const char *name, |
| const UDataInfo *pInfo) { |
| if (pInfo->size >= 20 && |
| pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
| pInfo->charsetFamily == U_CHARSET_FAMILY && |
| (uprv_memcmp(pInfo->dataFormat, QCHK_DATA_FORMAT_, |
| sizeof(QCHK_DATA_FORMAT_)) == 0) && |
| /* |
| pInfo->dataFormat[0] == 0x71 && |
| pInfo->dataFormat[1] == 0x63 && |
| pInfo->dataFormat[2] == 0x68 && |
| pInfo->dataFormat[3] == 0x6b && |
| pInfo->formatVersion[0] == 1 |
| */ |
| (uprv_memcmp(pInfo->formatVersion, QCHK_FORMAT_VERSION_, |
| sizeof(QCHK_FORMAT_VERSION_)) == 0)) { |
| return TRUE; |
| } else { |
| context = NULL; |
| type = NULL; |
| name = NULL; |
| return FALSE; |
| } |
| } |
| |
| static UBool |
| loadQuickCheckData(UErrorCode *error) { |
| /* load quickcheck data from file if necessary */ |
| if (!isQuickCheckLoaded && U_SUCCESS(*error)) { |
| UDataMemory *data; |
| |
| /* open the data outside the mutex block */ |
| data = udata_openChoice(NULL, DATA_TYPE, QCHK_DATA_NAME, |
| isQuickCheckAcceptable, NULL, error); |
| if (U_FAILURE(*error)) { |
| return isQuickCheckLoaded = FALSE; |
| } |
| |
| /* in the mutex block, set the data for this process */ |
| umtx_lock(NULL); |
| if (quickcheckData == NULL) { |
| const uint16_t *temp = (const uint16_t *)udata_getMemory(data); |
| const uint16_t *indexes = temp; |
| |
| quickcheckData = data; |
| |
| temp += 8; |
| QCHK_MIN_VALUES_ = (const UChar32 *)temp; |
| QCHK_STAGE_1_ = temp + indexes[QCHK_INDEX_STAGE_1_INDEX]; |
| QCHK_STAGE_2_ = temp + indexes[QCHK_INDEX_STAGE_2_INDEX]; |
| QCHK_STAGE_3_ = (const uint8_t *)(temp + |
| indexes[QCHK_INDEX_STAGE_3_INDEX]); |
| data = NULL; |
| } |
| umtx_unlock(NULL); |
| |
| isQuickCheckLoaded = TRUE; |
| |
| /* if a different thread set it first, then close the extra data */ |
| if (data != NULL) { |
| udata_close(data); /* NULL if it was set correctly */ |
| } |
| } |
| |
| return isQuickCheckLoaded; |
| } |
| |
| /** |
| * Performing quick check on a string, to quickly determine if the string is |
| * in a particular normalization format. |
| * Three types of result can be returned UNORM_YES, UNORM_NO or |
| * UNORM_MAYBE. Result UNORM_YES indicates that the argument |
| * string is in the desired normalized format, UNORM_NO determines that |
| * argument string is not in the desired normalized format. A |
| * UNORM_MAYBE result indicates that a more thorough check is required, |
| * the user may have to put the string in its normalized form and compare the |
| * results. |
| * @param source string for determining if it is in a normalized format |
| * @param sourcelength length of source to test |
| * @param mode normalization format from the enum UNormalizationMode |
| * @param status A pointer to an UErrorCode to receive any errors |
| * @return UNORM_YES, UNORM_NO or UNORM_MAYBE |
| */ |
| U_CAPI UNormalizationCheckResult |
| unorm_quickCheck(const UChar *source, |
| int32_t sourcelength, |
| UNormalizationMode mode, |
| UErrorCode* status) |
| { |
| uint8_t oldcombiningclass = 0; |
| uint8_t combiningclass; |
| uint8_t quickcheckvalue; |
| uint8_t mask = QCHK_MASK_[mode]; |
| UChar32 min; |
| UChar32 codepoint; |
| UNormalizationCheckResult result = UNORM_YES; |
| const UChar *psource; |
| const UChar *pend = 0; |
| |
| if (!loadQuickCheckData(status) || U_FAILURE(*status)) { |
| return UNORM_MAYBE; |
| } |
| |
| min = QCHK_MIN_VALUES_[mode]; |
| |
| /* checking argument*/ |
| if (mode >= UNORM_MODE_COUNT || mode < UNORM_NONE) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return UNORM_MAYBE; |
| } |
| |
| if (sourcelength >= 0) { |
| psource = source; |
| pend = source + sourcelength; |
| for (;;) { |
| if (psource >= pend) { |
| return UNORM_YES; |
| } |
| /* fast route : since codepoints < min has combining class 0 and YES |
| looking at the minimum values, surrogates are not a problem */ |
| if (*psource >= min) { |
| break; |
| } |
| psource ++; |
| } |
| } |
| else { |
| psource = source; |
| for (;;) { |
| if (*psource == 0) { |
| return UNORM_YES; |
| } |
| /* fast route : since codepoints < min has combining class 0 and YES |
| looking at the minimum values, surrogates are not a problem */ |
| if (*psource >= min) { |
| break; |
| } |
| psource ++; |
| } |
| } |
| |
| if (sourcelength >= 0) { |
| for (;;) { |
| int count = 0; |
| |
| if (psource >= pend) { |
| break; |
| } |
| UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); |
| combiningclass = u_getCombiningClass(codepoint); |
| /* not in canonical order */ |
| |
| if (oldcombiningclass > combiningclass && combiningclass != 0) { |
| return UNORM_NO; |
| } |
| |
| oldcombiningclass = combiningclass; |
| |
| /* trie access */ |
| quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ |
| QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + |
| ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + |
| (codepoint & STAGE_3_MASK_)] & mask); |
| /* value is a byte containing 2 sets of 4 bits information. |
| bits 1 2 3 4 5678<br> |
| NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES<br> |
| ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form |
| and maybe in NFKC form. */ |
| if (quickcheckvalue == 0) { |
| return UNORM_NO; |
| } |
| if (quickcheckvalue >= MIN_UNORM_MAYBE_) { |
| result = UNORM_MAYBE; |
| } |
| psource += count; |
| } |
| } |
| else { |
| for (;;) { |
| int count = 0; |
| UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); |
| if (codepoint == 0) { |
| break; |
| } |
| |
| combiningclass = u_getCombiningClass(codepoint); |
| /* not in canonical order */ |
| |
| if (oldcombiningclass > combiningclass && combiningclass != 0) { |
| return UNORM_NO; |
| } |
| |
| oldcombiningclass = combiningclass; |
| |
| /* trie access */ |
| quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ |
| QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + |
| ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + |
| (codepoint & STAGE_3_MASK_)] & mask); |
| /* value is a byte containing 2 sets of 4 bits information. |
| bits 1 2 3 4 5678<br> |
| NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES<br> |
| ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form |
| and maybe in NFKC form. */ |
| if (quickcheckvalue == 0) { |
| return UNORM_NO; |
| } |
| if (quickcheckvalue >= MIN_UNORM_MAYBE_) { |
| result = UNORM_MAYBE; |
| } |
| psource += count; |
| } |
| } |
| |
| return result; |
| } |
| |
| /* private methods ---------------------------------------------------------- */ |
| |
| static UBool |
| isFCDCheckAcceptable(void *context, |
| const char *type, const char *name, |
| const UDataInfo *pInfo) { |
| if( |
| pInfo->size >= 20 && |
| pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
| pInfo->charsetFamily == U_CHARSET_FAMILY && |
| (uprv_memcmp(pInfo->dataFormat, FCHK_DATA_FORMAT_, |
| sizeof(FCHK_DATA_FORMAT_)) == 0) && |
| /* |
| pInfo->dataFormat[0] == 0x71 && |
| pInfo->dataFormat[1] == 0x63 && |
| pInfo->dataFormat[2] == 0x68 && |
| pInfo->dataFormat[3] == 0x6b && |
| pInfo->formatVersion[0] == 1 |
| */ |
| (uprv_memcmp(pInfo->formatVersion, FCHK_FORMAT_VERSION_, |
| sizeof(FCHK_FORMAT_VERSION_)) == 0)) { |
| return TRUE; |
| } else { |
| context = NULL; |
| type = NULL; |
| name = NULL; |
| return FALSE; |
| } |
| } |
| |
| static UBool |
| loadFCDCheckData(UErrorCode *error) { |
| /* load fcdcheck data from file if necessary */ |
| if (!isFCDCheckLoaded && U_SUCCESS(*error)) { |
| UDataMemory *data; |
| |
| /* open the data outside the mutex block */ |
| data = udata_openChoice(NULL, DATA_TYPE, FCHK_DATA_NAME, |
| isFCDCheckAcceptable, NULL, error); |
| if (U_FAILURE(*error)) { |
| return isFCDCheckLoaded = FALSE; |
| } |
| |
| /* in the mutex block, set the data for this process */ |
| umtx_lock(NULL); |
| if (fcdcheckData == NULL) { |
| const uint16_t *temp = (const uint16_t *)udata_getMemory(data); |
| const uint16_t *indexes = temp; |
| |
| fcdcheckData = data; |
| |
| temp += 8; |
| FCHK_STAGE_1_ = temp + indexes[FCHK_INDEX_STAGE_1_INDEX]; |
| FCHK_STAGE_2_ = temp + indexes[FCHK_INDEX_STAGE_2_INDEX]; |
| FCHK_STAGE_3_ = (const uint16_t *)(temp + |
| indexes[FCHK_INDEX_STAGE_3_INDEX]); |
| data = NULL; |
| } |
| umtx_unlock(NULL); |
| |
| isFCDCheckLoaded = TRUE; |
| |
| /* if a different thread set it first, then close the extra data */ |
| if (data != NULL) { |
| udata_close(data); /* NULL if it was set correctly */ |
| } |
| } |
| |
| return isFCDCheckLoaded; |
| } |
| |
| /** |
| * Gets the stage 1 data for checkFCD. |
| * @param error status |
| * @return checkFCD data stage 1, null if data can not be loaded |
| */ |
| U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *error) |
| { |
| if (loadFCDCheckData(error)) { |
| return FCHK_STAGE_1_; |
| } |
| return NULL; |
| } |
| |
| /** |
| * Gets the stage 2 data for checkFCD. |
| * @param error status |
| * @return checkFCD data stage 2, null if data can not be loaded |
| */ |
| U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *error) |
| { |
| if (loadFCDCheckData(error)) { |
| return FCHK_STAGE_2_; |
| } |
| return NULL; |
| } |
| |
| /** |
| * Gets the stage 3 data for checkFCD. |
| * @param error status |
| * @return checkFCD data stage 3, null if data can not be loaded |
| */ |
| U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *error) |
| { |
| if (loadFCDCheckData(error)) { |
| return FCHK_STAGE_3_; |
| } |
| return NULL; |
| } |
| |
| /** |
| * Private method which performs a quick FCD check on a string, to quickly |
| * determine if a string is in a required FCD format. |
| * FCD is the set of strings such that for each character in the string, |
| * decomposition without any canonical reordering will produce a NFD. |
| * @param source string for determining if it is in a normalized format |
| * @param sourcelength length of source to test |
| * @paran mode normalization format from the enum UNormalizationMode |
| * @param status A pointer to an UErrorCode to receive any errors |
| * @return TRUE if source is in FCD format, FALSE otherwise |
| */ |
| U_CAPI UBool |
| checkFCD(const UChar* source, int32_t sourcelength, UErrorCode* status) |
| { |
| UChar32 codepoint; |
| const UChar *psource; |
| const UChar *pend = 0; |
| uint8_t oldfcdtrail = 0; |
| uint16_t fcd = 0; |
| |
| if (!loadFCDCheckData(status) || U_FAILURE(*status)) { |
| return FALSE; |
| } |
| |
| if (sourcelength >= 0) { |
| psource = source; |
| pend = source + sourcelength; |
| for (;;) { |
| if (psource >= pend) { |
| return TRUE; |
| } |
| /* fast route : since codepoints < NFC_ZER_CC_BLOCK_LIMIT_ has |
| combining class 0. |
| looking at the minimum values, surrogates are not a problem */ |
| if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { |
| break; |
| } |
| psource ++; |
| } |
| } |
| else { |
| psource = source; |
| for (;;) { |
| if (*psource == 0) { |
| return TRUE; |
| } |
| /* fast route : since codepoints < min has combining class 0 and YES |
| looking at the minimum values, surrogates are not a problem */ |
| if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { |
| break; |
| } |
| psource ++; |
| } |
| } |
| |
| /* not end of string and yet failed simple compare |
| safe to shift back one char because the previous char has to be < 0x300 or the |
| start of a string */ |
| if (psource == source) { |
| oldfcdtrail = 0; |
| } |
| else { |
| codepoint = *(psource - 1); |
| oldfcdtrail = (uint8_t)(FCHK_STAGE_3_[ |
| FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + |
| ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] |
| + (codepoint & STAGE_3_MASK_)] & LAST_BYTE_MASK_); |
| } |
| |
| if (sourcelength >= 0) { |
| for (;;) { |
| int count = 0; |
| uint8_t lead; |
| |
| if (psource >= pend) { |
| return TRUE; |
| } |
| |
| UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); |
| |
| /* trie access */ |
| fcd = FCHK_STAGE_3_[ |
| FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + |
| ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + |
| (codepoint & STAGE_3_MASK_)]; |
| lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| |
| if (lead != 0 && oldfcdtrail > lead) { |
| return FALSE; |
| } |
| oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| |
| psource += count; |
| } |
| } |
| else { |
| for (;;) { |
| int count = 0; |
| uint8_t lead; |
| |
| UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); |
| if (codepoint == 0) { |
| return TRUE; |
| } |
| /* trie access */ |
| fcd = FCHK_STAGE_3_[ |
| FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + |
| ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + |
| (codepoint & STAGE_3_MASK_)]; |
| |
| lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| |
| if (lead != 0 && oldfcdtrail > lead) { |
| return FALSE; |
| } |
| oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| psource += count; |
| } |
| } |
| return TRUE; |
| } |