| // Copyright (C) 2016 and later: Unicode, Inc. and others. | 
 | // License & terms of use: http://www.unicode.org/copyright.html | 
 | /* | 
 |  ********************************************************************** | 
 |  *   Copyright (C) 2005-2016, International Business Machines | 
 |  *   Corporation and others.  All Rights Reserved. | 
 |  ********************************************************************** | 
 |  */ | 
 |  | 
 | #include "unicode/utypes.h" | 
 |  | 
 | #if !UCONFIG_NO_CONVERSION | 
 |  | 
 | #include "unicode/ucsdet.h" | 
 |  | 
 | #include "csdetect.h" | 
 | #include "csmatch.h" | 
 | #include "uenumimp.h" | 
 |  | 
 | #include "cmemory.h" | 
 | #include "cstring.h" | 
 | #include "umutex.h" | 
 | #include "ucln_in.h" | 
 | #include "uarrsort.h" | 
 | #include "inputext.h" | 
 | #include "csrsbcs.h" | 
 | #include "csrmbcs.h" | 
 | #include "csrutf8.h" | 
 | #include "csrucode.h" | 
 | #include "csr2022.h" | 
 |  | 
 | #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) | 
 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) | 
 |  | 
 | U_NAMESPACE_BEGIN | 
 |  | 
 | struct CSRecognizerInfo : public UMemory { | 
 |     CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) | 
 |         : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; | 
 |  | 
 |     ~CSRecognizerInfo() {delete recognizer;}; | 
 |  | 
 |     CharsetRecognizer *recognizer; | 
 |     UBool isDefaultEnabled; | 
 | }; | 
 |  | 
 | U_NAMESPACE_END | 
 |  | 
 | static icu::CSRecognizerInfo **fCSRecognizers = NULL; | 
 | static icu::UInitOnce gCSRecognizersInitOnce; | 
 | static int32_t fCSRecognizers_size = 0; | 
 |  | 
 | U_CDECL_BEGIN | 
 | static UBool U_CALLCONV csdet_cleanup(void) | 
 | { | 
 |     U_NAMESPACE_USE | 
 |     if (fCSRecognizers != NULL) { | 
 |         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { | 
 |             delete fCSRecognizers[r]; | 
 |             fCSRecognizers[r] = NULL; | 
 |         } | 
 |  | 
 |         DELETE_ARRAY(fCSRecognizers); | 
 |         fCSRecognizers = NULL; | 
 |         fCSRecognizers_size = 0; | 
 |     } | 
 |     gCSRecognizersInitOnce.reset(); | 
 |  | 
 |     return TRUE; | 
 | } | 
 |  | 
 | static int32_t U_CALLCONV | 
 | charsetMatchComparator(const void * /*context*/, const void *left, const void *right) | 
 | { | 
 |     U_NAMESPACE_USE | 
 |  | 
 |     const CharsetMatch **csm_l = (const CharsetMatch **) left; | 
 |     const CharsetMatch **csm_r = (const CharsetMatch **) right; | 
 |  | 
 |     // NOTE: compare is backwards to sort from highest to lowest. | 
 |     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); | 
 | } | 
 |  | 
 | static void U_CALLCONV initRecognizers(UErrorCode &status) { | 
 |     U_NAMESPACE_USE | 
 |     ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); | 
 |     CSRecognizerInfo *tempArray[] = { | 
 |         new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), | 
 |  | 
 |         new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), | 
 |  | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), | 
 |  | 
 |         new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), | 
 | #if !UCONFIG_ONLY_HTML_CONVERSION | 
 |         new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), | 
 |         new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), | 
 |  | 
 |         new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), | 
 |         new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), | 
 |         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), | 
 |         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) | 
 | #endif | 
 |     }; | 
 |     int32_t rCount = UPRV_LENGTHOF(tempArray); | 
 |  | 
 |     fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); | 
 |  | 
 |     if (fCSRecognizers == NULL) { | 
 |         status = U_MEMORY_ALLOCATION_ERROR; | 
 |     }  | 
 |     else { | 
 |         fCSRecognizers_size = rCount; | 
 |         for (int32_t r = 0; r < rCount; r += 1) { | 
 |             fCSRecognizers[r] = tempArray[r]; | 
 |             if (fCSRecognizers[r] == NULL) { | 
 |                 status = U_MEMORY_ALLOCATION_ERROR; | 
 |             } | 
 |         } | 
 |     } | 
 | } | 
 |  | 
 | U_CDECL_END | 
 |  | 
 | U_NAMESPACE_BEGIN | 
 |  | 
 | void CharsetDetector::setRecognizers(UErrorCode &status) | 
 | { | 
 |     umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); | 
 | } | 
 |  | 
 | CharsetDetector::CharsetDetector(UErrorCode &status) | 
 |   : textIn(new InputText(status)), resultArray(NULL), | 
 |     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), | 
 |     fEnabledRecognizers(NULL) | 
 | { | 
 |     if (U_FAILURE(status)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     setRecognizers(status); | 
 |  | 
 |     if (U_FAILURE(status)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); | 
 |  | 
 |     if (resultArray == NULL) { | 
 |         status = U_MEMORY_ALLOCATION_ERROR; | 
 |         return; | 
 |     } | 
 |  | 
 |     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { | 
 |         resultArray[i] = new CharsetMatch(); | 
 |  | 
 |         if (resultArray[i] == NULL) { | 
 |             status = U_MEMORY_ALLOCATION_ERROR; | 
 |             break; | 
 |         } | 
 |     } | 
 | } | 
 |  | 
 | CharsetDetector::~CharsetDetector() | 
 | { | 
 |     delete textIn; | 
 |  | 
 |     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { | 
 |         delete resultArray[i]; | 
 |     } | 
 |  | 
 |     uprv_free(resultArray); | 
 |  | 
 |     if (fEnabledRecognizers) { | 
 |         uprv_free(fEnabledRecognizers); | 
 |     } | 
 | } | 
 |  | 
 | void CharsetDetector::setText(const char *in, int32_t len) | 
 | { | 
 |     textIn->setText(in, len); | 
 |     fFreshTextSet = TRUE; | 
 | } | 
 |  | 
 | UBool CharsetDetector::setStripTagsFlag(UBool flag) | 
 | { | 
 |     UBool temp = fStripTags; | 
 |     fStripTags = flag; | 
 |     fFreshTextSet = TRUE; | 
 |     return temp; | 
 | } | 
 |  | 
 | UBool CharsetDetector::getStripTagsFlag() const | 
 | { | 
 |     return fStripTags; | 
 | } | 
 |  | 
 | void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const | 
 | { | 
 |     textIn->setDeclaredEncoding(encoding,len); | 
 | } | 
 |  | 
 | int32_t CharsetDetector::getDetectableCount() | 
 | { | 
 |     UErrorCode status = U_ZERO_ERROR; | 
 |  | 
 |     setRecognizers(status); | 
 |  | 
 |     return fCSRecognizers_size;  | 
 | } | 
 |  | 
 | const CharsetMatch *CharsetDetector::detect(UErrorCode &status) | 
 | { | 
 |     int32_t maxMatchesFound = 0; | 
 |  | 
 |     detectAll(maxMatchesFound, status); | 
 |  | 
 |     if(maxMatchesFound > 0) { | 
 |         return resultArray[0]; | 
 |     } else { | 
 |         return NULL; | 
 |     } | 
 | } | 
 |  | 
 | const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) | 
 | { | 
 |     if(!textIn->isSet()) { | 
 |         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set | 
 |  | 
 |         return NULL; | 
 |     } else if (fFreshTextSet) { | 
 |         CharsetRecognizer *csr; | 
 |         int32_t            i; | 
 |  | 
 |         textIn->MungeInput(fStripTags); | 
 |  | 
 |         // Iterate over all possible charsets, remember all that | 
 |         // give a match quality > 0. | 
 |         resultCount = 0; | 
 |         for (i = 0; i < fCSRecognizers_size; i += 1) { | 
 |             csr = fCSRecognizers[i]->recognizer; | 
 |             if (csr->match(textIn, resultArray[resultCount])) { | 
 |                 resultCount++; | 
 |             } | 
 |         } | 
 |  | 
 |         if (resultCount > 1) { | 
 |             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); | 
 |         } | 
 |         fFreshTextSet = FALSE; | 
 |     } | 
 |  | 
 |     maxMatchesFound = resultCount; | 
 |  | 
 |     return resultArray; | 
 | } | 
 |  | 
 | void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) | 
 | { | 
 |     if (U_FAILURE(status)) { | 
 |         return; | 
 |     } | 
 |  | 
 |     int32_t modIdx = -1; | 
 |     UBool isDefaultVal = FALSE; | 
 |     for (int32_t i = 0; i < fCSRecognizers_size; i++) { | 
 |         CSRecognizerInfo *csrinfo = fCSRecognizers[i]; | 
 |         if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { | 
 |             modIdx = i; | 
 |             isDefaultVal = (csrinfo->isDefaultEnabled == enabled); | 
 |             break; | 
 |         } | 
 |     } | 
 |     if (modIdx < 0) { | 
 |         // No matching encoding found | 
 |         status = U_ILLEGAL_ARGUMENT_ERROR; | 
 |         return; | 
 |     } | 
 |  | 
 |     if (fEnabledRecognizers == NULL && !isDefaultVal) { | 
 |         // Create an array storing the non default setting | 
 |         fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); | 
 |         if (fEnabledRecognizers == NULL) { | 
 |             status = U_MEMORY_ALLOCATION_ERROR; | 
 |             return; | 
 |         } | 
 |         // Initialize the array with default info | 
 |         for (int32_t i = 0; i < fCSRecognizers_size; i++) { | 
 |             fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; | 
 |         } | 
 |     } | 
 |  | 
 |     if (fEnabledRecognizers != NULL) { | 
 |         fEnabledRecognizers[modIdx] = enabled; | 
 |     } | 
 | } | 
 |  | 
 | /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const | 
 | { | 
 |     if( index > fCSRecognizers_size-1 || index < 0) { | 
 |         status = U_INDEX_OUTOFBOUNDS_ERROR; | 
 |  | 
 |         return 0; | 
 |     } else { | 
 |         return fCSRecognizers[index]->getName(); | 
 |     } | 
 | }*/ | 
 |  | 
 | U_NAMESPACE_END | 
 |  | 
 | U_CDECL_BEGIN | 
 | typedef struct { | 
 |     int32_t currIndex; | 
 |     UBool all; | 
 |     UBool *enabledRecognizers; | 
 | } Context; | 
 |  | 
 |  | 
 |  | 
 | static void U_CALLCONV | 
 | enumClose(UEnumeration *en) { | 
 |     if(en->context != NULL) { | 
 |         DELETE_ARRAY(en->context); | 
 |     } | 
 |  | 
 |     DELETE_ARRAY(en); | 
 | } | 
 |  | 
 | static int32_t U_CALLCONV | 
 | enumCount(UEnumeration *en, UErrorCode *) { | 
 |     if (((Context *)en->context)->all) { | 
 |         // ucsdet_getAllDetectableCharsets, all charset detector names | 
 |         return fCSRecognizers_size; | 
 |     } | 
 |  | 
 |     // Otherwise, ucsdet_getDetectableCharsets - only enabled ones | 
 |     int32_t count = 0; | 
 |     UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; | 
 |     if (enabledArray != NULL) { | 
 |         // custom set | 
 |         for (int32_t i = 0; i < fCSRecognizers_size; i++) { | 
 |             if (enabledArray[i]) { | 
 |                 count++; | 
 |             } | 
 |         } | 
 |     } else { | 
 |         // default set | 
 |         for (int32_t i = 0; i < fCSRecognizers_size; i++) { | 
 |             if (fCSRecognizers[i]->isDefaultEnabled) { | 
 |                 count++; | 
 |             } | 
 |         } | 
 |     } | 
 |     return count; | 
 | } | 
 |  | 
 | static const char* U_CALLCONV | 
 | enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { | 
 |     const char *currName = NULL; | 
 |  | 
 |     if (((Context *)en->context)->currIndex < fCSRecognizers_size) { | 
 |         if (((Context *)en->context)->all) { | 
 |             // ucsdet_getAllDetectableCharsets, all charset detector names | 
 |             currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); | 
 |             ((Context *)en->context)->currIndex++; | 
 |         } else { | 
 |             // ucsdet_getDetectableCharsets | 
 |             UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; | 
 |             if (enabledArray != NULL) { | 
 |                 // custome set | 
 |                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { | 
 |                     if (enabledArray[((Context *)en->context)->currIndex]) { | 
 |                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); | 
 |                     } | 
 |                     ((Context *)en->context)->currIndex++; | 
 |                 } | 
 |             } else { | 
 |                 // default set | 
 |                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { | 
 |                     if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { | 
 |                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); | 
 |                     } | 
 |                     ((Context *)en->context)->currIndex++; | 
 |                 } | 
 |             } | 
 |         } | 
 |     } | 
 |  | 
 |     if(resultLength != NULL) { | 
 |         *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); | 
 |     } | 
 |  | 
 |     return currName; | 
 | } | 
 |  | 
 |  | 
 | static void U_CALLCONV | 
 | enumReset(UEnumeration *en, UErrorCode *) { | 
 |     ((Context *)en->context)->currIndex = 0; | 
 | } | 
 |  | 
 | static const UEnumeration gCSDetEnumeration = { | 
 |     NULL, | 
 |     NULL, | 
 |     enumClose, | 
 |     enumCount, | 
 |     uenum_unextDefault, | 
 |     enumNext, | 
 |     enumReset | 
 | }; | 
 |  | 
 | U_CDECL_END | 
 |  | 
 | U_NAMESPACE_BEGIN | 
 |  | 
 | UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) | 
 | { | 
 |  | 
 |     /* Initialize recognized charsets. */ | 
 |     setRecognizers(status); | 
 |  | 
 |     if(U_FAILURE(status)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     UEnumeration *en = NEW_ARRAY(UEnumeration, 1); | 
 |     if (en == NULL) { | 
 |         status = U_MEMORY_ALLOCATION_ERROR; | 
 |         return 0; | 
 |     } | 
 |     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); | 
 |     en->context = (void*)NEW_ARRAY(Context, 1); | 
 |     if (en->context == NULL) { | 
 |         status = U_MEMORY_ALLOCATION_ERROR; | 
 |         DELETE_ARRAY(en); | 
 |         return 0; | 
 |     } | 
 |     uprv_memset(en->context, 0, sizeof(Context)); | 
 |     ((Context*)en->context)->all = TRUE; | 
 |     return en; | 
 | } | 
 |  | 
 | UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const | 
 | { | 
 |     if(U_FAILURE(status)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     UEnumeration *en = NEW_ARRAY(UEnumeration, 1); | 
 |     if (en == NULL) { | 
 |         status = U_MEMORY_ALLOCATION_ERROR; | 
 |         return 0; | 
 |     } | 
 |     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); | 
 |     en->context = (void*)NEW_ARRAY(Context, 1); | 
 |     if (en->context == NULL) { | 
 |         status = U_MEMORY_ALLOCATION_ERROR; | 
 |         DELETE_ARRAY(en); | 
 |         return 0; | 
 |     } | 
 |     uprv_memset(en->context, 0, sizeof(Context)); | 
 |     ((Context*)en->context)->all = FALSE; | 
 |     ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; | 
 |     return en; | 
 | } | 
 |  | 
 | U_NAMESPACE_END | 
 |  | 
 | #endif |