|  | // © 2016 and later: Unicode, Inc. and others. | 
|  | // License & terms of use: http://www.unicode.org/copyright.html | 
|  | /* | 
|  | ************************************************************************************ | 
|  | * Copyright (C) 2006-2016, International Business Machines Corporation | 
|  | * and others. All Rights Reserved. | 
|  | ************************************************************************************ | 
|  | */ | 
|  |  | 
|  | #include "unicode/utypes.h" | 
|  |  | 
|  | #if !UCONFIG_NO_BREAK_ITERATION | 
|  |  | 
|  | #include "unicode/uchar.h" | 
|  | #include "unicode/uniset.h" | 
|  | #include "unicode/chariter.h" | 
|  | #include "unicode/ures.h" | 
|  | #include "unicode/udata.h" | 
|  | #include "unicode/putil.h" | 
|  | #include "unicode/ustring.h" | 
|  | #include "unicode/uscript.h" | 
|  | #include "unicode/ucharstrie.h" | 
|  | #include "unicode/bytestrie.h" | 
|  |  | 
|  | #include "brkeng.h" | 
|  | #include "cmemory.h" | 
|  | #include "dictbe.h" | 
|  | #include "lstmbe.h" | 
|  | #include "charstr.h" | 
|  | #include "dictionarydata.h" | 
|  | #include "mutex.h" | 
|  | #include "uvector.h" | 
|  | #include "umutex.h" | 
|  | #include "uresimp.h" | 
|  | #include "ubrkimpl.h" | 
|  |  | 
|  | U_NAMESPACE_BEGIN | 
|  |  | 
|  | /* | 
|  | ****************************************************************** | 
|  | */ | 
|  |  | 
|  | LanguageBreakEngine::LanguageBreakEngine() { | 
|  | } | 
|  |  | 
|  | LanguageBreakEngine::~LanguageBreakEngine() { | 
|  | } | 
|  |  | 
|  | /* | 
|  | ****************************************************************** | 
|  | */ | 
|  |  | 
|  | LanguageBreakFactory::LanguageBreakFactory() { | 
|  | } | 
|  |  | 
|  | LanguageBreakFactory::~LanguageBreakFactory() { | 
|  | } | 
|  |  | 
|  | /* | 
|  | ****************************************************************** | 
|  | */ | 
|  |  | 
|  | UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { | 
|  | (void)status; | 
|  | } | 
|  |  | 
|  | UnhandledEngine::~UnhandledEngine() { | 
|  | delete fHandled; | 
|  | fHandled = nullptr; | 
|  | } | 
|  |  | 
|  | UBool | 
|  | UnhandledEngine::handles(UChar32 c) const { | 
|  | return fHandled && fHandled->contains(c); | 
|  | } | 
|  |  | 
|  | int32_t | 
|  | UnhandledEngine::findBreaks( UText *text, | 
|  | int32_t /* startPos */, | 
|  | int32_t endPos, | 
|  | UVector32 &/*foundBreaks*/, | 
|  | UBool /* isPhraseBreaking */, | 
|  | UErrorCode &status) const { | 
|  | if (U_FAILURE(status)) return 0; | 
|  | UChar32 c = utext_current32(text); | 
|  | while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { | 
|  | utext_next32(text);            // TODO:  recast loop to work with post-increment operations. | 
|  | c = utext_current32(text); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | void | 
|  | UnhandledEngine::handleCharacter(UChar32 c) { | 
|  | if (fHandled == nullptr) { | 
|  | fHandled = new UnicodeSet(); | 
|  | if (fHandled == nullptr) { | 
|  | return; | 
|  | } | 
|  | } | 
|  | if (!fHandled->contains(c)) { | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | // Apply the entire script of the character. | 
|  | int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); | 
|  | fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | ****************************************************************** | 
|  | */ | 
|  |  | 
|  | ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { | 
|  | fEngines = 0; | 
|  | } | 
|  |  | 
|  | ICULanguageBreakFactory::~ICULanguageBreakFactory() { | 
|  | if (fEngines != 0) { | 
|  | delete fEngines; | 
|  | } | 
|  | } | 
|  |  | 
|  | U_NAMESPACE_END | 
|  | U_CDECL_BEGIN | 
|  | static void U_CALLCONV _deleteEngine(void *obj) { | 
|  | delete (const icu::LanguageBreakEngine *) obj; | 
|  | } | 
|  | U_CDECL_END | 
|  | U_NAMESPACE_BEGIN | 
|  |  | 
|  | const LanguageBreakEngine * | 
|  | ICULanguageBreakFactory::getEngineFor(UChar32 c) { | 
|  | const LanguageBreakEngine *lbe = nullptr; | 
|  | UErrorCode  status = U_ZERO_ERROR; | 
|  |  | 
|  | static UMutex gBreakEngineMutex; | 
|  | Mutex m(&gBreakEngineMutex); | 
|  |  | 
|  | if (fEngines == nullptr) { | 
|  | LocalPointer<UStack>  engines(new UStack(_deleteEngine, nullptr, status), status); | 
|  | if (U_FAILURE(status) ) { | 
|  | // Note: no way to return error code to caller. | 
|  | return nullptr; | 
|  | } | 
|  | fEngines = engines.orphan(); | 
|  | } else { | 
|  | int32_t i = fEngines->size(); | 
|  | while (--i >= 0) { | 
|  | lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); | 
|  | if (lbe != nullptr && lbe->handles(c)) { | 
|  | return lbe; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // We didn't find an engine. Create one. | 
|  | lbe = loadEngineFor(c); | 
|  | if (lbe != nullptr) { | 
|  | fEngines->push((void *)lbe, status); | 
|  | } | 
|  | return U_SUCCESS(status) ? lbe : nullptr; | 
|  | } | 
|  |  | 
|  | const LanguageBreakEngine * | 
|  | ICULanguageBreakFactory::loadEngineFor(UChar32 c) { | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | UScriptCode code = uscript_getScript(c, &status); | 
|  | if (U_SUCCESS(status)) { | 
|  | const LanguageBreakEngine *engine = nullptr; | 
|  | // Try to use LSTM first | 
|  | const LSTMData *data = CreateLSTMDataForScript(code, status); | 
|  | if (U_SUCCESS(status)) { | 
|  | if (data != nullptr) { | 
|  | engine = CreateLSTMBreakEngine(code, data, status); | 
|  | if (U_SUCCESS(status) && engine != nullptr) { | 
|  | return engine; | 
|  | } | 
|  | if (engine != nullptr) { | 
|  | delete engine; | 
|  | engine = nullptr; | 
|  | } else { | 
|  | DeleteLSTMData(data); | 
|  | } | 
|  | } | 
|  | } | 
|  | status = U_ZERO_ERROR;  // fallback to dictionary based | 
|  | DictionaryMatcher *m = loadDictionaryMatcherFor(code); | 
|  | if (m != nullptr) { | 
|  | switch(code) { | 
|  | case USCRIPT_THAI: | 
|  | engine = new ThaiBreakEngine(m, status); | 
|  | break; | 
|  | case USCRIPT_LAO: | 
|  | engine = new LaoBreakEngine(m, status); | 
|  | break; | 
|  | case USCRIPT_MYANMAR: | 
|  | engine = new BurmeseBreakEngine(m, status); | 
|  | break; | 
|  | case USCRIPT_KHMER: | 
|  | engine = new KhmerBreakEngine(m, status); | 
|  | break; | 
|  |  | 
|  | #if !UCONFIG_NO_NORMALIZATION | 
|  | // CJK not available w/o normalization | 
|  | case USCRIPT_HANGUL: | 
|  | engine = new CjkBreakEngine(m, kKorean, status); | 
|  | break; | 
|  |  | 
|  | // use same BreakEngine and dictionary for both Chinese and Japanese | 
|  | case USCRIPT_HIRAGANA: | 
|  | case USCRIPT_KATAKANA: | 
|  | case USCRIPT_HAN: | 
|  | engine = new CjkBreakEngine(m, kChineseJapanese, status); | 
|  | break; | 
|  | #if 0 | 
|  | // TODO: Have to get some characters with script=common handled | 
|  | // by CjkBreakEngine (e.g. U+309B). Simply subjecting | 
|  | // them to CjkBreakEngine does not work. The engine has to | 
|  | // special-case them. | 
|  | case USCRIPT_COMMON: | 
|  | { | 
|  | UBlockCode block = ublock_getCode(code); | 
|  | if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) | 
|  | engine = new CjkBreakEngine(dict, kChineseJapanese, status); | 
|  | break; | 
|  | } | 
|  | #endif | 
|  | #endif | 
|  |  | 
|  | default: | 
|  | break; | 
|  | } | 
|  | if (engine == nullptr) { | 
|  | delete m; | 
|  | } | 
|  | else if (U_FAILURE(status)) { | 
|  | delete engine; | 
|  | engine = nullptr; | 
|  | } | 
|  | return engine; | 
|  | } | 
|  | } | 
|  | return nullptr; | 
|  | } | 
|  |  | 
|  | DictionaryMatcher * | 
|  | ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | // open root from brkitr tree. | 
|  | UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); | 
|  | b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); | 
|  | int32_t dictnlength = 0; | 
|  | const UChar *dictfname = | 
|  | ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); | 
|  | if (U_FAILURE(status)) { | 
|  | ures_close(b); | 
|  | return nullptr; | 
|  | } | 
|  | CharString dictnbuf; | 
|  | CharString ext; | 
|  | const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot | 
|  | if (extStart != nullptr) { | 
|  | int32_t len = (int32_t)(extStart - dictfname); | 
|  | ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status); | 
|  | dictnlength = len; | 
|  | } | 
|  | dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status); | 
|  | ures_close(b); | 
|  |  | 
|  | UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); | 
|  | if (U_SUCCESS(status)) { | 
|  | // build trie | 
|  | const uint8_t *data = (const uint8_t *)udata_getMemory(file); | 
|  | const int32_t *indexes = (const int32_t *)data; | 
|  | const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; | 
|  | const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; | 
|  | DictionaryMatcher *m = nullptr; | 
|  | if (trieType == DictionaryData::TRIE_TYPE_BYTES) { | 
|  | const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; | 
|  | const char *characters = (const char *)(data + offset); | 
|  | m = new BytesDictionaryMatcher(characters, transform, file); | 
|  | } | 
|  | else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { | 
|  | const UChar *characters = (const UChar *)(data + offset); | 
|  | m = new UCharsDictionaryMatcher(characters, file); | 
|  | } | 
|  | if (m == nullptr) { | 
|  | // no matcher exists to take ownership - either we are an invalid | 
|  | // type or memory allocation failed | 
|  | udata_close(file); | 
|  | } | 
|  | return m; | 
|  | } else if (dictfname != nullptr) { | 
|  | // we don't have a dictionary matcher. | 
|  | // returning nullptr here will cause us to fail to find a dictionary break engine, as expected | 
|  | status = U_ZERO_ERROR; | 
|  | return nullptr; | 
|  | } | 
|  | return nullptr; | 
|  | } | 
|  |  | 
|  | U_NAMESPACE_END | 
|  |  | 
|  | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |