ICU-22100 Incorporate BudouX into ICU (C++)
diff --git a/.github/adaboost.json b/.github/adaboost.json
new file mode 100644
index 0000000..639fd6a
--- /dev/null
+++ b/.github/adaboost.json
@@ -0,0 +1,14 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+//
+// Include Japanese adaboost model.
+{
+ "featureFilters": {
+ "brkitr_adaboost": {
+ "includelist": [
+ "jaml"
+ ]
+ }
+ }
+}
diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml
index 90bce1e..1293e5e 100644
--- a/.github/workflows/icu_ci.yml
+++ b/.github/workflows/icu_ci.yml
@@ -334,6 +334,17 @@
make clean;
make -j2 check
+ # Test adaboost
+ adaboost-test:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - run: |
+ cd icu4c/source;
+ ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
+ make clean;
+ make -j2 check
+
# Build and run testmap
testmap:
runs-on: ubuntu-latest
diff --git a/icu4c/source/common/BUILD.bazel b/icu4c/source/common/BUILD.bazel
index e385d3b..47d3d24 100644
--- a/icu4c/source/common/BUILD.bazel
+++ b/icu4c/source/common/BUILD.bazel
@@ -342,6 +342,7 @@
"dictionarydata.cpp",
"filteredbrk.cpp",
"lstmbe.cpp",
+ "mlbe.cpp",
"rbbi.cpp",
"rbbi_cache.cpp",
"rbbidata.cpp",
diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj
index e35e1b0..2b4cc05 100644
--- a/icu4c/source/common/common.vcxproj
+++ b/icu4c/source/common/common.vcxproj
@@ -88,6 +88,7 @@
<ClCompile Include="brkiter.cpp" />
<ClCompile Include="dictbe.cpp" />
<ClCompile Include="lstmbe.cpp" />
+ <ClCompile Include="mlbe.cpp" />
<ClCompile Include="pluralmap.cpp" />
<ClCompile Include="rbbi.cpp" />
<ClCompile Include="rbbidata.cpp" />
@@ -282,6 +283,7 @@
<ClInclude Include="brkeng.h" />
<ClInclude Include="dictbe.h" />
<ClInclude Include="lstmbe.h" />
+ <ClInclude Include="mlbe.h" />
<ClInclude Include="rbbidata.h" />
<ClInclude Include="rbbinode.h" />
<ClInclude Include="rbbirb.h" />
diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters
index 38bc0c1..28a5d90 100644
--- a/icu4c/source/common/common.vcxproj.filters
+++ b/icu4c/source/common/common.vcxproj.filters
@@ -76,6 +76,9 @@
<ClCompile Include="lstmbe.cpp">
<Filter>break iteration</Filter>
</ClCompile>
+ <ClCompile Include="mlbe.cpp">
+ <Filter>break iteration</Filter>
+ </ClCompile>
<ClCompile Include="rbbi.cpp">
<Filter>break iteration</Filter>
</ClCompile>
@@ -660,6 +663,9 @@
<ClInclude Include="lstmbe.h">
<Filter>break iteration</Filter>
</ClInclude>
+ <ClInclude Include="mlbe.h">
+ <Filter>break iteration</Filter>
+ </ClInclude>
<ClInclude Include="rbbidata.h">
<Filter>break iteration</Filter>
</ClInclude>
diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj
index fc16562..5df0d57 100644
--- a/icu4c/source/common/common_uwp.vcxproj
+++ b/icu4c/source/common/common_uwp.vcxproj
@@ -222,6 +222,7 @@
<ClCompile Include="brkiter.cpp" />
<ClCompile Include="dictbe.cpp" />
<ClCompile Include="lstmbe.cpp" />
+ <ClCompile Include="mlbe.cpp" />
<ClCompile Include="pluralmap.cpp" />
<ClCompile Include="rbbi.cpp" />
<ClCompile Include="rbbidata.cpp" />
@@ -417,6 +418,7 @@
<ClInclude Include="brkeng.h" />
<ClInclude Include="dictbe.h" />
<ClInclude Include="lstmbe.h" />
+ <ClInclude Include="mlbe.h" />
<ClInclude Include="rbbidata.h" />
<ClInclude Include="rbbinode.h" />
<ClInclude Include="rbbirb.h" />
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
index 9b5434d..0e420c6 100644
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1054,9 +1054,10 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
*/
static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
-: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
+: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
+ fMlBreakEngine = nullptr;
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
@@ -1073,11 +1074,20 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
if (U_SUCCESS(status)) {
setCharacters(fHangulWordSet);
}
- } else { //Chinese and Japanese
+ } else { // Chinese and Japanese
UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+ isCj = true;
if (U_SUCCESS(status)) {
setCharacters(cjSet);
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+ fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
+ fClosePunctuationSet, status);
+ if (fMlBreakEngine == nullptr) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+#else
initJapanesePhraseParameter(status);
+#endif
}
}
UTRACE_EXIT_STATUS(status);
@@ -1085,6 +1095,7 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
CjkBreakEngine::~CjkBreakEngine(){
delete fDictionary;
+ delete fMlBreakEngine;
}
// The katakanaCost values below are based on the length frequencies of all
@@ -1251,7 +1262,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
}
}
}
-
+
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+ // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
+ if (isPhraseBreaking && isCj) {
+ return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
+ inputMap, status);
+ }
+#endif
+
// bestSnlp[i] is the snlp of the best segmentation of the first i
// code points in the range to be matched.
UVector32 bestSnlp(numCodePts + 1, status);
diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h
index ca1a3c2..a2c761b 100644
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@@ -16,11 +16,13 @@
#include "brkeng.h"
#include "hash.h"
+#include "mlbe.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
class DictionaryMatcher;
+class MlBreakEngine;
class Normalizer2;
/*******************************************************************
@@ -374,6 +376,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
DictionaryMatcher *fDictionary;
const Normalizer2 *nfkcNorm2;
+ MlBreakEngine *fMlBreakEngine;
+ bool isCj;
private:
// Load Japanese extensions.
diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp
new file mode 100644
index 0000000..3ccf470
--- /dev/null
+++ b/icu4c/source/common/mlbe.cpp
@@ -0,0 +1,452 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "cmemory.h"
+#include "mlbe.h"
+#include "uassert.h"
+#include "ubrkimpl.h"
+#include "unicode/resbund.h"
+#include "unicode/udata.h"
+#include "unicode/utf16.h"
+#include "uresimp.h"
+#include "util.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+Element::Element() : length(0) {}
+
+void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
+ character = ch;
+ U_ASSERT(idx.length() <= 3);
+ length = idx.length();
+ idx.extract(0, length, ublock);
+ ublock[length] = '\0';
+}
+
+UChar32 Element::getCharacter() const {
+ return character;
+}
+
+char16_t* Element::getUblock() const {
+ return (char16_t*)ublock;
+}
+
+uint16_t Element::getLength() const {
+ return length;
+}
+
+MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+ const UnicodeSet &closePunctuationSet, UErrorCode &status)
+ : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
+ fClosePunctuationSet(closePunctuationSet),
+ fModel(status),
+ fNegativeSum(0) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ loadMLModel(status);
+}
+
+MlBreakEngine::~MlBreakEngine() {}
+
+namespace {
+ const char16_t INVALID = u'|';
+ const int32_t MAX_FEATURE = 26;
+ const int32_t MAX_FEATURE_LENGTH = 14;
+
+ bool isValid(const Element& element) {
+ return element.getLength() != 1 || element.getUblock()[0] != INVALID;
+ }
+
+ void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ UnicodeString result(str);
+ for (int i = 0; i < length; i++) {
+ result.append(arr[i]);
+ }
+ U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
+ result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
+ }
+
+ void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
+ U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
+ str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
+ }
+}
+
+int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+ UVector32 &foundBreaks, const UnicodeString &inString,
+ const LocalPointer<UVector32> &inputMap,
+ UErrorCode &status) const {
+ if (U_FAILURE(status)) {
+ return 0;
+ }
+ if (rangeStart >= rangeEnd) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ UVector32 boundary(inString.countChar32() + 1, status);
+ if (U_FAILURE(status)) {
+ return 0;
+ }
+ int32_t numBreaks = 0;
+ UChar32 ch;
+ UnicodeString index;
+ // The ML model groups six char to evaluate if the 4th char is a breakpoint.
+ // Like a sliding window, the elementList removes the first char and appends the new char from
+ // inString in each iteration so that its size always remains at six.
+ Element elementList[6];
+
+ int32_t codeUts = initElementList(inString, elementList, status);
+ int32_t length = inString.countChar32();
+
+ // Add a break for the start.
+ boundary.addElement(0, status);
+ numBreaks++;
+ if (U_FAILURE(status)) return 0;
+
+ for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
+ evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
+ if (i + 1 >= inString.countChar32()) break;
+ // Remove the first element and append a new element
+ uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
+ ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
+ index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
+ elementList[5].setCharAndUblock(ch, index);
+ if (ch != INVALID) {
+ codeUts += U16_LENGTH(ch);
+ }
+ }
+ if (U_FAILURE(status)) return 0;
+
+ // Add a break for the end if there is not one there already.
+ if (boundary.lastElementi() != inString.countChar32()) {
+ boundary.addElement(inString.countChar32(), status);
+ numBreaks++;
+ }
+
+ int32_t prevCPPos = -1;
+ int32_t prevUTextPos = -1;
+ int32_t correctedNumBreaks = 0;
+ for (int32_t i = 0; i < numBreaks; i++) {
+ int32_t cpPos = boundary.elementAti(i);
+ int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
+ U_ASSERT(cpPos > prevCPPos);
+ U_ASSERT(utextPos >= prevUTextPos);
+
+ if (utextPos > prevUTextPos) {
+ if (utextPos != rangeStart ||
+ (utextPos > 0 &&
+ fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+ foundBreaks.push(utextPos, status);
+ correctedNumBreaks++;
+ }
+ } else {
+ // Normalization expanded the input text, the dictionary found a boundary
+ // within the expansion, giving two boundaries with the same index in the
+ // original text. Ignore the second. See ticket #12918.
+ --numBreaks;
+ }
+ prevCPPos = cpPos;
+ prevUTextPos = utextPos;
+ }
+ (void)prevCPPos; // suppress compiler warnings about unused variable
+
+ UChar32 nextChar = utext_char32At(inText, rangeEnd);
+ if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+ // In phrase breaking, there has to be a breakpoint between Cj character and
+ // the number/open punctuation.
+ // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+ // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
+ // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
+ if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+ foundBreaks.popi();
+ correctedNumBreaks--;
+ }
+ }
+
+ return correctedNumBreaks;
+}
+
+void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+ UVector32 &boundary, UErrorCode &status) const {
+ char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ UChar32 arr[4] = {-1, -1, -1, -1};
+ int32_t length = 0, listLength = 0;
+
+ const UChar32 w1 = elementList[0].getCharacter();
+ const UChar32 w2 = elementList[1].getCharacter();
+ const UChar32 w3 = elementList[2].getCharacter();
+ const UChar32 w4 = elementList[3].getCharacter();
+ const UChar32 w5 = elementList[4].getCharacter();
+ const UChar32 w6 = elementList[5].getCharacter();
+
+ length = 1;
+ if (w1 != INVALID) {
+ arr[0] = w1;
+ concatChar(u"UW1:", arr, length, featureList[listLength++], status);
+ }
+ if (w2 != INVALID) {
+ arr[0] = w2;
+ concatChar(u"UW2:", arr, length, featureList[listLength++], status);
+ }
+ if (w3 != INVALID) {
+ arr[0] = w3;
+ concatChar(u"UW3:", arr, length, featureList[listLength++], status);
+ }
+ if (w4 != INVALID) {
+ arr[0] = w4;
+ concatChar(u"UW4:", arr, length, featureList[listLength++], status);
+ }
+ if (w5 != INVALID) {
+ arr[0] = w5;
+ concatChar(u"UW5:", arr, length, featureList[listLength++], status);
+ }
+ if (w6 != INVALID) {
+ arr[0] = w6;
+ concatChar(u"UW6:", arr, length, featureList[listLength++], status);
+ }
+ length = 2;
+ if (w2 != INVALID && w3 != INVALID) {
+ arr[0] = w2;
+ arr[1] = w3;
+ concatChar(u"BW1:", arr, length, featureList[listLength++], status);
+ }
+ if (w3 != INVALID && w4 != INVALID) {
+ arr[0] = w3;
+ arr[1] = w4;
+ concatChar(u"BW2:", arr, length, featureList[listLength++], status);
+ }
+ if (w4 != INVALID && w5 != INVALID) {
+ arr[0] = w4;
+ arr[1] = w5;
+ concatChar(u"BW3:", arr, length, featureList[listLength++], status);
+ }
+ length = 3;
+ if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
+ arr[0] = w1;
+ arr[1] = w2;
+ arr[2] = w3;
+ concatChar(u"TW1:", arr, length, featureList[listLength++], status);
+ }
+ if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
+ arr[0] = w2;
+ arr[1] = w3;
+ arr[2] = w4;
+ concatChar(u"TW2:", arr, length, featureList[listLength++], status);
+ }
+ if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
+ arr[0] = w3;
+ arr[1] = w4;
+ arr[2] = w5;
+ concatChar(u"TW3:", arr, length, featureList[listLength++], status);
+ }
+ if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
+ arr[0] = w4;
+ arr[1] = w5;
+ arr[2] = w6;
+ concatChar(u"TW4:", arr, length, featureList[listLength++], status);
+ }
+ if (isValid(elementList[0])) {
+ writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
+ elementList[0].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[1])) {
+ writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
+ elementList[1].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[2])) {
+ writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
+ elementList[2].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[3])) {
+ writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
+ elementList[3].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[4])) {
+ writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
+ elementList[4].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[5])) {
+ writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
+ elementList[5].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[1]) && isValid(elementList[2])) {
+ writeString(UnicodeString(u"BB1:")
+ .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+ .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[2]) && isValid(elementList[3])) {
+ writeString(UnicodeString(u"BB2:")
+ .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+ .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[3]) && isValid(elementList[4])) {
+ writeString(UnicodeString(u"BB3:")
+ .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+ .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
+ writeString(UnicodeString(u"TB1:")
+ .append(elementList[0].getUblock(), 0, elementList[0].getLength())
+ .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+ .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
+ writeString(UnicodeString(u"TB2:")
+ .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+ .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+ .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
+ writeString(UnicodeString(u"TB3:")
+ .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+ .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+ .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
+ featureList[listLength++], status);
+ }
+ if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
+ writeString(UnicodeString(u"TB4:")
+ .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+ .append(elementList[4].getUblock(), 0, elementList[4].getLength())
+ .append(elementList[5].getUblock(), 0, elementList[5].getLength()),
+ featureList[listLength++], status);
+ }
+ if (U_FAILURE(status)) {
+ return;
+ }
+ int32_t score = fNegativeSum;
+ for (int32_t j = 0; j < listLength; j++) {
+ UnicodeString key(featureList[j]);
+ if (fModel.containsKey(key)) {
+ score += (2 * fModel.geti(key));
+ }
+ }
+ if (score > 0) {
+ boundary.addElement(index, status);
+ numBreaks++;
+ }
+}
+
+int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
+ UErrorCode &status) const {
+ if (U_FAILURE(status)) {
+ return 0;
+ }
+ int32_t index = 0;
+ int32_t length = inString.countChar32();
+ UChar32 w1, w2, w3, w4, w5, w6;
+ w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
+ if (length > 0) {
+ w3 = inString.char32At(0);
+ index += U16_LENGTH(w3);
+ }
+ if (length > 1) {
+ w4 = inString.char32At(index);
+ index += U16_LENGTH(w4);
+ }
+ if (length > 2) {
+ w5 = inString.char32At(index);
+ index += U16_LENGTH(w5);
+ }
+ if (length > 3) {
+ w6 = inString.char32At(index);
+ index += U16_LENGTH(w6);
+ }
+
+ const UnicodeString b1(INVALID);
+ const UnicodeString b2(b1);
+ const UnicodeString b3(getUnicodeBlock(w3, status));
+ const UnicodeString b4(getUnicodeBlock(w4, status));
+ const UnicodeString b5(getUnicodeBlock(w5, status));
+ const UnicodeString b6(getUnicodeBlock(w6, status));
+
+ elementList[0].setCharAndUblock(w1, b1);
+ elementList[1].setCharAndUblock(w2, b2);
+ elementList[2].setCharAndUblock(w3, b3);
+ elementList[3].setCharAndUblock(w4, b4);
+ elementList[4].setCharAndUblock(w5, b5);
+ elementList[5].setCharAndUblock(w6, b6);
+
+ return index;
+}
+
+UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
+ if (U_FAILURE(status)) {
+ return UnicodeString(INVALID);
+ }
+
+ UBlockCode block = ublock_getCode(ch);
+ if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
+ return UnicodeString(INVALID);
+ } else {
+ UnicodeString empty;
+ // Same as sprintf("%03d", block)
+ return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
+ }
+}
+
+void MlBreakEngine::loadMLModel(UErrorCode &error) {
+ // BudouX's model consists of pairs of the feature and its score.
+ // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
+ // corresponding feature's score.
+
+ if (U_FAILURE(error)) return;
+
+ int32_t keySize = 0;
+ int32_t valueSize = 0;
+ int32_t stringLength = 0;
+ UnicodeString key;
+ StackUResourceBundle stackTempBundle;
+ ResourceDataValue modelKey;
+
+ LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
+ UResourceBundle* rb = rbp.orphan();
+ // get modelValues
+ LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
+ const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
+ if (U_FAILURE(error)) return;
+
+ // get modelKeys
+ ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
+ ResourceArray stringArray = modelKey.getArray(error);
+ keySize = stringArray.getSize();
+ if (U_FAILURE(error)) return;
+
+ for (int32_t idx = 0; idx < keySize; idx++) {
+ stringArray.getValue(idx, modelKey);
+ key = UnicodeString(modelKey.getString(stringLength, error));
+ if (U_SUCCESS(error)) {
+ U_ASSERT(idx < valueSize);
+ fNegativeSum -= value[idx];
+ fModel.puti(key, value[idx], error);
+ }
+ }
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h
new file mode 100644
index 0000000..8943fa3
--- /dev/null
+++ b/icu4c/source/common/mlbe.h
@@ -0,0 +1,152 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#ifndef MLBREAKENGINE_H
+#define MLBREAKENGINE_H
+
+#include "hash.h"
+#include "unicode/uniset.h"
+#include "unicode/utext.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/**
+ * A class used to encapsulate a character and its unicode block index
+ */
+class Element : public UMemory {
+ public:
+ /**
+ * Default constructor.
+ */
+ Element();
+
+ /**
+ * Set the character and its unicode block.
+ *
+ * @param ch A unicode character.
+ * @param ublock The unicode block of the character.
+ */
+ void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
+
+ /**
+ * Get the unicode character.
+ *
+ * @return The unicode character.
+ */
+ UChar32 getCharacter() const;
+
+ /**
+ * Get the unicode character's unicode block.
+ *
+ * @return The unicode block.
+ */
+ char16_t* getUblock() const;
+
+ /**
+ * Get the length of the unicode block.
+ *
+ * @return The unicode block length.
+ */
+ uint16_t getLength() const;
+
+ private:
+ UChar32 character;
+ char16_t ublock[4];
+ uint16_t length;
+};
+
+/**
+ * A machine learning break engine for the phrase breaking in Japanese.
+ */
+class MlBreakEngine : public UMemory {
+ public:
+ /**
+ * Constructor.
+ *
+ * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
+ * alphabet.
+ * @param closePunctuationSet An UnicodeSet with close punctuation.
+ * @param status Information on any errors encountered.
+ */
+ MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+ const UnicodeSet &closePunctuationSet, UErrorCode &status);
+
+ /**
+ * Virtual destructor.
+ */
+ virtual ~MlBreakEngine();
+
+ public:
+ /**
+ * Divide up a range of characters handled by this break engine.
+ *
+ * @param inText A UText representing the text
+ * @param rangeStart The start of the range of the characters
+ * @param rangeEnd The end of the range of the characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @param inString The normalized string of text ranging from rangeStart to rangeEnd
+ * @param inputMap The vector storing the native index of inText
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ */
+ int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+ UVector32 &foundBreaks, const UnicodeString &inString,
+ const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
+
+ private:
+ /**
+ * Load the machine learning's model file.
+ *
+ * @param error Information on any errors encountered.
+ */
+ void loadMLModel(UErrorCode &error);
+
+ /**
+ * Get the character's unicode block code defined in UBlockCode.
+ *
+ * @param ch A character.
+ * @param error Information on any errors encountered.
+ * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
+ * is less than 3 digits.
+ *
+ */
+ UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
+
+ /**
+ * Initialize the element list from the input string.
+ *
+ * @param inString A input string to be segmented.
+ * @param elementList A list to store the first six characters and their unicode block codes.
+ * @param status Information on any errors encountered.
+ * @return The number of code units of the first six characters in inString.
+ */
+ int32_t initElementList(const UnicodeString &inString, Element* elementList,
+ UErrorCode &status) const;
+
+ /**
+ * Evaluate whether the index is a potential breakpoint.
+ *
+ * @param elementList A list including 6 elements for the breakpoint evaluation.
+ * @param index The breakpoint index to be evaluated.
+ * @param numBreaks The accumulated number of breakpoints.
+ * @param boundary A vector including the index of the breakpoint.
+ * @param status Information on any errors encountered.
+ */
+ void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+ UVector32 &boundary, UErrorCode &status) const;
+
+ UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+ UnicodeSet fClosePunctuationSet;
+ Hashtable fModel;
+ int32_t fNegativeSum;
+};
+
+#endif
+
+U_NAMESPACE_END
+
+/* MLBREAKENGINE_H */
+#endif
diff --git a/icu4c/source/common/sources.txt b/icu4c/source/common/sources.txt
index e5c39dd..90171fe 100644
--- a/icu4c/source/common/sources.txt
+++ b/icu4c/source/common/sources.txt
@@ -43,6 +43,7 @@
lsr.cpp
lstmbe.cpp
messagepattern.cpp
+mlbe.cpp
normalizer2.cpp
normalizer2impl.cpp
normlzr.cpp
diff --git a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h
index bbc232d..3818ca0 100644
--- a/icu4c/source/common/unicode/uconfig.h
+++ b/icu4c/source/common/unicode/uconfig.h
@@ -323,6 +323,16 @@
# define UCONFIG_NO_NORMALIZATION 0
#endif
+/**
+ * \def UCONFIG_USE_ML_PHRASE_BREAKING
+ * This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary.
+ *
+ * @internal
+ */
+#ifndef UCONFIG_USE_ML_PHRASE_BREAKING
+# define UCONFIG_USE_ML_PHRASE_BREAKING 0
+#endif
+
#if UCONFIG_NO_NORMALIZATION
/* common library */
/* ICU 50 CJK dictionary BreakIterator uses normalization */
diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py
index 899cba2..2608cb0 100644
--- a/icu4c/source/data/BUILDRULES.py
+++ b/icu4c/source/data/BUILDRULES.py
@@ -27,6 +27,7 @@ def generate(config, io, common_vars):
requests += generate_conversion_mappings(config, io, common_vars)
requests += generate_brkitr_brk(config, io, common_vars)
requests += generate_brkitr_lstm(config, io, common_vars)
+ requests += generate_brkitr_adaboost(config, io, common_vars)
requests += generate_stringprep(config, io, common_vars)
requests += generate_brkitr_dictionaries(config, io, common_vars)
requests += generate_normalization(config, io, common_vars)
@@ -184,7 +185,7 @@ def generate_brkitr_brk(config, io, common_vars):
category = "brkitr_rules",
dep_targets =
[DepTarget("cnvalias"),
- DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
+ DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")],
input_files = input_files,
output_files = output_files,
tool = IcuTool("genbrk"),
@@ -506,6 +507,32 @@ def generate_brkitr_lstm(config, io, common_vars):
)
]
+def generate_brkitr_adaboost(config, io, common_vars):
+ input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")]
+ input_basenames = [v.filename[16:] for v in input_files]
+ output_files = [
+ OutFile("brkitr/%s.res" % v[:-4])
+ for v in input_basenames
+ ]
+ return [
+ RepeatedOrSingleExecutionRequest(
+ name = "adaboost_res",
+ category = "brkitr_adaboost",
+ dep_targets = [],
+ input_files = input_files,
+ output_files = output_files,
+ tool = IcuTool("genrb"),
+ args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} "
+ "-k "
+ "{INPUT_BASENAME}",
+ format_with = {
+ },
+ repeat_with = {
+ "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
+ }
+ )
+ ]
+
def generate_tree(
config,
io,
diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt
new file mode 100644
index 0000000..0500ff7
--- /dev/null
+++ b/icu4c/source/data/brkitr/adaboost/jaml.txt
@@ -0,0 +1,940 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+jaml {
+ modelKeys {
+ "BB2:062071",
+ "UB3:061",
+ "UB3:071",
+ "TB2:062062062",
+ "TB4:062062062",
+ "UB3:063",
+ "UB4:071",
+ "BB3:062062",
+ "UB4:062",
+ "BB1:062071",
+ "BB1:062061",
+ "UB4:061",
+ "TB1:071071062",
+ "TB3:062063063",
+ "UB2:061",
+ "TB1:062071062",
+ "TB3:062062062",
+ "BB2:063063",
+ "UW3:は",
+ "UW3:に",
+ "TB3:062071062",
+ "UW3:が",
+ "UW4:こ",
+ "UB5:061",
+ "UW3:と",
+ "TB4:063063063",
+ "UW4:て",
+ "TB2:062062061",
+ "UW3:。",
+ "UW4:お",
+ "UW3:の",
+ "BB3:071071",
+ "BB3:062071",
+ "UW3:お",
+ "UW3:し",
+ "UW4:、",
+ "UW4:の",
+ "UW3:を",
+ "UW4:。",
+ "UW3:、",
+ "UW5:で",
+ "UW4:あ",
+ "BB2:062062",
+ "UW4:っ",
+ "UW5:っ",
+ "UW3:も",
+ "UW5:う",
+ "UW3:「",
+ "UW5:な",
+ "UW4:そ",
+ "UW4:る",
+ "UW3:っ",
+ "UW4:「",
+ "UW4:い",
+ "BB2:087087",
+ "UB4:087",
+ "UW5:に",
+ "BW3:もの",
+ "UW5:し",
+ "UW6:う",
+ "BW2:とい",
+ "UW4:に",
+ "UW3:る",
+ "TB2:071062071",
+ "UW4:で",
+ "UW5:が",
+ "BB1:071071",
+ "UW5:は",
+ "UW4:は",
+ "UW4:れ",
+ "UW5:き",
+ "BB2:071062",
+ "BB2:071071",
+ "UW3:・",
+ "BB2:071087",
+ "BB2:061062",
+ "TB1:062061062",
+ "UW3:れ",
+ "BB2:087062",
+ "TB2:087087087",
+ "UW4:ら",
+ "TB1:071071071",
+ "UB2:071",
+ "TB1:062062087",
+ "UW5:す",
+ "UW5:ん",
+ "UW3:で",
+ "UW4:が",
+ "UW3:こ",
+ "TB4:071062062",
+ "UW3:ら",
+ "UW6:に",
+ "UW6:。",
+ "UW3:た",
+ "TB1:061071071",
+ "UW5:く",
+ "UB1:063",
+ "UW1:そ",
+ "UW3:う",
+ "BW3:とい",
+ "BW3:とこ",
+ "UW3:ま",
+ "BW3:こと",
+ "UW2:っ",
+ "UW5:・",
+ "TB3:062062061",
+ "UW3:き",
+ "UW4:ん",
+ "UB3:062",
+ "UW3:く",
+ "UW3:」",
+ "UW5:あ",
+ "BB2:062087",
+ "BW3:いう",
+ "UW5:れ",
+ "UW2:一",
+ "UW3:,",
+ "UW1:に",
+ "UW2:と",
+ "TB2:071071062",
+ "TB2:071071071",
+ "UW5:を",
+ "UW4:り",
+ "BW1:から",
+ "UW3:ち",
+ "BW3:いい",
+ "UW2:は",
+ "UW6:た",
+ "TB1:063063062",
+ "UW4:1",
+ "UW4:や",
+ "UW2:ん",
+ "UW3:]",
+ "UW4:ほ",
+ "TB3:062087087",
+ "BW2:であ",
+ "UW4:だ",
+ "BB3:071062",
+ "TB1:087087087",
+ "BW3:・・",
+ "BW3:とき",
+ "UW4:を",
+ "UW3:て",
+ "UW4:か",
+ "UW2:そ",
+ "TB4:071071062",
+ "TB2:062061071",
+ "UW2:を",
+ "UW4:ご",
+ "UW2:で",
+ "TB3:071071071",
+ "BB1:087087",
+ "UW2:し",
+ "UW4:出",
+ "UW2:ま",
+ "UW4:,",
+ "UW5:と",
+ "UW4:ど",
+ "BW3:して",
+ "UW1:で",
+ "BB2:061071",
+ "BW3:ため",
+ "BW2:とし",
+ "BW2:ない",
+ "BW2:てい",
+ "UW3:間",
+ "UW3:!",
+ "UW5:ー",
+ "UW4:す",
+ "UW4:!",
+ "BW1:とが",
+ "UW5:の",
+ "TB4:062062071",
+ "TB2:061071071",
+ "UW6:・",
+ "UW3:.",
+ "UW2:て",
+ "UW3:笑",
+ "UW2:こ",
+ "UW5:も",
+ "BW3:よう",
+ "UW3:人",
+ "UW2:の",
+ "UW3:か",
+ "UW3:日",
+ "UW1:い",
+ "BW2:とこ",
+ "UW4:私",
+ "UW3:…",
+ "UW2:に",
+ "UW3:今",
+ "BB3:087062",
+ "UB3:055",
+ "UW4:(",
+ "BB1:087071",
+ "UW1:な",
+ "BB3:063063",
+ "UW5:来",
+ "UW3:?",
+ "TW3:ている",
+ "UW4:」",
+ "UW4:前",
+ "BW1:いう",
+ "UW4:つ",
+ "UW3:)",
+ "BW1:では",
+ "UW2:る",
+ "UW5:そ",
+ "UW4:ー",
+ "TW2:気に入",
+ "UW4:笑",
+ "UW4:ひ",
+ "TB4:087087087",
+ "UW4:け",
+ "UW2:も",
+ "BW3:ちょ",
+ "BW3:出来",
+ "TB2:062071062",
+ "UW4:『",
+ "UW3:[",
+ "UW4:2",
+ "UW5:つ",
+ "TB1:061071062",
+ "UW3:1",
+ "BW3:から",
+ "UB5:071",
+ "UW4:ま",
+ "UW3:ば",
+ "UW3:り",
+ "BW3:その",
+ "UW3:ご",
+ "UW4:わ",
+ "BW2:てお",
+ "TB2:071062062",
+ "BW1:ない",
+ "UW2:よ",
+ "UB2:087",
+ "UW6:の",
+ "UW2:毎",
+ "UW2:結",
+ "TW4:の京都",
+ "UW3:さ",
+ "UW2:最",
+ "BW2:です",
+ "UW2:」",
+ "UW5:え",
+ "UW3:だ",
+ "TW4:ところ",
+ "UW4:.",
+ "UB1:062",
+ "UW6:て",
+ "UW1:が",
+ "BW2:、と",
+ "UW3:0",
+ "UW3:ん",
+ "UW3:中",
+ "UW4:よ",
+ "BW3:この",
+ "UW2:が",
+ "UW3:み",
+ "TW2:ではな",
+ "UW6:と",
+ "UW4:[",
+ "TW3:、ある",
+ "BW3:ころ",
+ "UW4:?",
+ "UW6:、",
+ "UW4:電",
+ "BB1:062040",
+ "UW3:後",
+ "UW5:い",
+ "UW2:、",
+ "UW5:て",
+ "BB2:062040",
+ "UW3:真",
+ "UW3:そ",
+ "UW5:さ",
+ "UB5:087",
+ "TW3:という",
+ "UW3:分",
+ "UB6:071",
+ "BW3:なっ",
+ "UW4:ろ",
+ "BB2:061061",
+ "TW3:ところ",
+ "UB1:071",
+ "UW1:、",
+ "BW1:とか",
+ "UW3:な",
+ "UW6:り",
+ "UW4:間",
+ "UW3:べ",
+ "UW5:べ",
+ "TB4:062071062",
+ "UW4:]",
+ "BW2:には",
+ "UW5:々",
+ "BW1:。・",
+ "BW1:その",
+ "UW1:す",
+ "UW4:)",
+ "UW6:っ",
+ "TB3:063063063",
+ "TB3:062071071",
+ "UB5:063",
+ "BW1:かも",
+ "UW6:る",
+ "TB4:062063063",
+ "UW3:ど",
+ "TW3:である",
+ "TW4:くらい",
+ "BW1:最近",
+ "BW1:しい",
+ "BW1:とも",
+ "BW2:と同",
+ "TW1:という",
+ "UW2:さ",
+ "BW2:帯電",
+ "TB1:071062062",
+ "BW3:そし",
+ "UW2:。",
+ "UW5:か",
+ "UW5:こ",
+ "BW3:ない",
+ "BW1:んな",
+ "BW2:でき",
+ "UW4:3",
+ "UW3:け",
+ "TW4:ことが",
+ "BW1:こと",
+ "UB3:087",
+ "UW3:電",
+ "UW3:よ",
+ "BW1:たと",
+ "UW5:ま",
+ "UW5:た",
+ "UW5:ち",
+ "UW2:け",
+ "UW5:だ",
+ "UW3:度",
+ "BW1:たい",
+ "UW4:使",
+ "UW2:き",
+ "TW4:かなり",
+ "UB6:063",
+ "BB1:062062",
+ "UW4:込",
+ "TW3:と言っ",
+ "UW6:だ",
+ "UW5:り",
+ "UW5:よ",
+ "BW3:どう",
+ "UW4:…",
+ "UW3:や",
+ "BW1:かし",
+ "BW3:かっ",
+ "UW4:今",
+ "UW3:『",
+ "UW4:思",
+ "UB2:063",
+ "UW4:く",
+ "UW3:京",
+ "UW6:ー",
+ "UW1:ん",
+ "BW1:うな",
+ "TB2:062061061",
+ "UW1:と",
+ "TB4:062063062",
+ "TB2:061062062",
+ "BW1:この",
+ "BW2:ので",
+ "UW4:み",
+ "UW5:わ",
+ "UW6:や",
+ "BW1:れて",
+ "UW2:や",
+ "UW6:こ",
+ "UW4:な",
+ "UW5:め",
+ "BW1:もう",
+ "TB4:071062071",
+ "BW1:より",
+ "UW4:合",
+ "UW6:け",
+ "BW1:少し",
+ "BW2:でし",
+ "UW4:と",
+ "TB1:063063063",
+ "UW3:ー",
+ "BW2:くな",
+ "UW2:く",
+ "UW2:我",
+ "BW2:いも",
+ "BW3:わか",
+ "TB2:071063071",
+ "UW4:も",
+ "UW1:あ",
+ "UW4:最",
+ "BW1:るの",
+ "UW2:全",
+ "UW6:0",
+ "UW4:放",
+ "UW4:京",
+ "BW3:かけ",
+ "UW2:少",
+ "BW3:もう",
+ "UW2:多",
+ "UW2:う",
+ "TB1:062062040",
+ "UW1:を",
+ "UW3:光",
+ "BW1:!!",
+ "UW2:ャ",
+ "BW3:すぐ",
+ "UW4:帯",
+ "UW6:し",
+ "BW3:でも",
+ "BW2:、そ",
+ "TB3:071087087",
+ "TB2:063062071",
+ "UW3:わ",
+ "UB4:063",
+ "TB4:071071071",
+ "UW5:都",
+ "UW5:ず",
+ "UW2:バ",
+ "UW2:京",
+ "UW3:ゃ",
+ "BW1:い、",
+ "BW3:よく",
+ "BW1:たら",
+ "BW2:のよ",
+ "UW2:思",
+ "BW1:うに",
+ "BW1:の間",
+ "UW6:ん",
+ "UW6:ず",
+ "BW1:った",
+ "TW3:ること",
+ "BW3:とて",
+ "TW1:ような",
+ "UW6:ぱ",
+ "TB3:063071062",
+ "TW4:って、",
+ "TW4:なんて",
+ "TW2:その後",
+ "UW6:ら",
+ "TW4:ことに",
+ "UW3:>",
+ "TW3:てしま",
+ "UW3:い",
+ "TB4:071062061",
+ "UW2:ひ",
+ "UW6:め",
+ "UW6:で",
+ "BW3:なる",
+ "UW5:ご",
+ "BW2:りし",
+ "UW6:電",
+ "UW1:は",
+ "BW1:いも",
+ "BW3:すご",
+ "UW4:通",
+ "BW3:おり",
+ "BW3:かか",
+ "BW1:思い",
+ }
+ modelValues:intvector {
+ 1800,
+ 271,
+ -857,
+ -417,
+ 285,
+ -583,
+ 388,
+ 828,
+ -853,
+ -820,
+ 502,
+ -708,
+ 358,
+ 1341,
+ -586,
+ -451,
+ 257,
+ -1876,
+ 2052,
+ 1698,
+ -458,
+ 2048,
+ 1182,
+ -551,
+ 980,
+ 773,
+ -1453,
+ -152,
+ 3201,
+ 2865,
+ 1203,
+ 144,
+ -369,
+ -2539,
+ -613,
+ -3574,
+ -1111,
+ 3110,
+ -3022,
+ 2039,
+ -1091,
+ 1241,
+ -560,
+ -1412,
+ 625,
+ 1350,
+ 297,
+ -2404,
+ -595,
+ 1007,
+ -1829,
+ -1662,
+ 3213,
+ 270,
+ -911,
+ 178,
+ -727,
+ 2716,
+ -484,
+ -344,
+ 929,
+ -1236,
+ 760,
+ -299,
+ -419,
+ -728,
+ 122,
+ -704,
+ -605,
+ -1507,
+ 545,
+ -68,
+ -320,
+ 1498,
+ 953,
+ -323,
+ -575,
+ -673,
+ 520,
+ -450,
+ -1767,
+ -247,
+ 56,
+ 231,
+ -764,
+ 536,
+ 794,
+ -703,
+ -566,
+ 51,
+ 390,
+ 52,
+ -182,
+ 466,
+ 133,
+ 354,
+ 107,
+ 492,
+ 488,
+ -1194,
+ 1145,
+ -847,
+ 812,
+ 151,
+ -517,
+ -314,
+ -553,
+ -783,
+ -117,
+ 736,
+ -88,
+ -598,
+ 569,
+ 606,
+ 287,
+ 744,
+ 1739,
+ -217,
+ -219,
+ -144,
+ 234,
+ -649,
+ -757,
+ 834,
+ -819,
+ 869,
+ -275,
+ -267,
+ 154,
+ 653,
+ 594,
+ 255,
+ 1018,
+ 1124,
+ 284,
+ -1624,
+ -372,
+ 440,
+ -184,
+ -1936,
+ 1318,
+ -1124,
+ 453,
+ -92,
+ -343,
+ 175,
+ 182,
+ -886,
+ 930,
+ -223,
+ -57,
+ -113,
+ 103,
+ -200,
+ 510,
+ -2099,
+ -498,
+ 385,
+ 80,
+ -156,
+ 360,
+ 1289,
+ 771,
+ -1114,
+ -399,
+ 870,
+ 1230,
+ 79,
+ 472,
+ -1596,
+ -1092,
+ -572,
+ 55,
+ -151,
+ -124,
+ 1316,
+ -248,
+ 1280,
+ -125,
+ -284,
+ -1023,
+ 862,
+ 84,
+ 417,
+ 568,
+ -88,
+ -528,
+ 910,
+ 674,
+ -212,
+ 894,
+ -121,
+ 1108,
+ 762,
+ 260,
+ -197,
+ 91,
+ -53,
+ 1117,
+ -645,
+ -868,
+ -611,
+ 220,
+ 422,
+ 1431,
+ -532,
+ -157,
+ -476,
+ -846,
+ -1309,
+ -1614,
+ 1225,
+ 302,
+ -738,
+ -260,
+ 892,
+ -778,
+ -193,
+ 1221,
+ -779,
+ 489,
+ 420,
+ -85,
+ -525,
+ -830,
+ 26,
+ 270,
+ 439,
+ -120,
+ 1263,
+ -795,
+ 291,
+ -1310,
+ -23,
+ 347,
+ 312,
+ -107,
+ -114,
+ 701,
+ 830,
+ 1309,
+ -451,
+ 260,
+ -1080,
+ 536,
+ 188,
+ -60,
+ 643,
+ -1184,
+ 31,
+ -194,
+ -51,
+ -514,
+ -442,
+ -120,
+ 649,
+ 410,
+ 882,
+ -75,
+ -341,
+ -718,
+ -128,
+ 340,
+ -1245,
+ -164,
+ -1052,
+ 70,
+ -256,
+ 279,
+ 786,
+ 40,
+ -177,
+ 97,
+ -411,
+ 222,
+ -89,
+ -277,
+ -146,
+ 414,
+ 483,
+ 21,
+ -339,
+ -406,
+ -360,
+ -450,
+ -14,
+ -36,
+ 513,
+ 252,
+ 54,
+ -501,
+ -478,
+ 450,
+ -36,
+ -644,
+ -392,
+ 714,
+ 643,
+ -341,
+ 91,
+ -1018,
+ 34,
+ -177,
+ 123,
+ 80,
+ -695,
+ -44,
+ -357,
+ 253,
+ -389,
+ 613,
+ 515,
+ 418,
+ -396,
+ -553,
+ 193,
+ 298,
+ -334,
+ -57,
+ -315,
+ -77,
+ 33,
+ 88,
+ 137,
+ 280,
+ -448,
+ 196,
+ -136,
+ -295,
+ -329,
+ -92,
+ -360,
+ -132,
+ -288,
+ -45,
+ -43,
+ 174,
+ 75,
+ -60,
+ 330,
+ 360,
+ 217,
+ 130,
+ 473,
+ -41,
+ -23,
+ -340,
+ -530,
+ -69,
+ -71,
+ -115,
+ 297,
+ -240,
+ 229,
+ 507,
+ -348,
+ 171,
+ -320,
+ 239,
+ 16,
+ -195,
+ -277,
+ -41,
+ 69,
+ 280,
+ -264,
+ 30,
+ 249,
+ -97,
+ -163,
+ -221,
+ 96,
+ 83,
+ 82,
+ -218,
+ -93,
+ -53,
+ 40,
+ 28,
+ 285,
+ 27,
+ 283,
+ -211,
+ -92,
+ 214,
+ -225,
+ -54,
+ 53,
+ 105,
+ -198,
+ -53,
+ -277,
+ 198,
+ 184,
+ -264,
+ -106,
+ 14,
+ 185,
+ -155,
+ 185,
+ 106,
+ -119,
+ 53,
+ 208,
+ 92,
+ 262,
+ 106,
+ -52,
+ 105,
+ -25,
+ -79,
+ 104,
+ 141,
+ 129,
+ -114,
+ 26,
+ 64,
+ -113,
+ 26,
+ 77,
+ -64,
+ 13,
+ 13,
+ 26,
+ 89,
+ 115,
+ -49,
+ 89,
+ -114,
+ 51,
+ 64,
+ -64,
+ -51,
+ -38,
+ 89,
+ 13,
+ -64,
+ 13,
+ -48,
+ 76,
+ 63,
+ 62,
+ 13,
+ 112,
+ -76,
+ -50,
+ -13,
+ -49,
+ 63,
+ -50,
+ 13,
+ 13,
+ -50,
+ 24,
+ -12,
+ 24,
+ 12,
+ 24,
+ 12,
+ -12,
+ -24,
+ 12,
+ -12,
+ -12,
+ 12,
+ -12,
+ }
+}
\ No newline at end of file
diff --git a/icu4c/source/python/icutools/databuilder/filtration.py b/icu4c/source/python/icutools/databuilder/filtration.py
index 27d08b0..e9339a0 100644
--- a/icu4c/source/python/icutools/databuilder/filtration.py
+++ b/icu4c/source/python/icutools/databuilder/filtration.py
@@ -273,8 +273,8 @@ def _preprocess_file_filters(requests, config, io):
default_filter_json = "exclude" if config.strategy == "additive" else "include"
for category in all_categories:
filter_json = default_filter_json
- # Special default for category "brkitr_lstm" as "exclude" for now.
- if "brkitr_lstm" == category:
+ # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now.
+ if "brkitr_lstm" == category or "brkitr_adaboost" == category:
filter_json = "exclude"
# Figure out the correct filter to create for now.
if "featureFilters" in json_data and category in json_data["featureFilters"]:
diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt
index 9676ed4..7460caa 100644
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -211,7 +211,7 @@
brkiter.o brkeng.o ubrk.o
rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o
rbbidata.o rbbirb.o rbbi_cache.o
- dictionarydata.o dictbe.o lstmbe.o
+ dictionarydata.o dictbe.o lstmbe.o mlbe.o
# BreakIterator::makeInstance() factory implementation makes for circular dependency
# between BreakIterator base and FilteredBreakIteratorBuilder.
filteredbrk.o
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 17c05fb..7afdb9a 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -42,6 +42,7 @@
#include "charstr.h"
#include "cmemory.h"
#include "cstr.h"
+#include "cstring.h"
#include "intltest.h"
#include "lstmbe.h"
#include "rbbitst.h"
@@ -835,9 +836,28 @@ void RBBITest::TestExtended() {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
skipTest = false;
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+ if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
+ // skip <line> test cases of JP's phrase breaking when ML is enabled.
+ skipTest = true;
+ }
+#endif
charIdx += 5;
break;
}
+ if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
+ delete tp.bi;
+ tp.bi = BreakIterator::createLineInstance(locale, status);
+ skipTest = false;
+#if !UCONFIG_USE_ML_PHRASE_BREAKING
+ if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
+ // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
+ skipTest = true;
+ }
+#endif
+ charIdx += 7;
+ break;
+ }
if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createSentenceInstance(locale, status);
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 72bd158..40c6745 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1913,6 +1913,26 @@
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
<locale ja@lw=phrase>
+#phrase breaking test cases for the ML solution
+<lineML>
+#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
+#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
+#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
+<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
+<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
+#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
+<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data>
+#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します
+<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data>
+#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど
+<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data>
+
+<locale ja@lw=phrase>
+#phrase breaking test cases for the dictionary based solution
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
@@ -2005,8 +2025,8 @@
#大韓民國은 民主共和國이다
#<data>•大韓民國은 •民主•共和國이다•</data>
# All the tests for ja@lw=phrase should also work in Korean.
-#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
-<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>