ICU-20991 Trace BreakIterator/BreakEngine creation
See #1014
diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp
index 3d1366a..b9b6ca6 100644
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@@ -38,6 +38,7 @@
#include "uresimp.h"
#include "uassert.h"
#include "ubrkimpl.h"
+#include "utracimp.h"
#include "charstr.h"
// *****************************************************************************
@@ -412,14 +413,23 @@
BreakIterator *result = NULL;
switch (kind) {
case UBRK_CHARACTER:
- result = BreakIterator::buildInstance(loc, "grapheme", status);
+ {
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
+ result = BreakIterator::buildInstance(loc, "grapheme", status);
+ UTRACE_EXIT_STATUS(status);
+ }
break;
case UBRK_WORD:
- result = BreakIterator::buildInstance(loc, "word", status);
+ {
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
+ result = BreakIterator::buildInstance(loc, "word", status);
+ UTRACE_EXIT_STATUS(status);
+ }
break;
case UBRK_LINE:
- uprv_strcpy(lbType, "line");
{
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
+ uprv_strcpy(lbType, "line");
char lbKeyValue[kKeyValueLenMax] = {0};
UErrorCode kvStatus = U_ZERO_ERROR;
int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
@@ -427,13 +437,17 @@
uprv_strcat(lbType, "_");
uprv_strcat(lbType, lbKeyValue);
}
+ result = BreakIterator::buildInstance(loc, lbType, status);
+
+ UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue);
+ UTRACE_EXIT_STATUS(status);
}
- result = BreakIterator::buildInstance(loc, lbType, status);
break;
case UBRK_SENTENCE:
- result = BreakIterator::buildInstance(loc, "sentence", status);
-#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
{
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
+ result = BreakIterator::buildInstance(loc, "sentence", status);
+#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
char ssKeyValue[kKeyValueLenMax] = {0};
UErrorCode kvStatus = U_ZERO_ERROR;
int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
@@ -444,11 +458,16 @@
delete fbiBuilder;
}
}
- }
#endif
+ UTRACE_EXIT_STATUS(status);
+ }
break;
case UBRK_TITLE:
- result = BreakIterator::buildInstance(loc, "title", status);
+ {
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
+ result = BreakIterator::buildInstance(loc, "title", status);
+ UTRACE_EXIT_STATUS(status);
+ }
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
index c769138..b42cdf0 100644
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -18,6 +18,7 @@
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ubrk.h"
+#include "utracimp.h"
#include "uvectr32.h"
#include "uvector.h"
#include "uassert.h"
@@ -194,6 +195,8 @@
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+ UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fThaiWordSet);
@@ -213,6 +216,7 @@
fEndWordSet.compact();
fBeginWordSet.compact();
fSuffixSet.compact();
+ UTRACE_EXIT_STATUS(status);
}
ThaiBreakEngine::~ThaiBreakEngine() {
@@ -436,6 +440,8 @@
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+ UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fLaoWordSet);
@@ -452,6 +458,7 @@
fMarkSet.compact();
fEndWordSet.compact();
fBeginWordSet.compact();
+ UTRACE_EXIT_STATUS(status);
}
LaoBreakEngine::~LaoBreakEngine() {
@@ -632,6 +639,8 @@
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+ UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fBurmeseWordSet);
@@ -645,6 +654,7 @@
fMarkSet.compact();
fEndWordSet.compact();
fBeginWordSet.compact();
+ UTRACE_EXIT_STATUS(status);
}
BurmeseBreakEngine::~BurmeseBreakEngine() {
@@ -825,6 +835,8 @@
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+ UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
setCharacters(fKhmerWordSet);
@@ -850,6 +862,7 @@
fEndWordSet.compact();
fBeginWordSet.compact();
// fSuffixSet.compact();
+ UTRACE_EXIT_STATUS(status);
}
KhmerBreakEngine::~KhmerBreakEngine() {
@@ -1045,6 +1058,8 @@
static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+ UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
@@ -1066,6 +1081,7 @@
setCharacters(cjSet);
}
}
+ UTRACE_EXIT_STATUS(status);
}
CjkBreakEngine::~CjkBreakEngine(){
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
index f80c3e0..43ba58b 100644
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -1117,7 +1117,7 @@
* Release all static memory held by breakiterator.
*/
U_CDECL_BEGIN
-static UBool U_CALLCONV rbbi_cleanup(void) {
+UBool U_CALLCONV rbbi_cleanup(void) {
delete gLanguageBreakFactories;
gLanguageBreakFactories = nullptr;
delete gEmptyString;
diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h
index b7de6ce..7b9b8d8 100644
--- a/icu4c/source/common/rbbidata.h
+++ b/icu4c/source/common/rbbidata.h
@@ -192,6 +192,8 @@
U_NAMESPACE_END
+U_CFUNC UBool rbbi_cleanup(void);
+
#endif /* C++ */
#endif
diff --git a/icu4c/source/common/unicode/utrace.h b/icu4c/source/common/unicode/utrace.h
index 5afcd9f..5b4a049 100644
--- a/icu4c/source/common/unicode/utrace.h
+++ b/icu4c/source/common/unicode/utrace.h
@@ -177,6 +177,71 @@
UTRACE_RES_DATA_LIMIT,
#endif // U_HIDE_INTERNAL_API
+#ifndef U_HIDE_DRAFT_API
+ /**
+ * The lowest break iterator location.
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_START=0x4000,
+
+ /**
+ * Indicates that a character instance of break iterator was created.
+ *
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_CREATE_CHARACTER = UTRACE_UBRK_START,
+
+ /**
+ * Indicates that a word instance of break iterator was created.
+ *
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_CREATE_WORD,
+
+ /**
+ * Indicates that a line instance of break iterator was created.
+ *
+ * Provides one C-style string to UTraceData: the lb value ("",
+ * "loose", "strict", or "normal").
+ *
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_CREATE_LINE,
+
+ /**
+ * Indicates that a sentence instance of break iterator was created.
+ *
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_CREATE_SENTENCE,
+
+ /**
+ * Indicates that a title instance of break iterator was created.
+ *
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_CREATE_TITLE,
+
+ /**
+ * Indicates that an internal dictionary break engine was created.
+ *
+ * Provides one C-style string to UTraceData: the script code of what
+ * the break engine cover ("Hani", "Khmr", "Laoo", "Mymr", or "Thai").
+ *
+ * @draft ICU 67
+ */
+ UTRACE_UBRK_CREATE_BREAK_ENGINE,
+
+#endif // U_HIDE_DRAFT_API
+
+#ifndef U_HIDE_INTERNAL_API
+ /**
+ * One more than the highest normal break iterator trace location.
+ * @internal The numeric value may change over time, see ICU ticket #12420.
+ */
+ UTRACE_UBRK_LIMIT,
+#endif // U_HIDE_INTERNAL_API
+
} UTraceFunctionNumber;
/**
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index a5b8f13..0ca89f6 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -14,6 +14,7 @@
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
+#include <sstream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -35,6 +36,7 @@
#include "unicode/uscript.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
+#include "unicode/utrace.h"
#include "charstr.h"
#include "cmemory.h"
@@ -126,6 +128,19 @@
TESTCASE_AUTO(TestReverse);
TESTCASE_AUTO(TestBug13692);
TESTCASE_AUTO(TestDebugRules);
+
+#if U_ENABLE_TRACING
+ TESTCASE_AUTO(TestTraceCreateCharacter);
+ TESTCASE_AUTO(TestTraceCreateWord);
+ TESTCASE_AUTO(TestTraceCreateSentence);
+ TESTCASE_AUTO(TestTraceCreateTitle);
+ TESTCASE_AUTO(TestTraceCreateLine);
+ TESTCASE_AUTO(TestTraceCreateLineNormal);
+ TESTCASE_AUTO(TestTraceCreateLineLoose);
+ TESTCASE_AUTO(TestTraceCreateLineStrict);
+ TESTCASE_AUTO(TestTraceCreateBreakEngine);
+#endif
+
TESTCASE_AUTO_END;
}
@@ -4865,6 +4880,182 @@
#endif
}
+#if U_ENABLE_TRACING
+static std::vector<std::string> gData;
+static std::vector<int32_t> gEntryFn;
+static std::vector<int32_t> gExitFn;
+static std::vector<int32_t> gDataFn;
+static void U_CALLCONV traceData(
+ const void*,
+ int32_t fnNumber,
+ int32_t,
+ const char *,
+ va_list args) {
+ if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
+ const char* data = va_arg(args, const char*);
+ gDataFn.push_back(fnNumber);
+ gData.push_back(data);
+ }
+}
+
+static void traceEntry(const void *, int32_t fnNumber) {
+ if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
+ gEntryFn.push_back(fnNumber);
+ }
+}
+
+static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
+ if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
+ gExitFn.push_back(fnNumber);
+ }
+}
+
+
+void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
+ assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
+ assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
+ assertEquals("utrace_exit should be called ", 1, gExitFn.size());
+ assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
+
+ if (expectedData == nullptr) {
+ assertEquals("utrace_data should not be called ", 0, gDataFn.size());
+ assertEquals("utrace_data should not be called ", 0, gData.size());
+ } else {
+ assertEquals("utrace_data should be called ", 1, gDataFn.size());
+ assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
+ assertEquals("utrace_data should be called ", 1, gData.size());
+ assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
+ }
+}
+
+void SetupTestTrace() {
+ gEntryFn.clear();
+ gExitFn.clear();
+ gDataFn.clear();
+ gData.clear();
+
+ const void* context = nullptr;
+ utrace_setFunctions(context, traceEntry, traceExit, traceData);
+ utrace_setLevel(UTRACE_INFO);
+}
+
+void RBBITest::TestTraceCreateCharacter(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createCharacterInstance("zh-CN", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
+}
+
+void RBBITest::TestTraceCreateTitle(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateTitle");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createTitleInstance("zh-CN", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
+}
+
+void RBBITest::TestTraceCreateSentence(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateSentence");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createSentenceInstance("zh-CN", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
+}
+
+void RBBITest::TestTraceCreateWord(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateWord");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createWordInstance("zh-CN", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
+}
+
+void RBBITest::TestTraceCreateLine(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateLine");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createLineInstance("zh-CN", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
+}
+
+void RBBITest::TestTraceCreateLineStrict(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
+}
+
+void RBBITest::TestTraceCreateLineNormal(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
+}
+
+void RBBITest::TestTraceCreateLineLoose(void) {
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
+}
+
+void RBBITest::TestTraceCreateBreakEngine(void) {
+ rbbi_cleanup();
+ SetupTestTrace();
+ IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
+ LocalPointer<BreakIterator> brkitr(
+ BreakIterator::createWordInstance("zh-CN", status));
+ status.errIfFailureAndReset();
+ assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
+
+ // To word break the following text, BreakIterator will create 5 dictionary
+ // break engine internally.
+ brkitr->setText(
+ u"test "
+ u"測試 " // Hani
+ u"សាកល្បង " // Khmr
+ u"ທົດສອບ " // Laoo
+ u"စမ်းသပ်မှု " // Mymr
+ u"ทดสอบ " // Thai
+ u"test "
+ );
+
+ // Loop through all the text.
+ while (brkitr->next() > 0) ;
+
+ assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
+ assertEquals("utrace_exit should be called ", 6, gExitFn.size());
+ assertEquals("utrace_data should be called ", 5, gDataFn.size());
+
+ for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
+ assertEquals("utrace_entry should be called ",
+ UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
+ assertEquals("utrace_exit should be called ",
+ UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
+ assertEquals("utrace_data should be called ",
+ UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
+ }
+
+ assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
+ assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
+ assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
+ assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
+ assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
+
+}
+#endif
#endif // #if !UCONFIG_NO_BREAK_ITERATION
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index 96c2882..8f667e5 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -87,6 +87,18 @@
void TestDebug();
void TestProperties();
+#if U_ENABLE_TRACING
+ void TestTraceCreateCharacter();
+ void TestTraceCreateWord();
+ void TestTraceCreateSentence();
+ void TestTraceCreateTitle();
+ void TestTraceCreateLine();
+ void TestTraceCreateLineNormal();
+ void TestTraceCreateLineStrict();
+ void TestTraceCreateLineLoose();
+ void TestTraceCreateBreakEngine();
+#endif
+
/***********************/
private:
/**
@@ -120,6 +132,11 @@
// Test parameters, from the test framework and test invocation.
const char* fTestParams;
+
+#if U_ENABLE_TRACING
+ void assertTestTraceResult(int32_t fnNumber, const char* expectedData);
+#endif
+
};
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */