blob: 8edaf497860fad46d162417a4fc435709122c88e [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.rbbi;
//Regression testing of RuleBasedBreakIterator
//
// TODO: These tests should be mostly retired.
// Much of the test data that was originally here was removed when the RBBI rules
// were updated to match the Unicode boundary TRs, and the data was found to be invalid.
// Much of the remaining data has been moved into the rbbitst.txt test data file,
// which is common between ICU4C and ICU4J. The remaining test data should also be moved,
// or simply retired if it is no longer interesting.
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ULocale;
@RunWith(JUnit4.class)
public class RBBITest extends TestFmwk {
public RBBITest() {
}
@Test
public void TestThaiDictionaryBreakIterator() {
int position;
int index;
int result[] = { 1, 2, 5, 10, 11, 12, 11, 10, 5, 2, 1, 0 };
char ctext[] = {
0x0041, 0x0020,
0x0E01, 0x0E32, 0x0E23, 0x0E17, 0x0E14, 0x0E25, 0x0E2D, 0x0E07,
0x0020, 0x0041
};
String text = new String(ctext);
ULocale locale = ULocale.createCanonical("th");
BreakIterator b = BreakIterator.getWordInstance(locale);
b.setText(text);
index = 0;
// Test forward iteration
while ((position = b.next())!= BreakIterator.DONE) {
if (position != result[index++]) {
errln("Error with ThaiDictionaryBreakIterator forward iteration test at " + position + ".\nShould have been " + result[index-1]);
}
}
// Test backward iteration
while ((position = b.previous())!= BreakIterator.DONE) {
if (position != result[index++]) {
errln("Error with ThaiDictionaryBreakIterator backward iteration test at " + position + ".\nShould have been " + result[index-1]);
}
}
//Test invalid sequence and spaces
char text2[] = {
0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
0x0E16, 0x0E49, 0x0E33
};
int expectedWordResult[] = {
2, 3, 6, 10, 11, 15, 17, 20, 22
};
int expectedLineResult[] = {
3, 6, 11, 15, 17, 20, 22
};
BreakIterator brk = BreakIterator.getWordInstance(new ULocale("th"));
brk.setText(new String(text2));
position = index = 0;
while ((position = brk.next()) != BreakIterator.DONE && position < text2.length) {
if (position != expectedWordResult[index++]) {
errln("Incorrect break given by thai word break iterator. Expected: " + expectedWordResult[index-1] + " Got: " + position);
}
}
brk = BreakIterator.getLineInstance(new ULocale("th"));
brk.setText(new String(text2));
position = index = 0;
while ((position = brk.next()) != BreakIterator.DONE && position < text2.length) {
if (position != expectedLineResult[index++]) {
errln("Incorrect break given by thai line break iterator. Expected: " + expectedLineResult[index-1] + " Got: " + position);
}
}
// Improve code coverage
if (brk.preceding(expectedLineResult[1]) != expectedLineResult[0]) {
errln("Incorrect preceding position.");
}
if (brk.following(expectedLineResult[1]) != expectedLineResult[2]) {
errln("Incorrect following position.");
}
int []fillInArray = new int[2];
if (((RuleBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
errln("Error: Since getRuleStatusVec is not supported in DictionaryBasedBreakIterator, it should return 1 and fillInArray[0] == 0.");
}
}
// TODO: Move these test cases to rbbitst.txt if they aren't there already, then remove this test. It is redundant.
@Test
public void TestTailoredBreaks() {
class TBItem {
private int type;
private ULocale locale;
private String text;
private int[] expectOffsets;
TBItem(int typ, ULocale loc, String txt, int[] eOffs) {
type = typ;
locale = loc;
text = txt;
expectOffsets = eOffs;
}
private static final int maxOffsetCount = 128;
private boolean offsetsMatchExpected(int[] foundOffsets, int foundOffsetsLength) {
if ( foundOffsetsLength != expectOffsets.length ) {
return false;
}
for (int i = 0; i < foundOffsetsLength; i++) {
if ( foundOffsets[i] != expectOffsets[i] ) {
return false;
}
}
return true;
}
private String formatOffsets(int[] offsets, int length) {
StringBuffer buildString = new StringBuffer(4*maxOffsetCount);
for (int i = 0; i < length; i++) {
buildString.append(" " + offsets[i]);
}
return buildString.toString();
}
public void doTest() {
BreakIterator brkIter;
switch( type ) {
case BreakIterator.KIND_CHARACTER: brkIter = BreakIterator.getCharacterInstance(locale); break;
case BreakIterator.KIND_WORD: brkIter = BreakIterator.getWordInstance(locale); break;
case BreakIterator.KIND_LINE: brkIter = BreakIterator.getLineInstance(locale); break;
case BreakIterator.KIND_SENTENCE: brkIter = BreakIterator.getSentenceInstance(locale); break;
default: errln("Unsupported break iterator type " + type); return;
}
brkIter.setText(text);
int[] foundOffsets = new int[maxOffsetCount];
int offset, foundOffsetsCount = 0;
// do forwards iteration test
while ( foundOffsetsCount < maxOffsetCount && (offset = brkIter.next()) != BreakIterator.DONE ) {
foundOffsets[foundOffsetsCount++] = offset;
}
if ( !offsetsMatchExpected(foundOffsets, foundOffsetsCount) ) {
// log error for forwards test
String textToDisplay = (text.length() <= 16)? text: text.substring(0,16);
errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" +
"; expect " + expectOffsets.length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.length) +
"; found " + foundOffsetsCount + " offsets fwd:" + formatOffsets(foundOffsets, foundOffsetsCount) );
} else {
// do backwards iteration test
--foundOffsetsCount; // back off one from the end offset
while ( foundOffsetsCount > 0 ) {
offset = brkIter.previous();
if ( offset != foundOffsets[--foundOffsetsCount] ) {
// log error for backwards test
String textToDisplay = (text.length() <= 16)? text: text.substring(0,16);
errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" +
"; expect " + expectOffsets.length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.length) +
"; found rev offset " + offset + " where expect " + foundOffsets[foundOffsetsCount] );
break;
}
}
}
}
}
// KIND_SENTENCE "el"
final String elSentText = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +
"\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";
final int[] elSentTOffsets = { 8, 14, 20, 27, 35, 36 };
final int[] elSentROffsets = { 20, 27, 35, 36 };
// KIND_CHARACTER "th"
final String thCharText = "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 " +
"(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0E28) " +
"\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 ";
final int[] thCharTOffsets = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
//starting in Unicode 6.1, root behavior should be the same as Thai above
//final int[] thCharROffsets = { 1, 3, 5, 6, 7, 8, 9, 11,
// 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28,
// 29, 32, 33, 35, 37, 38, 40, 41 };
final TBItem[] tests = {
new TBItem( BreakIterator.KIND_SENTENCE, new ULocale("el"), elSentText, elSentTOffsets ),
new TBItem( BreakIterator.KIND_SENTENCE, ULocale.ROOT, elSentText, elSentROffsets ),
new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"), thCharText, thCharTOffsets ),
new TBItem( BreakIterator.KIND_CHARACTER, ULocale.ROOT, thCharText, thCharTOffsets ),
};
for (int iTest = 0; iTest < tests.length; iTest++) {
tests[iTest].doTest();
}
}
/* Tests the method public Object clone() */
@Test
public void TestClone() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
try {
rbbi.setText((CharacterIterator) null);
if (((RuleBasedBreakIterator) rbbi.clone()).getText() != null)
errln("RuleBasedBreakIterator.clone() was suppose to return "
+ "the same object because fText is set to null.");
} catch (Exception e) {
errln("RuleBasedBreakIterator.clone() was not suppose to return " + "an exception.");
}
}
/*
* Tests the method public boolean equals(Object that)
*/
@Test
public void TestEquals() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
RuleBasedBreakIterator rbbi1 = new RuleBasedBreakIterator(".;");
// TODO: Tests when "if (fRData != other.fRData && (fRData == null || other.fRData == null))" is true
// Tests when "if (fText == null || other.fText == null)" is true
rbbi.setText((CharacterIterator) null);
if (rbbi.equals(rbbi1)) {
errln("RuleBasedBreakIterator.equals(Object) was not suppose to return "
+ "true when the other object has a null fText.");
}
// Tests when "if (fText == null && other.fText == null)" is true
rbbi1.setText((CharacterIterator) null);
if (!rbbi.equals(rbbi1)) {
errln("RuleBasedBreakIterator.equals(Object) was not suppose to return "
+ "false when both objects has a null fText.");
}
// Tests when an exception occurs
if (rbbi.equals(0)) {
errln("RuleBasedBreakIterator.equals(Object) was suppose to return " + "false when comparing to integer 0.");
}
if (rbbi.equals(0.0)) {
errln("RuleBasedBreakIterator.equals(Object) was suppose to return " + "false when comparing to float 0.0.");
}
if (rbbi.equals("0")) {
errln("RuleBasedBreakIterator.equals(Object) was suppose to return "
+ "false when comparing to string '0'.");
}
}
/*
* Tests the method public int first()
*/
@Test
public void TestFirst() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
// Tests when "if (fText == null)" is true
rbbi.setText((CharacterIterator) null);
assertEquals("RuleBasedBreakIterator.first()", BreakIterator.DONE, rbbi.first());
rbbi.setText("abc");
assertEquals("RuleBasedBreakIterator.first()", 0, rbbi.first());
assertEquals("RuleBasedBreakIterator.next()", 1, rbbi.next());
}
/*
* Tests the method public int last()
*/
@Test
public void TestLast() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
// Tests when "if (fText == null)" is true
rbbi.setText((CharacterIterator) null);
if (rbbi.last() != BreakIterator.DONE) {
errln("RuleBasedBreakIterator.last() was suppose to return "
+ "BreakIterator.DONE when the object has a null fText.");
}
}
/*
* Tests the method public int following(int offset)
*/
@Test
public void TestFollowing() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
// Tests when "else if (offset < fText.getBeginIndex())" is true
rbbi.setText("dummy");
if (rbbi.following(-1) != 0) {
errln("RuleBasedBreakIterator.following(-1) was suppose to return "
+ "0 when the object has a fText of dummy.");
}
}
/*
* Tests the method public int preceding(int offset)
*/
@Test
public void TestPreceding() {
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
// Tests when "if (fText == null || offset > fText.getEndIndex())" is true
rbbi.setText((CharacterIterator)null);
if (rbbi.preceding(-1) != BreakIterator.DONE) {
errln("RuleBasedBreakIterator.preceding(-1) was suppose to return "
+ "0 when the object has a fText of null.");
}
// Tests when "else if (offset < fText.getBeginIndex())" is true
rbbi.setText("dummy");
if (rbbi.preceding(-1) != 0) {
errln("RuleBasedBreakIterator.preceding(-1) was suppose to return "
+ "0 when the object has a fText of dummy.");
}
}
/* Tests the method public int current() */
@Test
public void TestCurrent(){
RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
// Tests when "(fText != null) ? fText.getIndex() : BreakIterator.DONE" is true and false
rbbi.setText((CharacterIterator)null);
if(rbbi.current() != BreakIterator.DONE){
errln("RuleBasedBreakIterator.current() was suppose to return "
+ "BreakIterator.DONE when the object has a fText of null.");
}
rbbi.setText("dummy");
if(rbbi.current() != 0){
errln("RuleBasedBreakIterator.current() was suppose to return "
+ "0 when the object has a fText of dummy.");
}
}
@Test
public void TestBug7547() {
try {
new RuleBasedBreakIterator("");
fail("TestBug7547: RuleBasedBreakIterator constructor failed to throw an exception with empty rules.");
}
catch (IllegalArgumentException e) {
// expected exception with empty rules.
}
catch (Exception e) {
fail("TestBug7547: Unexpected exception while creating RuleBasedBreakIterator: " + e);
}
}
@Test
public void TestBug12797() {
String rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
bi.setText("abc");
bi.first();
assertEquals("Rule chaining test", 3, bi.next());
}
@Test
public void TestBug12873() {
// Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead
// matches not being cloned when a break iterator is cloned. This resulted in usage
// collisions if the original break iterator and its clone were used concurrently.
// The Line Break rules for Regional Indicators make use of look-ahead rules, and
// show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A
// Regional indicators group into pairs, expect breaks after two code points, which
// is after four 16 bit code units.
final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6";
final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance();
final AssertionError[] assertErr = new AssertionError[1]; // saves an error found from within a thread
class WorkerThread implements Runnable {
@Override
public void run() {
try {
RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone();
localBI.setText(dataToBreak);
for (int loop=0; loop<100; loop++) {
int nextExpectedBreak = 0;
for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE;
actualBreak = localBI.next(), nextExpectedBreak+= 4) {
assertEquals("", nextExpectedBreak, actualBreak);
}
assertEquals("", dataToBreak.length()+4, nextExpectedBreak);
}
} catch (AssertionError e) {
assertErr[0] = e;
}
}
}
List<Thread> threads = new ArrayList<>();
for (int n = 0; n<4; ++n) {
threads.add(new Thread(new WorkerThread()));
}
for (Thread thread: threads) {
thread.start();
}
for (Thread thread: threads) {
try {
thread.join();
} catch (InterruptedException e) {
fail(e.toString());
}
}
// JUnit wont see failures from within the worker threads, so
// check again if one occurred.
if (assertErr[0] != null) {
throw assertErr[0];
}
}
@Test
public void TestBreakAllChars() {
// Make a "word" from each code point, separated by spaces.
// For dictionary based breaking, runs the start-of-range
// logic with all possible dictionary characters.
StringBuilder sb = new StringBuilder();
for (int c=0; c<0x110000; ++c) {
sb.appendCodePoint(c);
sb.appendCodePoint(c);
sb.appendCodePoint(c);
sb.appendCodePoint(c);
sb.append(' ');
}
String s = sb.toString();
for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) {
RuleBasedBreakIterator bi =
(RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind);
bi.setText(s);
int lastb = -1;
for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) {
assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b);
}
}
}
@Test
public void TestBug12918() {
// This test triggered an assertion failure in ICU4C, in dictbe.cpp
// The equivalent code in ICU4J is structured slightly differently,
// and does not appear vulnerable to the same issue.
//
// \u3325 decomposes with normalization, then the CJK dictionary
// finds a break within the decomposition.
String crasherString = "\u3325\u4a16";
BreakIterator iter = BreakIterator.getWordInstance(ULocale.ENGLISH);
iter.setText(crasherString);
iter.first();
int pos = 0;
int lastPos = -1;
while((pos = iter.next()) != BreakIterator.DONE) {
assertTrue("", pos > lastPos);
}
}
@Test
public void TestBug12519() {
RuleBasedBreakIterator biEn = (RuleBasedBreakIterator)BreakIterator.getWordInstance(ULocale.ENGLISH);
RuleBasedBreakIterator biFr = (RuleBasedBreakIterator)BreakIterator.getWordInstance(ULocale.FRANCE);
assertEquals("", ULocale.ENGLISH, biEn.getLocale(ULocale.VALID_LOCALE));
assertEquals("", ULocale.FRENCH, biFr.getLocale(ULocale.VALID_LOCALE));
assertEquals("Locales do not participate in BreakIterator equality.", biEn, biFr);
RuleBasedBreakIterator cloneEn = (RuleBasedBreakIterator)biEn.clone();
assertEquals("", biEn, cloneEn);
assertEquals("", ULocale.ENGLISH, cloneEn.getLocale(ULocale.VALID_LOCALE));
RuleBasedBreakIterator cloneFr = (RuleBasedBreakIterator)biFr.clone();
assertEquals("", biFr, cloneFr);
assertEquals("", ULocale.FRENCH, cloneFr.getLocale(ULocale.VALID_LOCALE));
}
static class T13512Thread extends Thread {
private String fText;
public List fBoundaries;
public List fExpectedBoundaries;
T13512Thread(String text) {
fText = text;
fExpectedBoundaries = getBoundary(fText);
}
@Override
public void run() {
for (int i= 0; i<10000; ++i) {
fBoundaries = getBoundary(fText);
if (!fBoundaries.equals(fExpectedBoundaries)) {
break;
}
}
}
private static final BreakIterator BREAK_ITERATOR_CACHE = BreakIterator.getWordInstance(ULocale.ROOT);
public static List<Integer> getBoundary(String toParse) {
List<Integer> retVal = new ArrayList<>();
BreakIterator bi = (BreakIterator) BREAK_ITERATOR_CACHE.clone();
bi.setText(toParse);
for (int boundary=bi.first(); boundary != BreakIterator.DONE; boundary = bi.next()) {
retVal.add(boundary);
}
return retVal;
}
}
@Test
public void TestBug13512() {
String japanese = "コンピューターは、本質的には数字しか扱うことができません。コンピューターは、文字や記号などのそれぞれに番号を割り振る"
+ "ことによって扱えるようにします。ユニコードが出来るまでは、これらの番号を割り振る仕組みが何百種類も存在しました。どの一つをとっても、十分な"
+ "文字を含んではいませんでした。例えば、欧州連合一つを見ても、そのすべての言語をカバーするためには、いくつかの異なる符号化の仕"
+ "組みが必要でした。英語のような一つの言語に限っても、一つだけの符号化の仕組みでは、一般的に使われるすべての文字、句読点、技術"
+ "的な記号などを扱うには不十分でした。";
String thai = "โดยพื้นฐานแล้ว, คอมพิวเตอร์จะเกี่ยวข้องกับเรื่องของตัวเลข. คอมพิวเตอร์จัดเก็บตัวอักษรและอักขระอื่นๆ"
+ " โดยการกำหนดหมายเลขให้สำหรับแต่ละตัว. ก่อนหน้าที่๊ Unicode จะถูกสร้างขึ้น, ได้มีระบบ encoding "
+ "อยู่หลายร้อยระบบสำหรับการกำหนดหมายเลขเหล่านี้. ไม่มี encoding ใดที่มีจำนวนตัวอักขระมากเพียงพอ: ยกตัวอย่างเช่น, "
+ "เฉพาะในกลุ่มสหภาพยุโรปเพียงแห่งเดียว ก็ต้องการหลาย encoding ในการครอบคลุมทุกภาษาในกลุ่ม. "
+ "หรือแม้แต่ในภาษาเดี่ยว เช่น ภาษาอังกฤษ ก็ไม่มี encoding ใดที่เพียงพอสำหรับทุกตัวอักษร, "
+ "เครื่องหมายวรรคตอน และสัญลักษณ์ทางเทคนิคที่ใช้กันอยู่ทั่วไป.\n" +
"ระบบ encoding เหล่านี้ยังขัดแย้งซึ่งกันและกัน. นั่นก็คือ, ในสอง encoding สามารถใช้หมายเลขเดียวกันสำหรับตัวอักขระสองตัวที่แตกต่างกัน,"
+ "หรือใช้หมายเลขต่างกันสำหรับอักขระตัวเดียวกัน. ในระบบคอมพิวเตอร์ (โดยเฉพาะเซิร์ฟเวอร์) ต้องมีการสนับสนุนหลาย"
+ " encoding; และเมื่อข้อมูลที่ผ่านไปมาระหว่างการเข้ารหัสหรือแพล็ตฟอร์มที่ต่างกัน, ข้อมูลนั้นจะเสี่ยงต่อการผิดพลาดเสียหาย.";
T13512Thread t1 = new T13512Thread(thai);
T13512Thread t2 = new T13512Thread(japanese);
try {
t1.start(); t2.start();
t1.join(); t2.join();
} catch (Exception e) {
fail(e.toString());
}
assertEquals("", t1.fExpectedBoundaries, t1.fBoundaries);
assertEquals("", t2.fExpectedBoundaries, t2.fBoundaries);
}
@Test
public void TestBug12677() {
// Check that stripping of comments from rules for getRules() is not confused by
// the presence of '#' characters in the rules that do not introduce comments.
String rules = "!!forward; \n"
+ "$x = [ab#]; # a set with a # literal. \n"
+ " # .; # a comment that looks sort of like a rule. \n"
+ " '#' '?'; # a rule with a quoted # \n";
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
String rtRules = bi.toString(); // getRules() in C++
assertEquals("Break Iterator rule stripping test", "!!forward;$x=[ab#];'#''?';", rtRules);
}
@Test
public void TestTableRedundancies() {
RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance(Locale.ENGLISH);
String rules = bi.toString();
bi = new RuleBasedBreakIterator(rules);
// Build a break iterator from source rules.
// Want to check the rule builder in Java, not the pre-built rules that are imported from ICU4C.
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
int numCharClasses = dw.fHeader.fCatCount;
// Check for duplicate columns (character categories)
List<String> columns = new ArrayList<>();
for (int column=0; column<numCharClasses; column++) {
StringBuilder s = new StringBuilder();
for (int r = 1; r < fwtbl.fNumStates; r++) {
int row = dw.getRowIndex(r);
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append(tableVal);
}
columns.add(s.toString());
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
int limit = c1 < fwtbl.fDictCategoriesStart ? fwtbl.fDictCategoriesStart : numCharClasses;
for (int c2 = c1+1; c2 < limit; c2++) {
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
// if (columns.get(c1).equals(columns.get(c2))) {
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
// }
}
}
// Check for duplicate states.
List<String> rows = new ArrayList<>();
for (int r=0; r<fwtbl.fNumStates; r++) {
StringBuilder s = new StringBuilder();
int row = dw.getRowIndex(r);
s.append(fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.LOOKAHEAD]);
s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGSIDX]);
for (int column=0; column<numCharClasses; column++) {
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append(tableVal);
}
rows.add(s.toString());
}
for (int r1=0; r1 < fwtbl.fNumStates; r1++) {
for (int r2= r1+1; r2 < fwtbl.fNumStates; r2++) {
assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
// if (rows.get(r1).equals(rows.get(r2))) {
// System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
// }
}
}
}
@Test
public void TestBug13447() {
// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
// even after next() has returned DONE.
RuleBasedBreakIterator bi =
(RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH);
bi.setText("1234");
assertEquals("", BreakIterator.WORD_NONE, bi.getRuleStatus());
assertEquals("", 4, bi.next());
assertEquals("", BreakIterator.WORD_NUMBER, bi.getRuleStatus());
assertEquals("", BreakIterator.DONE, bi.next());
assertEquals("", 4, bi.current());
assertEquals("", BreakIterator.WORD_NUMBER, bi.getRuleStatus());
}
@Test
public void TestTableRebuild() {
// Test to verify that rebuilding the state tables from rule source for the standard
// break iterator types yields the same tables as are imported from ICU4C as part of the default data.
List<RuleBasedBreakIterator> breakIterators = new ArrayList<>();
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getCharacterInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getSentenceInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getLineInstance(Locale.ENGLISH));
for (RuleBasedBreakIterator bi: breakIterators) {
String rules = bi.toString();
RuleBasedBreakIterator bi2 = new RuleBasedBreakIterator(rules);
assertTrue("Forward Table", RBBIDataWrapper.equals(bi.fRData.fFTable, bi2.fRData.fFTable));
assertTrue("Reverse Table", RBBIDataWrapper.equals(bi.fRData.fRTable, bi2.fRData.fRTable));
}
}
// Helper function to test 8/16 bits of trie and 8/16 bits of state table.
private void testTrieStateTable(int numChar, boolean expectUCPTrieValueWidthIn8Bits,
boolean expectStateRowIn8Bits) {
// Text are duplicate characters from U+4E00 to U+4FFF
StringBuilder builder = new StringBuilder(2 * (0x5000 - 0x4e00));
for (char c = 0x4e00; c < 0x5000; c++) {
builder.append(c).append(c);
}
String text = builder.toString();
// Generate rule which will caused length+4 character classes and
// length+3 states
builder = new StringBuilder(100 + 6 * numChar);
builder.append("!!quoted_literals_only;");
for (char c = 0x4e00; c < 0x4e00 + numChar; c++) {
builder.append("\'").append(c).append(c).append("';");
}
builder.append(".;");
String rules = builder.toString();
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
RBBIDataWrapper.RBBIStateTable rvtbl = dw.fRTable;
boolean has8BitRowDataForwardTable = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitRowDataReverseTable = (rvtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
assertEquals("Number of char classes mismatch numChar=" + numChar, numChar + 4, dw.fHeader.fCatCount);
assertEquals("Number of states in Forward Table mismatch numChar=" + numChar, numChar + 3, fwtbl.fNumStates);
assertEquals("Number of states in Reverse Table mismatch numChar=" + numChar, numChar + 3, rvtbl.fNumStates);
assertEquals("Trie width mismatch numChar=" + numChar, expectUCPTrieValueWidthIn8Bits, has8BitsTrie);
assertEquals("Bits of Forward State table mismatch numChar=" + numChar,
expectStateRowIn8Bits, has8BitRowDataForwardTable);
assertEquals("Bits of Reverse State table mismatch numChar=" + numChar,
expectStateRowIn8Bits, has8BitRowDataReverseTable);
bi.setText(text);
int pos;
int i = 0;
while ((pos = bi.next()) > 0) {
// The first numChar should not break between the pair
if (i++ < numChar) {
assertEquals("next() mismatch numChar=" + numChar, i * 2, pos);
} else {
// After the first numChar next(), break on each character.
assertEquals("next() mismatch numChar=" + numChar, i + numChar, pos);
}
}
while ((pos = bi.previous()) > 0) {
// The first numChar should not break between the pair
if (--i < numChar) {
assertEquals("previous() mismatch numChar=" + numChar, i * 2, pos);
} else {
// After the first numChar next(), break on each character.
assertEquals("previous() mismatch numChar=" + numChar, i + numChar, pos);
}
}
}
@Test
public void Test8BitsTrieWith8BitStateTable() {
testTrieStateTable(251, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith8BitStateTable() {
testTrieStateTable(252, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith16BitStateTable() {
testTrieStateTable(253, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
}
@Test
public void Test8BitsTrieWith16BitStateTable() {
// Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
// create state table in 16 bits.
// Generate 510 'a' as text
StringBuilder builder = new StringBuilder(510);
for (int i = 0; i < 510; i++) {
builder.append('a');
}
String text = builder.toString();
builder = new StringBuilder(550);
builder.append("!!quoted_literals_only;'");
// 254 'a' in the rule will cause 256 states
for (int i = 0; i < 254; i++) {
builder.append('a');
}
builder.append("';.;");
String rules = builder.toString();
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
assertFalse("State table should be in 16 bits", has8BitRowData);
assertTrue("Trie should be in 8 bits", has8BitsTrie);
bi.setText(text);
// break positions:
// 254, 508, 509, 510
assertEquals("next()", 254, bi.next());
int i = 0;
int pos;
while ((pos = bi.next()) > 0) {
assertEquals("next()", 508 + i , pos);
i++;
}
i = 0;
while ((pos = bi.previous()) > 0) {
i++;
if (pos >= 508) {
assertEquals("previous()", 510 - i , pos);
} else {
assertEquals("previous()", 254 , pos);
}
}
}
/**
* Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
* that there are no problems with rules at the size that transitions between the two.
*
* A rule that matches a literal string, like 'abcdefghij', will require one state and
* one character class per character in the string. So we can make a rule to tickle the
* boundaries by using literal strings of various lengths.
*
* For both the number of states and the number of character classes, the eight bit format
* only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
* leaving 120 something available. This test runs the string over the range of 120 - 130,
* which allows some margin for changes to the number of values reserved by the rule builder
* without breaking the test.
*/
@Test
public void TestTable_8_16_Bits() {
// testStr serves as both the source of the rule string (truncated to the desired length)
// and as test data to check matching behavior. A break rule consisting of the first 120
// characters of testStr will match the first 120 chars of the full-length testStr.
StringBuilder builder = new StringBuilder(0x200);
for (char c=0x3000; c<0x3200; ++c) {
builder.append(c);
}
String testStr = builder.toString();
int startLength = 120; // The shortest rule string to test.
int endLength = 260; // The longest rule string to test
int increment = 1;
for (int ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
String ruleString = (new String("!!quoted_literals_only; '#';"))
.replace("#", testStr.substring(0, ruleLen));
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(ruleString);
// Verify that the break iterator is functioning - that the first boundary found
// in testStr is at the length of the rule string.
bi.setText(testStr);
assertEquals("The first boundary found in testStr should be at the length of the rule string",
ruleLen, bi.next());
// Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
// of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
bi.setText(testStr);
int result = bi.preceding(ruleLen);
assertEquals("Reverse iteration should find the boundary at 0", 0, result);
// Verify that the range of rule lengths being tested cover the transations
// from 8 to 16 bit data.
RBBIDataWrapper dw = bi.fRData;
RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable;
boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0;
boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8;
if (ruleLen == startLength) {
assertTrue("State table should be in 8 bits", has8BitRowData);
assertTrue("Trie should be in 8 bits", has8BitsTrie);
}
if (ruleLen == endLength) {
assertFalse("State table should be in 16 bits", has8BitRowData);
assertFalse("Trie should be in 16 bits", has8BitsTrie);
}
}
}
/* Test handling of a large number of look-ahead rules.
* The number of rules in the test exceeds the implementation limits prior to the
* improvements introduced with #13590.
*
* The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
* The text being matched is sequential, "ABCDEFGHI..."
*
* The upshot is that the look-ahead rules all match on their preceding context,
* and consequently must save a potential result, but then fail to match on their
* trailing context, so that they don't actually cause a boundary.
*
* Additionally, add a ".*" rule, so there are no boundaries unless a
* look-ahead hard-break rule forces one.
*/
@Test
public void TestBug13590() {
StringBuilder rules = new StringBuilder("!!quoted_literals_only; !!chain; .*;\n");
int NUM_LOOKAHEAD_RULES = 50;
char STARTING_CHAR = '\u5000';
char firstChar = 0;
for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
firstChar = (char) (STARTING_CHAR + ruleNum*2);
rules.append('\'') .append(firstChar) .append((char)(firstChar+1)) .append('\'')
.append(' ') .append('/') .append(' ')
.append('\'') .append((char)(firstChar+2)) .append((char)(firstChar+4)) .append('\'')
.append(';') .append('\n');
}
// Change the last rule added from the form "UV / WY" to "UV / WX".
// Changes the rule so that it will match - all 4 chars are in ascending sequence.
String rulesStr = rules.toString().replace((char)(firstChar+4), (char)(firstChar+3));
RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rulesStr);
// bi.dump(System.out);
StringBuilder testString = new StringBuilder();
for (char c = (char) (STARTING_CHAR-200); c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
testString.append(c);
}
bi.setText(testString);
int breaksFound = 0;
while (bi.next() != BreakIterator.DONE) {
++breaksFound;
}
// Two matches are expected, one from the last rule that was explicitly modified,
// and one at the end of the text.
assertEquals("Wrong number of breaks found", 2, breaksFound);
}
}