blob: d35195b49c2c5a81b7e2e4df6874c15c4bff54fb [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
* Created on May 5, 2004
*
* Copyright (C) 2004-2016 International Business Machines Corporation and others.
* All Rights Reserved.
*
*/
package com.ibm.icu.dev.test.rbbi;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
/**
* Rule based break iterator data driven test.
* Perform the tests from the file rbbitst.txt.
* The test data file is common to both ICU4C and ICU4J.
* See the data file for a description of the tests.
*
*/
@RunWith(JUnit4.class)
public class RBBITestExtended extends TestFmwk {
public RBBITestExtended() {
}
static class TestParams {
BreakIterator bi;
StringBuilder dataToBreak = new StringBuilder();
int[] expectedBreaks = new int[4000];
int[] srcLine = new int[4000];
int[] srcCol = new int[4000];
ULocale currentLocale = new ULocale("en_US");
}
@Test
public void TestExtended() {
TestParams tp = new TestParams();
//
// Open and read the test data file.
//
StringBuilder testFileBuf = new StringBuilder();
InputStream is = null;
try {
is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
if (is == null) {
errln("Could not open test data file rbbitst.txt");
return;
}
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
try {
int c;
int count = 0;
for (;;) {
c = isr.read();
if (c < 0) {
break;
}
count++;
if (c == 0xFEFF && count == 1) {
// BOM in the test data file. Discard it.
continue;
}
testFileBuf.appendCodePoint(c);
}
} finally {
isr.close();
}
} catch (IOException e) {
errln(e.toString());
try {
is.close();
} catch (IOException ignored) {
}
return;
}
String testString = testFileBuf.toString();
final int PARSE_COMMENT = 1;
final int PARSE_TAG = 2;
final int PARSE_DATA = 3;
final int PARSE_NUM = 4;
final int PARSE_RULES = 5;
int parseState = PARSE_TAG;
int savedState = PARSE_TAG;
int lineNum = 1;
int colStart = 0;
int column = 0;
int charIdx = 0;
int i;
int tagValue = 0; // The numeric value of a <nnn> tag.
StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
int rulesFirstLine = 0; // Line number of the start of current <rules> block
int len = testString.length();
for (charIdx = 0; charIdx < len; ) {
int c = testString.codePointAt(charIdx);
charIdx++;
if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
// treat CRLF as a unit
c = '\n';
charIdx++;
}
if (c == '\n' || c == '\r') {
lineNum++;
colStart = charIdx;
}
column = charIdx - colStart + 1;
switch (parseState) {
case PARSE_COMMENT:
if (c == 0x0a || c == 0x0d) {
parseState = savedState;
}
break;
case PARSE_TAG:
{
if (c == '#') {
parseState = PARSE_COMMENT;
savedState = PARSE_TAG;
break;
}
if (UCharacter.isWhitespace(c)) {
break;
}
if (testString.startsWith("<word>", charIdx-1)) {
tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
charIdx += 5;
break;
}
if (testString.startsWith("<char>", charIdx-1)) {
tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
charIdx += 5;
break;
}
if (testString.startsWith("<line>", charIdx-1)) {
tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
charIdx += 5;
break;
}
if (testString.startsWith("<sent>", charIdx-1)) {
tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
charIdx += 5;
break;
}
if (testString.startsWith("<title>", charIdx-1)) {
tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
charIdx += 6;
break;
}
if (testString.startsWith("<rules>", charIdx-1) ||
testString.startsWith("<badrules>", charIdx-1)) {
charIdx = testString.indexOf('>', charIdx) + 1;
parseState = PARSE_RULES;
rules.setLength(0);
rulesFirstLine = lineNum;
break;
}
if (testString.startsWith("<locale ", charIdx-1)) {
int closeIndex = testString.indexOf(">", charIdx);
if (closeIndex < 0) {
errln("line" + lineNum + ": missing close on <locale tag.");
break;
}
String localeName = testString.substring(charIdx+6, closeIndex);
localeName = localeName.trim();
tp.currentLocale = new ULocale(localeName);
charIdx = closeIndex+1;
break;
}
if (testString.startsWith("<data>", charIdx-1)) {
parseState = PARSE_DATA;
charIdx += 5;
tp.dataToBreak.setLength(0);
Arrays.fill(tp.expectedBreaks, 0);
Arrays.fill(tp.srcCol, 0);
Arrays.fill(tp.srcLine, 0);
break;
}
errln("line" + lineNum + ": Tag expected in test file.");
return;
//parseState = PARSE_COMMENT;
//savedState = PARSE_DATA;
}
case PARSE_RULES:
if (testString.startsWith("</rules>", charIdx-1)) {
charIdx += 7;
parseState = PARSE_TAG;
try {
tp.bi = new RuleBasedBreakIterator(rules.toString());
} catch (IllegalArgumentException e) {
errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e));
}
} else if (testString.startsWith("</badrules>", charIdx-1)) {
charIdx += 10;
parseState = PARSE_TAG;
boolean goodRules = true;
try {
new RuleBasedBreakIterator(rules.toString());
} catch (IllegalArgumentException e) {
goodRules = false;
}
if (goodRules) {
errln(String.format(
"rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.",
lineNum));
}
} else {
rules.appendCodePoint(c);
}
break;
case PARSE_DATA:
if (c == '•') {
int breakIdx = tp.dataToBreak.length();
if (tp.expectedBreaks[breakIdx] != 0) {
errln(String.format(
"rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
lineNum, column));
}
tp.expectedBreaks[breakIdx] = -1;
tp.srcLine[breakIdx] = lineNum;
tp.srcCol[breakIdx] = column;
break;
}
if (testString.startsWith("</data>", charIdx-1)) {
// Add final entry to mappings from break location to source file position.
// Need one extra because last break position returned is after the
// last char in the data, not at the last char.
int idx = tp.dataToBreak.length();
tp.srcLine[idx] = lineNum;
tp.srcCol[idx] = column;
parseState = PARSE_TAG;
charIdx += 6;
// RUN THE TEST!
executeTest(tp);
break;
}
if (testString.startsWith("\\N{", charIdx-1)) {
int nameEndIdx = testString.indexOf('}', charIdx);
if (nameEndIdx == -1) {
errln("Error in named character in test file at line " + lineNum +
", col " + column);
}
// Named character, e.g. \N{COMBINING GRAVE ACCENT}
// Get the code point from the name and insert it into the test data.
String charName = testString.substring(charIdx+2, nameEndIdx);
c = UCharacter.getCharFromName(charName);
if (c == -1) {
errln("Error in named character in test file at line " + lineNum +
", col " + column);
} else {
// Named code point was recognized. Insert it
// into the test data.
tp.dataToBreak.appendCodePoint(c);
for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
}
}
if (nameEndIdx > charIdx) {
charIdx = nameEndIdx+1;
}
break;
}
if (testString.startsWith("<>", charIdx-1)) {
charIdx++;
int breakIdx = tp.dataToBreak.length();
tp.expectedBreaks[breakIdx] = -1;
tp.srcLine[breakIdx] = lineNum;
tp.srcCol[breakIdx] = column;
break;
}
if (c == '<') {
tagValue = 0;
parseState = PARSE_NUM;
break;
}
if (c == '#' && column==3) { // TODO: why is column off so far?
parseState = PARSE_COMMENT;
savedState = PARSE_DATA;
break;
}
if (c == '\\') {
// Check for \ at end of line, a line continuation.
// Advance over (discard) the newline
int cp = testString.codePointAt(charIdx);
if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
// We have a CR LF
// Need an extra increment of the input ptr to move over both of them
charIdx++;
}
if (cp == '\n' || cp == '\r') {
lineNum++;
column = 0;
charIdx++;
colStart = charIdx;
break;
}
// Let unescape handle the back slash.
int charIdxAr[] = new int[1];
charIdxAr[0] = charIdx;
cp = Utility.unescapeAt(testString, charIdxAr);
if (cp != -1) {
// Escape sequence was recognized. Insert the char
// into the test data.
charIdx = charIdxAr[0];
tp.dataToBreak.appendCodePoint(cp);
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
}
break;
}
// Not a recognized backslash escape sequence.
// Take the next char as a literal.
// TODO: Should this be an error?
c = testString.codePointAt(charIdx);
charIdx = testString.offsetByCodePoints(charIdx, 1);
}
// Normal, non-escaped data char.
tp.dataToBreak.appendCodePoint(c);
// Save the mapping from offset in the data to line/column numbers in
// the original input file. Will be used for better error messages only.
// If there's an expected break before this char, the slot in the mapping
// vector will already be set for this char; don't overwrite it.
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
}
break;
case PARSE_NUM:
// We are parsing an expected numeric tag value, like <1234>,
// within a chunk of data.
if (UCharacter.isWhitespace(c)) {
break;
}
if (c == '>') {
// Finished the number. Add the info to the expected break data,
// and switch parse state back to doing plain data.
parseState = PARSE_DATA;
if (tagValue == 0) {
tagValue = -1;
}
int breakIdx = tp.dataToBreak.length();
if (tp.expectedBreaks[breakIdx] != 0) {
errln(String.format(
"rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
lineNum, column));
}
tp.expectedBreaks[breakIdx] = tagValue;
tp.srcLine[breakIdx] = lineNum;
tp.srcCol[breakIdx] = column;
break;
}
if (UCharacter.isDigit(c)) {
tagValue = tagValue*10 + UCharacter.digit(c);
break;
}
errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
return;
}
}
// Reached end of test file. Raise an error if parseState indicates that we are
// within a block that should have been terminated.
if (parseState == PARSE_RULES) {
errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
lineNum, rulesFirstLine));
}
if (parseState == PARSE_DATA) {
errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
}
}
void executeTest(TestParams t) {
// TODO: also rerun tests with a break iterator re-created from bi.getRules()
// and from bi.clone(). If in exhaustive mode only.
int bp;
int prevBP;
int i;
if (t.bi == null) {
return;
}
t.bi.setText(t.dataToBreak.toString());
//
// Run the iterator forward
//
prevBP = -1;
for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
if (prevBP == bp) {
// Fail for lack of forward progress.
errln("Forward Iteration, no forward progress. Break Pos=" + bp +
" File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
break;
}
// Check that there were we didn't miss an expected break between the last one
// and this one.
for (i=prevBP+1; i<bp; i++) {
if (t.expectedBreaks[i] != 0) {
errln("Forward Iteration, break expected, but not found. Pos=" + i +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
}
}
// Check that the break we did find was expected
if (t.expectedBreaks[bp] == 0) {
errln("Forward Iteration, break found, but not expected. Pos=" + bp +
" File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
} else {
// The break was expected.
// Check that the {nnn} tag value is correct.
int expectedTagVal = t.expectedBreaks[bp];
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int line = t.srcLine[bp];
int rs = t.bi.getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos = " + bp +
". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
" Actual, Expected status = " + rs + ", " + expectedTagVal);
}
int[] fillInArray = new int[4];
int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
assertTrue("", numStatusVals >= 1);
assertEquals("", expectedTagVal, fillInArray[0]);
}
prevBP = bp;
}
// Verify that there were no missed expected breaks after the last one found
for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
if (t.expectedBreaks[i] != 0) {
errln("Forward Iteration, break expected, but not found. Pos=" + i +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
}
}
//
// Run the iterator backwards, verify that the same breaks are found.
//
prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
if (prevBP == bp) {
// Fail for lack of progress.
errln("Reverse Iteration, no progress. Break Pos=" + bp +
"File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
break;
}
// Check that we didn't miss an expected break between the last one
// and this one. (UVector returns zeros for index out of bounds.)
for (i=prevBP-1; i>bp; i--) {
if (t.expectedBreaks[i] != 0) {
errln("Reverse Itertion, break expected, but not found. Pos=" + i +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
}
}
// Check that the break we did find was expected
if (t.expectedBreaks[bp] == 0) {
errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
" File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
} else {
// The break was expected.
// Check that the {nnn} tag value is correct.
int expectedTagVal = t.expectedBreaks[bp];
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int line = t.srcLine[bp];
int rs = t.bi.getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for reverse break. Pos = " + bp +
" File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
" Actual, Expected status = " + rs + ", " + expectedTagVal);
}
}
prevBP = bp;
}
// Verify that there were no missed breaks prior to the last one found
for (i=prevBP-1; i>=0; i--) {
if (t.expectedBreaks[i] != 0) {
errln("Reverse Itertion, break expected, but not found. Pos=" + i +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
}
}
// Check isBoundary()
for (i=0; i<=t.dataToBreak.length(); i++) {
boolean boundaryExpected = (t.expectedBreaks[i] != 0);
boolean boundaryFound = t.bi.isBoundary(i);
if (boundaryExpected != boundaryFound) {
errln("isBoundary(" + i + ") incorrect.\n" +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
" Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
}
}
// Check following()
for (i=0; i<=t.dataToBreak.length(); i++) {
int actualBreak = t.bi.following(i);
int expectedBreak = BreakIterator.DONE;
for (int j=i+1; j < t.expectedBreaks.length; j++) {
if (t.expectedBreaks[j] != 0) {
expectedBreak = j;
break;
}
}
if (expectedBreak != actualBreak) {
errln("following(" + i + ") incorrect.\n" +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
" Expected, Actual= " + expectedBreak + ", " + actualBreak);
}
}
// Check preceding()
for (i=t.dataToBreak.length(); i>=0; i--) {
int actualBreak = t.bi.preceding(i);
int expectedBreak = BreakIterator.DONE;
for (int j=i-1; j >= 0; j--) {
if (t.expectedBreaks[j] != 0) {
expectedBreak = j;
break;
}
}
if (expectedBreak != actualBreak) {
errln("preceding(" + i + ") incorrect.\n" +
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
" Expected, Actual= " + expectedBreak + ", " + actualBreak);
}
}
}
}