| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| /* |
| * Created on May 5, 2004 |
| * |
| * Copyright (C) 2004-2016 International Business Machines Corporation and others. |
| * All Rights Reserved. |
| * |
| */ |
| package com.ibm.icu.dev.test.rbbi; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.util.Arrays; |
| |
| import org.junit.Test; |
| import org.junit.runner.RunWith; |
| import org.junit.runners.JUnit4; |
| |
| import com.ibm.icu.dev.test.TestFmwk; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.BreakIterator; |
| import com.ibm.icu.text.RuleBasedBreakIterator; |
| import com.ibm.icu.util.ULocale; |
| |
| |
| /** |
| * Rule based break iterator data driven test. |
| * Perform the tests from the file rbbitst.txt. |
| * The test data file is common to both ICU4C and ICU4J. |
| * See the data file for a description of the tests. |
| * |
| */ |
| @RunWith(JUnit4.class) |
| public class RBBITestExtended extends TestFmwk { |
| public RBBITestExtended() { |
| } |
| |
| |
| |
| static class TestParams { |
| BreakIterator bi; |
| StringBuilder dataToBreak = new StringBuilder(); |
| int[] expectedBreaks = new int[4000]; |
| int[] srcLine = new int[4000]; |
| int[] srcCol = new int[4000]; |
| ULocale currentLocale = new ULocale("en_US"); |
| } |
| |
| |
| @Test |
| public void TestExtended() { |
| TestParams tp = new TestParams(); |
| |
| |
| // |
| // Open and read the test data file. |
| // |
| StringBuilder testFileBuf = new StringBuilder(); |
| InputStream is = null; |
| try { |
| is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt"); |
| if (is == null) { |
| errln("Could not open test data file rbbitst.txt"); |
| return; |
| } |
| InputStreamReader isr = new InputStreamReader(is, "UTF-8"); |
| try { |
| int c; |
| int count = 0; |
| for (;;) { |
| c = isr.read(); |
| if (c < 0) { |
| break; |
| } |
| count++; |
| if (c == 0xFEFF && count == 1) { |
| // BOM in the test data file. Discard it. |
| continue; |
| } |
| |
| testFileBuf.appendCodePoint(c); |
| } |
| } finally { |
| isr.close(); |
| } |
| } catch (IOException e) { |
| errln(e.toString()); |
| try { |
| is.close(); |
| } catch (IOException ignored) { |
| } |
| return; |
| } |
| |
| String testString = testFileBuf.toString(); |
| |
| |
| final int PARSE_COMMENT = 1; |
| final int PARSE_TAG = 2; |
| final int PARSE_DATA = 3; |
| final int PARSE_NUM = 4; |
| final int PARSE_RULES = 5; |
| |
| int parseState = PARSE_TAG; |
| |
| int savedState = PARSE_TAG; |
| |
| int lineNum = 1; |
| int colStart = 0; |
| int column = 0; |
| int charIdx = 0; |
| int i; |
| |
| int tagValue = 0; // The numeric value of a <nnn> tag. |
| |
| StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block |
| int rulesFirstLine = 0; // Line number of the start of current <rules> block |
| |
| int len = testString.length(); |
| |
| for (charIdx = 0; charIdx < len; ) { |
| int c = testString.codePointAt(charIdx); |
| charIdx++; |
| if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') { |
| // treat CRLF as a unit |
| c = '\n'; |
| charIdx++; |
| } |
| if (c == '\n' || c == '\r') { |
| lineNum++; |
| colStart = charIdx; |
| } |
| column = charIdx - colStart + 1; |
| |
| switch (parseState) { |
| case PARSE_COMMENT: |
| if (c == 0x0a || c == 0x0d) { |
| parseState = savedState; |
| } |
| break; |
| |
| case PARSE_TAG: |
| { |
| if (c == '#') { |
| parseState = PARSE_COMMENT; |
| savedState = PARSE_TAG; |
| break; |
| } |
| if (UCharacter.isWhitespace(c)) { |
| break; |
| } |
| if (testString.startsWith("<word>", charIdx-1)) { |
| tp.bi = BreakIterator.getWordInstance(tp.currentLocale); |
| charIdx += 5; |
| break; |
| } |
| if (testString.startsWith("<char>", charIdx-1)) { |
| tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale); |
| charIdx += 5; |
| break; |
| } |
| if (testString.startsWith("<line>", charIdx-1)) { |
| tp.bi = BreakIterator.getLineInstance(tp.currentLocale); |
| charIdx += 5; |
| break; |
| } |
| if (testString.startsWith("<sent>", charIdx-1)) { |
| tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale); |
| charIdx += 5; |
| break; |
| } |
| if (testString.startsWith("<title>", charIdx-1)) { |
| tp.bi = BreakIterator.getTitleInstance(tp.currentLocale); |
| charIdx += 6; |
| break; |
| } |
| if (testString.startsWith("<rules>", charIdx-1) || |
| testString.startsWith("<badrules>", charIdx-1)) { |
| charIdx = testString.indexOf('>', charIdx) + 1; |
| parseState = PARSE_RULES; |
| rules.setLength(0); |
| rulesFirstLine = lineNum; |
| break; |
| } |
| |
| if (testString.startsWith("<locale ", charIdx-1)) { |
| int closeIndex = testString.indexOf(">", charIdx); |
| if (closeIndex < 0) { |
| errln("line" + lineNum + ": missing close on <locale tag."); |
| break; |
| } |
| String localeName = testString.substring(charIdx+6, closeIndex); |
| localeName = localeName.trim(); |
| tp.currentLocale = new ULocale(localeName); |
| charIdx = closeIndex+1; |
| break; |
| } |
| if (testString.startsWith("<data>", charIdx-1)) { |
| parseState = PARSE_DATA; |
| charIdx += 5; |
| tp.dataToBreak.setLength(0); |
| Arrays.fill(tp.expectedBreaks, 0); |
| Arrays.fill(tp.srcCol, 0); |
| Arrays.fill(tp.srcLine, 0); |
| break; |
| } |
| |
| errln("line" + lineNum + ": Tag expected in test file."); |
| return; |
| //parseState = PARSE_COMMENT; |
| //savedState = PARSE_DATA; |
| } |
| |
| case PARSE_RULES: |
| if (testString.startsWith("</rules>", charIdx-1)) { |
| charIdx += 7; |
| parseState = PARSE_TAG; |
| try { |
| tp.bi = new RuleBasedBreakIterator(rules.toString()); |
| } catch (IllegalArgumentException e) { |
| errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e)); |
| } |
| } else if (testString.startsWith("</badrules>", charIdx-1)) { |
| charIdx += 10; |
| parseState = PARSE_TAG; |
| boolean goodRules = true; |
| try { |
| new RuleBasedBreakIterator(rules.toString()); |
| } catch (IllegalArgumentException e) { |
| goodRules = false; |
| } |
| if (goodRules) { |
| errln(String.format( |
| "rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.", |
| lineNum)); |
| } |
| } else { |
| rules.appendCodePoint(c); |
| } |
| break; |
| |
| case PARSE_DATA: |
| if (c == '•') { |
| int breakIdx = tp.dataToBreak.length(); |
| if (tp.expectedBreaks[breakIdx] != 0) { |
| errln(String.format( |
| "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text", |
| lineNum, column)); |
| } |
| tp.expectedBreaks[breakIdx] = -1; |
| tp.srcLine[breakIdx] = lineNum; |
| tp.srcCol[breakIdx] = column; |
| break; |
| } |
| |
| if (testString.startsWith("</data>", charIdx-1)) { |
| // Add final entry to mappings from break location to source file position. |
| // Need one extra because last break position returned is after the |
| // last char in the data, not at the last char. |
| int idx = tp.dataToBreak.length(); |
| tp.srcLine[idx] = lineNum; |
| tp.srcCol[idx] = column; |
| |
| parseState = PARSE_TAG; |
| charIdx += 6; |
| |
| // RUN THE TEST! |
| executeTest(tp); |
| break; |
| } |
| |
| if (testString.startsWith("\\N{", charIdx-1)) { |
| int nameEndIdx = testString.indexOf('}', charIdx); |
| if (nameEndIdx == -1) { |
| errln("Error in named character in test file at line " + lineNum + |
| ", col " + column); |
| } |
| // Named character, e.g. \N{COMBINING GRAVE ACCENT} |
| // Get the code point from the name and insert it into the test data. |
| String charName = testString.substring(charIdx+2, nameEndIdx); |
| c = UCharacter.getCharFromName(charName); |
| if (c == -1) { |
| errln("Error in named character in test file at line " + lineNum + |
| ", col " + column); |
| } else { |
| // Named code point was recognized. Insert it |
| // into the test data. |
| tp.dataToBreak.appendCodePoint(c); |
| for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { |
| tp.srcLine[i] = lineNum; |
| tp.srcCol[i] = column; |
| } |
| |
| } |
| if (nameEndIdx > charIdx) { |
| charIdx = nameEndIdx+1; |
| } |
| break; |
| } |
| |
| if (testString.startsWith("<>", charIdx-1)) { |
| charIdx++; |
| int breakIdx = tp.dataToBreak.length(); |
| tp.expectedBreaks[breakIdx] = -1; |
| tp.srcLine[breakIdx] = lineNum; |
| tp.srcCol[breakIdx] = column; |
| break; |
| } |
| |
| if (c == '<') { |
| tagValue = 0; |
| parseState = PARSE_NUM; |
| break; |
| } |
| |
| if (c == '#' && column==3) { // TODO: why is column off so far? |
| parseState = PARSE_COMMENT; |
| savedState = PARSE_DATA; |
| break; |
| } |
| |
| if (c == '\\') { |
| // Check for \ at end of line, a line continuation. |
| // Advance over (discard) the newline |
| int cp = testString.codePointAt(charIdx); |
| if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') { |
| // We have a CR LF |
| // Need an extra increment of the input ptr to move over both of them |
| charIdx++; |
| } |
| if (cp == '\n' || cp == '\r') { |
| lineNum++; |
| column = 0; |
| charIdx++; |
| colStart = charIdx; |
| break; |
| } |
| |
| // Let unescape handle the back slash. |
| int charIdxAr[] = new int[1]; |
| charIdxAr[0] = charIdx; |
| cp = Utility.unescapeAt(testString, charIdxAr); |
| if (cp != -1) { |
| // Escape sequence was recognized. Insert the char |
| // into the test data. |
| charIdx = charIdxAr[0]; |
| tp.dataToBreak.appendCodePoint(cp); |
| for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { |
| tp.srcLine[i] = lineNum; |
| tp.srcCol[i] = column; |
| } |
| |
| break; |
| } |
| |
| |
| // Not a recognized backslash escape sequence. |
| // Take the next char as a literal. |
| // TODO: Should this be an error? |
| c = testString.codePointAt(charIdx); |
| charIdx = testString.offsetByCodePoints(charIdx, 1); |
| } |
| |
| // Normal, non-escaped data char. |
| tp.dataToBreak.appendCodePoint(c); |
| |
| // Save the mapping from offset in the data to line/column numbers in |
| // the original input file. Will be used for better error messages only. |
| // If there's an expected break before this char, the slot in the mapping |
| // vector will already be set for this char; don't overwrite it. |
| for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { |
| tp.srcLine[i] = lineNum; |
| tp.srcCol[i] = column; |
| } |
| break; |
| |
| |
| case PARSE_NUM: |
| // We are parsing an expected numeric tag value, like <1234>, |
| // within a chunk of data. |
| if (UCharacter.isWhitespace(c)) { |
| break; |
| } |
| |
| if (c == '>') { |
| // Finished the number. Add the info to the expected break data, |
| // and switch parse state back to doing plain data. |
| parseState = PARSE_DATA; |
| if (tagValue == 0) { |
| tagValue = -1; |
| } |
| int breakIdx = tp.dataToBreak.length(); |
| if (tp.expectedBreaks[breakIdx] != 0) { |
| errln(String.format( |
| "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text", |
| lineNum, column)); |
| } |
| tp.expectedBreaks[breakIdx] = tagValue; |
| tp.srcLine[breakIdx] = lineNum; |
| tp.srcCol[breakIdx] = column; |
| break; |
| } |
| |
| if (UCharacter.isDigit(c)) { |
| tagValue = tagValue*10 + UCharacter.digit(c); |
| break; |
| } |
| |
| errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column)); |
| return; |
| } |
| } |
| |
| // Reached end of test file. Raise an error if parseState indicates that we are |
| // within a block that should have been terminated. |
| if (parseState == PARSE_RULES) { |
| errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.", |
| lineNum, rulesFirstLine)); |
| } |
| if (parseState == PARSE_DATA) { |
| errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum)); |
| } |
| } |
| |
| void executeTest(TestParams t) { |
| // TODO: also rerun tests with a break iterator re-created from bi.getRules() |
| // and from bi.clone(). If in exhaustive mode only. |
| int bp; |
| int prevBP; |
| int i; |
| |
| if (t.bi == null) { |
| return; |
| } |
| |
| t.bi.setText(t.dataToBreak.toString()); |
| // |
| // Run the iterator forward |
| // |
| prevBP = -1; |
| for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) { |
| if (prevBP == bp) { |
| // Fail for lack of forward progress. |
| errln("Forward Iteration, no forward progress. Break Pos=" + bp + |
| " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]); |
| break; |
| } |
| |
| // Check that there were we didn't miss an expected break between the last one |
| // and this one. |
| for (i=prevBP+1; i<bp; i++) { |
| if (t.expectedBreaks[i] != 0) { |
| errln("Forward Iteration, break expected, but not found. Pos=" + i + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); |
| } |
| } |
| |
| // Check that the break we did find was expected |
| if (t.expectedBreaks[bp] == 0) { |
| errln("Forward Iteration, break found, but not expected. Pos=" + bp + |
| " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); |
| } else { |
| // The break was expected. |
| // Check that the {nnn} tag value is correct. |
| int expectedTagVal = t.expectedBreaks[bp]; |
| if (expectedTagVal == -1) { |
| expectedTagVal = 0; |
| } |
| int line = t.srcLine[bp]; |
| int rs = t.bi.getRuleStatus(); |
| if (rs != expectedTagVal) { |
| errln("Incorrect status for forward break. Pos = " + bp + |
| ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" + |
| " Actual, Expected status = " + rs + ", " + expectedTagVal); |
| } |
| int[] fillInArray = new int[4]; |
| int numStatusVals = t.bi.getRuleStatusVec(fillInArray); |
| assertTrue("", numStatusVals >= 1); |
| assertEquals("", expectedTagVal, fillInArray[0]); |
| } |
| |
| |
| prevBP = bp; |
| } |
| |
| // Verify that there were no missed expected breaks after the last one found |
| for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) { |
| if (t.expectedBreaks[i] != 0) { |
| errln("Forward Iteration, break expected, but not found. Pos=" + i + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); |
| } |
| } |
| |
| |
| // |
| // Run the iterator backwards, verify that the same breaks are found. |
| // |
| prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen. |
| for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) { |
| if (prevBP == bp) { |
| // Fail for lack of progress. |
| errln("Reverse Iteration, no progress. Break Pos=" + bp + |
| "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]); |
| break; |
| } |
| |
| // Check that we didn't miss an expected break between the last one |
| // and this one. (UVector returns zeros for index out of bounds.) |
| for (i=prevBP-1; i>bp; i--) { |
| if (t.expectedBreaks[i] != 0) { |
| errln("Reverse Itertion, break expected, but not found. Pos=" + i + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); |
| } |
| } |
| |
| // Check that the break we did find was expected |
| if (t.expectedBreaks[bp] == 0) { |
| errln("Reverse Itertion, break found, but not expected. Pos=" + bp + |
| " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); |
| } else { |
| // The break was expected. |
| // Check that the {nnn} tag value is correct. |
| int expectedTagVal = t.expectedBreaks[bp]; |
| if (expectedTagVal == -1) { |
| expectedTagVal = 0; |
| } |
| int line = t.srcLine[bp]; |
| int rs = t.bi.getRuleStatus(); |
| if (rs != expectedTagVal) { |
| errln("Incorrect status for reverse break. Pos = " + bp + |
| " File line,col= " + line + ", " + t.srcCol[bp] + "\n" + |
| " Actual, Expected status = " + rs + ", " + expectedTagVal); |
| } |
| } |
| |
| prevBP = bp; |
| } |
| |
| // Verify that there were no missed breaks prior to the last one found |
| for (i=prevBP-1; i>=0; i--) { |
| if (t.expectedBreaks[i] != 0) { |
| errln("Reverse Itertion, break expected, but not found. Pos=" + i + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); |
| } |
| } |
| // Check isBoundary() |
| for (i=0; i<=t.dataToBreak.length(); i++) { |
| boolean boundaryExpected = (t.expectedBreaks[i] != 0); |
| boolean boundaryFound = t.bi.isBoundary(i); |
| if (boundaryExpected != boundaryFound) { |
| errln("isBoundary(" + i + ") incorrect.\n" + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + |
| " Expected, Actual= " + boundaryExpected + ", " + boundaryFound); |
| } |
| } |
| |
| // Check following() |
| for (i=0; i<=t.dataToBreak.length(); i++) { |
| int actualBreak = t.bi.following(i); |
| int expectedBreak = BreakIterator.DONE; |
| for (int j=i+1; j < t.expectedBreaks.length; j++) { |
| if (t.expectedBreaks[j] != 0) { |
| expectedBreak = j; |
| break; |
| } |
| } |
| if (expectedBreak != actualBreak) { |
| errln("following(" + i + ") incorrect.\n" + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + |
| " Expected, Actual= " + expectedBreak + ", " + actualBreak); |
| } |
| } |
| |
| // Check preceding() |
| for (i=t.dataToBreak.length(); i>=0; i--) { |
| int actualBreak = t.bi.preceding(i); |
| int expectedBreak = BreakIterator.DONE; |
| |
| for (int j=i-1; j >= 0; j--) { |
| if (t.expectedBreaks[j] != 0) { |
| expectedBreak = j; |
| break; |
| } |
| } |
| if (expectedBreak != actualBreak) { |
| errln("preceding(" + i + ") incorrect.\n" + |
| " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] + |
| " Expected, Actual= " + expectedBreak + ", " + actualBreak); |
| } |
| } |
| |
| } |
| |
| |
| |
| |
| } |