ICU-20303 Break Iterator, improve handling of look-ahead rules.
- Merge the look-ahead results slots used when multiple rules share a common accepting state.
- Sequentially number the look-ahead result slot. Will eventually allow replacing the runtime map with an array.
- Inhibit chaining out of look-ahead rules. This could never actually happen; when a hard break
rule matches, the engine is stopped immediately, but the state table was being constructed
as if it could happen. Reduces table size for line break rules.
- Remove incorrect handling of fAccepting and fLookAhead fields of a state table row
when removing duplicate states. Look-ahead slot number was being mis-interpreted as a state number.
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
index 01dae48..f80c3e0 100644
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -883,9 +883,15 @@
return lookaheadResult;
}
}
+
+ // If we are at the position of the '/' in a look-ahead (hard break) rule;
+ // record the current position, to be returned later, if the full rule matches.
+ // TODO: Move this check before the previous check of fAccepting.
+ // This would enable hard-break rules with no following context.
+ // But there are line break test failures when trying this. Investigate.
+ // Issue ICU-20837
int16_t rule = row->fLookAhead;
if (rule != 0) {
- // At the position of a '/' in a look-ahead match. Record it.
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
lookAheadMatches.setPosition(rule, pos);
}
diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp
index f536ab5..4eb324b 100644
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@@ -1274,6 +1274,10 @@
}
+int32_t RBBIRuleScanner::numRules() {
+ return fRuleNum;
+}
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/rbbiscan.h b/icu4c/source/common/rbbiscan.h
index c51b4cf..6828ba3 100644
--- a/icu4c/source/common/rbbiscan.h
+++ b/icu4c/source/common/rbbiscan.h
@@ -73,6 +73,8 @@
// reverse rules,
// and a list of UnicodeSets encountered.
+ int32_t numRules(); // Return the number of rules that have been seen.
+
/**
* Return a rules string without unnecessary
* characters.
diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp
index a20b517..960ef7e 100644
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@@ -18,6 +18,7 @@
#include "unicode/unistr.h"
#include "rbbitblb.h"
#include "rbbirb.h"
+#include "rbbiscan.h"
#include "rbbisetb.h"
#include "rbbidata.h"
#include "cstring.h"
@@ -52,6 +53,7 @@
}
delete fDStates;
delete fSafeTable;
+ delete fLookAheadRuleMap;
}
@@ -121,7 +123,7 @@
}
cn->fLeftChild = fTree;
fTree->fParent = cn;
- cn->fRightChild = new RBBINode(RBBINode::endMark);
+ RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
// Delete and exit if memory allocation failed.
if (cn->fRightChild == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
@@ -164,7 +166,7 @@
// For "chained" rules, modify the followPos sets
//
if (fRB->fChainRules) {
- calcChainedFollowPos(fTree);
+ calcChainedFollowPos(fTree, endMarkerNode);
}
//
@@ -178,6 +180,7 @@
// Build the DFA state transition tables.
//
buildStateTable();
+ mapLookAheadRules();
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
@@ -401,19 +404,13 @@
// to implement rule chaining. NOT described by Aho
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
+void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
- UVector endMarkerNodes(*fStatus);
UVector leafNodes(*fStatus);
- int32_t i;
-
if (U_FAILURE(*fStatus)) {
return;
}
- // get a list of all endmarker nodes.
- tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
-
// get a list all leaf nodes
tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
if (U_FAILURE(*fStatus)) {
@@ -442,28 +439,26 @@
int32_t startNodeIx;
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
- RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
- RBBINode *endNode = NULL;
+ RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
// Identify leaf nodes that correspond to overall rule match positions.
- // These include an endMarkerNode in their followPos sets.
- for (i=0; i<endMarkerNodes.size(); i++) {
- if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
- endNode = tNode;
- break;
- }
- }
- if (endNode == NULL) {
- // node wasn't an end node. Try again with the next.
+ // These include the endMarkNode in their followPos sets.
+ //
+ // Note: do not consider other end marker nodes, those that are added to
+ // look-ahead rules. These can't chain; a match immediately stops
+ // further matching. This leaves exactly one end marker node, the one
+ // at the end of the complete tree.
+
+ if (!endNode->fFollowPos->contains(endMarkNode)) {
continue;
}
// We've got a node that can end a match.
- // Line Break Specific hack: If this node's val correspond to the $CM char class,
- // don't chain from it.
- // TODO: Add rule syntax for this behavior, get specifics out of here and
- // into the rule file.
+ // !!LBCMNoChain implementation: If this node's val correspond to
+ // the Line Break $CM char class, don't chain from it.
+ // TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
+ // by any of the standard ICU rules.
if (fRB->fLBCMNoChain) {
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
if (c != -1) {
@@ -475,7 +470,6 @@
}
}
-
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;
@@ -705,6 +699,77 @@
}
+/**
+ * mapLookAheadRules
+ *
+ */
+void RBBITableBuilder::mapLookAheadRules() {
+ fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
+ if (fLookAheadRuleMap == nullptr) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ }
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
+ fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
+ int32_t laSlotsInUse = 0;
+
+ for (int32_t n=0; n<fDStates->size(); n++) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
+ int32_t laSlotForState = 0;
+
+ // Establish the look-ahead slot for this state, if the state covers
+ // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
+
+ // If any of the look-ahead nodes already have a slot assigned, use it,
+ // otherwise assign a new one.
+
+ bool sawLookAheadNode = false;
+ for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
+ RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
+ if (node->fType != RBBINode::NodeType::lookAhead) {
+ continue;
+ }
+ sawLookAheadNode = true;
+ int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
+ U_ASSERT(ruleNum < fLookAheadRuleMap->size());
+ U_ASSERT(ruleNum > 0);
+ int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
+ if (laSlot != 0) {
+ if (laSlotForState == 0) {
+ laSlotForState = laSlot;
+ } else {
+ // TODO: figure out if this can fail, change to setting an error code if so.
+ U_ASSERT(laSlot == laSlotForState);
+ }
+ }
+ }
+ if (!sawLookAheadNode) {
+ continue;
+ }
+
+ if (laSlotForState == 0) {
+ laSlotForState = ++laSlotsInUse;
+ }
+
+ // For each look ahead node covered by this state,
+ // set the mapping from the node's rule number to the look ahead slot.
+ // There can be multiple nodes/rule numbers going to the same la slot.
+
+ for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
+ RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
+ if (node->fType != RBBINode::NodeType::lookAhead) {
+ continue;
+ }
+ int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
+ int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
+ (void)existingVal;
+ U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
+ fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
+ }
+ }
+
+}
//-----------------------------------------------------------------------------
//
@@ -744,28 +809,19 @@
if (sd->fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
- sd->fAccepting = endMarker->fVal;
+ sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
if (sd->fAccepting == 0) {
sd->fAccepting = -1;
}
}
if (sd->fAccepting==-1 && endMarker->fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
- // Favor the look-ahead. Expedient for line break.
- // TODO: need a more elegant resolution for conflicting rules.
- sd->fAccepting = endMarker->fVal;
+ // Favor the look-ahead, because a look-ahead match needs to
+ // immediately stop the run-time engine. First match, not longest.
+ sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
}
// implicit else:
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
-
- // If the end marker node is from a look-ahead rule, set
- // the fLookAhead field for this state also.
- if (endMarker->fLookAheadEnd) {
- // TODO: don't change value if already set?
- // TODO: allow for more than one active look-ahead rule in engine.
- // Make value here an index to a side array in engine?
- sd->fLookAhead = sd->fAccepting;
- }
}
}
}
@@ -792,11 +848,20 @@
}
for (i=0; i<lookAheadNodes.size(); i++) {
lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
+ U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
- if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
- sd->fLookAhead = lookAheadNode->fVal;
+ int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
+ if (positionsIdx >= 0) {
+ U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
+ int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
+ U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
+ // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
+ // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
+ // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
+ // }
+ sd->fLookAhead = lookaheadSlot;
}
}
}
@@ -1204,16 +1269,6 @@
}
sd->fDtran->setElementAt(newVal, col);
}
- if (sd->fAccepting == duplState) {
- sd->fAccepting = keepState;
- } else if (sd->fAccepting > duplState) {
- sd->fAccepting--;
- }
- if (sd->fLookAhead == duplState) {
- sd->fLookAhead = keepState;
- } else if (sd->fLookAhead > duplState) {
- sd->fLookAhead--;
- }
}
}
diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h
index bc6077b..c2b574f 100644
--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@@ -91,9 +91,10 @@
void calcFirstPos(RBBINode *n);
void calcLastPos(RBBINode *n);
void calcFollowPos(RBBINode *n);
- void calcChainedFollowPos(RBBINode *n);
+ void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
void bofFixup();
void buildStateTable();
+ void mapLookAheadRules();
void flagAcceptingStates();
void flagLookAheadStates();
void flagTaggedStates();
@@ -175,6 +176,9 @@
/** Synthesized safe table, UVector of UnicodeString, one string per table row. */
UVector *fSafeTable;
+ /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
+ UVector32 *fLookAheadRuleMap = nullptr;
+
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 30ef79f..c4960b6 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -125,6 +125,7 @@
TESTCASE_AUTO(TestBug13447);
TESTCASE_AUTO(TestReverse);
TESTCASE_AUTO(TestBug13692);
+ TESTCASE_AUTO(TestDebugRules);
TESTCASE_AUTO_END;
}
@@ -4778,6 +4779,19 @@
assertSuccess(WHERE, status);
}
+
+void RBBITest::TestProperties() {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
+ if (!prependSet.isEmpty()) {
+ errln(
+ "[:GCB=Prepend:] is not empty any more. "
+ "Uncomment relevant lines in source/data/brkitr/char.txt and "
+ "change this test to the opposite condition.");
+ }
+}
+
+
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked
@@ -4796,15 +4810,56 @@
assertSuccess(WHERE, status);
}
-void RBBITest::TestProperties() {
- UErrorCode errorCode = U_ZERO_ERROR;
- UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
- if (!prependSet.isEmpty()) {
- errln(
- "[:GCB=Prepend:] is not empty any more. "
- "Uncomment relevant lines in source/data/brkitr/char.txt and "
- "change this test to the opposite condition.");
+
+//
+// TestDebugRules A stub test for use in debugging rule compilation problems.
+// Can be freely altered as needed or convenient.
+// Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
+// data files may not be available in all environments.
+// Any permanent test cases should be moved to rbbitst.txt
+// (see Bug 20303 in that file, for example), or to another test function in this file.
+//
+void RBBITest::TestDebugRules() {
+#if 0
+ const char16_t *rules = u""
+ "!!quoted_literals_only; \n"
+ "!!chain; \n"
+ "!!lookAheadHardBreak; \n"
+ " \n"
+ // "[a] / ; \n"
+ "[a] [b] / [c] [d]; \n"
+ "[a] [b] / [c] [d] {100}; \n"
+ "[x] [a] [b] / [c] [d] {100}; \n"
+ "[a] [b] [c] / [d] {100}; \n"
+ //" [c] [d] / [e] [f]; \n"
+ //"[a] [b] / [c]; \n"
+ ;
+
+ UErrorCode status = U_ZERO_ERROR;
+ CharString path(pathToDataDirectory(), status);
+ path.appendPathPart("brkitr", status);
+ path.appendPathPart("rules", status);
+ path.appendPathPart("line.txt", status);
+ int len;
+ std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
+ if (!assertSuccess(WHERE, status)) {
+ return;
}
+
+ UParseError pe;
+ // rules = testFile.get();
+ RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
+
+ if (!assertSuccess(WHERE, status)) {
+ delete bi;
+ return;
+ }
+ // bi->dumpTables();
+
+ delete bi;
+#endif
}
+
+
#endif // #if !UCONFIG_NO_BREAK_ITERATION
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index cfaf688..96c2882 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -82,6 +82,7 @@
void TestReverse();
void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
void TestBug13692();
+ void TestDebugRules();
void TestDebug();
void TestProperties();
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 240dbdd..9962f94 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1991,3 +1991,23 @@
•
•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。•
•</data>
+
+
+#
+# Bug 20303 Multiple Look-ahead rules with similar contexts.
+# Check that samples of such rules are being handled correctly.
+#
+
+<rules>
+!!forward;
+!!quoted_literals_only;
+!!chain;
+[a] [b] / [c] [d];
+[a] [b] / [c] [d] {100};
+[a] [b] / [e] [f] {200};
+[a] [b] / [e] [g] {300};
+[a] [b] [c] [h] {400};
+[x] [a] [b] / [c] [d] {500};
+[y] [a] [b] [c] [d] {600};
+</rules>
+<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
index 5751e41..3bcc30d 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
@@ -73,7 +73,7 @@
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
- HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
+ HashMap<String, RBBISetTableEl> fSetTable = new HashMap<>(); // UnicocodeSet hash table, holds indexes to
// the sets created while parsing rules.
// The key is the string used for creating
// the set.
@@ -933,7 +933,7 @@
// Perform any action specified by this row in the state table.
if (doParseActions(tableEl.fAction) == false) {
// Break out of the state machine loop if the
- // the action signalled some kind of error, or
+ // the action signaled some kind of error, or
// the action was to exit, occurs on normal end-of-rules-input.
break;
}
@@ -1070,7 +1070,7 @@
error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
}
- // Advance the RBBI parse postion over the UnicodeSet pattern.
+ // Advance the RBBI parse position over the UnicodeSet pattern.
// Don't just set fScanIndex because the line/char positions maintained
// for error reporting would be thrown off.
i = pos.getIndex();
@@ -1089,12 +1089,17 @@
n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
// findSetFor() serves several purposes here:
// - Adopts storage for the UnicodeSet, will be responsible for deleting.
- // - Mantains collection of all sets in use, needed later for establishing
+ // - Maintains collection of all sets in use, needed later for establishing
// character categories for run time engine.
- // - Eliminates mulitiple instances of the same set.
+ // - Eliminates multiple instances of the same set.
// - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n.fText, n, uset);
}
+ /**
+ * @return the number of rules that have been seen.
+ */
+ int numRules() {
+ return fRuleNum;
+ }
}
-
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
index 9ccafe8..c5ae21d 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
@@ -53,8 +53,8 @@
// in RBBITableBuilder.fDStates
RBBIStateDescriptor(int maxInputSymbol) {
- fTagVals = new TreeSet<Integer>();
- fPositions = new HashSet<RBBINode>();
+ fTagVals = new TreeSet<>();
+ fPositions = new HashSet<>();
fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
@@ -74,6 +74,9 @@
/** Synthesized safe table, a List of row arrays. */
private List<short[]> fSafeTable;
+ /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
+ int[] fLookAheadRuleMap;
+
//-----------------------------------------------------------------------------
//
// Constructor for RBBITableBuilder.
@@ -84,7 +87,7 @@
RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
fRootIx = rootNodeIx;
fRB = rb;
- fDStates = new ArrayList<RBBIStateDescriptor>();
+ fDStates = new ArrayList<>();
}
@@ -137,7 +140,7 @@
RBBINode cn = new RBBINode(RBBINode.opCat);
cn.fLeftChild = fRB.fTreeRoots[fRootIx];
fRB.fTreeRoots[fRootIx].fParent = cn;
- cn.fRightChild = new RBBINode(RBBINode.endMark);
+ RBBINode endMarkerNode = cn.fRightChild = new RBBINode(RBBINode.endMark);
cn.fRightChild.fParent = cn;
fRB.fTreeRoots[fRootIx] = cn;
@@ -172,7 +175,7 @@
// For "chained" rules, modify the followPos sets
//
if (fRB.fChainRules) {
- calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
+ calcChainedFollowPos(fRB.fTreeRoots[fRootIx], endMarkerNode);
}
//
@@ -186,6 +189,7 @@
// Build the DFA state transition tables.
//
buildStateTable();
+ mapLookAheadRules();
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
@@ -390,13 +394,9 @@
// to implement rule chaining. NOT described by Aho
//
//-----------------------------------------------------------------------------
- void calcChainedFollowPos(RBBINode tree) {
+ void calcChainedFollowPos(RBBINode tree, RBBINode endMarkNode) {
- List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
- List<RBBINode> leafNodes = new ArrayList<RBBINode>();
-
- // get a list of all endmarker nodes.
- tree.findNodes(endMarkerNodes, RBBINode.endMark);
+ List<RBBINode> leafNodes = new ArrayList<>();
// get a list all leaf nodes
tree.findNodes(leafNodes, RBBINode.leafChar);
@@ -405,10 +405,10 @@
// with inbound chaining enabled, which is the union of the
// firstPosition sets from each of the rule root nodes.
- List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
+ List<RBBINode> ruleRootNodes = new ArrayList<>();
addRuleRootNodes(ruleRootNodes, tree);
- Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
+ Set<RBBINode> matchStartNodes = new HashSet<>();
for (RBBINode node: ruleRootNodes) {
if (node.fChainIn) {
matchStartNodes.addAll(node.fFirstPosSet);
@@ -417,28 +417,26 @@
// Iterate over all leaf nodes,
//
- for (RBBINode tNode : leafNodes) {
- RBBINode endNode = null;
+ for (RBBINode endNode : leafNodes) {
// Identify leaf nodes that correspond to overall rule match positions.
- // These include an endMarkerNode in their followPos sets.
- for (RBBINode endMarkerNode : endMarkerNodes) {
- if (tNode.fFollowPos.contains(endMarkerNode)) {
- endNode = tNode;
- break;
- }
- }
- if (endNode == null) {
- // node wasn't an end node. Try again with the next.
+ // These include the endMarkNode in their followPos sets.
+ //
+ // Note: do not consider other end marker nodes, those that are added to
+ // look-ahead rules. These can't chain; a match immediately stops
+ // further matching. This leaves exactly one end marker node, the one
+ // at the end of the complete tree.
+
+ if (!endNode.fFollowPos.contains(endMarkNode)) {
continue;
}
// We've got a node that can end a match.
- // Line Break Specific hack: If this node's val correspond to the $CM char class,
- // don't chain from it.
- // TODO: Add rule syntax for this behavior, get specifics out of here and
- // into the rule file.
+ // !!LBCMNoChain implementation: If this node's val correspond to
+ // the Line Break $CM char class, don't chain from it.
+ // TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
+ // by any of the standard ICU rules.
if (fRB.fLBCMNoChain) {
int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
if (c != -1) {
@@ -571,7 +569,7 @@
for (RBBINode p : T.fPositions) {
if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
if (U == null) {
- U = new HashSet<RBBINode>();
+ U = new HashSet<>();
}
U.addAll(p.fFollowPos);
}
@@ -610,7 +608,66 @@
}
}
+ /**
+ * mapLookAheadRules
+ *
+ */
+ void mapLookAheadRules() {
+ fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
+ int laSlotsInUse = 0;
+ for (RBBIStateDescriptor sd: fDStates) {
+ int laSlotForState = 0;
+
+ // Establish the look-ahead slot for this state, if the state covers
+ // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
+
+ // If any of the look-ahead nodes already have a slot assigned, use it,
+ // otherwise assign a new one.
+
+ boolean sawLookAheadNode = false;
+ for (RBBINode node: sd.fPositions) {
+ if (node.fType != RBBINode.lookAhead) {
+ continue;
+ }
+ sawLookAheadNode = true;
+ int ruleNum = node.fVal; // Set when rule was originally parsed.
+ assert(ruleNum < fLookAheadRuleMap.length);
+ assert(ruleNum > 0);
+ int laSlot = fLookAheadRuleMap[ruleNum];
+ if (laSlot != 0) {
+ if (laSlotForState == 0) {
+ laSlotForState = laSlot;
+ } else {
+ // TODO: figure out if this can fail, change to setting an error code if so.
+ assert(laSlot == laSlotForState);
+ }
+ }
+ }
+ if (!sawLookAheadNode) {
+ continue;
+ }
+
+ if (laSlotForState == 0) {
+ laSlotForState = ++laSlotsInUse;
+ }
+
+ // For each look ahead node covered by this state,
+ // set the mapping from the node's rule number to the look ahead slot.
+ // There can be multiple nodes/rule numbers going to the same la slot.
+
+ for (RBBINode node: sd.fPositions) {
+ if (node.fType != RBBINode.lookAhead) {
+ continue;
+ }
+ int ruleNum = node.fVal; // Set when rule was originally parsed.
+ int existingVal = fLookAheadRuleMap[ruleNum];
+ assert(existingVal == 0 || existingVal == laSlotForState);
+ fLookAheadRuleMap[ruleNum] = laSlotForState;
+ }
+ }
+
+ }
//-----------------------------------------------------------------------------
//
@@ -622,7 +679,7 @@
//
//-----------------------------------------------------------------------------
void flagAcceptingStates() {
- List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
+ List<RBBINode> endMarkerNodes = new ArrayList<>();
RBBINode endMarker;
int i;
int n;
@@ -640,29 +697,20 @@
// If no other value was specified, force it to -1.
if (sd.fAccepting==0) {
- // State hasn't been marked as accepting yet. Do it now.
- sd.fAccepting = endMarker.fVal;
+ // State hasn't been marked as accepting yet. Do it now.
+ sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
if (sd.fAccepting == 0) {
sd.fAccepting = -1;
- }
+ }
}
if (sd.fAccepting==-1 && endMarker.fVal != 0) {
- // Both lookahead and non-lookahead accepting for this state.
- // Favor the look-ahead. Expedient for line break.
- // TODO: need a more elegant resolution for conflicting rules.
- sd.fAccepting = endMarker.fVal;
- }
- // implicit else:
- // if sd.fAccepting already had a value other than 0 or -1, leave it be.
-
- // If the end marker node is from a look-ahead rule, set
- // the fLookAhead field for this state also.
- if (endMarker.fLookAheadEnd) {
- // TODO: don't change value if already set?
- // TODO: allow for more than one active look-ahead rule in engine.
- // Make value here an index to a side array in engine?
- sd.fLookAhead = sd.fAccepting;
+ // Both lookahead and non-lookahead accepting for this state.
+ // Favor the look-ahead, because a look-ahead match needs to
+ // immediately stop the run-time engine. First match, not longest.
+ sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
}
+ // implicit else:
+ // if sd.fAccepting already had a value other than 0 or -1, leave it be.
}
}
}
@@ -675,7 +723,7 @@
//
//-----------------------------------------------------------------------------
void flagLookAheadStates() {
- List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();
+ List<RBBINode> lookAheadNodes = new ArrayList<>();
RBBINode lookAheadNode;
int i;
int n;
@@ -683,11 +731,12 @@
fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
for (i=0; i<lookAheadNodes.size(); i++) {
lookAheadNode = lookAheadNodes.get(i);
-
for (n=0; n<fDStates.size(); n++) {
RBBIStateDescriptor sd = fDStates.get(n);
if (sd.fPositions.contains(lookAheadNode)) {
- sd.fLookAhead = lookAheadNode.fVal;
+ int lookaheadSlot = fLookAheadRuleMap[lookAheadNode.fVal];
+ assert(sd.fLookAhead == 0 || sd.fLookAhead == lookaheadSlot);
+ sd.fLookAhead = lookaheadSlot;
}
}
}
@@ -702,7 +751,7 @@
//
//-----------------------------------------------------------------------------
void flagTaggedStates() {
- List<RBBINode> tagNodes = new ArrayList<RBBINode>();
+ List<RBBINode> tagNodes = new ArrayList<>();
RBBINode tagNode;
int i;
int n;
@@ -765,10 +814,10 @@
fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group
fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero
- SortedSet<Integer> s0 = new TreeSet<Integer>();
+ SortedSet<Integer> s0 = new TreeSet<>();
Integer izero = Integer.valueOf(0);
fRB.fStatusSets.put(s0, izero);
- SortedSet<Integer> s1 = new TreeSet<Integer>();
+ SortedSet<Integer> s1 = new TreeSet<>();
s1.add(izero);
fRB.fStatusSets.put(s0, izero);
}
@@ -986,16 +1035,6 @@
}
sd.fDtran[col] = newVal;
}
- if (sd.fAccepting == duplState) {
- sd.fAccepting = keepState;
- } else if (sd.fAccepting > duplState) {
- sd.fAccepting--;
- }
- if (sd.fLookAhead == duplState) {
- sd.fLookAhead = keepState;
- } else if (sd.fLookAhead > duplState) {
- sd.fLookAhead--;
- }
}
}
@@ -1167,7 +1206,7 @@
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
assert(fSafeTable == null);
- fSafeTable = new ArrayList<short[]>();
+ fSafeTable = new ArrayList<>();
for (int row=0; row<numCharClasses + 2; ++row) {
fSafeTable.add(new short[numCharClasses]);
}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
index b9680f3..8b0f525 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -937,9 +937,14 @@
}
}
+ // If we are at the position of the '/' in a look-ahead (hard break) rule;
+ // record the current position, to be returned later, if the full rule matches.
+ // TODO: Move this check before the previous check of fAccepting.
+ // This would enable hard-break rules with no following context.
+ // But there are line break test failures when trying this. Investigate.
+ // Issue ICU-20837
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
- // At the position of a '/' in a look-ahead match. Record it.
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar
index 298a004..418ee34 100644
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:f9b73d720421a85704fc64aa0949c94d52e450a44af96c715881e9e6ab0fa3e6
-size 12998988
+oid sha256:6b14ef66277d196e8b01365e9426a1585ea25a9cc346b4a454db2ecc157aed41
+size 12995957
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
index 240dbdd..9962f94 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -1991,3 +1991,23 @@
•
•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。•
•</data>
+
+
+#
+# Bug 20303 Multiple Look-ahead rules with similar contexts.
+# Check that samples of such rules are being handled correctly.
+#
+
+<rules>
+!!forward;
+!!quoted_literals_only;
+!!chain;
+[a] [b] / [c] [d];
+[a] [b] / [c] [d] {100};
+[a] [b] / [e] [f] {200};
+[a] [b] / [e] [g] {300};
+[a] [b] [c] [h] {400};
+[x] [a] [b] / [c] [d] {500};
+[y] [a] [b] [c] [d] {600};
+</rules>
+<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>