| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| // |
| // Copyright (C) 2002-2014, International Business Machines Corporation and others. |
| // All Rights Reserved. |
| // |
| // |
| |
| package com.ibm.icu.text; |
| |
| import java.io.DataOutputStream; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.nio.charset.StandardCharsets; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import com.ibm.icu.impl.Assert; |
| import com.ibm.icu.impl.ICUBinary; |
| import com.ibm.icu.impl.ICUDebug; |
| import com.ibm.icu.impl.RBBIDataWrapper; |
| |
| class RBBIRuleBuilder { |
| // This is the main class for building (compiling) break rules into the tables |
| // required by the runtime RBBI engine. |
| // |
| |
| String fDebugEnv; // controls debug trace output |
| String fRules; // The rule string that we are compiling |
| StringBuilder fStrippedRules; // The rule string, with comments stripped. |
| RBBIRuleScanner fScanner; // The scanner. |
| |
| |
| // |
| // There are four separate parse trees generated, one for each of the |
| // forward rules, reverse rules, safe forward rules and safe reverse rules. |
| // This array references the root of each of the trees. |
| // Only fForwardTree data is actually used to generate a state table. |
| // The other three are retained for back compatibility with old rule files, |
| // which may have safe and reverse rules. These are still parsed. |
| // |
| RBBINode[] fTreeRoots = new RBBINode[4]; |
| static final int fForwardTree = 0; // Indexes into the above fTreeRoots array |
| static final int fReverseTree = 1; // for each of the trees. |
| static final int fSafeFwdTree = 2; // (in C, these are pointer variables and |
| static final int fSafeRevTree = 3; // there is no array.) |
| int fDefaultTree = fForwardTree; // For rules not qualified with a ! |
| // the tree to which they belong to. |
| |
| boolean fChainRules; // True for chained Unicode TR style rules. |
| // False for traditional regexp rules. |
| |
| boolean fLBCMNoChain; // True: suppress chaining of rules on |
| // chars with LineBreak property == CM. |
| |
| boolean fLookAheadHardBreak; // True: Look ahead matches cause an |
| // immediate break, no continuing for the |
| // longest match. |
| |
| RBBISetBuilder fSetBuilder; // Set and Character Category builder. |
| List<RBBINode> fUSetNodes; // Vector of all used nodes. |
| RBBITableBuilder fForwardTable; // State transition tables |
| |
| // |
| // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.). |
| // |
| Map<Set<Integer>, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far. |
| // Map Key is the set of values. |
| // Map Value is the runtime array index. |
| |
| List<Integer> fRuleStatusVals; // List of Integer objects. Has same layout as the |
| // runtime array of status (tag) values - |
| // number of values in group 1 |
| // first status value in group 1 |
| // 2nd status value in group 1 |
| // ... |
| // number of values in group 2 |
| // first status value in group 2 |
| // etc. |
| // |
| // Error codes from ICU4C. |
| // using these simplified the porting, and consolidated the |
| // creation of Java exceptions |
| // |
| static final int U_BRK_ERROR_START = 0x10200; |
| /**< Start of codes indicating Break Iterator failures */ |
| |
| static final int U_BRK_INTERNAL_ERROR = 0x10201; |
| /**< An internal error (bug) was detected. */ |
| |
| static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202; |
| /**< Hex digits expected as part of a escaped char in a rule. */ |
| |
| static final int U_BRK_SEMICOLON_EXPECTED = 0x10203; |
| /**< Missing ';' at the end of a RBBI rule. */ |
| |
| static final int U_BRK_RULE_SYNTAX = 0x10204; |
| /**< Syntax error in RBBI rule. */ |
| |
| static final int U_BRK_UNCLOSED_SET = 0x10205; |
| /**< UnicodeSet writing an RBBI rule missing a closing ']'. */ |
| |
| static final int U_BRK_ASSIGN_ERROR = 0x10206; |
| /**< Syntax error in RBBI rule assignment statement. */ |
| |
| static final int U_BRK_VARIABLE_REDFINITION = 0x10207; |
| /**< RBBI rule $Variable redefined. */ |
| |
| static final int U_BRK_MISMATCHED_PAREN = 0x10208; |
| /**< Mis-matched parentheses in an RBBI rule. */ |
| |
| static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209; |
| /**< Missing closing quote in an RBBI rule. */ |
| |
| static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a; |
| /**< Use of an undefined $Variable in an RBBI rule. */ |
| |
| static final int U_BRK_INIT_ERROR = 0x1020b; |
| /**< Initialization failure. Probable missing ICU Data. */ |
| |
| static final int U_BRK_RULE_EMPTY_SET = 0x1020c; |
| /**< Rule contains an empty Unicode Set. */ |
| |
| static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d; |
| /**< !!option in RBBI rules not recognized. */ |
| |
| static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e; |
| /**< The {nnn} tag on a rule is mal formed */ |
| static final int U_BRK_MALFORMED_SET = 0x1020f; |
| |
| static final int U_BRK_ERROR_LIMIT = 0x10210; |
| /**< This must always be the last value to indicate the limit for Break Iterator failures */ |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Constructor. |
| // |
| //---------------------------------------------------------------------------------------- |
| RBBIRuleBuilder(String rules) |
| { |
| fDebugEnv = ICUDebug.enabled("rbbi") ? |
| ICUDebug.value("rbbi") : null; |
| fRules = rules; |
| fStrippedRules = new StringBuilder(rules); |
| fUSetNodes = new ArrayList<>(); |
| fRuleStatusVals = new ArrayList<>(); |
| fScanner = new RBBIRuleScanner(this); |
| fSetBuilder = new RBBISetBuilder(this); |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // flattenData() - Collect up the compiled RBBI rule data and put it into |
| // the format for saving in ICU data files, |
| // |
| // See the ICU4C file common/rbidata.h for a detailed description. |
| // |
| //---------------------------------------------------------------------------------------- |
| static final int align8(int i) |
| { |
| return (i + 7) & 0xfffffff8; |
| } |
| |
| void flattenData(OutputStream os) throws IOException { |
| DataOutputStream dos = new DataOutputStream(os); |
| int i; |
| |
| // Remove whitespace from the rules to make it smaller. |
| // The rule parser has already removed comments. |
| String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString()); |
| |
| // Calculate the size of each section in the data in bytes. |
| // Sizes here are padded up to a multiple of 8 for better memory alignment. |
| // Sections sizes actually stored in the header are for the actual data |
| // without the padding. |
| // |
| int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader)); |
| int forwardTableSize = align8(fForwardTable.getTableSize()); |
| int reverseTableSize = align8(fForwardTable.getSafeTableSize()); |
| int trieSize = align8(fSetBuilder.getTrieSize()); |
| int statusTableSize = align8(fRuleStatusVals.size() * 4); |
| |
| byte[] strippedRulesUTF8 = strippedRules.getBytes(StandardCharsets.UTF_8); |
| int rulesSize = align8(strippedRulesUTF8.length + 1); |
| |
| int totalSize = headerSize |
| + forwardTableSize |
| + reverseTableSize |
| + statusTableSize + trieSize + rulesSize; |
| int outputPos = 0; // Track stream position, starting from RBBIDataHeader. |
| |
| // |
| // Write out an ICU Data Header |
| // |
| ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos); |
| |
| // |
| // Write out the RBBIDataHeader |
| // |
| int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader |
| header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0; |
| header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION; |
| header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. |
| header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); |
| |
| header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable |
| header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen |
| |
| header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable |
| header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen |
| |
| header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE] |
| + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie |
| header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen |
| header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] |
| + trieSize; |
| header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen |
| header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE] |
| + statusTableSize; |
| header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRulesUTF8.length; |
| for (i = 0; i < header.length; i++) { |
| dos.writeInt(header[i]); |
| outputPos += 4; |
| } |
| |
| // Write out the actual state tables. |
| RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable(); |
| assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]); |
| outputPos += table.put(dos); |
| |
| table = fForwardTable.exportSafeTable(); |
| Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]); |
| outputPos += table.put(dos); |
| |
| // write out the Trie table |
| Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]); |
| fSetBuilder.serializeTrie(os); |
| outputPos += header[RBBIDataWrapper.DH_TRIELEN]; |
| while (outputPos % 8 != 0) { // pad to an 8 byte boundary |
| dos.write(0); |
| outputPos += 1; |
| } |
| |
| // Write out the status {tag} table. |
| Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]); |
| for (Integer val : fRuleStatusVals) { |
| dos.writeInt(val.intValue()); |
| outputPos += 4; |
| } |
| |
| while (outputPos % 8 != 0) { // pad to an 8 byte boundary |
| dos.write(0); |
| outputPos += 1; |
| } |
| |
| // Write out the stripped rules (rules with extra spaces removed |
| // These go last in the data area, even though they are not last in the header. |
| Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]); |
| dos.write(strippedRulesUTF8, 0, strippedRulesUTF8.length); |
| dos.write(0); // Null termination |
| outputPos += strippedRulesUTF8.length + 1; |
| while (outputPos % 8 != 0) { // pad to an 8 byte boundary |
| dos.write(0); |
| outputPos += 1; |
| } |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // compileRules compile source rules, placing the compiled form into a output stream |
| // The compiled form is identical to that from ICU4C (Big Endian). |
| // |
| //---------------------------------------------------------------------------------------- |
| static void compileRules(String rules, OutputStream os) throws IOException |
| { |
| // |
| // Read the input rules, generate a parse tree, symbol table, |
| // and list of all Unicode Sets referenced by the rules. |
| // |
| RBBIRuleBuilder builder = new RBBIRuleBuilder(rules); |
| builder.build(os); |
| } |
| |
| /** |
| * Compile rules to the binary form, write that to an ouput stream. |
| * |
| */ |
| void build(OutputStream os) throws IOException { |
| fScanner.parse(); |
| |
| // |
| // UnicodeSet processing. |
| // Munge the Unicode Sets to create an initial set of character categories. |
| // |
| fSetBuilder.buildRanges(); |
| |
| // |
| // Generate the DFA state transition table. |
| // |
| fForwardTable = new RBBITableBuilder(this, fForwardTree); |
| fForwardTable.buildForwardTable(); |
| // State table and character category optimization. |
| // Merge equivalent rows and columns. |
| // Note that this process alters the the initial set of character categories, |
| // causing the representation of UnicodeSets in the parse tree to become invalid. |
| optimizeTables(); |
| fForwardTable.buildSafeReverseTable(); |
| |
| |
| if (fDebugEnv != null |
| && fDebugEnv.indexOf("states") >= 0) { |
| fForwardTable.printStates(); |
| fForwardTable.printRuleStatusTable(); |
| fForwardTable.printReverseTable(); |
| } |
| // Generate the mapping tables (TRIE) from input code points to |
| // the character categories. |
| // |
| fSetBuilder.buildTrie(); |
| // |
| // Package up the compiled data, writing it to an output stream |
| // in the serialization format. This is the same as the ICU4C runtime format. |
| // |
| flattenData(os); |
| } |
| |
| static class IntPair { |
| int first = 0; |
| int second = 0; |
| IntPair() {}; |
| IntPair(int f, int s) { |
| first = f; |
| second = s; |
| } |
| } |
| |
| void optimizeTables() { |
| boolean didSomething; |
| do { |
| didSomething = false; |
| // Begin looking for duplicates with char class 3. |
| // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, |
| // and should not have other categories merged into them. |
| IntPair duplPair = new IntPair(3, 0); |
| while (fForwardTable.findDuplCharClassFrom(duplPair)) { |
| fSetBuilder.mergeCategories(duplPair); |
| fForwardTable.removeColumn(duplPair.second); |
| didSomething = true; |
| } |
| while (fForwardTable.removeDuplicateStates() > 0) { |
| didSomething = true; |
| }; |
| } while (didSomething); |
| } |
| } |