| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2000, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/tools/normalizer/Attic/NormalizerBuilder.java,v $ |
| * $Date: 2001/03/15 23:36:02 $ |
| * $Revision: 1.12 $ |
| * |
| ***************************************************************************************** |
| */ |
| |
| package com.ibm.tools.normalizer; |
| |
| import java.io.*; |
| import java.util.*; |
| import com.ibm.text.*; |
| import com.ibm.util.CompactByteArray; |
| import com.ibm.util.CompactCharArray; |
| import com.ibm.util.Utility; |
| import com.ibm.icu.internal.UInfo; |
| |
| public final class NormalizerBuilder |
| { |
| public static void main(String args[]) throws IOException { |
| try { |
| NormalizerBuilder foo = new NormalizerBuilder(args); |
| } catch (Throwable e) { |
| System.err.println(e.getLocalizedMessage()); |
| e.printStackTrace(); |
| System.in.read(); |
| } |
| } |
| |
| private UInfo uinfo; |
| |
| /** |
| * Map char->String. Each entry maps a character with a |
| * decomposition (either canonical or compatibility) to that |
| * decomposition. The decomposition is in canonical order. |
| */ |
| private DecompMap decomps = new DecompMap(); |
| |
| /** |
| * Map of characters whose full canonical decomposition is |
| * DIFFERENT from their full compatibility decomposition. |
| */ |
| private DecompMap explodeCompat = new DecompMap(); |
| |
| /** |
| * Map of characters with a decomposition that are neither |
| * in explodeCompat nor in permutedCompositions. |
| */ |
| private DecompMap explodeOnly = new DecompMap(); |
| |
| /** |
| * Map of String->char of permutations that compose to a |
| * character. This does not include singletons or other |
| * composition exclusions. It is an inverse list, with valid |
| * permutations, for canonical decomposition. |
| */ |
| private CompMap permutedCompositions = new CompMap(); |
| private CompMap binaryCompositions = new CompMap(); |
| |
| /** |
| * A set of characters that form the base of a combining |
| * sequence. |
| */ |
| private CharSet bases = new CharSet(); |
| |
| /** |
| * A set of characters that form the combining character of |
| * a combining sequence. |
| */ |
| private CharSet combining = new CharSet(); |
| |
| private Map pairExplosions = new HashMap(); |
| |
| private boolean fVerbose = false; |
| private boolean fWriteData = false; |
| private boolean fShowSizes = false; |
| private boolean fPrompt = false; |
| private boolean fJava = true; |
| private boolean fCPP = false; |
| private String fOutDir = null; // output directory for either Java or C++ |
| |
| /** |
| * The highest Unicode character that has a canonical |
| * decomposition. (i.e. largest char that can result from a |
| * primary canonical composition.) This is the largest char in |
| * permutedCompositions. |
| */ |
| char largestChar = 0; |
| |
| public NormalizerBuilder(String[] args) throws IOException |
| { |
| // Parse my command line |
| for (int i = 0; i < args.length; i++) |
| { |
| if (args[i].equals("-data")) { |
| uinfo = new UInfo(args[++i], args[++i]); |
| } |
| else if (args[i].equals("-write")) { |
| fWriteData = true; |
| } |
| else if (args[i].equals("-verbose") || args[i]. equals("-v")) { |
| fVerbose = true; |
| } |
| else if (args[i].equals("-size")) { |
| fShowSizes = true; |
| } |
| else if (args[i].equals("-prompt")) { |
| fPrompt = true; |
| } |
| else if (args[i].equals("-java")) { |
| fJava = true; |
| fCPP = false; |
| } |
| else if (args[i].equals("-cpp")) { |
| fCPP = true; |
| fJava = false; |
| } |
| else if (args[i].equals("-outdir")) { |
| fOutDir = args[++i]; |
| } |
| } |
| if (uinfo == null) { |
| uinfo = new UInfo(); |
| } |
| if (fOutDir == null) { |
| fOutDir = fJava ? "src/com/ibm/text/" |
| : "./"; |
| } |
| if (!fOutDir.endsWith("/")) { fOutDir += '/'; } |
| |
| boolean canonicalOnly = true; |
| |
| // Build decomps, a char->String mapping of characters to their |
| // decompositions, either canonical or compatibility. |
| createDecompositions(); |
| |
| outv("\nGenerating permuted compositions..."); |
| |
| // Form the list of all the permuted sequences that are |
| // canonically equivalent to the canonical decompositions. As |
| // a by-product, find out which are not combining character |
| // sequences. |
| |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| String decomp = decomps.get(ch); |
| |
| if (decomp != null) { |
| boolean done = false; |
| |
| if (!uinfo.getFullDecomposition(ch,true).equals( |
| uinfo.getFullDecomposition(ch,false))) |
| { |
| explodeCompat.put(ch, uinfo.getFullDecomposition(ch, false)); |
| done = true; |
| } |
| // It's always a combining base sequence, so removed last check - liu |
| if (uinfo.hasCanonicalDecomposition(ch) && decomp.length() > 1 |
| && !uinfo.isExcludedComposition(ch) /*&& uinfo.isCBS(decomp)*/) |
| { |
| if (decomp.length() <= 2) { |
| permutedCompositions.put(decomp, ch); |
| } |
| else { |
| /* Create a comprehensive list of |
| * permutations. Assume the first char is a |
| * base char, so don't permute it into the |
| * middle of the string -- just concatenate it |
| * onto the front. However, there may be |
| * embedded base characters, so we do a |
| * further check for canonical decomposition |
| * equivalence below. */ |
| List alternatives = concat(decomp.charAt(0), |
| jumble(decomp.substring(1, decomp.length()))); |
| |
| for (int i = 0; i < alternatives.size(); ++i) |
| { |
| String variant = (String)alternatives.get(i); |
| String normalized = uinfo.fixCanonical(variant); |
| |
| if (normalized.equals(decomp)) { |
| permutedCompositions.put(variant, ch); |
| } |
| } |
| } |
| largestChar = ch; |
| done = true; |
| } |
| if (!done) { |
| explodeOnly.put(ch, decomp); // Disparaged |
| } |
| } |
| } |
| |
| outv("\nLargest composed char: " + Utility.hex(largestChar)); |
| |
| // Form the binary compositions |
| outv("\nGenerating pairwise compositions..."); |
| |
| Iterator list = permutedCompositions.keySet().iterator(); |
| while (list.hasNext()) { |
| String decomp = (String)list.next(); |
| char ch = permutedCompositions.get(decomp); |
| |
| if (decomp.length() > 2) { |
| // |
| // If this is a composition of more than two characters, |
| // see if its initial portion is also a composition. If so, that lets |
| // us build up this composed character iteratively. |
| // |
| for (int i = decomp.length()-1; i > 1; --i) { |
| String partial = decomp.substring(0,i); |
| char partialMap = permutedCompositions.get(partial); |
| if (partialMap != 0) { |
| decomp = partialMap + decomp.substring(i); |
| break; |
| } |
| } |
| } |
| if (decomp.length() <= 2) { |
| binaryCompositions.put(decomp, ch); |
| } else { |
| // |
| // The composition takes more than two characters, and there's |
| // no way to build it up from smaller ones. |
| // |
| if (decomp.equals(uinfo.fixCanonical(decomp))) |
| { |
| // If the decomp is in canonical order, we're in trouble, |
| // since that means there's no way to generate this composed |
| // character from its canonically decomposed equivalent. |
| err("No pairwise compose of " + Utility.hex(decomp) + |
| " > " + Utility.hex(ch) + " " + uinfo.getName(ch,true) ); |
| } |
| else { |
| // If the decomp is *not* in canonical order, it's not as |
| // bad, since composition will still work as long as |
| warn("No pairwise compose of non-canon " + Utility.hex(decomp) + |
| " > " + Utility.hex(ch) + " " + uinfo.getName(ch,true) ); |
| } |
| } |
| |
| bases.add(decomp.charAt(0)); |
| |
| // add to list of all combining characters in composites |
| for (int q = 1; q < decomp.length(); ++q) { |
| combining.add(decomp.charAt(q)); |
| } |
| } |
| |
| |
| // Generate the pairwise explosions, where a composed char + combining char |
| // transforms into a different pair of characters, usually because the |
| // canonical combining classes are reversed. |
| |
| outv("\nGenerating exploding pairs...."); |
| |
| List binaryValues = new ArrayList(binaryCompositions.values()); |
| Collections.sort(binaryValues); |
| |
| for (char addOn = 0; addOn < 0xFFFF; addOn++) { |
| if (combining.contains(addOn)) |
| { |
| list = binaryValues.iterator(); |
| |
| while (list.hasNext()) { |
| MutableChar unichar = (MutableChar)list.next(); |
| String chStr = String.valueOf(unichar.value); |
| String source = chStr + addOn; |
| |
| String comp = binaryComposition(source); |
| |
| if (comp.length() == 1) continue; // don't care if combines |
| if (comp.charAt(0) == addOn || comp.charAt(1) == addOn) continue; // rearranges |
| |
| if (!source.equals(comp)) { |
| String decomp = fullDecomposition(source); |
| pairExplosions.put(source,comp); |
| bases.add(unichar); |
| } |
| } |
| } |
| } |
| |
| buildDecompData(); |
| buildComposeData(); |
| outv("Success!"); |
| |
| if (fPrompt) { |
| System.out.println("\nHit any key to continue..."); |
| System.in.read(); |
| } |
| } |
| |
| public String fullDecomposition(String s) { |
| return fullDecomposition(s, new StringBuffer()).toString(); |
| } |
| |
| public StringBuffer fullDecomposition(char ch, StringBuffer output) { |
| String value = decomps.get(ch); |
| if (value == null) { |
| bubbleAppend(output, ch); |
| } |
| else { |
| bubbleAppend(output, value); |
| } |
| return output; |
| } |
| |
| public StringBuffer fullDecomposition(String s, StringBuffer output) { |
| for (int i = 0; i < s.length(); ++i) { |
| fullDecomposition(s.charAt(i),output); |
| } |
| return output; |
| } |
| |
| public String binaryComposition(String sr) { |
| // set up decomposed string, init variables |
| StringBuffer output = new StringBuffer(); |
| StringBuffer decomp = new StringBuffer(); |
| |
| if (sr.length() == 0) return output.toString(); |
| |
| // First generate the full decomposition of the input string |
| fullDecomposition(sr, decomp); |
| int basePosition = 0; |
| char base = decomp.charAt(0); |
| output.append(base); |
| |
| // handle degenerate case--no base character at start |
| if (uinfo.getCanonicalClass(base) != 0) { |
| // later |
| } |
| |
| // loop through, composing items with base |
| for (int i = 1; i < decomp.length(); ++i) { |
| char ch = decomp.charAt(i); |
| short can = uinfo.getCanonicalClass(ch); |
| |
| char value = binaryCompositions.get(String.valueOf(base) + ch); |
| |
| if (value != 0 && noObstructions(output, basePosition, can)) { |
| base = value; |
| output.setCharAt(basePosition, base); |
| } else if (can == 0) { |
| basePosition = output.length(); |
| base = ch; |
| output.append(ch); |
| } else { |
| bubbleAppend(output, ch, can); |
| } |
| } |
| return output.toString(); |
| } |
| |
| public boolean noObstructions(StringBuffer buffer, int pos, short can) { |
| for (int j = buffer.length()-1; j > pos; --j) { |
| if (can == uinfo.getCanonicalClass(buffer.charAt(j))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| public void bubbleAppend(StringBuffer buffer, char ch, short can) { |
| for (int j = buffer.length()-1; j >= 0; --j) { |
| if (can >= uinfo.getCanonicalClass(buffer.charAt(j))) { |
| buffer.insert(j + 1, ch); |
| return; |
| } |
| } |
| buffer.insert(0, ch); |
| } |
| |
| public void bubbleAppend(StringBuffer buffer, char ch) { |
| bubbleAppend(buffer, ch, uinfo.getCanonicalClass(ch)); |
| } |
| |
| public void bubbleAppend(StringBuffer buffer, String s) { |
| for (int i = 0; i < s.length(); ++i) { |
| bubbleAppend(buffer, s.charAt(i)); |
| } |
| } |
| |
| String getDecomposition(char ch) { |
| return decomps.get(ch); |
| } |
| |
| |
| /** |
| * Generate a Map of all decompositions in Unicode. The keys in |
| * the map are MutableChar objects, one for each character that |
| * has a decomposition. The values are String objects containing |
| * the full decomposition for the character, in canonical order. |
| */ |
| private void createDecompositions() |
| { |
| outv("\nGenerating Full decompositions..."); |
| StringBuffer temp = new StringBuffer(); |
| |
| short compatCount=0, canonCount=0; |
| |
| for (char ch = 0; ch < 0xFFFF; ++ch) { |
| if (ch >= '\u4E00' && ch <= '\uD7A3') continue; // skip ideos |
| |
| short category = uinfo.getCategory(ch); |
| |
| if (category == uinfo.UNASSIGNED) continue; //skip reserved |
| if (category == uinfo.CONTROL) continue; |
| if (category == uinfo.FORMAT) continue; |
| if (category == uinfo.PRIVATE_USE) continue; |
| if (category == uinfo.SURROGATE) continue; |
| |
| boolean canon = uinfo.hasCanonicalDecomposition(ch); |
| boolean compat = uinfo.hasCompatibilityDecomposition(ch); |
| |
| if (canon) canonCount++; |
| if (compat) compatCount++; |
| |
| if (canon || compat) { |
| String decomp = uinfo.getFullDecomposition(ch, canon); |
| temp.setLength(0); |
| temp.append(decomp); |
| uinfo.fixCanonical(temp); // put into canonical order |
| |
| decomps.put(ch, temp.toString() ); |
| } |
| } |
| } |
| |
| /** |
| * Modify a list in place by prepending the given character to all |
| * of its elements, which are assumed to be strings. |
| */ |
| static List concat(char ch, List a) { |
| for (int i = 0; i < a.size(); ++i) { |
| a.set(i, ch + (String)a.get(i)); |
| } |
| return a; |
| } |
| |
| /** |
| * Return a list of Strings for all possible permutations of the |
| * characters in the input string. |
| */ |
| static List jumble (String source) |
| { |
| ArrayList result = new ArrayList(); |
| if (source.length() == 1) { |
| result.add(source); |
| } else for (int i = 0; i < source.length(); ++i) { |
| result.addAll( concat( source.charAt(i), |
| jumble(source.substring(0,i) |
| + source.substring(i+1,source.length())))); |
| } |
| return result; |
| } |
| |
| static final int STR_INDEX_SHIFT = 2; |
| static final int STR_LENGTH_MASK = 0x0003; |
| |
| static final int DECOMP_RECURSE = 0x00008000; |
| static final int DECOMP_MASK = 0x00007FFF; |
| |
| /** |
| * Generate a new "DecompData.java" that contains the CompactArray definitions |
| * used in the {@link Normalizer.DECOMPOSE} operation. |
| */ |
| void buildDecompData() throws IOException { |
| |
| outv("\nGenerating DecompData.java...."); |
| // |
| // For each Unicode character that has a decomposition, we put its |
| // fully-decomposed form at the end of the "contents" string, followed |
| // by a null, and we put its index in "contents" into the CompactArray. |
| // If it does not have a decomposition, we store a bogus index. |
| // |
| // We do this first for all of the compatibility decompositions, save |
| // the index in MAX_COMPAT, and then do it again for the canonical |
| // decompositions. When the array is used later, any character whose |
| // decomp has an index greater than MAX_COMPAT is a canonical decomp. |
| // |
| int canonIndex = 0; |
| int compatIndex = 0; |
| |
| // Map from Unicode character to replacement string index |
| CompactCharArray offsets = new CompactCharArray((char)0); |
| |
| // We also need a place to store the replacement strings. Add a char at |
| // the front so that "0" won't be the index of any of the replacement strings. |
| StringBuffer replace = new StringBuffer().append("\uffff"); |
| |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| if (uinfo.hasCompatibilityDecomposition(ch)) { |
| compatIndex = putLength(replace, decomps.get(ch), 0); |
| offsets.setElementAt(ch, (char)compatIndex); |
| } |
| } |
| |
| // Add the canonical decomps. Their indices must be > compatIndex. |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| if (uinfo.hasCanonicalDecomposition(ch)) { |
| |
| if (ch == 0x0f77) { |
| outv("0F77: decomps.get() = " + Utility.hex(decomps.get(ch))); |
| outv("0F77: fullDecomp = " + Utility.hex(uinfo.getFullDecomposition(ch,false))); |
| } |
| |
| canonIndex = putLength(replace, decomps.get(ch), compatIndex); |
| |
| // If this character's full compatibility decomposition is different from |
| // its canonical decomp, that means one of the characters in its |
| // canonical decomp itself has a compatibility decomp. To deal with this, |
| // we set a bit flag telling the decomposer to recurse on this character. |
| |
| if (!uinfo.getFullDecomposition(ch,true).equals(uinfo.getFullDecomposition(ch,false))) { |
| offsets.setElementAt(ch, (char)(canonIndex | DECOMP_RECURSE)); |
| } else { |
| offsets.setElementAt(ch, (char)canonIndex); |
| } |
| } |
| } |
| |
| // |
| // Now generate another CompactArray containing the combining class of every |
| // character in Unicode |
| // |
| final byte BASE = 0; |
| CompactByteArray canonClasses = new CompactByteArray(BASE); |
| |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| short canonClass = uinfo.getCanonicalClass(ch); |
| if (canonClass != 0) { |
| canonClasses.setElementAt(ch, (byte)canonClass); |
| } |
| } |
| |
| // Finally, write the data out to a compilable Java source file |
| |
| if (fJava) { |
| String f = fOutDir + "DecompData"; |
| out("Writing " + f); |
| writeDecompData(new JavaWriter(f), |
| canonIndex, compatIndex, BASE, offsets, replace, canonClasses); |
| |
| } |
| |
| if (fCPP) { |
| String f = fOutDir + "dcmpdata"; |
| out("Writing " + f + ".(cpp|h)"); |
| writeDecompData(new CPPWriter(f, "DecompData"), |
| canonIndex, compatIndex, BASE, offsets, replace, canonClasses); |
| } |
| |
| outv("Decomp data: MAX_CANONICAL = " + canonIndex + ", MAX_DECOMP = " + compatIndex); |
| |
| if (fShowSizes) { |
| int offsetSize = offsets.getIndexArray().length * 2 + offsets.getValueArray().length * 2; |
| int canonSize = canonClasses.getIndexArray().length * 2 + canonClasses.getValueArray().length; |
| int replaceLength = replace.length(); |
| |
| outv("Total runtime size of decomp data is " |
| + (offsetSize + canonSize + replaceLength)); |
| |
| outv(" offsets: " + offsetSize); |
| outv(" canonClasses: " + canonSize); |
| outv(" replace: " + replaceLength); |
| } |
| } |
| |
| void writeDecompData(SourceWriter out, int maxCanon, int maxCompat, short BASE, |
| CompactCharArray offsets, StringBuffer contents, |
| CompactByteArray canonClasses) |
| { |
| out.write("MAX_CANONICAL", maxCanon ); |
| out.write("MAX_COMPAT", maxCompat ); |
| out.write("DECOMP_MASK", DECOMP_MASK ); |
| out.write("DECOMP_RECURSE", DECOMP_RECURSE ); |
| out.write("BASE", BASE ); |
| out.write("offsets", offsets ); |
| out.write("contents", contents ); |
| out.write("canonClass", canonClasses ); |
| out.close(); |
| } |
| |
| |
| //========================================================================================== |
| // Methods for generating and writing the composition data |
| // |
| final int TYPE_MASK = 0x0007; |
| final int INDEX_MASK = 0xFFF8; |
| final int INDEX_SHIFT = 3; |
| |
| // MAX_BASES is used to map a 2-diminsional (base,combining) index pair onto a |
| // one-dimensional CompactArray. We could just use baseCount, but making it a power |
| // of two allows slightly better compaction. |
| |
| final int MAX_BASES = 1024; // Product must be <= 64K |
| final int MAX_COMBINE = 65536/MAX_BASES; |
| |
| final char // for character types |
| IGNORE = 0, |
| BASE = 1, |
| EXPLODING_BASE = 2, |
| COMBINING = 3, |
| INITIAL_JAMO = 4, |
| MEDIAL_JAMO = 5, |
| FINAL_JAMO = 6, |
| HANGUL = 7; |
| |
| // These variables actually hold the composition data. |
| short baseCount = 1; // Leave 0 as an invalid index |
| short combineCount = 1; // Leave 0 as an invalid index |
| short nccCount = 0; |
| int maxCompat = 0; |
| int maxCanon = 0; |
| |
| // This array contains types (from the set above) and indices into the "replace" |
| // and "actions" arrays |
| CompactCharArray lookup = new CompactCharArray(IGNORE); |
| |
| // We also need a place to store the strings that result from replacements, |
| // explosions, and combinations. Add a char at the front so that "0" won't |
| // be the index of any of the replacement strings. |
| StringBuffer replace = new StringBuffer().append(" "); |
| |
| // We need to represent each canonical character class as a single bit |
| // so that we can OR together a mask of all combining char classes seen |
| // Build an array that maps from combining class to a compacted integer |
| // from 0..n-1, where n is the number of distinct combining classes. |
| // E.g., in 3.0, there are 53 distinct combining classes. |
| int[] classMap = new int[256]; |
| int[] typeBit; |
| |
| // Build a two-dimensional array of the action to take for each base/combining pair |
| CompactCharArray actions = new CompactCharArray((char)0); |
| |
| char[] actionIndex; |
| |
| /** |
| * Generate a new "ComposeData.java" that contains the CompactArray definitions |
| * used in the {@link Normalizer.COMPOSE} operation. |
| */ |
| void buildComposeData() throws IOException |
| { |
| outv("\nGenerating ComposeData.java...."); |
| |
| BitSet usedIndices = new BitSet(); |
| CharSet explodingBases = new CharSet(); |
| NonComposingCombiningMap nccMap = new NonComposingCombiningMap(); |
| |
| // Find all characters that are both bases *and* have compatibility |
| // decompositions. These are weird |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| if (bases.contains(ch) && uinfo.hasCompatibilityDecomposition(ch)) { |
| // |
| // Add this character's explosion to the replacement string list. |
| // We're going to make sure that its "base index", i.e. the |
| // index for it in the actions array, is the same as the |
| // explosion's index in the replace string. This lets |
| // us use the same index for the character's two behaviors |
| // |
| int index = put(replace, explodeCompat.get(ch), 0); |
| |
| outv(Utility.hex(ch) + " is base and has compat explosion " |
| + Utility.hex(explodeCompat.get(ch)) ); |
| |
| addChar(lookup, ch, EXPLODING_BASE, index); |
| usedIndices.set(index); |
| explodingBases.add(ch); |
| } |
| } |
| |
| // First add the base characters to the array. |
| // At the same time, compute their indices. |
| // Leave an empty base index of 0 as a placeholder for null operations. |
| // |
| |
| for (char ch = 0; ch < 0xFFFF; ch++) |
| { |
| |
| if (explodingBases.contains(ch)) { |
| continue; |
| } |
| |
| short cclass = uinfo.getCanonicalClass(ch); |
| |
| if (bases.contains(ch)) { |
| // Make sure that we don't use a base index that was already used |
| // for an exploding base character. |
| while (usedIndices.get(baseCount)) { |
| baseCount++; |
| } |
| // Now add the character to lookup as a base |
| addChar(lookup, ch, BASE, baseCount++); |
| } |
| if (combining.contains(ch)) { |
| classMap[cclass] = 1; // Mark this combining class as being used |
| addChar(lookup, ch, COMBINING, combineCount++); |
| } |
| |
| if (ch >= '\u1100' && ch < '\u1160') { |
| addChar(lookup, ch, INITIAL_JAMO, 0); |
| } |
| if (ch >= '\u1161' && ch < '\u11a6') { |
| addChar(lookup, ch, MEDIAL_JAMO, 0); |
| } |
| if (ch >= '\u11a7' && ch < '\u11fa') { |
| addChar(lookup, ch, FINAL_JAMO, 0); |
| } |
| if (ch >= 0xac00 && ch <= 0xd7a4) { |
| addChar(lookup, ch, HANGUL, 0); |
| } |
| |
| // Add explosions for all compatibility decompositions, |
| // including the Jamo --> Conjoining Jamo decomps. |
| // If the canonical decomposition is exactly one character |
| // one (4 hex digits) then we deal with it separately below. |
| if (explodeCompat.contains(ch) && |
| uinfo.getDecomposition(ch).length() != 4) |
| { |
| maxCompat = put(replace, explodeCompat.get(ch), 0); |
| addExplosion(lookup, ch, maxCompat); |
| } |
| } |
| |
| // Now add the explosions resulting from canonical decompositions |
| // These will all have indices greater than "maxCompat" so we can distinguish them. |
| // |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| short cclass = uinfo.getCanonicalClass(ch); |
| String explosion = null; |
| |
| if (explodeOnly.contains(ch) && uinfo.hasCanonicalDecomposition(ch)) { |
| maxCanon = put(replace, explodeOnly.get(ch), maxCompat); |
| addExplosion(lookup, ch, maxCanon); |
| } |
| |
| // else if (!combining.contains(ch) && cclass != 0 && classMap[cclass] != 0) { |
| // // |
| // // If a combining character didn't happen to end up in one of |
| // // the pairwise combinations or explosions we use but still has |
| // // a combining class that is the same as a character we *do* use, |
| // // we need to save its class so that we don't combine things "past" it. |
| // // |
| // // However, if the character has an explosion we *don't* need it, because |
| // // we'll never see it, only the results of its explosion. |
| // // |
| // addChar(lookup, ch, COMBINING, 0); |
| // nccCount++; |
| // } |
| |
| // I'm rewriting this logic. Having an index of zero means that |
| // the typeBit[index] gets overwritten with multiple different |
| // values. So we must use real index values that are unique |
| // per combining class. Also, it doesn't matter if the class |
| // has been seen or not; we still need to record the character |
| // in order to have its type and class during composition. |
| else if (!combining.contains(ch) && cclass != 0) { |
| // If a combining character didn't happen to end up in one of |
| // the pairwise combinations or explosions we use but still has |
| // a combining class that is the same as a character we *do* use, |
| // we need to save its class. |
| |
| // As our index, use combineCount and up. Reuse values by |
| // mapping them through nccMap, which keeps track of previously |
| // used values and allocates new ones only as needed, starting |
| // with zero. - Liu |
| classMap[cclass] = 1; // Mark this combining class as being used |
| addChar(lookup, ch, COMBINING, combineCount + nccMap.getIndexFor(cclass)); |
| } |
| } |
| |
| nccCount = (short) nccMap.getIndexCount(); // Liu |
| |
| // Remap characters that have a canonical decomposition to a singleton, |
| // and also different compatibility and canonical full decompositions |
| // (that is, also are members of explodeCompat). These characters can't |
| // be exploded to their full decomposition since that breaks canonical |
| // composition (normalization form C). Instead, we place their |
| // singleton decomposition in the table, at the end. This works because |
| // the singleton will get recursively exploded by Normalizer. As of |
| // Unicode 3.0, this fix applies to U+1FFE, 1FFD, 2000, and 2001. - Liu |
| int singleton = replace.length(); |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| if (!explodingBases.contains(ch) && |
| explodeCompat.contains(ch) && |
| uinfo.getDecomposition(ch).length() == 4) { |
| |
| // There might be a cleaner way to do this, perhaps by folding |
| // this logic into the code above (perhaps calling |
| // addExplosion() instead of addChar()), but I couldn't find it. |
| char remap = (char) |
| Integer.parseInt(uinfo.getDecomposition(ch), 16); |
| |
| int index = put(replace, String.valueOf(remap), singleton); |
| addChar(lookup, ch, EXPLODING_BASE, index); |
| |
| outv("Canonical singleton " + Utility.hex(ch) + |
| " remaps to " + Utility.hex(remap) + " index=" + index); |
| } |
| } |
| |
| // Now run through the combining classes again and assign bit numbers |
| // in the same ascending order as the canonical classes |
| int maskShift = 0; |
| int bit = 0; |
| for (int i = 0; i < 256; i++) { |
| if (classMap[i] != 0) { |
| classMap[i] = ++bit; |
| } |
| } |
| if (bit >= 64) { |
| err(String.valueOf(bit+1) + " combining classes; max is 64"); |
| } |
| outv("# of combining classes is " + (bit+1)); |
| |
| outv("baseCount=" + baseCount + ", combineCount=" + combineCount |
| + ", nccCount=" + nccCount); |
| |
| if (baseCount > MAX_BASES) { |
| err(Integer.toString(baseCount) + " bases, limit is " + MAX_BASES); |
| err(Integer.toString(combineCount) + " combining chars, limit is " + MAX_COMBINE); |
| } |
| |
| // Now build the "actions" array that tells what to do when each base / |
| // combining pair is seen. |
| // |
| // First do character pairs that combine into a single character... |
| // |
| Iterator iter = binaryCompositions.keySet().iterator(); |
| while (iter.hasNext()) { |
| String source = (String)iter.next(); |
| char ch = binaryCompositions.get(source); |
| |
| int baseIndex = lookup.elementAt(source.charAt(0)) >>> INDEX_SHIFT; |
| int combiningIndex = lookup.elementAt(source.charAt(1)) >>> INDEX_SHIFT; |
| |
| actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex), ch); |
| } |
| |
| |
| // |
| // Pair explosions: base/combining pairs that explode into something else |
| // We're squeezing the indices for these in between MAX_COMPOSED and 0xFFFF, |
| // which means they can't be indexes into the "replace" string; those are too big. |
| // Instead they're indexes into the "actionIndex" array, which in turn contains |
| // indices in "replace" |
| // |
| actionIndex = new char[ pairExplosions.size() ]; |
| short index = 0; |
| |
| iter = pairExplosions.keySet().iterator(); |
| while (iter.hasNext()) { |
| String source = (String)iter.next(); |
| char base = source.charAt(0); |
| char combining = source.charAt(1); |
| |
| int strIndex = put(replace, (String)pairExplosions.get(source), 0); |
| actionIndex[index] = (char)strIndex; |
| |
| int baseIndex = lookup.elementAt(base) >>> INDEX_SHIFT; |
| int combiningIndex = lookup.elementAt(combining) >>> INDEX_SHIFT; |
| |
| actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex), |
| (char)(index + largestChar)); |
| index++; |
| } |
| |
| // Fill in the array that maps from combining class value |
| // to a bit numbe representing the canonical combining class. |
| // That is, map from 0..240 (in 3.0) to 0..52. |
| typeBit = new int[combineCount + nccCount]; |
| |
| for (char ch = 0; ch < 0xFFFF; ch++) { |
| int value = lookup.elementAt(ch); |
| int type = value & TYPE_MASK; |
| |
| if (type == COMBINING) { |
| int ind = value >>> INDEX_SHIFT; |
| int cclass = uinfo.getCanonicalClass(ch); |
| if (typeBit[ind] != 0 && typeBit[ind] != classMap[cclass]) { |
| err("Overwriting typeBit[" + ind + "], was " + |
| typeBit[ind] + ", changing to " + classMap[cclass] + " for class " + cclass); |
| } |
| typeBit[ind] = classMap[cclass]; |
| } |
| } |
| |
| if (fJava) { |
| String f = fOutDir + "ComposeData"; |
| out("Writing " + f); |
| writeComposeData(new JavaWriter(f)); |
| } |
| if (fCPP) { |
| String f = fOutDir + "compdata"; |
| out("Writing " + f + ".(cpp|h)"); |
| writeComposeData(new CPPWriter(f, "ComposeData")); |
| } |
| |
| if (fShowSizes) { |
| int lookupSize = lookup.getIndexArray().length * 2 + lookup.getValueArray().length * 2; |
| int actionSize = actions.getIndexArray().length * 2 + actions.getValueArray().length * 2; |
| int actIndexSize = actionIndex.length * 2; |
| int replaceSize = replace.length(); |
| int typeBitSize = typeBit.length * 2; |
| |
| outv("Total runtime size of compose data is " |
| + (lookupSize + actionSize + actIndexSize + replaceSize + typeBitSize)); |
| |
| outv(" lookup: " + lookupSize); |
| outv(" actions: " + actionSize); |
| outv(" actionIndex: " + actIndexSize); |
| outv(" typeBit: " + typeBitSize); |
| outv(" replace: " + replaceSize); |
| } |
| } |
| |
| void writeComposeData(SourceWriter out) { |
| out.write("BASE_COUNT", baseCount); |
| out.write("COMBINING_COUNT", combineCount); |
| out.write("MAX_COMPAT", maxCompat); |
| out.write("MAX_CANONICAL", maxCanon); |
| |
| out.writeHex("MAX_COMPOSED", largestChar); |
| |
| int maxIndex = replace.length(); |
| out.write("MAX_INDEX", maxIndex ); |
| out.write("INITIAL_JAMO_INDEX", maxIndex + 1); |
| out.write("MEDIAL_JAMO_INDEX", maxIndex + 2); |
| |
| out.write("MAX_BASES", MAX_BASES ); |
| out.write("MAX_COMBINE", MAX_COMBINE); |
| |
| out.writeHex("TYPE_MASK", TYPE_MASK); |
| out.write("INDEX_SHIFT", INDEX_SHIFT); |
| |
| // The character types |
| out.write("IGNORE", (int)IGNORE); |
| out.write("BASE", (int)BASE); |
| out.write("NON_COMPOSING_COMBINING", (int)EXPLODING_BASE); |
| out.write("COMBINING", (int)COMBINING); |
| out.write("INITIAL_JAMO", (int)INITIAL_JAMO); |
| out.write("MEDIAL_JAMO", (int)MEDIAL_JAMO); |
| out.write("FINAL_JAMO", (int)FINAL_JAMO); |
| out.write("HANGUL", (int)HANGUL); |
| |
| out.write("lookup", lookup ); |
| out.write("actions", actions ); |
| out.write("actionIndex", actionIndex ); |
| out.write("replace", replace ); |
| out.write("typeBit", typeBit); |
| |
| out.close(); |
| } |
| |
| void addChar(CompactCharArray lookup, char ch, int type, int index) |
| { |
| // First make sure it's not already present |
| if (lookup.elementAt(ch) != IGNORE) |
| { |
| char oldValue = lookup.elementAt(ch); |
| err(typeName(type) + " char is also " |
| + typeName(oldValue & TYPE_MASK) + ": " |
| + Utility.hex(ch) + " " + uinfo.getName(ch,true)); |
| } |
| else if ((index << INDEX_SHIFT) > 65536) { |
| err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT)); |
| } else { |
| lookup.setElementAt(ch, (char)(type | (index << INDEX_SHIFT))); |
| } |
| } |
| |
| void addExplosion(CompactCharArray lookup, char ch, int index) |
| { |
| // First make sure it doesn't already have an index |
| char oldValue = lookup.elementAt(ch); |
| int oldIndex = oldValue >>> INDEX_SHIFT; |
| |
| if (oldValue != IGNORE) { |
| err("Exploding char is already " + typeName(oldValue & TYPE_MASK) |
| + " (index " + oldIndex + "): " |
| + Utility.hex(ch) + " " + uinfo.getName(ch,true)); |
| } |
| |
| if (oldIndex != 0) { |
| err("Exploding char is already " + typeName(oldValue & TYPE_MASK) |
| + " (index " + oldIndex + "): " |
| + Utility.hex(ch) + " " + uinfo.getName(ch,true)); |
| } |
| else if ((index << INDEX_SHIFT) > 65536) { |
| err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT)); |
| } else { |
| lookup.setElementAt(ch, (char)((oldValue & ~INDEX_MASK) | (index << INDEX_SHIFT))); |
| } |
| } |
| |
| String typeName(int type) { |
| switch (type) { |
| case IGNORE: return "Ignored"; |
| case BASE: return "Base"; |
| case EXPLODING_BASE: return "Exploding Base"; |
| case COMBINING: return "Combining"; |
| case INITIAL_JAMO: return "Initial Jamo"; |
| case MEDIAL_JAMO: return "Medial Jamo"; |
| case FINAL_JAMO: return "Final Jamo"; |
| case HANGUL: return "Hangul"; |
| default: return "Unknown"; |
| } |
| } |
| |
| |
| static final int put(StringBuffer buf, String str, int minIndex) |
| { |
| str = str + '\u0000'; // Add trailing null |
| |
| int index = buf.toString().indexOf(str); |
| if (index <= minIndex) { |
| index = buf.length(); |
| buf.append(str); |
| } |
| return index; |
| } |
| |
| static final int putLength(StringBuffer buf, String str, int minIndex) { |
| int length = str.length(); |
| |
| if (length >= (1 << STR_INDEX_SHIFT)) { |
| // There's no room to store the length in the index, so |
| // add a null terminator and use a 0 length to flag this |
| str = str + '\u0000'; |
| length = 0; |
| } |
| |
| int index = buf.toString().indexOf(str); |
| if (index <= minIndex) { |
| index = buf.length(); |
| buf.append(str); |
| } |
| return (index << STR_INDEX_SHIFT) | length; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // Output & formatting |
| |
| void out(String str) { |
| System.out.println(str); |
| } |
| void outv(String str) { |
| if (fVerbose) System.out.println(str); |
| } |
| void warn(String str) { |
| System.err.println("Warning: " + str); |
| } |
| void err(String str) { |
| System.err.println("ERROR: " + str); |
| } |
| } |
| |
| //----------------------------------------------------------------------------- |
| // Utility classes |
| //----------------------------------------------------------------------------- |
| |
| class DecompMap extends HashMap { |
| public DecompMap() { |
| } |
| |
| void put(char ch, String value) { |
| put(new MutableChar(ch), value); |
| } |
| |
| String get(char ch) { |
| Object obj = get(probe.set(ch)); |
| return (obj != null) ? (String)obj : null; |
| } |
| |
| boolean contains(char ch) { |
| return containsKey(probe.set(ch)); |
| } |
| |
| MutableChar probe = new MutableChar(' '); |
| } |
| |
| class CompMap extends HashMap { |
| public CompMap() { |
| } |
| |
| void put(String key, char value) { |
| put(key, new MutableChar(value)); |
| } |
| |
| char get(String key) { |
| Object obj = get((Object)key); |
| return (obj != null) ? ((MutableChar)obj).value : 0; |
| } |
| } |
| |
| class CharSet extends HashSet { |
| public CharSet() { |
| } |
| |
| public void add(char ch) { |
| add(new MutableChar(ch)); |
| } |
| |
| public boolean contains(char ch) { |
| return contains(probe.set(ch)); |
| } |
| MutableChar probe = new MutableChar(' '); |
| } |
| |
| /** |
| * An int->int map. Each time a non-existent key is looked up, |
| * create a new mapping to the next available integer value. |
| */ |
| class NonComposingCombiningMap { |
| int index; |
| Hashtable hash; |
| |
| public NonComposingCombiningMap() { |
| index = 0; |
| hash = new Hashtable(); |
| } |
| |
| /** |
| * Return the existing mapping of class. If no such mapping |
| * exists, create one and return it. New mappings map to |
| * zero, then one, etc. |
| */ |
| public int getIndexFor(int cclass) { |
| Integer cl = new Integer(cclass); |
| Integer ind = (Integer) hash.get(cl); |
| if (ind != null) { |
| return ind.intValue(); |
| } |
| hash.put(cl, new Integer(index)); |
| return index++; |
| } |
| |
| /** |
| * Return the number of mappings made so far. That is, getIndexFor() |
| * has returned integers 0..getIndexCount()-1. |
| */ |
| public int getIndexCount() { |
| return index; |
| } |
| } |