blob: f14aa07a03ce43e187a41982aeeb9541680faf38 [file] [log] [blame]
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/tools/normalizer/Attic/,v $
* $Date: 2001/09/08 01:18:03 $
* $Revision: 1.13 $
import java.util.*;
public final class NormalizerBuilder
public static void main(String args[]) throws IOException {
try {
NormalizerBuilder foo = new NormalizerBuilder(args);
foo = null;
} catch (Throwable e) {
private UInfo uinfo;
* Map char->String. Each entry maps a character with a
* decomposition (either canonical or compatibility) to that
* decomposition. The decomposition is in canonical order.
private DecompMap decomps = new DecompMap();
* Map of characters whose full canonical decomposition is
* DIFFERENT from their full compatibility decomposition.
private DecompMap explodeCompat = new DecompMap();
* Map of characters with a decomposition that are neither
* in explodeCompat nor in permutedCompositions.
private DecompMap explodeOnly = new DecompMap();
* Map of String->char of permutations that compose to a
* character. This does not include singletons or other
* composition exclusions. It is an inverse list, with valid
* permutations, for canonical decomposition.
private CompMap permutedCompositions = new CompMap();
private CompMap binaryCompositions = new CompMap();
* A set of characters that form the base of a combining
* sequence.
private CharSet bases = new CharSet();
* A set of characters that form the combining character of
* a combining sequence.
private CharSet combining = new CharSet();
private Map pairExplosions = new HashMap();
private boolean fVerbose = false;
private boolean fWriteData = false;
private boolean fShowSizes = false;
private boolean fPrompt = false;
private boolean fJava = true;
private boolean fCPP = false;
private String fOutDir = null; // output directory for either Java or C++
* The highest Unicode character that has a canonical
* decomposition. (i.e. largest char that can result from a
* primary canonical composition.) This is the largest char in
* permutedCompositions.
char largestChar = 0;
public NormalizerBuilder(String[] args) throws IOException
// Parse my command line
for (int i = 0; i < args.length; i++)
if (args[i].equals("-data")) {
uinfo = new UInfo(args[++i], args[++i]);
else if (args[i].equals("-write")) {
fWriteData = true;
else if (args[i].equals("-verbose") || args[i]. equals("-v")) {
fVerbose = true;
else if (args[i].equals("-size")) {
fShowSizes = true;
else if (args[i].equals("-prompt")) {
fPrompt = true;
else if (args[i].equals("-java")) {
fJava = true;
fCPP = false;
else if (args[i].equals("-cpp")) {
fCPP = true;
fJava = false;
else if (args[i].equals("-outdir")) {
fOutDir = args[++i];
if (uinfo == null) {
uinfo = new UInfo();
if (fOutDir == null) {
fOutDir = fJava ? "src/com/ibm/text/"
: "./";
if (!fOutDir.endsWith("/")) { fOutDir += '/'; }
// not used boolean canonicalOnly = true;
// Build decomps, a char->String mapping of characters to their
// decompositions, either canonical or compatibility.
outv("\nGenerating permuted compositions...");
// Form the list of all the permuted sequences that are
// canonically equivalent to the canonical decompositions. As
// a by-product, find out which are not combining character
// sequences.
for (char ch = 0; ch < 0xFFFF; ch++) {
String decomp = decomps.get(ch);
if (decomp != null) {
boolean done = false;
if (!uinfo.getFullDecomposition(ch,true).equals(
explodeCompat.put(ch, uinfo.getFullDecomposition(ch, false));
done = true;
// It's always a combining base sequence, so removed last check - liu
if (uinfo.hasCanonicalDecomposition(ch) && decomp.length() > 1
&& !uinfo.isExcludedComposition(ch) /*&& uinfo.isCBS(decomp)*/)
if (decomp.length() <= 2) {
permutedCompositions.put(decomp, ch);
else {
/* Create a comprehensive list of
* permutations. Assume the first char is a
* base char, so don't permute it into the
* middle of the string -- just concatenate it
* onto the front. However, there may be
* embedded base characters, so we do a
* further check for canonical decomposition
* equivalence below. */
List alternatives = concat(decomp.charAt(0),
jumble(decomp.substring(1, decomp.length())));
for (int i = 0; i < alternatives.size(); ++i)
String variant = (String)alternatives.get(i);
String normalized = uinfo.fixCanonical(variant);
if (normalized.equals(decomp)) {
permutedCompositions.put(variant, ch);
largestChar = ch;
done = true;
if (!done) {
explodeOnly.put(ch, decomp); // Disparaged
outv("\nLargest composed char: " + Utility.hex(largestChar));
// Form the binary compositions
outv("\nGenerating pairwise compositions...");
Iterator list = permutedCompositions.keySet().iterator();
while (list.hasNext()) {
String decomp = (String);
char ch = permutedCompositions.get(decomp);
if (decomp.length() > 2) {
// If this is a composition of more than two characters,
// see if its initial portion is also a composition. If so, that lets
// us build up this composed character iteratively.
for (int i = decomp.length()-1; i > 1; --i) {
String partial = decomp.substring(0,i);
char partialMap = permutedCompositions.get(partial);
if (partialMap != 0) {
decomp = partialMap + decomp.substring(i);
if (decomp.length() <= 2) {
binaryCompositions.put(decomp, ch);
} else {
// The composition takes more than two characters, and there's
// no way to build it up from smaller ones.
if (decomp.equals(uinfo.fixCanonical(decomp)))
// If the decomp is in canonical order, we're in trouble,
// since that means there's no way to generate this composed
// character from its canonically decomposed equivalent.
err("No pairwise compose of " + Utility.hex(decomp) +
" > " + Utility.hex(ch) + " " + uinfo.getName(ch,true) );
else {
// If the decomp is *not* in canonical order, it's not as
// bad, since composition will still work as long as
warn("No pairwise compose of non-canon " + Utility.hex(decomp) +
" > " + Utility.hex(ch) + " " + uinfo.getName(ch,true) );
// add to list of all combining characters in composites
for (int q = 1; q < decomp.length(); ++q) {
// Generate the pairwise explosions, where a composed char + combining char
// transforms into a different pair of characters, usually because the
// canonical combining classes are reversed.
outv("\nGenerating exploding pairs....");
List binaryValues = new ArrayList(binaryCompositions.values());
for (char addOn = 0; addOn < 0xFFFF; addOn++) {
if (combining.contains(addOn))
list = binaryValues.iterator();
while (list.hasNext()) {
MutableChar unichar = (MutableChar);
String chStr = String.valueOf(unichar.value);
String source = chStr + addOn;
String comp = binaryComposition(source);
if (comp.length() == 1) continue; // don't care if combines
if (comp.charAt(0) == addOn || comp.charAt(1) == addOn) continue; // rearranges
if (!source.equals(comp)) {
String decomp = fullDecomposition(source);
decomp = null;
if (fPrompt) {
System.out.println("\nHit any key to continue...");;
public String fullDecomposition(String s) {
return fullDecomposition(s, new StringBuffer()).toString();
public StringBuffer fullDecomposition(char ch, StringBuffer output) {
String value = decomps.get(ch);
if (value == null) {
bubbleAppend(output, ch);
else {
bubbleAppend(output, value);
return output;
public StringBuffer fullDecomposition(String s, StringBuffer output) {
for (int i = 0; i < s.length(); ++i) {
return output;
public String binaryComposition(String sr) {
// set up decomposed string, init variables
StringBuffer output = new StringBuffer();
StringBuffer decomp = new StringBuffer();
if (sr.length() == 0) return output.toString();
// First generate the full decomposition of the input string
fullDecomposition(sr, decomp);
int basePosition = 0;
char base = decomp.charAt(0);
// handle degenerate case--no base character at start
if (uinfo.getCanonicalClass(base) != 0) {
// later
// loop through, composing items with base
for (int i = 1; i < decomp.length(); ++i) {
char ch = decomp.charAt(i);
short can = uinfo.getCanonicalClass(ch);
char value = binaryCompositions.get(String.valueOf(base) + ch);
if (value != 0 && noObstructions(output, basePosition, can)) {
base = value;
output.setCharAt(basePosition, base);
} else if (can == 0) {
basePosition = output.length();
base = ch;
} else {
bubbleAppend(output, ch, can);
return output.toString();
public boolean noObstructions(StringBuffer buffer, int pos, short can) {
for (int j = buffer.length()-1; j > pos; --j) {
if (can == uinfo.getCanonicalClass(buffer.charAt(j))) {
return false;
return true;
public void bubbleAppend(StringBuffer buffer, char ch, short can) {
for (int j = buffer.length()-1; j >= 0; --j) {
if (can >= uinfo.getCanonicalClass(buffer.charAt(j))) {
buffer.insert(j + 1, ch);
buffer.insert(0, ch);
public void bubbleAppend(StringBuffer buffer, char ch) {
bubbleAppend(buffer, ch, uinfo.getCanonicalClass(ch));
public void bubbleAppend(StringBuffer buffer, String s) {
for (int i = 0; i < s.length(); ++i) {
bubbleAppend(buffer, s.charAt(i));
String getDecomposition(char ch) {
return decomps.get(ch);
* Generate a Map of all decompositions in Unicode. The keys in
* the map are MutableChar objects, one for each character that
* has a decomposition. The values are String objects containing
* the full decomposition for the character, in canonical order.
private void createDecompositions()
outv("\nGenerating Full decompositions...");
StringBuffer temp = new StringBuffer();
short compatCount=0, canonCount=0;
for (char ch = 0; ch < 0xFFFF; ++ch) {
if (ch >= '\u4E00' && ch <= '\uD7A3') continue; // skip ideos
short category = uinfo.getCategory(ch);
if (category == uinfo.UNASSIGNED) continue; //skip reserved
if (category == uinfo.CONTROL) continue;
if (category == uinfo.FORMAT) continue;
if (category == uinfo.PRIVATE_USE) continue;
if (category == uinfo.SURROGATE) continue;
boolean canon = uinfo.hasCanonicalDecomposition(ch);
boolean compat = uinfo.hasCompatibilityDecomposition(ch);
if (canon) canonCount++;
if (compat) compatCount++;
if (canon || compat) {
String decomp = uinfo.getFullDecomposition(ch, canon);
uinfo.fixCanonical(temp); // put into canonical order
decomps.put(ch, temp.toString() );
* Modify a list in place by prepending the given character to all
* of its elements, which are assumed to be strings.
static List concat(char ch, List a) {
for (int i = 0; i < a.size(); ++i) {
a.set(i, ch + (String)a.get(i));
return a;
* Return a list of Strings for all possible permutations of the
* characters in the input string.
static List jumble (String source)
ArrayList result = new ArrayList();
if (source.length() == 1) {
} else for (int i = 0; i < source.length(); ++i) {
result.addAll( concat( source.charAt(i),
+ source.substring(i+1,source.length()))));
return result;
static final int STR_INDEX_SHIFT = 2;
static final int STR_LENGTH_MASK = 0x0003;
static final int DECOMP_RECURSE = 0x00008000;
static final int DECOMP_MASK = 0x00007FFF;
* Generate a new "" that contains the CompactArray definitions
* used in the {@link Normalizer.DECOMPOSE} operation.
void buildDecompData() throws IOException {
// For each Unicode character that has a decomposition, we put its
// fully-decomposed form at the end of the "contents" string, followed
// by a null, and we put its index in "contents" into the CompactArray.
// If it does not have a decomposition, we store a bogus index.
// We do this first for all of the compatibility decompositions, save
// the index in MAX_COMPAT, and then do it again for the canonical
// decompositions. When the array is used later, any character whose
// decomp has an index greater than MAX_COMPAT is a canonical decomp.
int canonIndex = 0;
int compatIndex = 0;
// Map from Unicode character to replacement string index
CompactCharArray offsets = new CompactCharArray((char)0);
// We also need a place to store the replacement strings. Add a char at
// the front so that "0" won't be the index of any of the replacement strings.
StringBuffer replace = new StringBuffer().append("\uffff");
for (char ch = 0; ch < 0xFFFF; ch++) {
if (uinfo.hasCompatibilityDecomposition(ch)) {
compatIndex = putLength(replace, decomps.get(ch), 0);
offsets.setElementAt(ch, (char)compatIndex);
// Add the canonical decomps. Their indices must be > compatIndex.
for (char ch = 0; ch < 0xFFFF; ch++) {
if (uinfo.hasCanonicalDecomposition(ch)) {
if (ch == 0x0f77) {
outv("0F77: decomps.get() = " + Utility.hex(decomps.get(ch)));
outv("0F77: fullDecomp = " + Utility.hex(uinfo.getFullDecomposition(ch,false)));
canonIndex = putLength(replace, decomps.get(ch), compatIndex);
// If this character's full compatibility decomposition is different from
// its canonical decomp, that means one of the characters in its
// canonical decomp itself has a compatibility decomp. To deal with this,
// we set a bit flag telling the decomposer to recurse on this character.
if (!uinfo.getFullDecomposition(ch,true).equals(uinfo.getFullDecomposition(ch,false))) {
offsets.setElementAt(ch, (char)(canonIndex | DECOMP_RECURSE));
} else {
offsets.setElementAt(ch, (char)canonIndex);
// Now generate another CompactArray containing the combining class of every
// character in Unicode
final byte BASE = 0;
CompactByteArray canonClasses = new CompactByteArray(BASE);
for (char ch = 0; ch < 0xFFFF; ch++) {
short canonClass = uinfo.getCanonicalClass(ch);
if (canonClass != 0) {
canonClasses.setElementAt(ch, (byte)canonClass);
// Finally, write the data out to a compilable Java source file
if (fJava) {
String f = fOutDir + "DecompData";
out("Writing " + f);
writeDecompData(new JavaWriter(f),
canonIndex, compatIndex, BASE, offsets, replace, canonClasses);
if (fCPP) {
String f = fOutDir + "dcmpdata";
out("Writing " + f + ".(cpp|h)");
writeDecompData(new CPPWriter(f, "DecompData"),
canonIndex, compatIndex, BASE, offsets, replace, canonClasses);
outv("Decomp data: MAX_CANONICAL = " + canonIndex + ", MAX_DECOMP = " + compatIndex);
if (fShowSizes) {
int offsetSize = offsets.getIndexArray().length * 2 + offsets.getValueArray().length * 2;
int canonSize = canonClasses.getIndexArray().length * 2 + canonClasses.getValueArray().length;
int replaceLength = replace.length();
outv("Total runtime size of decomp data is "
+ (offsetSize + canonSize + replaceLength));
outv(" offsets: " + offsetSize);
outv(" canonClasses: " + canonSize);
outv(" replace: " + replaceLength);
void writeDecompData(SourceWriter out, int maxCanon, int maxCompat, short BASE,
CompactCharArray offsets, StringBuffer contents,
CompactByteArray canonClasses)
out.write("MAX_CANONICAL", maxCanon );
out.write("MAX_COMPAT", maxCompat );
out.write("BASE", BASE );
out.write("offsets", offsets );
out.write("contents", contents );
out.write("canonClass", canonClasses );
// Methods for generating and writing the composition data
final int TYPE_MASK = 0x0007;
final int INDEX_MASK = 0xFFF8;
final int INDEX_SHIFT = 3;
// MAX_BASES is used to map a 2-diminsional (base,combining) index pair onto a
// one-dimensional CompactArray. We could just use baseCount, but making it a power
// of two allows slightly better compaction.
final int MAX_BASES = 1024; // Product must be <= 64K
final int MAX_COMBINE = 65536/MAX_BASES;
final char // for character types
BASE = 1,
// These variables actually hold the composition data.
short baseCount = 1; // Leave 0 as an invalid index
short combineCount = 1; // Leave 0 as an invalid index
short nccCount = 0;
int maxCompat = 0;
int maxCanon = 0;
// This array contains types (from the set above) and indices into the "replace"
// and "actions" arrays
CompactCharArray lookup = new CompactCharArray(IGNORE);
// We also need a place to store the strings that result from replacements,
// explosions, and combinations. Add a char at the front so that "0" won't
// be the index of any of the replacement strings.
StringBuffer replace = new StringBuffer().append(" ");
// We need to represent each canonical character class as a single bit
// so that we can OR together a mask of all combining char classes seen
// Build an array that maps from combining class to a compacted integer
// from 0..n-1, where n is the number of distinct combining classes.
// E.g., in 3.0, there are 53 distinct combining classes.
int[] classMap = new int[256];
int[] typeBit;
// Build a two-dimensional array of the action to take for each base/combining pair
CompactCharArray actions = new CompactCharArray((char)0);
char[] actionIndex;
* Generate a new "" that contains the CompactArray definitions
* used in the {@link Normalizer.COMPOSE} operation.
void buildComposeData() throws IOException
BitSet usedIndices = new BitSet();
CharSet explodingBases = new CharSet();
NonComposingCombiningMap nccMap = new NonComposingCombiningMap();
// Find all characters that are both bases *and* have compatibility
// decompositions. These are weird
for (char ch = 0; ch < 0xFFFF; ch++) {
if (bases.contains(ch) && uinfo.hasCompatibilityDecomposition(ch)) {
// Add this character's explosion to the replacement string list.
// We're going to make sure that its "base index", i.e. the
// index for it in the actions array, is the same as the
// explosion's index in the replace string. This lets
// us use the same index for the character's two behaviors
int index = put(replace, explodeCompat.get(ch), 0);
outv(Utility.hex(ch) + " is base and has compat explosion "
+ Utility.hex(explodeCompat.get(ch)) );
addChar(lookup, ch, EXPLODING_BASE, index);
// First add the base characters to the array.
// At the same time, compute their indices.
// Leave an empty base index of 0 as a placeholder for null operations.
for (char ch = 0; ch < 0xFFFF; ch++)
if (explodingBases.contains(ch)) {
short cclass = uinfo.getCanonicalClass(ch);
if (bases.contains(ch)) {
// Make sure that we don't use a base index that was already used
// for an exploding base character.
while (usedIndices.get(baseCount)) {
// Now add the character to lookup as a base
addChar(lookup, ch, BASE, baseCount++);
if (combining.contains(ch)) {
classMap[cclass] = 1; // Mark this combining class as being used
addChar(lookup, ch, COMBINING, combineCount++);
if (ch >= '\u1100' && ch < '\u1160') {
addChar(lookup, ch, INITIAL_JAMO, 0);
if (ch >= '\u1161' && ch < '\u11a6') {
addChar(lookup, ch, MEDIAL_JAMO, 0);
if (ch >= '\u11a7' && ch < '\u11fa') {
addChar(lookup, ch, FINAL_JAMO, 0);
if (ch >= 0xac00 && ch <= 0xd7a4) {
addChar(lookup, ch, HANGUL, 0);
// Add explosions for all compatibility decompositions,
// including the Jamo --> Conjoining Jamo decomps.
// If the canonical decomposition is exactly one character
// one (4 hex digits) then we deal with it separately below.
if (explodeCompat.contains(ch) &&
uinfo.getDecomposition(ch).length() != 4)
maxCompat = put(replace, explodeCompat.get(ch), 0);
addExplosion(lookup, ch, maxCompat);
// Now add the explosions resulting from canonical decompositions
// These will all have indices greater than "maxCompat" so we can distinguish them.
for (char ch = 0; ch < 0xFFFF; ch++) {
short cclass = uinfo.getCanonicalClass(ch);
// not used String explosion = null;
if (explodeOnly.contains(ch) && uinfo.hasCanonicalDecomposition(ch)) {
maxCanon = put(replace, explodeOnly.get(ch), maxCompat);
addExplosion(lookup, ch, maxCanon);
// else if (!combining.contains(ch) && cclass != 0 && classMap[cclass] != 0) {
// //
// // If a combining character didn't happen to end up in one of
// // the pairwise combinations or explosions we use but still has
// // a combining class that is the same as a character we *do* use,
// // we need to save its class so that we don't combine things "past" it.
// //
// // However, if the character has an explosion we *don't* need it, because
// // we'll never see it, only the results of its explosion.
// //
// addChar(lookup, ch, COMBINING, 0);
// nccCount++;
// }
// I'm rewriting this logic. Having an index of zero means that
// the typeBit[index] gets overwritten with multiple different
// values. So we must use real index values that are unique
// per combining class. Also, it doesn't matter if the class
// has been seen or not; we still need to record the character
// in order to have its type and class during composition.
else if (!combining.contains(ch) && cclass != 0) {
// If a combining character didn't happen to end up in one of
// the pairwise combinations or explosions we use but still has
// a combining class that is the same as a character we *do* use,
// we need to save its class.
// As our index, use combineCount and up. Reuse values by
// mapping them through nccMap, which keeps track of previously
// used values and allocates new ones only as needed, starting
// with zero. - Liu
classMap[cclass] = 1; // Mark this combining class as being used
addChar(lookup, ch, COMBINING, combineCount + nccMap.getIndexFor(cclass));
nccCount = (short) nccMap.getIndexCount(); // Liu
// Remap characters that have a canonical decomposition to a singleton,
// and also different compatibility and canonical full decompositions
// (that is, also are members of explodeCompat). These characters can't
// be exploded to their full decomposition since that breaks canonical
// composition (normalization form C). Instead, we place their
// singleton decomposition in the table, at the end. This works because
// the singleton will get recursively exploded by Normalizer. As of
// Unicode 3.0, this fix applies to U+1FFE, 1FFD, 2000, and 2001. - Liu
int singleton = replace.length();
for (char ch = 0; ch < 0xFFFF; ch++) {
if (!explodingBases.contains(ch) &&
explodeCompat.contains(ch) &&
uinfo.getDecomposition(ch).length() == 4) {
// There might be a cleaner way to do this, perhaps by folding
// this logic into the code above (perhaps calling
// addExplosion() instead of addChar()), but I couldn't find it.
char remap = (char)
Integer.parseInt(uinfo.getDecomposition(ch), 16);
int index = put(replace, String.valueOf(remap), singleton);
addChar(lookup, ch, EXPLODING_BASE, index);
outv("Canonical singleton " + Utility.hex(ch) +
" remaps to " + Utility.hex(remap) + " index=" + index);
// Now run through the combining classes again and assign bit numbers
// in the same ascending order as the canonical classes
// not used int maskShift = 0;
int bit = 0;
for (int i = 0; i < 256; i++) {
if (classMap[i] != 0) {
classMap[i] = ++bit;
if (bit >= 64) {
err(String.valueOf(bit+1) + " combining classes; max is 64");
outv("# of combining classes is " + (bit+1));
outv("baseCount=" + baseCount + ", combineCount=" + combineCount
+ ", nccCount=" + nccCount);
if (baseCount > MAX_BASES) {
err(Integer.toString(baseCount) + " bases, limit is " + MAX_BASES);
err(Integer.toString(combineCount) + " combining chars, limit is " + MAX_COMBINE);
// Now build the "actions" array that tells what to do when each base /
// combining pair is seen.
// First do character pairs that combine into a single character...
Iterator iter = binaryCompositions.keySet().iterator();
while (iter.hasNext()) {
String source = (String);
char ch = binaryCompositions.get(source);
int baseIndex = lookup.elementAt(source.charAt(0)) >>> INDEX_SHIFT;
int combiningIndex = lookup.elementAt(source.charAt(1)) >>> INDEX_SHIFT;
actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex), ch);
// Pair explosions: base/combining pairs that explode into something else
// We're squeezing the indices for these in between MAX_COMPOSED and 0xFFFF,
// which means they can't be indexes into the "replace" string; those are too big.
// Instead they're indexes into the "actionIndex" array, which in turn contains
// indices in "replace"
actionIndex = new char[ pairExplosions.size() ];
short index = 0;
iter = pairExplosions.keySet().iterator();
while (iter.hasNext()) {
String source = (String);
char base = source.charAt(0);
char combining = source.charAt(1);
int strIndex = put(replace, (String)pairExplosions.get(source), 0);
actionIndex[index] = (char)strIndex;
int baseIndex = lookup.elementAt(base) >>> INDEX_SHIFT;
int combiningIndex = lookup.elementAt(combining) >>> INDEX_SHIFT;
actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex),
(char)(index + largestChar));
// Fill in the array that maps from combining class value
// to a bit numbe representing the canonical combining class.
// That is, map from 0..240 (in 3.0) to 0..52.
typeBit = new int[combineCount + nccCount];
for (char ch = 0; ch < 0xFFFF; ch++) {
int value = lookup.elementAt(ch);
int type = value & TYPE_MASK;
if (type == COMBINING) {
int ind = value >>> INDEX_SHIFT;
int cclass = uinfo.getCanonicalClass(ch);
if (typeBit[ind] != 0 && typeBit[ind] != classMap[cclass]) {
err("Overwriting typeBit[" + ind + "], was " +
typeBit[ind] + ", changing to " + classMap[cclass] + " for class " + cclass);
typeBit[ind] = classMap[cclass];
if (fJava) {
String f = fOutDir + "ComposeData";
out("Writing " + f);
writeComposeData(new JavaWriter(f));
if (fCPP) {
String f = fOutDir + "compdata";
out("Writing " + f + ".(cpp|h)");
writeComposeData(new CPPWriter(f, "ComposeData"));
if (fShowSizes) {
int lookupSize = lookup.getIndexArray().length * 2 + lookup.getValueArray().length * 2;
int actionSize = actions.getIndexArray().length * 2 + actions.getValueArray().length * 2;
int actIndexSize = actionIndex.length * 2;
int replaceSize = replace.length();
int typeBitSize = typeBit.length * 2;
outv("Total runtime size of compose data is "
+ (lookupSize + actionSize + actIndexSize + replaceSize + typeBitSize));
outv(" lookup: " + lookupSize);
outv(" actions: " + actionSize);
outv(" actionIndex: " + actIndexSize);
outv(" typeBit: " + typeBitSize);
outv(" replace: " + replaceSize);
void writeComposeData(SourceWriter out) {
out.write("BASE_COUNT", baseCount);
out.write("COMBINING_COUNT", combineCount);
out.write("MAX_COMPAT", maxCompat);
out.write("MAX_CANONICAL", maxCanon);
out.writeHex("MAX_COMPOSED", largestChar);
int maxIndex = replace.length();
out.write("MAX_INDEX", maxIndex );
out.write("INITIAL_JAMO_INDEX", maxIndex + 1);
out.write("MEDIAL_JAMO_INDEX", maxIndex + 2);
out.write("MAX_BASES", MAX_BASES );
out.writeHex("TYPE_MASK", TYPE_MASK);
// The character types
out.write("IGNORE", (int)IGNORE);
out.write("BASE", (int)BASE);
out.write("COMBINING", (int)COMBINING);
out.write("INITIAL_JAMO", (int)INITIAL_JAMO);
out.write("MEDIAL_JAMO", (int)MEDIAL_JAMO);
out.write("FINAL_JAMO", (int)FINAL_JAMO);
out.write("HANGUL", (int)HANGUL);
out.write("lookup", lookup );
out.write("actions", actions );
out.write("actionIndex", actionIndex );
out.write("replace", replace );
out.write("typeBit", typeBit);
void addChar(CompactCharArray lookup, char ch, int type, int index)
// First make sure it's not already present
if (lookup.elementAt(ch) != IGNORE)
char oldValue = lookup.elementAt(ch);
err(typeName(type) + " char is also "
+ typeName(oldValue & TYPE_MASK) + ": "
+ Utility.hex(ch) + " " + uinfo.getName(ch,true));
else if ((index << INDEX_SHIFT) > 65536) {
err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT));
} else {
lookup.setElementAt(ch, (char)(type | (index << INDEX_SHIFT)));
void addExplosion(CompactCharArray lookup, char ch, int index)
// First make sure it doesn't already have an index
char oldValue = lookup.elementAt(ch);
int oldIndex = oldValue >>> INDEX_SHIFT;
if (oldValue != IGNORE) {
err("Exploding char is already " + typeName(oldValue & TYPE_MASK)
+ " (index " + oldIndex + "): "
+ Utility.hex(ch) + " " + uinfo.getName(ch,true));
if (oldIndex != 0) {
err("Exploding char is already " + typeName(oldValue & TYPE_MASK)
+ " (index " + oldIndex + "): "
+ Utility.hex(ch) + " " + uinfo.getName(ch,true));
else if ((index << INDEX_SHIFT) > 65536) {
err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT));
} else {
lookup.setElementAt(ch, (char)((oldValue & ~INDEX_MASK) | (index << INDEX_SHIFT)));
String typeName(int type) {
switch (type) {
case IGNORE: return "Ignored";
case BASE: return "Base";
case EXPLODING_BASE: return "Exploding Base";
case COMBINING: return "Combining";
case INITIAL_JAMO: return "Initial Jamo";
case MEDIAL_JAMO: return "Medial Jamo";
case FINAL_JAMO: return "Final Jamo";
case HANGUL: return "Hangul";
default: return "Unknown";
static final int put(StringBuffer buf, String str, int minIndex)
str = str + '\u0000'; // Add trailing null
int index = buf.toString().indexOf(str);
if (index <= minIndex) {
index = buf.length();
return index;
static final int putLength(StringBuffer buf, String str, int minIndex) {
int length = str.length();
if (length >= (1 << STR_INDEX_SHIFT)) {
// There's no room to store the length in the index, so
// add a null terminator and use a 0 length to flag this
str = str + '\u0000';
length = 0;
int index = buf.toString().indexOf(str);
if (index <= minIndex) {
index = buf.length();
return (index << STR_INDEX_SHIFT) | length;
// Output & formatting
void out(String str) {
void outv(String str) {
if (fVerbose) System.out.println(str);
void warn(String str) {
System.err.println("Warning: " + str);
void err(String str) {
System.err.println("ERROR: " + str);
// Utility classes
class DecompMap extends HashMap {
public DecompMap() {
void put(char ch, String value) {
put(new MutableChar(ch), value);
String get(char ch) {
Object obj = get(probe.set(ch));
return (obj != null) ? (String)obj : null;
boolean contains(char ch) {
return containsKey(probe.set(ch));
MutableChar probe = new MutableChar(' ');
class CompMap extends HashMap {
public CompMap() {
void put(String key, char value) {
put(key, new MutableChar(value));
char get(String key) {
Object obj = get((Object)key);
return (obj != null) ? ((MutableChar)obj).value : 0;
class CharSet extends HashSet {
public CharSet() {
public void add(char ch) {
add(new MutableChar(ch));
public boolean contains(char ch) {
return contains(probe.set(ch));
MutableChar probe = new MutableChar(' ');
* An int->int map. Each time a non-existent key is looked up,
* create a new mapping to the next available integer value.
class NonComposingCombiningMap {
int index;
Hashtable hash;
public NonComposingCombiningMap() {
index = 0;
hash = new Hashtable();
* Return the existing mapping of class. If no such mapping
* exists, create one and return it. New mappings map to
* zero, then one, etc.
public int getIndexFor(int cclass) {
Integer cl = new Integer(cclass);
Integer ind = (Integer) hash.get(cl);
if (ind != null) {
return ind.intValue();
hash.put(cl, new Integer(index));
return index++;
* Return the number of mappings made so far. That is, getIndexFor()
* has returned integers 0..getIndexCount()-1.
public int getIndexCount() {
return index;