blob: 894c4566f92aa2b12397d6cb07eaa8876ffddfe7 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/tools/normalizer/Attic/NormalizerBuilder.java,v $
* $Date: 2000/03/10 04:17:56 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
package com.ibm.tools.normalizer;
import java.io.*;
import java.util.*;
import com.ibm.text.*;
import com.ibm.util.CompactByteArray;
import com.ibm.util.CompactCharArray;
public final class NormalizerBuilder
{
public static void main(String args[]) throws IOException {
try {
NormalizerBuilder foo = new NormalizerBuilder(args);
} catch (Throwable e) {
System.err.println(e.getLocalizedMessage());
e.printStackTrace();
System.in.read();
}
}
private UInfo uinfo;
private DecompMap decomps = new DecompMap();
private DecompMap explodeCompat = new DecompMap();
private DecompMap explodeOnly = new DecompMap();
private CompMap permutedCompositions = new CompMap();
private CompMap binaryCompositions = new CompMap();
private CharSet bases = new CharSet();
private CharSet combining = new CharSet();
private Map pairExplosions = new HashMap();
private boolean fVerbose = false;
private boolean fWriteData = false;
private boolean fShowSizes = false;
private boolean fPrompt = false;
private boolean fJava = true;
private boolean fCPP = false;
/**
* The highest Unicode character that has a canonical decomposition.
* (i.e. largest char that can result from a primary canonical composition.)
*/
char largestChar = 0;
public NormalizerBuilder(String[] args) throws IOException
{
// Parse my command line
for (int i = 0; i < args.length; i++)
{
if (args[i].equals("-data")) {
uinfo = new UInfo(args[++i]);
}
else if (args[i].equals("-write")) {
fWriteData = true;
}
else if (args[i].equals("-verbose")) {
fVerbose = true;
}
else if (args[i].equals("-size")) {
fShowSizes = true;
}
else if (args[i].equals("-prompt")) {
fPrompt = true;
}
else if (args[i].equals("-java")) {
fJava = true;
fCPP = false;
}
else if (args[i].equals("-cpp")) {
fCPP = true;
fJava = false;
}
}
if (uinfo == null) {
uinfo = new UInfo("../src/data/unicode/UnicodeData.txt");
}
boolean canonicalOnly = true;
createDecompositions();
out("\nGenerating permuted compositions...");
// Form the list of all the permuted sequences that are canonically
// equivalent to the canonical decompositions.
// As a by-product, find out which are not combining character sequences,
for (char ch = 0; ch < 0xFFFF; ch++) {
String decomp = decomps.get(ch);
if (decomp != null) {
boolean done = false;
if (!uinfo.getFullDecomposition(ch,true).equals(
uinfo.getFullDecomposition(ch,false)))
{
explodeCompat.put(ch, uinfo.getFullDecomposition(ch, false));
done = true;
}
if (uinfo.hasCanonicalDecomposition(ch) && decomp.length() > 1
&& !uinfo.isExcludedComposition(ch) && uinfo.isCBS(decomp))
{
if (decomp.length() <= 2) {
permutedCompositions.put(decomp, ch);
}
else {
List alternatives = concat(decomp.charAt(0),
jumble(decomp.substring(1, decomp.length())));
for (int i = 0; i < alternatives.size(); ++i)
{
String variant = (String)alternatives.get(i);
String normalized = uinfo.fixCanonical(variant);
if (normalized.equals(decomp)) {
permutedCompositions.put(variant, ch);
}
}
}
largestChar = ch;
done = true;
}
if (!done) {
explodeOnly.put(ch, decomp); // Disparaged
}
}
}
out("\nLargest composed char: " + uinfo.hex(largestChar));
// Form the binary compositions
out("\nGenerating pairwise compositions...");
Iterator list = permutedCompositions.keySet().iterator();
while (list.hasNext()) {
String decomp = (String)list.next();
char ch = permutedCompositions.get(decomp);
if (decomp.length() > 2) {
//
// If this is a composition of more than two characters,
// see if its initial portion is also a composition. If so, that lets
// us build up this composed character iteratively.
//
for (int i = decomp.length()-1; i > 1; --i) {
String partial = decomp.substring(0,i);
char partialMap = permutedCompositions.get(partial);
if (partialMap != 0) {
decomp = partialMap + decomp.substring(i);
break;
}
}
}
if (decomp.length() <= 2) {
binaryCompositions.put(decomp, ch);
} else {
//
// The composition takes more than two characters, and there's
// no way to build it up from smaller ones.
//
if (decomp.equals(uinfo.fixCanonical(decomp)))
{
// If the decomp is in canonical order, we're in trouble,
// since that means there's no way to generate this composed
// character from its canonically decomposed equivalent.
err("No pairwise compose of " + uinfo.hex(decomp) +
" > " + uinfo.hex(ch) + " " + uinfo.getName(ch,true) );
}
else {
// If the decomp is *not* in canonical order, it's not as
// bad, since composition will still work as long as
warn("No pairwise compose of non-canon " + uinfo.hex(decomp) +
" > " + uinfo.hex(ch) + " " + uinfo.getName(ch,true) );
}
}
bases.add(decomp.charAt(0));
// add to list of all combining characters in composites
for (int q = 1; q < decomp.length(); ++q) {
combining.add(decomp.charAt(q));
}
}
// Generate the pairwise explosions, where a composed char + combining char
// transforms into a different pair of characters, usually because the
// canonical combining classes are reversed.
out("\nGenerating exploding pairs....");
List binaryValues = new ArrayList(binaryCompositions.values());
Collections.sort(binaryValues);
for (char addOn = 0; addOn < 0xFFFF; addOn++) {
if (combining.contains(addOn))
{
list = binaryValues.iterator();
while (list.hasNext()) {
MutableChar unichar = (MutableChar)list.next();
String chStr = String.valueOf(unichar.value);
String source = chStr + addOn;
String comp = binaryComposition(source);
if (comp.length() == 1) continue; // don't care if combines
if (comp.charAt(0) == addOn || comp.charAt(1) == addOn) continue; // rearranges
if (!source.equals(comp)) {
String decomp = fullDecomposition(source);
pairExplosions.put(source,comp);
bases.add(unichar);
}
}
}
}
buildDecompData();
buildComposeData();
out("Success!");
if (fPrompt) {
System.out.println("\nHit any key to continue...");
System.in.read();
}
}
public String fullDecomposition(String s) {
return fullDecomposition(s, new StringBuffer()).toString();
}
public StringBuffer fullDecomposition(char ch, StringBuffer output) {
String value = decomps.get(ch);
if (value == null) {
bubbleAppend(output, ch);
}
else {
bubbleAppend(output, value);
}
return output;
}
public StringBuffer fullDecomposition(String s, StringBuffer output) {
for (int i = 0; i < s.length(); ++i) {
fullDecomposition(s.charAt(i),output);
}
return output;
}
public String binaryComposition(String sr) {
// set up decomposed string, init variables
StringBuffer output = new StringBuffer();
StringBuffer decomp = new StringBuffer();
if (sr.length() == 0) return output.toString();
// First generate the full decomposition of the input string
fullDecomposition(sr, decomp);
int basePosition = 0;
char base = decomp.charAt(0);
output.append(base);
// handle degenerate case--no base character at start
if (uinfo.getCanonicalClass(base) != 0) {
// later
}
// loop through, composing items with base
for (int i = 1; i < decomp.length(); ++i) {
char ch = decomp.charAt(i);
short can = uinfo.getCanonicalClass(ch);
char value = binaryCompositions.get(String.valueOf(base) + ch);
if (value != 0 && noObstructions(output, basePosition, can)) {
base = value;
output.setCharAt(basePosition, base);
} else if (can == 0) {
basePosition = output.length();
base = ch;
output.append(ch);
} else {
bubbleAppend(output, ch, can);
}
}
return output.toString();
}
public boolean noObstructions(StringBuffer buffer, int pos, short can) {
for (int j = buffer.length()-1; j > pos; --j) {
if (can == uinfo.getCanonicalClass(buffer.charAt(j))) {
return false;
}
}
return true;
}
public void bubbleAppend(StringBuffer buffer, char ch, short can) {
for (int j = buffer.length()-1; j >= 0; --j) {
if (can >= uinfo.getCanonicalClass(buffer.charAt(j))) {
buffer.insert(j + 1, ch);
return;
}
}
buffer.insert(0, ch);
}
public void bubbleAppend(StringBuffer buffer, char ch) {
bubbleAppend(buffer, ch, uinfo.getCanonicalClass(ch));
}
public void bubbleAppend(StringBuffer buffer, String s) {
for (int i = 0; i < s.length(); ++i) {
bubbleAppend(buffer, s.charAt(i));
}
}
String getDecomposition(char ch) {
return decomps.get(ch);
}
/**
* Generate a Map of all decompositions in Unicode.
* The keys in the map are MutableChar objects, one for each character that has a decomposition.
* The values are String objects containing the full decomposition for the character,
* in canonical order.
*/
private void createDecompositions()
{
out("\nGenerating Full decompositions...");
StringBuffer temp = new StringBuffer();
short compatCount=0, canonCount=0;
for (char ch = 0; ch < 0xFFFF; ++ch) {
if (ch >= '\u4E00' && ch <= '\uD7A3') continue; // skip ideos
short category = uinfo.getCategory(ch);
if (category == uinfo.UNASSIGNED) continue; //skip reserved
if (category == uinfo.CONTROL) continue;
if (category == uinfo.FORMAT) continue;
if (category == uinfo.PRIVATE_USE) continue;
if (category == uinfo.SURROGATE) continue;
boolean canon = uinfo.hasCanonicalDecomposition(ch);
if (uinfo.hasCanonicalDecomposition(ch)) canonCount++;
if (uinfo.hasCompatibilityDecomposition(ch)) compatCount++;
if (canon || uinfo.hasCompatibilityDecomposition(ch)) {
String decomp = uinfo.getFullDecomposition(ch, canon);
temp.setLength(0);
temp.append(decomp);
uinfo.fixCanonical(temp);
decomps.put(ch, temp.toString() );
}
}
}
static List concat(char ch, List a) {
for (int i = 0; i < a.size(); ++i) {
a.set(i, ch + (String)a.get(i));
}
return a;
}
/**
* Return a list of Strings for all possible permutations of the
* characters in the input string.
*/
static List jumble (String source)
{
ArrayList result = new ArrayList();
if (source.length() == 1) {
result.add(source);
} else for (int i = 0; i < source.length(); ++i) {
result.addAll( concat( source.charAt(i),
jumble(source.substring(0,i)
+ source.substring(i+1,source.length()))));
}
return result;
}
static final int STR_INDEX_SHIFT = 2;
static final int STR_LENGTH_MASK = 0x0003;
static final int DECOMP_RECURSE = 0x00008000;
static final int DECOMP_MASK = 0x00007FFF;
/**
* Generate a new "DecompData.java" that contains the CompactArray definitions
* used in the {@link Normalizer.DECOMPOSE} operation.
*/
void buildDecompData() throws IOException {
out("\nGenerating DecompData.java....");
//
// For each Unicode character that has a decomposition, we put its
// fully-decomposed form at the end of the "contents" string, followed
// by a null, and we put its index in "contents" into the CompactArray.
// If it does not have a decomposition, we store a bogus index.
//
// We do this first for all of the compatibility decompositions, save
// the index in MAX_COMPAT, and then do it again for the canonical
// decompositions. When the array is used later, any character whose
// decomp has an index greater than MAX_COMPAT is a canonical decomp.
//
int canonIndex = 0;
int compatIndex = 0;
// Map from Unicode character to replacement string index
CompactCharArray offsets = new CompactCharArray((char)0);
// We also need a place to store the replacement strings. Add a char at
// the front so that "0" won't be the index of any of the replacement strings.
StringBuffer replace = new StringBuffer().append("\uffff");
for (char ch = 0; ch < 0xFFFF; ch++) {
if (uinfo.hasCompatibilityDecomposition(ch)) {
compatIndex = putLength(replace, decomps.get(ch), 0);
offsets.setElementAt(ch, (char)compatIndex);
}
}
// Add the canonical decomps. Their indices must be > compatIndex.
for (char ch = 0; ch < 0xFFFF; ch++) {
if (uinfo.hasCanonicalDecomposition(ch)) {
if (ch == 0x0f77) {
out("0F77: decomps.get() = " + uinfo.hex(decomps.get(ch)));
out("0F77: fullDecomp = " + uinfo.hex(uinfo.getFullDecomposition(ch,false)));
}
canonIndex = putLength(replace, decomps.get(ch), compatIndex);
// If this character's full compatibility decomposition is different from
// its canonical decomp, that means one of the characters in its
// canonical decomp itself has a compatibility decomp. To deal with this,
// we set a bit flag telling the decomposer to recurse on this character.
if (!uinfo.getFullDecomposition(ch,true).equals(uinfo.getFullDecomposition(ch,false))) {
offsets.setElementAt(ch, (char)(canonIndex | DECOMP_RECURSE));
} else {
offsets.setElementAt(ch, (char)canonIndex);
}
}
}
//
// Now generate another CompactArray containing the combining class of every
// character in Unicode
//
final byte BASE = 0;
CompactByteArray canonClasses = new CompactByteArray(BASE);
for (char ch = 0; ch < 0xFFFF; ch++) {
short canonClass = uinfo.getCanonicalClass(ch);
if (canonClass != 0) {
canonClasses.setElementAt(ch, (byte)canonClass);
}
}
// Finally, write the data out to a compilable Java source file
if (fJava) {
writeDecompData(new JavaWriter("../src/com/ibm/text/DecompData"),
canonIndex, compatIndex, BASE, offsets, replace, canonClasses);
}
if (fCPP) {
writeDecompData(new CPPWriter("/intlwork/source/collate/CPP/dcmpdata", "DecompData"),
canonIndex, compatIndex, BASE, offsets, replace, canonClasses);
}
out("Decomp data: MAX_CANONICAL = " + canonIndex + ", MAX_DECOMP = " + compatIndex);
if (fShowSizes) {
int offsetSize = offsets.getIndexArray().length * 2 + offsets.getValueArray().length * 2;
int canonSize = canonClasses.getIndexArray().length * 2 + canonClasses.getValueArray().length;
int replaceLength = replace.length();
out("Total runtime size of decomp data is "
+ (offsetSize + canonSize + replaceLength));
out(" offsets: " + offsetSize);
out(" canonClasses: " + canonSize);
out(" replace: " + replaceLength);
}
}
void writeDecompData(SourceWriter out, int maxCanon, int maxCompat, short BASE,
CompactCharArray offsets, StringBuffer contents,
CompactByteArray canonClasses)
{
out.write("MAX_CANONICAL", maxCanon );
out.write("MAX_COMPAT", maxCompat );
out.write("DECOMP_MASK", DECOMP_MASK );
out.write("DECOMP_RECURSE", DECOMP_RECURSE );
out.write("BASE", BASE );
out.write("offsets", offsets );
out.write("contents", contents );
out.write("canonClass", canonClasses );
out.close();
}
//==========================================================================================
// Methods for generating and writing the composition data
//
final int TYPE_MASK = 0x0007;
final int INDEX_MASK = 0xFFF8;
final int INDEX_SHIFT = 3;
// MAX_BASES is used to map a 2-diminsional (base,combining) index pair onto a
// one-dimensional CompactArray. We could just use baseCount, but making it a power
// of two allows slightly better compaction.
final int MAX_BASES = 1024; // Product must be <= 64K
final int MAX_COMBINE = 65536/MAX_BASES;
final char // for character types
IGNORE = 0,
BASE = 1,
EXPLODING_BASE = 2,
COMBINING = 3,
INITIAL_JAMO = 4,
MEDIAL_JAMO = 5,
FINAL_JAMO = 6,
HANGUL = 7;
// These variables actually hold the composition data.
short baseCount = 1; // Leave 0 as an invalid index
short combineCount = 1; // Leave 0 as an invalid index
short nccCount = 0;
int maxCompat = 0;
int maxCanon = 0;
// This array contains types (from the set above) and indices into the "replace"
// and "actions" arrays
CompactCharArray lookup = new CompactCharArray(IGNORE);
// We also need a place to store the strings that result from replacements,
// explosions, and combinations. Add a char at the front so that "0" won't
// be the index of any of the replacement strings.
StringBuffer replace = new StringBuffer().append(" ");
// We need to represent each canonical character class as a single bit
// so that we can OR together a mask of all combining char classes seen
// Build an array that maps from combining class to bit mask.
int[] classMap = new int[256];
int[] typeMask;
// Build a two-dimensional array of the action to take for each base/combining pair
CompactCharArray actions = new CompactCharArray((char)0);
char[] actionIndex;
/**
* Generate a new "ComposeData.java" that contains the CompactArray definitions
* used in the {@link Normalizer.COMPOSE} operation.
*/
void buildComposeData() throws IOException
{
out("\nGenerating ComposeData.java....");
BitSet usedIndices = new BitSet();
CharSet explodingBases = new CharSet();
// Find all characters that are both bases *and* have compatibility
// decompositions. These are weird
for (char ch = 0; ch < 0xFFFF; ch++) {
if (bases.contains(ch) && uinfo.hasCompatibilityDecomposition(ch)) {
//
// Add this character's explosion to the replacement string list.
// We're going to make sure that its "base index", i.e. the
// index for it in the actions array, is the same as the
// explosion's index in the replace string. This lets
// us use the same index for the character's two behaviors
//
int index = put(replace, explodeCompat.get(ch), 0);
out(uinfo.hex(ch) + " is base and has compat explosion "
+ uinfo.hex(explodeCompat.get(ch)) );
addChar(lookup, ch, EXPLODING_BASE, index);
usedIndices.set(index);
explodingBases.add(ch);
}
}
// First add the base characters to the array.
// At the same time, compute their indices.
// Leave an empty base index of 0 as a placeholder for null operations.
//
for (char ch = 0; ch < 0xFFFF; ch++)
{
if (explodingBases.contains(ch)) {
continue;
}
short cclass = uinfo.getCanonicalClass(ch);
if (bases.contains(ch)) {
// Make sure that we don't use a base index that was already used
// for an exploding base character.
while (usedIndices.get(baseCount)) {
baseCount++;
}
// Now add the character to lookup as a base
addChar(lookup, ch, BASE, baseCount++);
}
if (combining.contains(ch)) {
classMap[cclass] = 1; // Mark this combining class as being used
addChar(lookup, ch, COMBINING, combineCount++);
}
if (ch >= '\u1100' && ch < '\u1160') {
addChar(lookup, ch, INITIAL_JAMO, 0);
}
if (ch >= '\u1161' && ch < '\u11a6') {
addChar(lookup, ch, MEDIAL_JAMO, 0);
}
if (ch >= '\u11a7' && ch < '\u11fa') {
addChar(lookup, ch, FINAL_JAMO, 0);
}
if (ch >= 0xac00 && ch <= 0xd7a4) {
addChar(lookup, ch, HANGUL, 0);
}
// Add explosions for all compatibility decompositions,
// including the Jamo --> Conjoining Jamo decomps.
if (explodeCompat.contains(ch))
{
maxCompat = put(replace, explodeCompat.get(ch), 0);
addExplosion(lookup, ch, maxCompat);
}
}
// Now add the explosions resulting from canonical decompositions
// These will all have indices greater than "maxCompat" so we can distinguish them.
//
for (char ch = 0; ch < 0xFFFF; ch++) {
short cclass = uinfo.getCanonicalClass(ch);
if (explodeOnly.contains(ch) && uinfo.hasCanonicalDecomposition(ch)) {
maxCanon = put(replace, explodeOnly.get(ch), maxCompat);
addExplosion(lookup, ch, maxCanon);
}
else if (!combining.contains(ch) && cclass != 0 && classMap[cclass] != 0) {
//
// If a combining character didn't happen to end up in one of
// the pairwise combinations or explosions we use but still has
// a combining class that is the same as a character we *do* use,
// we need to save its class so that we don't combine things "past" it.
//
// However, if the character has an explosion we *don't* need it, because
// we'll never see it, only the results of its explosion.
//
addChar(lookup, ch, COMBINING, 0);
nccCount++;
}
}
// Now run through the combining classes again and assign bitmasks
// in the same ascending order as the canonical classes
int maskShift = 0;
for (int i = 0; i < 256; i++) {
if (classMap[i] != 0) {
classMap[i] = (1 << (maskShift++));
}
}
if (maskShift > 32) {
err(Integer.toString(maskShift) + "combining classes; max is 32");
}
out("# of combining classes is " + maskShift);
out("baseCount=" + baseCount + ", combineCount=" + combineCount
+ ", nccCount=" + nccCount);
if (baseCount > MAX_BASES) {
err(Integer.toString(baseCount) + " bases, limit is " + MAX_BASES);
err(Integer.toString(combineCount) + " combining chars, limit is " + MAX_COMBINE);
}
// Now build the "actions" array that tells what to do when each base /
// combining pair is seen.
//
// First do character pairs that combine into a single character...
//
Iterator iter = binaryCompositions.keySet().iterator();
while (iter.hasNext()) {
String source = (String)iter.next();
char ch = binaryCompositions.get(source);
int baseIndex = lookup.elementAt(source.charAt(0)) >>> INDEX_SHIFT;
int combiningIndex = lookup.elementAt(source.charAt(1)) >>> INDEX_SHIFT;
actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex), ch);
}
//
// Pair explosions: base/combining pairs that explode into something else
// We're squeezing the indices for these in between MAX_COMPOSED and 0xFFFF,
// which means they can't be indexes into the "replace" string; those are too big.
// Instead they're indexes into the "actionIndex" array, which in turn contains
// indices in "replace"
//
actionIndex = new char[ pairExplosions.size() ];
short index = 0;
iter = pairExplosions.keySet().iterator();
while (iter.hasNext()) {
String source = (String)iter.next();
char base = source.charAt(0);
char combining = source.charAt(1);
int strIndex = put(replace, (String)pairExplosions.get(source), 0);
actionIndex[index] = (char)strIndex;
int baseIndex = lookup.elementAt(base) >>> INDEX_SHIFT;
int combiningIndex = lookup.elementAt(combining) >>> INDEX_SHIFT;
actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex),
(char)(index + largestChar));
index++;
}
// Fill in the "type mask" array that maps from combining character index
// to a bit mask representing the canonical combining class
typeMask = new int[combineCount + nccCount];
for (char ch = 0; ch < 0xFFFF; ch++) {
int value = lookup.elementAt(ch);
int type = value & TYPE_MASK;
if (type == COMBINING) {
int ind = value >>> INDEX_SHIFT;
int cclass = uinfo.getCanonicalClass(ch);
typeMask[ind] = classMap[cclass];
}
}
if (fJava) {
writeComposeData(new JavaWriter("../src/com/ibm/text/ComposeData"));
}
if (fCPP) {
writeComposeData(new CPPWriter("/intlwork/source/collate/CPP/compdata", "ComposeData"));
}
if (fShowSizes) {
int lookupSize = lookup.getIndexArray().length * 2 + lookup.getValueArray().length * 2;
int actionSize = actions.getIndexArray().length * 2 + actions.getValueArray().length * 2;
int actIndexSize = actionIndex.length * 2;
int replaceSize = replace.length();
int typeMaskSize = typeMask.length * 2;
out("Total runtime size of compose data is "
+ (lookupSize + actionSize + actIndexSize + replaceSize + typeMaskSize));
out(" lookup: " + lookupSize);
out(" actions: " + actionSize);
out(" actionIndex: " + actIndexSize);
out(" typeMask: " + typeMaskSize);
out(" replace: " + replaceSize);
}
}
void writeComposeData(SourceWriter out) {
out.write("BASE_COUNT", baseCount);
out.write("COMBINING_COUNT", combineCount);
out.write("MAX_COMPAT", maxCompat);
out.write("MAX_CANON", maxCanon);
out.writeHex("MAX_COMPOSED", largestChar);
int maxIndex = replace.length();
out.write("MAX_INDEX", maxIndex );
out.write("INITIAL_JAMO_INDEX", maxIndex + 1);
out.write("MEDIAL_JAMO_INDEX", maxIndex + 2);
out.write("MAX_BASES", MAX_BASES );
out.write("MAX_COMBINE", MAX_COMBINE);
out.writeHex("TYPE_MASK", TYPE_MASK);
out.write("INDEX_SHIFT", INDEX_SHIFT);
// The character types
out.write("IGNORE", (int)IGNORE);
out.write("BASE", (int)BASE);
out.write("EXPLODING_BASE", (int)EXPLODING_BASE);
out.write("COMBINING", (int)COMBINING);
out.write("INITIAL_JAMO", (int)INITIAL_JAMO);
out.write("MEDIAL_JAMO", (int)MEDIAL_JAMO);
out.write("FINAL_JAMO", (int)FINAL_JAMO);
out.write("HANGUL", (int)HANGUL);
out.write("lookup", lookup );
out.write("actions", actions );
out.write("actionIndex", actionIndex );
out.write("replace", replace );
out.write("typeMask", typeMask );
out.close();
}
void addChar(CompactCharArray lookup, char ch, int type, int index)
{
// First make sure it's not already present
if (lookup.elementAt(ch) != IGNORE)
{
char oldValue = lookup.elementAt(ch);
err(typeName(type) + " char is also "
+ typeName(oldValue & TYPE_MASK) + ": "
+ uinfo.hex(ch) + " " + uinfo.getName(ch,true));
}
else if ((index << INDEX_SHIFT) > 65536) {
err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT));
} else {
lookup.setElementAt(ch, (char)(type | (index << INDEX_SHIFT)));
}
}
void addExplosion(CompactCharArray lookup, char ch, int index)
{
// First make sure it doesn't already have an index
char oldValue = lookup.elementAt(ch);
int oldIndex = oldValue >>> INDEX_SHIFT;
if (oldValue != IGNORE) {
err("Exploding char is already " + typeName(oldValue & TYPE_MASK)
+ " (index " + oldIndex + "): "
+ uinfo.hex(ch) + " " + uinfo.getName(ch,true));
}
if (oldIndex != 0) {
err("Exploding char is already " + typeName(oldValue & TYPE_MASK)
+ " (index " + oldIndex + "): "
+ uinfo.hex(ch) + " " + uinfo.getName(ch,true));
}
else if ((index << INDEX_SHIFT) > 65536) {
err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT));
} else {
lookup.setElementAt(ch, (char)((oldValue & ~INDEX_MASK) | (index << INDEX_SHIFT)));
}
}
String typeName(int type) {
switch (type) {
case IGNORE: return "Ignored";
case BASE: return "Base";
case EXPLODING_BASE: return "Exploding Base";
case COMBINING: return "Combining";
case INITIAL_JAMO: return "Initial Jamo";
case MEDIAL_JAMO: return "Medial Jamo";
case FINAL_JAMO: return "Final Jamo";
case HANGUL: return "Hangul";
default: return "Unknown";
}
}
static final int put(StringBuffer buf, String str, int minIndex)
{
str = str + '\u0000'; // Add trailing null
int index = buf.toString().indexOf(str);
if (index <= minIndex) {
index = buf.length();
buf.append(str);
}
return index;
}
static final int putLength(StringBuffer buf, String str, int minIndex) {
int length = str.length();
if (length >= (1 << STR_INDEX_SHIFT)) {
// There's no room to store the length in the index, so
// add a null terminator and use a 0 length to flag this
str = str + '\u0000';
length = 0;
}
int index = buf.toString().indexOf(str);
if (index <= minIndex) {
index = buf.length();
buf.append(str);
}
return (index << STR_INDEX_SHIFT) | length;
}
//--------------------------------------------------------------------------------
// Source file headers
//
static final String kCHeader =
"/*\n"
+" * (C) Copyright IBM Corp. 1997-1998 - All Rights Reserved\n"
+" *\n"
+" * The program is provided 'as is' without any warranty express or\n"
+" * implied, including the warranty of non-infringement and the implied\n"
+" * warranties of merchantibility and fitness for a particular purpose.\n"
+" * IBM will not be liable for any damages suffered by you as a result\n"
+" * of using the Program. In no event will IBM be liable for any\n"
+" * special, indirect or consequential damages or lost profits even if\n"
+" * IBM has been advised of the possibility of their occurrence. IBM\n"
+" * will not be liable for any third party claims against you.\n"
+" */\n"
+ "// This class is MACHINE GENERATED. Run NormalizerBuilder to regenerate.\n"
+"\n";
void out(String str) {
if (fVerbose) System.out.println(str);
}
void warn(String str) {
System.err.println("Warning: " + str);
}
void err(String str) {
System.err.println("ERROR: " + str);
}
}
//-----------------------------------------------------------------------------
// Utility classes
//-----------------------------------------------------------------------------
class DecompMap extends HashMap {
public DecompMap() {
}
void put(char ch, String value) {
put(new MutableChar(ch), value);
}
String get(char ch) {
Object obj = get(probe.set(ch));
return (obj != null) ? (String)obj : null;
}
boolean contains(char ch) {
return containsKey(probe.set(ch));
}
MutableChar probe = new MutableChar(' ');
}
class CompMap extends HashMap {
public CompMap() {
}
void put(String key, char value) {
put(key, new MutableChar(value));
}
char get(String key) {
Object obj = get((Object)key);
return (obj != null) ? ((MutableChar)obj).value : 0;
}
}
class CharSet extends HashSet {
public CharSet() {
}
public void add(char ch) {
add(new MutableChar(ch));
}
public boolean contains(char ch) {
return contains(probe.set(ch));
}
MutableChar probe = new MutableChar(' ');
}