src/com/ibm/tools/normalizer/NormalizerBuilder.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 1996-2000, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/tools/normalizer/Attic/NormalizerBuilder.java,v $
  * $Date: 2000/03/10 04:17:56 $
  * $Revision: 1.3 $
  *
  *****************************************************************************************
  */

 package com.ibm.tools.normalizer;

 import java.io.*;
 import java.util.*;
 import com.ibm.text.*;
 import com.ibm.util.CompactByteArray;
 import com.ibm.util.CompactCharArray;

 public final class NormalizerBuilder
 {
     public static void main(String args[]) throws IOException {
         try {
             NormalizerBuilder foo = new NormalizerBuilder(args);
         } catch (Throwable e) {
             System.err.println(e.getLocalizedMessage());
             e.printStackTrace();
             System.in.read();
         }
     }

     private UInfo uinfo;

     private DecompMap decomps = new DecompMap();

     private DecompMap explodeCompat = new DecompMap();
     private DecompMap explodeOnly   = new DecompMap();

     private CompMap permutedCompositions = new CompMap();
     private CompMap binaryCompositions = new CompMap();

     private CharSet bases = new CharSet();
     private CharSet combining = new CharSet();

     private Map pairExplosions = new HashMap();

     private boolean fVerbose = false;
     private boolean fWriteData = false;
     private boolean fShowSizes = false;
     private boolean fPrompt = false;
     private boolean fJava = true;
     private boolean fCPP = false;

     /**
      * The highest Unicode character that has a canonical decomposition.
      * (i.e. largest char that can result from a primary canonical composition.)
      */
     char largestChar = 0;

     public NormalizerBuilder(String[] args) throws IOException
     {
         // Parse my command line
         for (int i = 0; i < args.length; i++)
         {
             if (args[i].equals("-data")) {
                 uinfo = new UInfo(args[++i]);
             }
             else if (args[i].equals("-write")) {
                 fWriteData = true;
             }
             else if (args[i].equals("-verbose")) {
                 fVerbose = true;
             }
             else if (args[i].equals("-size")) {
                 fShowSizes = true;
             }
             else if (args[i].equals("-prompt")) {
                 fPrompt = true;
             }
             else if (args[i].equals("-java")) {
                 fJava = true;
                 fCPP = false;
             }
             else if (args[i].equals("-cpp")) {
                 fCPP = true;
                 fJava = false;
             }
         }
         if (uinfo == null) {
             uinfo = new UInfo("../src/data/unicode/UnicodeData.txt");
         }

         boolean canonicalOnly = true;

         createDecompositions();

         out("\nGenerating permuted compositions...");

         // Form the list of all the permuted sequences that are canonically
         // equivalent to the canonical decompositions.
         // As a by-product, find out which are not combining character sequences,

         for (char ch = 0; ch < 0xFFFF; ch++) {
             String decomp = decomps.get(ch);

             if (decomp != null) {
                 boolean done = false;

                 if (!uinfo.getFullDecomposition(ch,true).equals(
                             uinfo.getFullDecomposition(ch,false)))
                 {
                     explodeCompat.put(ch, uinfo.getFullDecomposition(ch, false));
                     done = true;
                 }
                 if (uinfo.hasCanonicalDecomposition(ch) && decomp.length() > 1
                     && !uinfo.isExcludedComposition(ch) && uinfo.isCBS(decomp))
                 {
                     if (decomp.length() <= 2) {
                         permutedCompositions.put(decomp, ch);
                     }
                     else {
                         List alternatives = concat(decomp.charAt(0),
                             jumble(decomp.substring(1, decomp.length())));

                         for (int i = 0; i < alternatives.size(); ++i)
                         {
                             String variant = (String)alternatives.get(i);
                             String normalized = uinfo.fixCanonical(variant);

                             if (normalized.equals(decomp)) {
                                 permutedCompositions.put(variant, ch);
                             }
                         }
                     }
                     largestChar = ch;
                     done = true;
                 }
                 if (!done) {
                     explodeOnly.put(ch, decomp);    // Disparaged
                 }
             }
         }

         out("\nLargest composed char: " + uinfo.hex(largestChar));

         // Form the binary compositions
         out("\nGenerating pairwise compositions...");

         Iterator list = permutedCompositions.keySet().iterator();
         while (list.hasNext()) {
             String decomp = (String)list.next();
             char ch = permutedCompositions.get(decomp);

             if (decomp.length() > 2) {
                 //
                 // If this is a composition of more than two characters,
                 // see if its initial portion is also a composition.  If so, that lets
                 // us build up this composed character iteratively.
                 //
                 for (int i = decomp.length()-1; i > 1; --i) {
                     String partial = decomp.substring(0,i);
                     char partialMap = permutedCompositions.get(partial);
                     if (partialMap != 0) {
                         decomp = partialMap + decomp.substring(i);
                         break;
                     }
                 }
             }
             if (decomp.length() <= 2) {
                 binaryCompositions.put(decomp, ch);
             } else {
                 //
                 // The composition takes more than two characters, and there's
                 // no way to build it up from smaller ones.
                 //
                 if (decomp.equals(uinfo.fixCanonical(decomp)))
                 {
                     // If the decomp is in canonical order, we're in trouble,
                     // since that means there's no way to generate this composed
                     // character from its canonically decomposed equivalent.
                     err("No pairwise compose of " + uinfo.hex(decomp) +
                             " > " + uinfo.hex(ch) + " " + uinfo.getName(ch,true) );
                 }
                 else {
                     // If the decomp is *not* in canonical order, it's not as
                     // bad, since composition will still work as long as
                     warn("No pairwise compose of non-canon " + uinfo.hex(decomp) +
                             " > " + uinfo.hex(ch) + " " + uinfo.getName(ch,true) );
                 }
             }

             bases.add(decomp.charAt(0));

             // add to list of all combining characters in composites
             for (int q = 1; q < decomp.length(); ++q) {
                 combining.add(decomp.charAt(q));
             }
         }


         // Generate the pairwise explosions, where a composed char + combining char
         // transforms into a different pair of characters, usually because the
         // canonical combining classes are reversed.

         out("\nGenerating exploding pairs....");

         List binaryValues = new ArrayList(binaryCompositions.values());
         Collections.sort(binaryValues);

         for (char addOn = 0; addOn < 0xFFFF; addOn++) {
             if (combining.contains(addOn))
             {
                 list = binaryValues.iterator();

                 while (list.hasNext()) {
                     MutableChar unichar = (MutableChar)list.next();
                     String chStr = String.valueOf(unichar.value);
                     String source = chStr + addOn;

                     String comp = binaryComposition(source);

                     if (comp.length() == 1) continue; // don't care if combines
                     if (comp.charAt(0) == addOn || comp.charAt(1) == addOn) continue; // rearranges

                     if (!source.equals(comp)) {
                         String decomp = fullDecomposition(source);
                         pairExplosions.put(source,comp);
                         bases.add(unichar);
                     }
                 }
             }
         }

         buildDecompData();
         buildComposeData();
         out("Success!");

         if (fPrompt) {
             System.out.println("\nHit any key to continue...");
             System.in.read();
         }
     }

     public String fullDecomposition(String s) {
         return fullDecomposition(s, new StringBuffer()).toString();
     }

     public StringBuffer fullDecomposition(char ch, StringBuffer output) {
         String value = decomps.get(ch);
         if (value == null) {
             bubbleAppend(output, ch);
         }
         else {
             bubbleAppend(output, value);
         }
         return output;
     }

     public StringBuffer fullDecomposition(String s, StringBuffer output) {
         for (int i = 0; i < s.length(); ++i) {
             fullDecomposition(s.charAt(i),output);
         }
         return output;
     }

     public String binaryComposition(String sr) {
         // set up decomposed string, init variables
         StringBuffer output = new StringBuffer();
         StringBuffer decomp = new StringBuffer();

         if (sr.length() == 0) return output.toString();

         // First generate the full decomposition of the input string
         fullDecomposition(sr, decomp);
         int basePosition = 0;
         char base = decomp.charAt(0);
         output.append(base);

         // handle degenerate case--no base character at start
         if (uinfo.getCanonicalClass(base) != 0) {
             // later
         }

         // loop through, composing items with base
         for (int i = 1; i < decomp.length(); ++i) {
             char ch = decomp.charAt(i);
             short can = uinfo.getCanonicalClass(ch);

             char value = binaryCompositions.get(String.valueOf(base) + ch);

             if (value != 0 && noObstructions(output, basePosition, can)) {
                 base = value;
                 output.setCharAt(basePosition, base);
             } else if (can == 0) {
                 basePosition = output.length();
                 base = ch;
                 output.append(ch);
             } else {
                 bubbleAppend(output, ch, can);
             }
         }
         return output.toString();
     }

     public boolean noObstructions(StringBuffer buffer, int pos, short can) {
         for (int j = buffer.length()-1; j > pos; --j) {
             if (can == uinfo.getCanonicalClass(buffer.charAt(j))) {
                 return false;
             }
         }
         return true;
     }

     public void bubbleAppend(StringBuffer buffer, char ch, short can) {
         for (int j = buffer.length()-1; j >= 0; --j) {
             if (can >= uinfo.getCanonicalClass(buffer.charAt(j))) {
                 buffer.insert(j + 1, ch);
                 return;
             }
         }
         buffer.insert(0, ch);
     }

     public void bubbleAppend(StringBuffer buffer, char ch) {
         bubbleAppend(buffer, ch, uinfo.getCanonicalClass(ch));
     }

     public void bubbleAppend(StringBuffer buffer, String s) {
         for (int i = 0; i < s.length(); ++i) {
             bubbleAppend(buffer, s.charAt(i));
         }
     }

     String getDecomposition(char ch) {
         return decomps.get(ch);
     }


     /**
      * Generate a Map of all decompositions in Unicode.
      * The keys in the map are MutableChar objects, one for each character that has a decomposition.
      * The values are String objects containing the full decomposition for the character,
      * in canonical order.
      */
     private void createDecompositions()
     {
         out("\nGenerating Full decompositions...");
         StringBuffer temp = new StringBuffer();

         short compatCount=0, canonCount=0;

         for (char ch = 0; ch < 0xFFFF; ++ch) {
             if (ch >= '\u4E00' && ch <= '\uD7A3') continue; // skip ideos

             short category = uinfo.getCategory(ch);

             if (category == uinfo.UNASSIGNED) continue; //skip reserved
             if (category == uinfo.CONTROL) continue;
             if (category == uinfo.FORMAT) continue;
             if (category == uinfo.PRIVATE_USE) continue;
             if (category == uinfo.SURROGATE) continue;

             boolean canon = uinfo.hasCanonicalDecomposition(ch);

             if (uinfo.hasCanonicalDecomposition(ch)) canonCount++;
             if (uinfo.hasCompatibilityDecomposition(ch)) compatCount++;

             if (canon || uinfo.hasCompatibilityDecomposition(ch)) {
                 String decomp = uinfo.getFullDecomposition(ch, canon);
                 temp.setLength(0);
                 temp.append(decomp);
                 uinfo.fixCanonical(temp);

                 decomps.put(ch, temp.toString() );
             }
         }
     }

     static List concat(char ch, List a) {
         for (int i = 0; i < a.size(); ++i) {
             a.set(i, ch + (String)a.get(i));
         }
         return a;
     }

     /**
      * Return a list of Strings for all possible permutations of the
      * characters in the input string.
      */
     static List jumble (String source)
     {
         ArrayList result = new ArrayList();
         if (source.length() == 1) {
             result.add(source);
         } else for (int i = 0; i < source.length(); ++i) {
             result.addAll( concat( source.charAt(i),
                                    jumble(source.substring(0,i)
                                           + source.substring(i+1,source.length()))));
         }
         return result;
     }

     static final int STR_INDEX_SHIFT = 2;
     static final int STR_LENGTH_MASK = 0x0003;

     static final int DECOMP_RECURSE = 0x00008000;
     static final int DECOMP_MASK  = 0x00007FFF;

     /**
      * Generate a new "DecompData.java" that contains the CompactArray definitions
      * used in the {@link Normalizer.DECOMPOSE} operation.
      */
     void buildDecompData() throws IOException {

         out("\nGenerating DecompData.java....");
         //
         // For each Unicode character that has a decomposition, we put its
         // fully-decomposed form at the end of the "contents" string, followed
         // by a null, and we put its index in "contents" into the CompactArray.
         // If it does not have a decomposition, we store a bogus index.
         //
         // We do this first for all of the compatibility decompositions, save
         // the index in MAX_COMPAT, and then do it again for the canonical
         // decompositions.  When the array is used later, any character whose
         // decomp has an index greater than MAX_COMPAT is a canonical decomp.
         //
         int canonIndex = 0;
         int compatIndex = 0;

         // Map from Unicode character to replacement string index
         CompactCharArray offsets = new CompactCharArray((char)0);

         // We also need a place to store the replacement strings.  Add a char at
         // the front so that "0" won't be the index of any of the replacement strings.
         StringBuffer replace = new StringBuffer().append("\uffff");

         for (char ch = 0; ch < 0xFFFF; ch++) {
             if (uinfo.hasCompatibilityDecomposition(ch)) {
                 compatIndex = putLength(replace, decomps.get(ch), 0);
                 offsets.setElementAt(ch, (char)compatIndex);
             }
         }

         // Add the canonical decomps.  Their indices must be > compatIndex.
         for (char ch = 0; ch < 0xFFFF; ch++) {
             if (uinfo.hasCanonicalDecomposition(ch)) {

                 if (ch == 0x0f77) {
                     out("0F77: decomps.get() = " + uinfo.hex(decomps.get(ch)));
                     out("0F77: fullDecomp = " + uinfo.hex(uinfo.getFullDecomposition(ch,false)));
                 }

                 canonIndex = putLength(replace, decomps.get(ch), compatIndex);

                 // If this character's full compatibility decomposition is different from
                 // its canonical decomp, that means one of the characters in its
                 // canonical decomp itself has a compatibility decomp.  To deal with this,
                 // we set a bit flag telling the decomposer to recurse on this character.

                 if (!uinfo.getFullDecomposition(ch,true).equals(uinfo.getFullDecomposition(ch,false))) {
                     offsets.setElementAt(ch, (char)(canonIndex | DECOMP_RECURSE));
                 } else {
                     offsets.setElementAt(ch, (char)canonIndex);
                 }
             }
         }

         //
         // Now generate another CompactArray containing the combining class of every
         // character in Unicode
         //
         final byte BASE = 0;
         CompactByteArray canonClasses = new CompactByteArray(BASE);

         for (char ch = 0; ch < 0xFFFF; ch++) {
             short canonClass = uinfo.getCanonicalClass(ch);
             if (canonClass != 0) {
                 canonClasses.setElementAt(ch, (byte)canonClass);
             }
         }

         // Finally, write the data out to a compilable Java source file

         if (fJava) {
             writeDecompData(new JavaWriter("../src/com/ibm/text/DecompData"),
                         canonIndex, compatIndex, BASE, offsets, replace, canonClasses);
         }

         if (fCPP) {
             writeDecompData(new CPPWriter("/intlwork/source/collate/CPP/dcmpdata", "DecompData"),
                         canonIndex, compatIndex, BASE, offsets, replace, canonClasses);
         }

         out("Decomp data: MAX_CANONICAL = " + canonIndex + ", MAX_DECOMP = " + compatIndex);

         if (fShowSizes) {
             int offsetSize = offsets.getIndexArray().length * 2 + offsets.getValueArray().length * 2;
             int canonSize = canonClasses.getIndexArray().length * 2 + canonClasses.getValueArray().length;
             int replaceLength = replace.length();

             out("Total runtime size of decomp data is "
                 + (offsetSize + canonSize + replaceLength));

             out("  offsets:      " + offsetSize);
             out("  canonClasses: " + canonSize);
             out("  replace:      " + replaceLength);
         }
     }

     void writeDecompData(SourceWriter out, int maxCanon, int maxCompat, short BASE,
                         CompactCharArray offsets, StringBuffer contents,
                         CompactByteArray canonClasses)
     {
         out.write("MAX_CANONICAL",  maxCanon        );
         out.write("MAX_COMPAT",     maxCompat       );
         out.write("DECOMP_MASK",    DECOMP_MASK     );
         out.write("DECOMP_RECURSE", DECOMP_RECURSE  );
         out.write("BASE",           BASE            );
         out.write("offsets",        offsets         );
         out.write("contents",       contents        );
         out.write("canonClass",     canonClasses    );
         out.close();
     }


     //==========================================================================================
     // Methods for generating and writing the composition data
     //
     final int TYPE_MASK   = 0x0007;
     final int INDEX_MASK  = 0xFFF8;
     final int INDEX_SHIFT = 3;

     // MAX_BASES is used to map a 2-diminsional (base,combining) index pair onto a
     // one-dimensional CompactArray.  We could just use baseCount, but making it a power
     // of two allows slightly better compaction.

     final int MAX_BASES   = 1024;   // Product must be <= 64K
     final int MAX_COMBINE = 65536/MAX_BASES;

     final char                // for character types
         IGNORE = 0,
         BASE = 1,
         EXPLODING_BASE = 2,
         COMBINING = 3,
         INITIAL_JAMO = 4,
         MEDIAL_JAMO = 5,
         FINAL_JAMO = 6,
         HANGUL = 7;

     // These variables actually hold the composition data.
     short baseCount = 1;        // Leave 0 as an invalid index
     short combineCount = 1;     // Leave 0 as an invalid index
     short nccCount = 0;
     int   maxCompat = 0;
     int   maxCanon = 0;

     // This array contains types (from the set above) and indices into the "replace"
     // and "actions" arrays
     CompactCharArray lookup = new CompactCharArray(IGNORE);

     // We also need a place to store the strings that result from replacements,
     // explosions, and combinations.  Add a char at the front so that "0" won't
     // be the index of any of the replacement strings.
     StringBuffer replace = new StringBuffer().append(" ");

     // We need to represent each canonical character class as a single bit
     // so that we can OR together a mask of all combining char classes seen
     // Build an array that maps from combining class to bit mask.
     int[] classMap = new int[256];
     int[] typeMask;

     // Build a two-dimensional array of the action to take for each base/combining pair
     CompactCharArray actions = new CompactCharArray((char)0);

     char[] actionIndex;

     /**
      * Generate a new "ComposeData.java" that contains the CompactArray definitions
      * used in the {@link Normalizer.COMPOSE} operation.
      */
     void buildComposeData() throws IOException
     {
        out("\nGenerating ComposeData.java....");

         BitSet usedIndices = new BitSet();
         CharSet explodingBases = new CharSet();

         // Find all characters that are both bases *and* have compatibility
         // decompositions.  These are weird
         for (char ch = 0; ch < 0xFFFF; ch++) {
             if (bases.contains(ch) && uinfo.hasCompatibilityDecomposition(ch)) {
                 //
                 // Add this character's explosion to the replacement string list.
                 // We're going to make sure that its "base index", i.e. the
                 // index for it in the actions array, is the same as the
                 // explosion's index in the replace string.  This lets
                 // us use the same index for the character's two behaviors
                 //
                 int index = put(replace, explodeCompat.get(ch), 0);

                 out(uinfo.hex(ch) + " is base and has compat explosion "
                                   + uinfo.hex(explodeCompat.get(ch)) );

                 addChar(lookup, ch, EXPLODING_BASE, index);
                 usedIndices.set(index);
                 explodingBases.add(ch);
             }
         }

         // First add the base characters to the array.
         // At the same time, compute their indices.
         // Leave an empty base index of 0 as a placeholder for null operations.
         //

         for (char ch = 0; ch < 0xFFFF; ch++)
         {

             if (explodingBases.contains(ch)) {
                 continue;
             }

             short cclass = uinfo.getCanonicalClass(ch);

             if (bases.contains(ch)) {
                 // Make sure that we don't use a base index that was already used
                 // for an exploding base character.
                 while (usedIndices.get(baseCount)) {
                     baseCount++;
                 }
                 // Now add the character to lookup as a base
                 addChar(lookup, ch, BASE, baseCount++);
             }
             if (combining.contains(ch)) {
                 classMap[cclass] = 1;       // Mark this combining class as being used
                 addChar(lookup, ch, COMBINING, combineCount++);
             }

             if (ch >= '\u1100' && ch < '\u1160') {
                 addChar(lookup, ch, INITIAL_JAMO, 0);
             }
             if (ch >= '\u1161' && ch < '\u11a6') {
                 addChar(lookup, ch, MEDIAL_JAMO, 0);
             }
             if (ch >= '\u11a7' && ch < '\u11fa') {
                 addChar(lookup, ch, FINAL_JAMO, 0);
             }
             if (ch >= 0xac00 && ch <= 0xd7a4) {
                 addChar(lookup, ch, HANGUL, 0);
             }

             // Add explosions for all compatibility decompositions,
             // including the Jamo --> Conjoining Jamo decomps.
             if (explodeCompat.contains(ch))
             {
                 maxCompat = put(replace, explodeCompat.get(ch), 0);
                 addExplosion(lookup, ch, maxCompat);
             }
         }

         // Now add the explosions resulting from canonical decompositions
         // These will all have indices greater than "maxCompat" so we can distinguish them.
         //
         for (char ch = 0; ch < 0xFFFF; ch++) {
             short cclass = uinfo.getCanonicalClass(ch);

             if (explodeOnly.contains(ch) && uinfo.hasCanonicalDecomposition(ch)) {
                 maxCanon = put(replace, explodeOnly.get(ch), maxCompat);
                 addExplosion(lookup, ch, maxCanon);
             }
             else if (!combining.contains(ch) && cclass != 0 && classMap[cclass] != 0) {
                 //
                 // If a combining character didn't happen to end up in one of
                 // the pairwise combinations or explosions we use but still has
                 // a combining class that is the same as a character we *do* use,
                 // we need to save its class so that we don't combine things "past" it.
                 //
                 // However, if the character has an explosion we *don't* need it, because
                 // we'll never see it, only the results of its explosion.
                 //
                 addChar(lookup, ch, COMBINING, 0);
                 nccCount++;
             }
         }

         // Now run through the combining classes again and assign bitmasks
         // in the same ascending order as the canonical classes
         int maskShift = 0;
         for (int i = 0; i < 256; i++) {
             if (classMap[i] != 0) {
                 classMap[i] = (1 << (maskShift++));
             }
         }
         if (maskShift > 32) {
             err(Integer.toString(maskShift) + "combining classes; max is 32");
         }
         out("# of combining classes is " + maskShift);

         out("baseCount=" + baseCount + ", combineCount=" + combineCount
                             + ", nccCount=" + nccCount);

         if (baseCount > MAX_BASES) {
             err(Integer.toString(baseCount) + " bases, limit is " + MAX_BASES);
             err(Integer.toString(combineCount) + " combining chars, limit is " + MAX_COMBINE);
         }

         // Now build the "actions" array that tells what to do when each base /
         // combining pair is seen.
         //
         // First do character pairs that combine into a single character...
         //
         Iterator iter = binaryCompositions.keySet().iterator();
         while (iter.hasNext()) {
             String source = (String)iter.next();
             char ch = binaryCompositions.get(source);

             int baseIndex = lookup.elementAt(source.charAt(0)) >>> INDEX_SHIFT;
             int combiningIndex = lookup.elementAt(source.charAt(1)) >>> INDEX_SHIFT;

             actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex), ch);
         }


         //
         // Pair explosions: base/combining pairs that explode into something else
         // We're squeezing the indices for these in between MAX_COMPOSED and 0xFFFF,
         // which means they can't be indexes into the "replace" string; those are too big.
         // Instead they're indexes into the "actionIndex" array, which in turn contains
         // indices in "replace"
         //
         actionIndex = new char[ pairExplosions.size() ];
         short index = 0;

         iter = pairExplosions.keySet().iterator();
         while (iter.hasNext()) {
             String source = (String)iter.next();
             char base = source.charAt(0);
             char combining = source.charAt(1);

             int strIndex = put(replace, (String)pairExplosions.get(source), 0);
             actionIndex[index] = (char)strIndex;

             int baseIndex = lookup.elementAt(base) >>> INDEX_SHIFT;
             int combiningIndex = lookup.elementAt(combining) >>> INDEX_SHIFT;

             actions.setElementAt((char)(baseIndex + MAX_BASES*combiningIndex),
                                  (char)(index + largestChar));
             index++;
         }

         // Fill in the "type mask" array that maps from combining character index
         // to a bit mask representing the canonical combining class
         typeMask = new int[combineCount + nccCount];

         for (char ch = 0; ch < 0xFFFF; ch++) {
             int value = lookup.elementAt(ch);
             int type = value & TYPE_MASK;

             if (type == COMBINING) {
                 int ind = value >>> INDEX_SHIFT;
                 int cclass = uinfo.getCanonicalClass(ch);
                 typeMask[ind] = classMap[cclass];
             }
         }

         if (fJava) {
             writeComposeData(new JavaWriter("../src/com/ibm/text/ComposeData"));
         }
         if (fCPP) {
            writeComposeData(new CPPWriter("/intlwork/source/collate/CPP/compdata", "ComposeData"));
         }

         if (fShowSizes) {
             int lookupSize = lookup.getIndexArray().length * 2 + lookup.getValueArray().length * 2;
             int actionSize = actions.getIndexArray().length * 2 + actions.getValueArray().length * 2;
             int actIndexSize = actionIndex.length * 2;
             int replaceSize = replace.length();
             int typeMaskSize = typeMask.length * 2;

             out("Total runtime size of compose data is "
                 + (lookupSize + actionSize + actIndexSize + replaceSize + typeMaskSize));

             out("  lookup:       " + lookupSize);
             out("  actions:      " + actionSize);
             out("  actionIndex:  " + actIndexSize);
             out("  typeMask:     " + typeMaskSize);
             out("  replace:      " + replaceSize);
         }
     }

     void writeComposeData(SourceWriter out) {
         out.write("BASE_COUNT",         baseCount);
         out.write("COMBINING_COUNT",    combineCount);
         out.write("MAX_COMPAT",         maxCompat);
         out.write("MAX_CANON",          maxCanon);

         out.writeHex("MAX_COMPOSED",    largestChar);

         int maxIndex = replace.length();
         out.write("MAX_INDEX",          maxIndex    );
         out.write("INITIAL_JAMO_INDEX", maxIndex + 1);
         out.write("MEDIAL_JAMO_INDEX",  maxIndex + 2);

         out.write("MAX_BASES",          MAX_BASES  );
         out.write("MAX_COMBINE",        MAX_COMBINE);

         out.writeHex("TYPE_MASK",       TYPE_MASK);
         out.write("INDEX_SHIFT",        INDEX_SHIFT);

         // The character types
         out.write("IGNORE",             (int)IGNORE);
         out.write("BASE",               (int)BASE);
         out.write("EXPLODING_BASE",     (int)EXPLODING_BASE);
         out.write("COMBINING",          (int)COMBINING);
         out.write("INITIAL_JAMO",       (int)INITIAL_JAMO);
         out.write("MEDIAL_JAMO",        (int)MEDIAL_JAMO);
         out.write("FINAL_JAMO",         (int)FINAL_JAMO);
         out.write("HANGUL",             (int)HANGUL);

         out.write("lookup",         lookup        );
         out.write("actions",        actions       );
         out.write("actionIndex",    actionIndex   );
         out.write("replace",        replace       );
         out.write("typeMask",       typeMask      );

         out.close();
     }

     void addChar(CompactCharArray lookup, char ch, int type, int index)
     {
         // First make sure it's not already present
         if (lookup.elementAt(ch) != IGNORE)
         {
             char oldValue = lookup.elementAt(ch);
             err(typeName(type) + " char is also "
                   + typeName(oldValue & TYPE_MASK) + ": "
                   + uinfo.hex(ch) + "  " + uinfo.getName(ch,true));
         }
         else if ((index << INDEX_SHIFT) > 65536) {
             err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT));
         } else {
             lookup.setElementAt(ch, (char)(type | (index << INDEX_SHIFT)));
         }
     }

     void addExplosion(CompactCharArray lookup, char ch, int index)
     {
         // First make sure it doesn't already have an index
         char oldValue = lookup.elementAt(ch);
         int oldIndex = oldValue >>> INDEX_SHIFT;

         if (oldValue != IGNORE) {
             err("Exploding char is already " + typeName(oldValue & TYPE_MASK)
                              + " (index " + oldIndex + "): "
                              + uinfo.hex(ch) + "  " + uinfo.getName(ch,true));
         }

         if (oldIndex != 0) {
             err("Exploding char is already " + typeName(oldValue & TYPE_MASK)
                              + " (index " + oldIndex + "): "
                              + uinfo.hex(ch) + "  " + uinfo.getName(ch,true));
         }
         else if ((index << INDEX_SHIFT) > 65536) {
             err("not enough bits: index " + index + " << INDEX_SHIFT = " + (index << INDEX_SHIFT));
         } else {
             lookup.setElementAt(ch, (char)((oldValue & ~INDEX_MASK) | (index << INDEX_SHIFT)));
         }
     }

     String typeName(int type) {
         switch (type) {
             case IGNORE:            return "Ignored";
             case BASE:              return "Base";
             case EXPLODING_BASE:    return "Exploding Base";
             case COMBINING:         return "Combining";
             case INITIAL_JAMO:      return "Initial Jamo";
             case MEDIAL_JAMO:       return "Medial Jamo";
             case FINAL_JAMO:        return "Final Jamo";
             case HANGUL:            return "Hangul";
             default:                return "Unknown";
         }
     }


     static final int put(StringBuffer buf, String str, int minIndex)
     {
         str = str + '\u0000';   // Add trailing null

         int index = buf.toString().indexOf(str);
         if (index <= minIndex) {
             index = buf.length();
             buf.append(str);
         }
         return index;
     }

     static final int putLength(StringBuffer buf, String str, int minIndex) {
         int length = str.length();

         if (length >= (1 << STR_INDEX_SHIFT)) {
             // There's no room to store the length in the index, so
             // add a null terminator and use a 0 length to flag this
             str = str + '\u0000';
             length = 0;
         }

         int index = buf.toString().indexOf(str);
         if (index <= minIndex) {
             index = buf.length();
             buf.append(str);
         }
         return (index << STR_INDEX_SHIFT) | length;
     }

     //--------------------------------------------------------------------------------
     // Source file headers
     //

     static final String kCHeader =
          "/*\n"
         +" * (C) Copyright IBM Corp. 1997-1998 - All Rights Reserved\n"
         +" *\n"
         +" * The program is provided 'as is' without any warranty express or\n"
         +" * implied, including the warranty of non-infringement and the implied\n"
         +" * warranties of merchantibility and fitness for a particular purpose.\n"
         +" * IBM will not be liable for any damages suffered by you as a result\n"
         +" * of using the Program. In no event will IBM be liable for any\n"
         +" * special, indirect or consequential damages or lost profits even if\n"
         +" * IBM has been advised of the possibility of their occurrence. IBM\n"
         +" * will not be liable for any third party claims against you.\n"
         +" */\n"
         + "// This class is MACHINE GENERATED.  Run NormalizerBuilder to regenerate.\n"
         +"\n";

     void out(String str) {
         if (fVerbose) System.out.println(str);
     }
     void warn(String str) {
         System.err.println("Warning: " + str);
     }
     void err(String str) {
         System.err.println("ERROR:   " + str);
     }
 }

 //-----------------------------------------------------------------------------
 // Utility classes
 //-----------------------------------------------------------------------------

 class DecompMap extends HashMap {
     public DecompMap() {
     }

     void put(char ch, String value) {
         put(new MutableChar(ch), value);
     }

     String get(char ch) {
         Object obj = get(probe.set(ch));
         return (obj != null) ? (String)obj : null;
     }

     boolean contains(char ch) {
         return containsKey(probe.set(ch));
     }

     MutableChar probe = new MutableChar(' ');
 }

 class CompMap extends HashMap {
     public CompMap() {
     }

     void put(String key, char value) {
         put(key, new MutableChar(value));
     }

     char get(String key) {
         Object obj = get((Object)key);
         return (obj != null) ? ((MutableChar)obj).value : 0;
     }
 }

 class CharSet extends HashSet {
     public CharSet() {
     }

     public void add(char ch) {
         add(new MutableChar(ch));
     }

     public boolean contains(char ch) {
         return contains(probe.set(ch));
     }
     MutableChar probe = new MutableChar(' ');
 }