src/com/ibm/icu/dev/test/normalizer/NormalizerBuilder.java - external/github.com/unicode-org/icu - Git at Google

 package com.ibm.icu.dev.test.normalizer;

 import java.io.BufferedReader;
 import java.util.BitSet;

 import com.ibm.icu.dev.test.TestUtil;
 import com.ibm.icu.dev.test.UTF16Util;


 /**
  * Builds the normalization tables. This is a separate class so that it
  * can be unloaded once not needed.<br>
  * Copyright ? 1998-2002 Unicode, Inc. All Rights Reserved.<br>
  * The Unicode Consortium makes no expressed or implied warranty of any
  * kind, and assumes no liability for errors or omissions.
  * No liability is assumed for incidental and consequential damages
  * in connection with or arising out of the use of the information here.
  * @author Mark Davis
  * Updates for supplementary code points:
  * Vladimir Weinstein & Markus Scherer
  */
 class NormalizerBuilder {
     static final String copyright = "Copyright ? 1998-1999 Unicode, Inc.";

     /**
      * Testing flags
      */

     private static final boolean DEBUG = false;
     private static final boolean GENERATING = false;

     /**
      * Constants for the data file version to use.
      */
     /*static final boolean NEW_VERSION = true;
     private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");

     static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
     static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";

     public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
     public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
     */

     /**
      * Called exactly once by NormalizerData to build the static data
      */

     static NormalizerData build(boolean fullData) {
         try {
             IntHashtable canonicalClass = new IntHashtable(0);
             IntStringHashtable decompose = new IntStringHashtable(null);
             LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE);
             BitSet isCompatibility = new BitSet();
             BitSet isExcluded = new BitSet();
             if (fullData) {
                 //System.out.println("Building Normalizer Data from file.");
                 readExclusionList(isExcluded);
                 //System.out.println(isExcluded.get(0x00C0));
                 buildDecompositionTables(canonicalClass, decompose, compose,
                   isCompatibility, isExcluded);
             } else {    // for use in Applets
                 //System.out.println("Building abridged data.");
                 setMinimalDecomp(canonicalClass, decompose, compose,
                   isCompatibility, isExcluded);
             }
             return new NormalizerData(canonicalClass, decompose, compose,
                   isCompatibility, isExcluded);
         } catch (java.io.IOException e) {
             System.err.println("Can't load data file." + e + ", " + e.getMessage());
             return null;
         }
     }

 // =============================================================
 // Building Decomposition Tables
 // =============================================================

     /**
      * Reads exclusion list and stores the data
      */
     private static void readExclusionList(BitSet isExcluded) throws java.io.IOException {
         if (DEBUG) System.out.println("Reading Exclusions");
         //BufferedReader in = new BufferedReader(new FileReader(COMPOSITION_EXCLUSIONS), 5*1024);
         BufferedReader in = null;
         try {
             in = TestUtil.getDataReader("unicode/CompositionExclusions.txt");
         } catch (Exception e) {
             System.err.println("Fail to read the file CompositionExclusions.txt!");
             System.exit(1);
         }

         while (true) {

             // read a line, discarding comments and blank lines

             String line = in.readLine();
             if (line == null) break;
             int comment = line.indexOf('#');                    // strip comments
             if (comment != -1) line = line.substring(0,comment);
             if (line.length() == 0) continue;                   // ignore blanks
             if(line.indexOf(' ') != -1) {
                 line = line.substring(0, line.indexOf(' '));
             }
             // store -1 in the excluded table for each character hit

             int value = Integer.parseInt(line,16);
             isExcluded.set(value);
             //System.out.println("Excluding " + hex(value));
         }
         in.close();
         if (DEBUG) System.out.println("Done reading Exclusions");
     }

     /**
      * Builds a decomposition table from a UnicodeData file
      */
     private static void buildDecompositionTables(
       IntHashtable canonicalClass, IntStringHashtable decompose,
       LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
       throws java.io.IOException {
         if (DEBUG) System.out.println("Reading Unicode Character Database");
         //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
         BufferedReader in = null;
         try {
             in = TestUtil.getDataReader("unicode/UnicodeData.txt");
         } catch (Exception e) {
             System.err.println("Failed to read UnicodeData.txt");
             System.exit(1);
         }

         int value;
         long pair;
         int counter = 0;
         while (true) {

             // read a line, discarding comments and blank lines

             String line = in.readLine();
             if (line == null) break;
             int comment = line.indexOf('#');                    // strip comments
             if (comment != -1) line = line.substring(0,comment);
             if (line.length() == 0) continue;
             if (DEBUG) {
                 counter++;
                 if ((counter & 0xFF) == 0) System.out.println("At: " + line);
             }

             // find the values of the particular fields that we need
             // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;

             int start = 0;
             int end = line.indexOf(';'); // code
             value = Integer.parseInt(line.substring(start,end),16);
             if (true && value == '\u00c0') {
                 //System.out.println("debug: " + line);
             }
             end = line.indexOf(';',start=end+1); // name
             /*String name = line.substring(start,end);*/
             end = line.indexOf(';',start=end+1); // general category
             end = line.indexOf(';',start=end+1); // canonical class

             // check consistency: canonical classes must be from 0 to 255

             int cc = Integer.parseInt(line.substring(start,end));
             if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
             canonicalClass.put(value,cc);
             end = line.indexOf(';',start=end+1); // BIDI
             end = line.indexOf(';',start=end+1); // decomp

             // decomp requires more processing.
             // store whether it is canonical or compatibility.
             // store the decomp in one table, and the reverse mapping (from pairs) in another

             if (start != end) {
                 String segment = line.substring(start, end);
                 boolean compat = segment.charAt(0) == '<';
                 if (compat) isCompatibility.set(value);
                 String decomp = fromHex(segment);

                 // a small snippet of code to generate the Applet data

                 /*if (GENERATING) {
                     if (value < 0xFF) {
                         System.out.println(
                             "\"\\u" + hex((char)value) + "\", "
                             + "\"\\u" + hex(decomp, "\\u") + "\", "
                             + (compat ? "\"K\"," : "\"\",")
                             + "// " + name);
                     }
                 }*/

                 // check consistency: all canon decomps must be singles or pairs!
                 int decompLen = UTF16Util.countCodePoint(decomp);
                 if (decompLen < 1 || decompLen > 2 && !compat) {
                     System.err.println("Bad decomp at: " + line);
                 }
                 decompose.put(value, decomp);

                 // only compositions are canonical pairs
                 // skip if script exclusion

                 if (!compat && !isExcluded.get(value)) {
                     int first = '\u0000';
                     int second = UTF16Util.nextCodePoint(decomp, 0);
                     if (decompLen > 1) {
                         first = second;
                         second = UTF16Util.nextCodePoint(decomp,
                             UTF16Util.codePointLength(first));
                     }

                     // store composition pair in single integer

                     pair = ((long)first << 32) | second;
                     if (DEBUG && value == '\u00C0') {
                         System.out.println("debug2: " + line);
                     }
                     compose.put(pair, value);
                 } else if (DEBUG) {
                     System.out.println("Excluding: " + decomp);
                 }
             }
         }
         in.close();
         if (DEBUG) System.out.println("Done reading Unicode Character Database");

         // add algorithmic Hangul decompositions
         // this is more compact if done at runtime, but for simplicity we
         // do it this way.

         if (DEBUG) System.out.println("Adding Hangul");

         for (int SIndex = 0; SIndex < SCount; ++SIndex) {
             int TIndex = SIndex % TCount;
             char first, second;
             if (TIndex != 0) { // triple
                 first = (char)(SBase + SIndex - TIndex);
                 second = (char)(TBase + TIndex);
             } else {
                 first = (char)(LBase + SIndex / NCount);
                 second = (char)(VBase + (SIndex % NCount) / TCount);
             }
             pair = ((long)first << 32) | second;
             value = SIndex + SBase;
             decompose.put(value, String.valueOf(first) + second);
             compose.put(pair, value);
         }
         if (DEBUG) System.out.println("Done adding Hangul");
     }

     /**
      * Hangul composition constants
      */
     static final int
         SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
         LCount = 19, VCount = 21, TCount = 28,
         NCount = VCount * TCount,   // 588
         SCount = LCount * NCount;   // 11172

     /**
      * For use in an applet: just load a minimal set of data.
      */
     private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
       LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
         String[] decomposeData = {
             "\u005E", "\u0020\u0302", "K",
             "\u005F", "\u0020\u0332", "K",
             "\u0060", "\u0020\u0300", "K",
             "\u00A0", "\u0020", "K",
             "\u00A8", "\u0020\u0308", "K",
             "\u00AA", "\u0061", "K",
             "\u00AF", "\u0020\u0304", "K",
             "\u00B2", "\u0032", "K",
             "\u00B3", "\u0033", "K",
             "\u00B4", "\u0020\u0301", "K",
             "\u00B5", "\u03BC", "K",
             "\u00B8", "\u0020\u0327", "K",
             "\u00B9", "\u0031", "K",
             "\u00BA", "\u006F", "K",
             "\u00BC", "\u0031\u2044\u0034", "K",
             "\u00BD", "\u0031\u2044\u0032", "K",
             "\u00BE", "\u0033\u2044\u0034", "K",
             "\u00C0", "\u0041\u0300", "",
             "\u00C1", "\u0041\u0301", "",
             "\u00C2", "\u0041\u0302", "",
             "\u00C3", "\u0041\u0303", "",
             "\u00C4", "\u0041\u0308", "",
             "\u00C5", "\u0041\u030A", "",
             "\u00C7", "\u0043\u0327", "",
             "\u00C8", "\u0045\u0300", "",
             "\u00C9", "\u0045\u0301", "",
             "\u00CA", "\u0045\u0302", "",
             "\u00CB", "\u0045\u0308", "",
             "\u00CC", "\u0049\u0300", "",
             "\u00CD", "\u0049\u0301", "",
             "\u00CE", "\u0049\u0302", "",
             "\u00CF", "\u0049\u0308", "",
             "\u00D1", "\u004E\u0303", "",
             "\u00D2", "\u004F\u0300", "",
             "\u00D3", "\u004F\u0301", "",
             "\u00D4", "\u004F\u0302", "",
             "\u00D5", "\u004F\u0303", "",
             "\u00D6", "\u004F\u0308", "",
             "\u00D9", "\u0055\u0300", "",
             "\u00DA", "\u0055\u0301", "",
             "\u00DB", "\u0055\u0302", "",
             "\u00DC", "\u0055\u0308", "",
             "\u00DD", "\u0059\u0301", "",
             "\u00E0", "\u0061\u0300", "",
             "\u00E1", "\u0061\u0301", "",
             "\u00E2", "\u0061\u0302", "",
             "\u00E3", "\u0061\u0303", "",
             "\u00E4", "\u0061\u0308", "",
             "\u00E5", "\u0061\u030A", "",
             "\u00E7", "\u0063\u0327", "",
             "\u00E8", "\u0065\u0300", "",
             "\u00E9", "\u0065\u0301", "",
             "\u00EA", "\u0065\u0302", "",
             "\u00EB", "\u0065\u0308", "",
             "\u00EC", "\u0069\u0300", "",
             "\u00ED", "\u0069\u0301", "",
             "\u00EE", "\u0069\u0302", "",
             "\u00EF", "\u0069\u0308", "",
             "\u00F1", "\u006E\u0303", "",
             "\u00F2", "\u006F\u0300", "",
             "\u00F3", "\u006F\u0301", "",
             "\u00F4", "\u006F\u0302", "",
             "\u00F5", "\u006F\u0303", "",
             "\u00F6", "\u006F\u0308", "",
             "\u00F9", "\u0075\u0300", "",
             "\u00FA", "\u0075\u0301", "",
             "\u00FB", "\u0075\u0302", "",
             "\u00FC", "\u0075\u0308", "",
             "\u00FD", "\u0079\u0301", "",
 // EXTRAS, outside of Latin 1
             "\u1EA4", "\u00C2\u0301", "",
             "\u1EA5", "\u00E2\u0301", "",
             "\u1EA6", "\u00C2\u0300", "",
             "\u1EA7", "\u00E2\u0300", "",
         };

         int[] classData = {
             0x0300, 230,
             0x0301, 230,
             0x0302, 230,
             0x0303, 230,
             0x0304, 230,
             0x0305, 230,
             0x0306, 230,
             0x0307, 230,
             0x0308, 230,
             0x0309, 230,
             0x030A, 230,
             0x030B, 230,
             0x030C, 230,
             0x030D, 230,
             0x030E, 230,
             0x030F, 230,
             0x0310, 230,
             0x0311, 230,
             0x0312, 230,
             0x0313, 230,
             0x0314, 230,
             0x0315, 232,
             0x0316, 220,
             0x0317, 220,
             0x0318, 220,
             0x0319, 220,
             0x031A, 232,
             0x031B, 216,
             0x031C, 220,
             0x031D, 220,
             0x031E, 220,
             0x031F, 220,
             0x0320, 220,
             0x0321, 202,
             0x0322, 202,
             0x0323, 220,
             0x0324, 220,
             0x0325, 220,
             0x0326, 220,
             0x0327, 202,
             0x0328, 202,
             0x0329, 220,
             0x032A, 220,
             0x032B, 220,
             0x032C, 220,
             0x032D, 220,
             0x032E, 220,
             0x032F, 220,
             0x0330, 220,
             0x0331, 220,
             0x0332, 220,
             0x0333, 220,
             0x0334, 1,
             0x0335, 1,
             0x0336, 1,
             0x0337, 1,
             0x0338, 1,
             0x0339, 220,
             0x033A, 220,
             0x033B, 220,
             0x033C, 220,
             0x033D, 230,
             0x033E, 230,
             0x033F, 230,
             0x0340, 230,
             0x0341, 230,
             0x0342, 230,
             0x0343, 230,
             0x0344, 230,
             0x0345, 240,
             0x0360, 234,
             0x0361, 234
         };

         // build the same tables we would otherwise get from the
         // Unicode Character Database, just with limited data

         for (int i = 0; i < decomposeData.length; i+=3) {
             char value = decomposeData[i].charAt(0);
             String decomp = decomposeData[i+1];
             boolean compat = decomposeData[i+2].equals("K");
             if (compat) isCompatibility.set(value);
             decompose.put(value, decomp);
             if (!compat) {
                 int first = '\u0000';
                 int second = UTF16Util.nextCodePoint(decomp, 0);
                 if (decomp.length() > 1) {
                     first = second;
                     second = UTF16Util.nextCodePoint(decomp,
                         UTF16Util.codePointLength(first));
                 }
                 long pair = (first << 16) | second;
                 compose.put(pair, value);
             }
         }

         for (int i = 0; i < classData.length;) {
             canonicalClass.put(classData[i++], classData[i++]);
         }
     }

     /**
      * Utility: Parses a sequence of hex Unicode characters separated by spaces
      */
     static public String fromHex(String source) {
         StringBuffer result = new StringBuffer();
         for (int i = 0; i < source.length(); ++i) {
             char c = source.charAt(i);
             switch (c) {
               case ' ': break; // ignore
               case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
               case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                 int end = 0;
                 int value = 0;
                 try {
                     //System.out.println(source.substring(i, i + 4) + "************" + source);
                     end = source.indexOf(' ',i);
                     if (end < 0) {
                         end = source.length();
                     }
                     value = Integer.parseInt(source.substring(i, end),16);
                     UTF16Util.appendCodePoint(result, value);
                 } catch (Exception e) {
                     System.out.println("i: " + i + ";end:" + end + "source:" + source);
                     //System.out.println(source.substring(i, i + 4) + "************" + source);
                     System.exit(1);
                 }
                 //i+= 3; // skip rest of number
                 i = end;
                 break;
               case '<': int j = source.indexOf('>',i); // skip <...>
                 if (j > 0) {
                     i = j;
                     break;
                 } // else fall through--error
               default:
                 throw new IllegalArgumentException("Bad hex value in " + source);
             }
         }
         return result.toString();
     }

     /**
      * Utility: Supplies a zero-padded hex representation of an integer (without 0x)
      */
     static public String hex(int i) {
         String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
         return "00000000".substring(result.length(),8) + result;
     }

     /**
      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
      */
     static public String hex(char i) {
         String result = Integer.toString(i, 16).toUpperCase();
         return "0000".substring(result.length(),4) + result;
     }

     /**
      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
      */
     public static String hex(String s, String sep) {
         StringBuffer result = new StringBuffer();
         for (int i = 0; i < s.length(); ++i) {
             if (i != 0) result.append(sep);
             result.append(hex(s.charAt(i)));
         }
         return result.toString();
     }
 }
	package com.ibm.icu.dev.test.normalizer;

	import java.io.BufferedReader;
	import java.util.BitSet;

	import com.ibm.icu.dev.test.TestUtil;
	import com.ibm.icu.dev.test.UTF16Util;


	/**
	* Builds the normalization tables. This is a separate class so that it
	* can be unloaded once not needed.<br>
	* Copyright ? 1998-2002 Unicode, Inc. All Rights Reserved.<br>
	* The Unicode Consortium makes no expressed or implied warranty of any
	* kind, and assumes no liability for errors or omissions.
	* No liability is assumed for incidental and consequential damages
	* in connection with or arising out of the use of the information here.
	* @author Mark Davis
	* Updates for supplementary code points:
	* Vladimir Weinstein & Markus Scherer
	*/
	class NormalizerBuilder {
	static final String copyright = "Copyright ? 1998-1999 Unicode, Inc.";

	/**
	* Testing flags
	*/

	private static final boolean DEBUG = false;
	private static final boolean GENERATING = false;

	/**
	* Constants for the data file version to use.
	*/
	/*static final boolean NEW_VERSION = true;
	private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");

	static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
	static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";

	public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
	public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
	*/

	/**
	* Called exactly once by NormalizerData to build the static data
	*/

	static NormalizerData build(boolean fullData) {
	try {
	IntHashtable canonicalClass = new IntHashtable(0);
	IntStringHashtable decompose = new IntStringHashtable(null);
	LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE);
	BitSet isCompatibility = new BitSet();
	BitSet isExcluded = new BitSet();
	if (fullData) {
	//System.out.println("Building Normalizer Data from file.");
	readExclusionList(isExcluded);
	//System.out.println(isExcluded.get(0x00C0));
	buildDecompositionTables(canonicalClass, decompose, compose,
	isCompatibility, isExcluded);
	} else { // for use in Applets
	//System.out.println("Building abridged data.");
	setMinimalDecomp(canonicalClass, decompose, compose,
	isCompatibility, isExcluded);
	}
	return new NormalizerData(canonicalClass, decompose, compose,
	isCompatibility, isExcluded);
	} catch (java.io.IOException e) {
	System.err.println("Can't load data file." + e + ", " + e.getMessage());
	return null;
	}
	}

	// =============================================================
	// Building Decomposition Tables
	// =============================================================

	/**
	* Reads exclusion list and stores the data
	*/
	private static void readExclusionList(BitSet isExcluded) throws java.io.IOException {
	if (DEBUG) System.out.println("Reading Exclusions");
	//BufferedReader in = new BufferedReader(new FileReader(COMPOSITION_EXCLUSIONS), 5*1024);
	BufferedReader in = null;
	try {
	in = TestUtil.getDataReader("unicode/CompositionExclusions.txt");
	} catch (Exception e) {
	System.err.println("Fail to read the file CompositionExclusions.txt!");
	System.exit(1);
	}

	while (true) {

	// read a line, discarding comments and blank lines

	String line = in.readLine();
	if (line == null) break;
	int comment = line.indexOf('#'); // strip comments
	if (comment != -1) line = line.substring(0,comment);
	if (line.length() == 0) continue; // ignore blanks
	if(line.indexOf(' ') != -1) {
	line = line.substring(0, line.indexOf(' '));
	}
	// store -1 in the excluded table for each character hit

	int value = Integer.parseInt(line,16);
	isExcluded.set(value);
	//System.out.println("Excluding " + hex(value));
	}
	in.close();
	if (DEBUG) System.out.println("Done reading Exclusions");
	}

	/**
	* Builds a decomposition table from a UnicodeData file
	*/
	private static void buildDecompositionTables(
	IntHashtable canonicalClass, IntStringHashtable decompose,
	LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
	throws java.io.IOException {
	if (DEBUG) System.out.println("Reading Unicode Character Database");
	//BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
	BufferedReader in = null;
	try {
	in = TestUtil.getDataReader("unicode/UnicodeData.txt");
	} catch (Exception e) {
	System.err.println("Failed to read UnicodeData.txt");
	System.exit(1);
	}

	int value;
	long pair;
	int counter = 0;
	while (true) {

	// read a line, discarding comments and blank lines

	String line = in.readLine();
	if (line == null) break;
	int comment = line.indexOf('#'); // strip comments
	if (comment != -1) line = line.substring(0,comment);
	if (line.length() == 0) continue;
	if (DEBUG) {
	counter++;
	if ((counter & 0xFF) == 0) System.out.println("At: " + line);
	}

	// find the values of the particular fields that we need
	// Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;

	int start = 0;
	int end = line.indexOf(';'); // code
	value = Integer.parseInt(line.substring(start,end),16);
	if (true && value == '\u00c0') {
	//System.out.println("debug: " + line);
	}
	end = line.indexOf(';',start=end+1); // name
	/String name = line.substring(start,end);/
	end = line.indexOf(';',start=end+1); // general category
	end = line.indexOf(';',start=end+1); // canonical class

	// check consistency: canonical classes must be from 0 to 255

	int cc = Integer.parseInt(line.substring(start,end));
	if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
	canonicalClass.put(value,cc);
	end = line.indexOf(';',start=end+1); // BIDI
	end = line.indexOf(';',start=end+1); // decomp

	// decomp requires more processing.
	// store whether it is canonical or compatibility.
	// store the decomp in one table, and the reverse mapping (from pairs) in another

	if (start != end) {
	String segment = line.substring(start, end);
	boolean compat = segment.charAt(0) == '<';
	if (compat) isCompatibility.set(value);
	String decomp = fromHex(segment);

	// a small snippet of code to generate the Applet data

	/*if (GENERATING) {
	if (value < 0xFF) {
	System.out.println(
	"\"\\u" + hex((char)value) + "\", "
	+ "\"\\u" + hex(decomp, "\\u") + "\", "
	+ (compat ? "\"K\"," : "\"\",")
	+ "// " + name);
	}
	}*/

	// check consistency: all canon decomps must be singles or pairs!
	int decompLen = UTF16Util.countCodePoint(decomp);
	if (decompLen < 1 \|\| decompLen > 2 && !compat) {
	System.err.println("Bad decomp at: " + line);
	}
	decompose.put(value, decomp);

	// only compositions are canonical pairs
	// skip if script exclusion

	if (!compat && !isExcluded.get(value)) {
	int first = '\u0000';
	int second = UTF16Util.nextCodePoint(decomp, 0);
	if (decompLen > 1) {
	first = second;
	second = UTF16Util.nextCodePoint(decomp,
	UTF16Util.codePointLength(first));
	}

	// store composition pair in single integer

	pair = ((long)first << 32) \| second;
	if (DEBUG && value == '\u00C0') {
	System.out.println("debug2: " + line);
	}
	compose.put(pair, value);
	} else if (DEBUG) {
	System.out.println("Excluding: " + decomp);
	}
	}
	}
	in.close();
	if (DEBUG) System.out.println("Done reading Unicode Character Database");

	// add algorithmic Hangul decompositions
	// this is more compact if done at runtime, but for simplicity we
	// do it this way.

	if (DEBUG) System.out.println("Adding Hangul");

	for (int SIndex = 0; SIndex < SCount; ++SIndex) {
	int TIndex = SIndex % TCount;
	char first, second;
	if (TIndex != 0) { // triple
	first = (char)(SBase + SIndex - TIndex);
	second = (char)(TBase + TIndex);
	} else {
	first = (char)(LBase + SIndex / NCount);
	second = (char)(VBase + (SIndex % NCount) / TCount);
	}
	pair = ((long)first << 32) \| second;
	value = SIndex + SBase;
	decompose.put(value, String.valueOf(first) + second);
	compose.put(pair, value);
	}
	if (DEBUG) System.out.println("Done adding Hangul");
	}

	/**
	* Hangul composition constants
	*/
	static final int
	SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
	LCount = 19, VCount = 21, TCount = 28,
	NCount = VCount * TCount, // 588
	SCount = LCount * NCount; // 11172

	/**
	* For use in an applet: just load a minimal set of data.
	*/
	private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
	LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
	String[] decomposeData = {
	"\u005E", "\u0020\u0302", "K",
	"\u005F", "\u0020\u0332", "K",
	"\u0060", "\u0020\u0300", "K",
	"\u00A0", "\u0020", "K",
	"\u00A8", "\u0020\u0308", "K",
	"\u00AA", "\u0061", "K",
	"\u00AF", "\u0020\u0304", "K",
	"\u00B2", "\u0032", "K",
	"\u00B3", "\u0033", "K",
	"\u00B4", "\u0020\u0301", "K",
	"\u00B5", "\u03BC", "K",
	"\u00B8", "\u0020\u0327", "K",
	"\u00B9", "\u0031", "K",
	"\u00BA", "\u006F", "K",
	"\u00BC", "\u0031\u2044\u0034", "K",
	"\u00BD", "\u0031\u2044\u0032", "K",
	"\u00BE", "\u0033\u2044\u0034", "K",
	"\u00C0", "\u0041\u0300", "",
	"\u00C1", "\u0041\u0301", "",
	"\u00C2", "\u0041\u0302", "",
	"\u00C3", "\u0041\u0303", "",
	"\u00C4", "\u0041\u0308", "",
	"\u00C5", "\u0041\u030A", "",
	"\u00C7", "\u0043\u0327", "",
	"\u00C8", "\u0045\u0300", "",
	"\u00C9", "\u0045\u0301", "",
	"\u00CA", "\u0045\u0302", "",
	"\u00CB", "\u0045\u0308", "",
	"\u00CC", "\u0049\u0300", "",
	"\u00CD", "\u0049\u0301", "",
	"\u00CE", "\u0049\u0302", "",
	"\u00CF", "\u0049\u0308", "",
	"\u00D1", "\u004E\u0303", "",
	"\u00D2", "\u004F\u0300", "",
	"\u00D3", "\u004F\u0301", "",
	"\u00D4", "\u004F\u0302", "",
	"\u00D5", "\u004F\u0303", "",
	"\u00D6", "\u004F\u0308", "",
	"\u00D9", "\u0055\u0300", "",
	"\u00DA", "\u0055\u0301", "",
	"\u00DB", "\u0055\u0302", "",
	"\u00DC", "\u0055\u0308", "",
	"\u00DD", "\u0059\u0301", "",
	"\u00E0", "\u0061\u0300", "",
	"\u00E1", "\u0061\u0301", "",
	"\u00E2", "\u0061\u0302", "",
	"\u00E3", "\u0061\u0303", "",
	"\u00E4", "\u0061\u0308", "",
	"\u00E5", "\u0061\u030A", "",
	"\u00E7", "\u0063\u0327", "",
	"\u00E8", "\u0065\u0300", "",
	"\u00E9", "\u0065\u0301", "",
	"\u00EA", "\u0065\u0302", "",
	"\u00EB", "\u0065\u0308", "",
	"\u00EC", "\u0069\u0300", "",
	"\u00ED", "\u0069\u0301", "",
	"\u00EE", "\u0069\u0302", "",
	"\u00EF", "\u0069\u0308", "",
	"\u00F1", "\u006E\u0303", "",
	"\u00F2", "\u006F\u0300", "",
	"\u00F3", "\u006F\u0301", "",
	"\u00F4", "\u006F\u0302", "",
	"\u00F5", "\u006F\u0303", "",
	"\u00F6", "\u006F\u0308", "",
	"\u00F9", "\u0075\u0300", "",
	"\u00FA", "\u0075\u0301", "",
	"\u00FB", "\u0075\u0302", "",
	"\u00FC", "\u0075\u0308", "",
	"\u00FD", "\u0079\u0301", "",
	// EXTRAS, outside of Latin 1
	"\u1EA4", "\u00C2\u0301", "",
	"\u1EA5", "\u00E2\u0301", "",
	"\u1EA6", "\u00C2\u0300", "",
	"\u1EA7", "\u00E2\u0300", "",
	};

	int[] classData = {
	0x0300, 230,
	0x0301, 230,
	0x0302, 230,
	0x0303, 230,
	0x0304, 230,
	0x0305, 230,
	0x0306, 230,
	0x0307, 230,
	0x0308, 230,
	0x0309, 230,
	0x030A, 230,
	0x030B, 230,
	0x030C, 230,
	0x030D, 230,
	0x030E, 230,
	0x030F, 230,
	0x0310, 230,
	0x0311, 230,
	0x0312, 230,
	0x0313, 230,
	0x0314, 230,
	0x0315, 232,
	0x0316, 220,
	0x0317, 220,
	0x0318, 220,
	0x0319, 220,
	0x031A, 232,
	0x031B, 216,
	0x031C, 220,
	0x031D, 220,
	0x031E, 220,
	0x031F, 220,
	0x0320, 220,
	0x0321, 202,
	0x0322, 202,
	0x0323, 220,
	0x0324, 220,
	0x0325, 220,
	0x0326, 220,
	0x0327, 202,
	0x0328, 202,
	0x0329, 220,
	0x032A, 220,
	0x032B, 220,
	0x032C, 220,
	0x032D, 220,
	0x032E, 220,
	0x032F, 220,
	0x0330, 220,
	0x0331, 220,
	0x0332, 220,
	0x0333, 220,
	0x0334, 1,
	0x0335, 1,
	0x0336, 1,
	0x0337, 1,
	0x0338, 1,
	0x0339, 220,
	0x033A, 220,
	0x033B, 220,
	0x033C, 220,
	0x033D, 230,
	0x033E, 230,
	0x033F, 230,
	0x0340, 230,
	0x0341, 230,
	0x0342, 230,
	0x0343, 230,
	0x0344, 230,
	0x0345, 240,
	0x0360, 234,
	0x0361, 234
	};

	// build the same tables we would otherwise get from the
	// Unicode Character Database, just with limited data

	for (int i = 0; i < decomposeData.length; i+=3) {
	char value = decomposeData[i].charAt(0);
	String decomp = decomposeData[i+1];
	boolean compat = decomposeData[i+2].equals("K");
	if (compat) isCompatibility.set(value);
	decompose.put(value, decomp);
	if (!compat) {
	int first = '\u0000';
	int second = UTF16Util.nextCodePoint(decomp, 0);
	if (decomp.length() > 1) {
	first = second;
	second = UTF16Util.nextCodePoint(decomp,
	UTF16Util.codePointLength(first));
	}
	long pair = (first << 16) \| second;
	compose.put(pair, value);
	}
	}

	for (int i = 0; i < classData.length;) {
	canonicalClass.put(classData[i++], classData[i++]);
	}
	}

	/**
	* Utility: Parses a sequence of hex Unicode characters separated by spaces
	*/
	static public String fromHex(String source) {
	StringBuffer result = new StringBuffer();
	for (int i = 0; i < source.length(); ++i) {
	char c = source.charAt(i);
	switch (c) {
	case ' ': break; // ignore
	case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
	case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	int end = 0;
	int value = 0;
	try {
	//System.out.println(source.substring(i, i + 4) + "************" + source);
	end = source.indexOf(' ',i);
	if (end < 0) {
	end = source.length();
	}
	value = Integer.parseInt(source.substring(i, end),16);
	UTF16Util.appendCodePoint(result, value);
	} catch (Exception e) {
	System.out.println("i: " + i + ";end:" + end + "source:" + source);
	//System.out.println(source.substring(i, i + 4) + "************" + source);
	System.exit(1);
	}
	//i+= 3; // skip rest of number
	i = end;
	break;
	case '<': int j = source.indexOf('>',i); // skip <...>
	if (j > 0) {
	i = j;
	break;
	} // else fall through--error
	default:
	throw new IllegalArgumentException("Bad hex value in " + source);
	}
	}
	return result.toString();
	}

	/**
	* Utility: Supplies a zero-padded hex representation of an integer (without 0x)
	*/
	static public String hex(int i) {
	String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
	return "00000000".substring(result.length(),8) + result;
	}

	/**
	* Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
	*/
	static public String hex(char i) {
	String result = Integer.toString(i, 16).toUpperCase();
	return "0000".substring(result.length(),4) + result;
	}

	/**
	* Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
	*/
	public static String hex(String s, String sep) {
	StringBuffer result = new StringBuffer();
	for (int i = 0; i < s.length(); ++i) {
	if (i != 0) result.append(sep);
	result.append(hex(s.charAt(i)));
	}
	return result.toString();
	}
	}