blob: 40de2738740f4d23f836f68aad9838b1e56acfd3 [file] [log] [blame]
package com.ibm.icu.dev.test.normalizer;
import java.io.BufferedReader;
import java.util.BitSet;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.dev.test.UTF16Util;
/**
* Builds the normalization tables. This is a separate class so that it
* can be unloaded once not needed.<br>
* Copyright ? 1998-2002 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
* Updates for supplementary code points:
* Vladimir Weinstein & Markus Scherer
*/
class NormalizerBuilder {
static final String copyright = "Copyright ? 1998-1999 Unicode, Inc.";
/**
* Testing flags
*/
private static final boolean DEBUG = false;
private static final boolean GENERATING = false;
/**
* Constants for the data file version to use.
*/
/*static final boolean NEW_VERSION = true;
private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");
static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";
public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
*/
/**
* Called exactly once by NormalizerData to build the static data
*/
static NormalizerData build(boolean fullData) {
try {
IntHashtable canonicalClass = new IntHashtable(0);
IntStringHashtable decompose = new IntStringHashtable(null);
LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE);
BitSet isCompatibility = new BitSet();
BitSet isExcluded = new BitSet();
if (fullData) {
//System.out.println("Building Normalizer Data from file.");
readExclusionList(isExcluded);
//System.out.println(isExcluded.get(0x00C0));
buildDecompositionTables(canonicalClass, decompose, compose,
isCompatibility, isExcluded);
} else { // for use in Applets
//System.out.println("Building abridged data.");
setMinimalDecomp(canonicalClass, decompose, compose,
isCompatibility, isExcluded);
}
return new NormalizerData(canonicalClass, decompose, compose,
isCompatibility, isExcluded);
} catch (java.io.IOException e) {
System.err.println("Can't load data file." + e + ", " + e.getMessage());
return null;
}
}
// =============================================================
// Building Decomposition Tables
// =============================================================
/**
* Reads exclusion list and stores the data
*/
private static void readExclusionList(BitSet isExcluded) throws java.io.IOException {
if (DEBUG) System.out.println("Reading Exclusions");
//BufferedReader in = new BufferedReader(new FileReader(COMPOSITION_EXCLUSIONS), 5*1024);
BufferedReader in = null;
try {
in = TestUtil.getDataReader("unicode/CompositionExclusions.txt");
} catch (Exception e) {
System.err.println("Fail to read the file CompositionExclusions.txt!");
System.exit(1);
}
while (true) {
// read a line, discarding comments and blank lines
String line = in.readLine();
if (line == null) break;
int comment = line.indexOf('#'); // strip comments
if (comment != -1) line = line.substring(0,comment);
if (line.length() == 0) continue; // ignore blanks
if(line.indexOf(' ') != -1) {
line = line.substring(0, line.indexOf(' '));
}
// store -1 in the excluded table for each character hit
int value = Integer.parseInt(line,16);
isExcluded.set(value);
//System.out.println("Excluding " + hex(value));
}
in.close();
if (DEBUG) System.out.println("Done reading Exclusions");
}
/**
* Builds a decomposition table from a UnicodeData file
*/
private static void buildDecompositionTables(
IntHashtable canonicalClass, IntStringHashtable decompose,
LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
throws java.io.IOException {
if (DEBUG) System.out.println("Reading Unicode Character Database");
//BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
BufferedReader in = null;
try {
in = TestUtil.getDataReader("unicode/UnicodeData.txt");
} catch (Exception e) {
System.err.println("Failed to read UnicodeData.txt");
System.exit(1);
}
int value;
long pair;
int counter = 0;
while (true) {
// read a line, discarding comments and blank lines
String line = in.readLine();
if (line == null) break;
int comment = line.indexOf('#'); // strip comments
if (comment != -1) line = line.substring(0,comment);
if (line.length() == 0) continue;
if (DEBUG) {
counter++;
if ((counter & 0xFF) == 0) System.out.println("At: " + line);
}
// find the values of the particular fields that we need
// Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
int start = 0;
int end = line.indexOf(';'); // code
value = Integer.parseInt(line.substring(start,end),16);
if (true && value == '\u00c0') {
//System.out.println("debug: " + line);
}
end = line.indexOf(';',start=end+1); // name
/*String name = line.substring(start,end);*/
end = line.indexOf(';',start=end+1); // general category
end = line.indexOf(';',start=end+1); // canonical class
// check consistency: canonical classes must be from 0 to 255
int cc = Integer.parseInt(line.substring(start,end));
if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
canonicalClass.put(value,cc);
end = line.indexOf(';',start=end+1); // BIDI
end = line.indexOf(';',start=end+1); // decomp
// decomp requires more processing.
// store whether it is canonical or compatibility.
// store the decomp in one table, and the reverse mapping (from pairs) in another
if (start != end) {
String segment = line.substring(start, end);
boolean compat = segment.charAt(0) == '<';
if (compat) isCompatibility.set(value);
String decomp = fromHex(segment);
// a small snippet of code to generate the Applet data
/*if (GENERATING) {
if (value < 0xFF) {
System.out.println(
"\"\\u" + hex((char)value) + "\", "
+ "\"\\u" + hex(decomp, "\\u") + "\", "
+ (compat ? "\"K\"," : "\"\",")
+ "// " + name);
}
}*/
// check consistency: all canon decomps must be singles or pairs!
int decompLen = UTF16Util.countCodePoint(decomp);
if (decompLen < 1 || decompLen > 2 && !compat) {
System.err.println("Bad decomp at: " + line);
}
decompose.put(value, decomp);
// only compositions are canonical pairs
// skip if script exclusion
if (!compat && !isExcluded.get(value)) {
int first = '\u0000';
int second = UTF16Util.nextCodePoint(decomp, 0);
if (decompLen > 1) {
first = second;
second = UTF16Util.nextCodePoint(decomp,
UTF16Util.codePointLength(first));
}
// store composition pair in single integer
pair = ((long)first << 32) | second;
if (DEBUG && value == '\u00C0') {
System.out.println("debug2: " + line);
}
compose.put(pair, value);
} else if (DEBUG) {
System.out.println("Excluding: " + decomp);
}
}
}
in.close();
if (DEBUG) System.out.println("Done reading Unicode Character Database");
// add algorithmic Hangul decompositions
// this is more compact if done at runtime, but for simplicity we
// do it this way.
if (DEBUG) System.out.println("Adding Hangul");
for (int SIndex = 0; SIndex < SCount; ++SIndex) {
int TIndex = SIndex % TCount;
char first, second;
if (TIndex != 0) { // triple
first = (char)(SBase + SIndex - TIndex);
second = (char)(TBase + TIndex);
} else {
first = (char)(LBase + SIndex / NCount);
second = (char)(VBase + (SIndex % NCount) / TCount);
}
pair = ((long)first << 32) | second;
value = SIndex + SBase;
decompose.put(value, String.valueOf(first) + second);
compose.put(pair, value);
}
if (DEBUG) System.out.println("Done adding Hangul");
}
/**
* Hangul composition constants
*/
static final int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
NCount = VCount * TCount, // 588
SCount = LCount * NCount; // 11172
/**
* For use in an applet: just load a minimal set of data.
*/
private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
String[] decomposeData = {
"\u005E", "\u0020\u0302", "K",
"\u005F", "\u0020\u0332", "K",
"\u0060", "\u0020\u0300", "K",
"\u00A0", "\u0020", "K",
"\u00A8", "\u0020\u0308", "K",
"\u00AA", "\u0061", "K",
"\u00AF", "\u0020\u0304", "K",
"\u00B2", "\u0032", "K",
"\u00B3", "\u0033", "K",
"\u00B4", "\u0020\u0301", "K",
"\u00B5", "\u03BC", "K",
"\u00B8", "\u0020\u0327", "K",
"\u00B9", "\u0031", "K",
"\u00BA", "\u006F", "K",
"\u00BC", "\u0031\u2044\u0034", "K",
"\u00BD", "\u0031\u2044\u0032", "K",
"\u00BE", "\u0033\u2044\u0034", "K",
"\u00C0", "\u0041\u0300", "",
"\u00C1", "\u0041\u0301", "",
"\u00C2", "\u0041\u0302", "",
"\u00C3", "\u0041\u0303", "",
"\u00C4", "\u0041\u0308", "",
"\u00C5", "\u0041\u030A", "",
"\u00C7", "\u0043\u0327", "",
"\u00C8", "\u0045\u0300", "",
"\u00C9", "\u0045\u0301", "",
"\u00CA", "\u0045\u0302", "",
"\u00CB", "\u0045\u0308", "",
"\u00CC", "\u0049\u0300", "",
"\u00CD", "\u0049\u0301", "",
"\u00CE", "\u0049\u0302", "",
"\u00CF", "\u0049\u0308", "",
"\u00D1", "\u004E\u0303", "",
"\u00D2", "\u004F\u0300", "",
"\u00D3", "\u004F\u0301", "",
"\u00D4", "\u004F\u0302", "",
"\u00D5", "\u004F\u0303", "",
"\u00D6", "\u004F\u0308", "",
"\u00D9", "\u0055\u0300", "",
"\u00DA", "\u0055\u0301", "",
"\u00DB", "\u0055\u0302", "",
"\u00DC", "\u0055\u0308", "",
"\u00DD", "\u0059\u0301", "",
"\u00E0", "\u0061\u0300", "",
"\u00E1", "\u0061\u0301", "",
"\u00E2", "\u0061\u0302", "",
"\u00E3", "\u0061\u0303", "",
"\u00E4", "\u0061\u0308", "",
"\u00E5", "\u0061\u030A", "",
"\u00E7", "\u0063\u0327", "",
"\u00E8", "\u0065\u0300", "",
"\u00E9", "\u0065\u0301", "",
"\u00EA", "\u0065\u0302", "",
"\u00EB", "\u0065\u0308", "",
"\u00EC", "\u0069\u0300", "",
"\u00ED", "\u0069\u0301", "",
"\u00EE", "\u0069\u0302", "",
"\u00EF", "\u0069\u0308", "",
"\u00F1", "\u006E\u0303", "",
"\u00F2", "\u006F\u0300", "",
"\u00F3", "\u006F\u0301", "",
"\u00F4", "\u006F\u0302", "",
"\u00F5", "\u006F\u0303", "",
"\u00F6", "\u006F\u0308", "",
"\u00F9", "\u0075\u0300", "",
"\u00FA", "\u0075\u0301", "",
"\u00FB", "\u0075\u0302", "",
"\u00FC", "\u0075\u0308", "",
"\u00FD", "\u0079\u0301", "",
// EXTRAS, outside of Latin 1
"\u1EA4", "\u00C2\u0301", "",
"\u1EA5", "\u00E2\u0301", "",
"\u1EA6", "\u00C2\u0300", "",
"\u1EA7", "\u00E2\u0300", "",
};
int[] classData = {
0x0300, 230,
0x0301, 230,
0x0302, 230,
0x0303, 230,
0x0304, 230,
0x0305, 230,
0x0306, 230,
0x0307, 230,
0x0308, 230,
0x0309, 230,
0x030A, 230,
0x030B, 230,
0x030C, 230,
0x030D, 230,
0x030E, 230,
0x030F, 230,
0x0310, 230,
0x0311, 230,
0x0312, 230,
0x0313, 230,
0x0314, 230,
0x0315, 232,
0x0316, 220,
0x0317, 220,
0x0318, 220,
0x0319, 220,
0x031A, 232,
0x031B, 216,
0x031C, 220,
0x031D, 220,
0x031E, 220,
0x031F, 220,
0x0320, 220,
0x0321, 202,
0x0322, 202,
0x0323, 220,
0x0324, 220,
0x0325, 220,
0x0326, 220,
0x0327, 202,
0x0328, 202,
0x0329, 220,
0x032A, 220,
0x032B, 220,
0x032C, 220,
0x032D, 220,
0x032E, 220,
0x032F, 220,
0x0330, 220,
0x0331, 220,
0x0332, 220,
0x0333, 220,
0x0334, 1,
0x0335, 1,
0x0336, 1,
0x0337, 1,
0x0338, 1,
0x0339, 220,
0x033A, 220,
0x033B, 220,
0x033C, 220,
0x033D, 230,
0x033E, 230,
0x033F, 230,
0x0340, 230,
0x0341, 230,
0x0342, 230,
0x0343, 230,
0x0344, 230,
0x0345, 240,
0x0360, 234,
0x0361, 234
};
// build the same tables we would otherwise get from the
// Unicode Character Database, just with limited data
for (int i = 0; i < decomposeData.length; i+=3) {
char value = decomposeData[i].charAt(0);
String decomp = decomposeData[i+1];
boolean compat = decomposeData[i+2].equals("K");
if (compat) isCompatibility.set(value);
decompose.put(value, decomp);
if (!compat) {
int first = '\u0000';
int second = UTF16Util.nextCodePoint(decomp, 0);
if (decomp.length() > 1) {
first = second;
second = UTF16Util.nextCodePoint(decomp,
UTF16Util.codePointLength(first));
}
long pair = (first << 16) | second;
compose.put(pair, value);
}
}
for (int i = 0; i < classData.length;) {
canonicalClass.put(classData[i++], classData[i++]);
}
}
/**
* Utility: Parses a sequence of hex Unicode characters separated by spaces
*/
static public String fromHex(String source) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
switch (c) {
case ' ': break; // ignore
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
int end = 0;
int value = 0;
try {
//System.out.println(source.substring(i, i + 4) + "************" + source);
end = source.indexOf(' ',i);
if (end < 0) {
end = source.length();
}
value = Integer.parseInt(source.substring(i, end),16);
UTF16Util.appendCodePoint(result, value);
} catch (Exception e) {
System.out.println("i: " + i + ";end:" + end + "source:" + source);
//System.out.println(source.substring(i, i + 4) + "************" + source);
System.exit(1);
}
//i+= 3; // skip rest of number
i = end;
break;
case '<': int j = source.indexOf('>',i); // skip <...>
if (j > 0) {
i = j;
break;
} // else fall through--error
default:
throw new IllegalArgumentException("Bad hex value in " + source);
}
}
return result.toString();
}
/**
* Utility: Supplies a zero-padded hex representation of an integer (without 0x)
*/
static public String hex(int i) {
String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
return "00000000".substring(result.length(),8) + result;
}
/**
* Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
static public String hex(char i) {
String result = Integer.toString(i, 16).toUpperCase();
return "0000".substring(result.length(),4) + result;
}
/**
* Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
public static String hex(String s, String sep) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
if (i != 0) result.append(sep);
result.append(hex(s.charAt(i)));
}
return result.toString();
}
}