blob: 083b9fa6e0d89559c82a032c1b1f493df16ac6df [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2003-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: February 11 2003
* Since: ICU 2.6
**********************************************************************
*/
package com.ibm.icu.dev.tool.translit;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.Vector;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* This class produces the data tables used by the closeOver() method
* of UnicodeSet.
*
* Whenever the Unicode database changes, this tool must be re-run
* (AFTER the data file(s) underlying ICU4J are udpated).
*
* The output of this tool should then be pasted into the appropriate
* files:
*
* ICU4J: com.ibm.icu.text.UnicodeSet.java
* ICU4C: /icu/source/common/uniset.cpp
*/
class UnicodeSetCloseOver {
// Our output files
static final String JAVA_OUT = "to_UnicodeSet.java";
static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
static final String C_SET_OUT = "to_uniset.cpp";
static final String C_UCHAR_OUT = "to_uchar.c";
// Source code "do not edit" warning
static final String WARNING = "MACHINE-GENERATED; Unicode version " +
UCharacter.getUnicodeVersion() +
"; DO NOT EDIT; See " +
UnicodeSetCloseOver.class.getName();
// Case folding options flag. This must correspond to the options
// used in UnicodeSet.closeOver() in Java and C++.
static final boolean DEFAULT_CASE_MAP = true; // false for Turkish
public static void main(String[] args) throws IOException {
System.out.println("This tool will generate several output files. Each is named according");
System.out.println("the target file. For example, the contents of to_UnicodeSet.java should");
System.out.println("be pasted into UnicodeSet.java.");
System.out.println();
generateCaseData();
}
/**
* Create a map of String => Set. The String in this case is a
* folded string for which
* UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
* The Set contains all single-character strings x for which
* UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
* well as folded itself.
*/
static Map createCaseFoldEquivalencyClasses() {
Map equivClasses = new HashMap();
for (int i = 0; i <= 0x10FFFF; ++i) {
int cat = UCharacter.getType(i);
if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE)
continue;
String cp = UTF16.valueOf(i);
String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
if (folded.equals(cp)) continue;
// At this point, have different case folding. Add
// the code point and its folded equivalent into the
// equivalency class.
TreeSet s = (TreeSet) equivClasses.get(folded);
if (s == null) {
s = new TreeSet();
s.add(folded); // add the case fold result itself
equivClasses.put(folded, s);
}
s.add(cp);
}
return equivClasses;
}
/**
* Analyze the case fold equivalency classes. Break them into two
* groups: 'pairs', and 'nonpairs'. Create a tally of the length
* configurations of the nonpairs.
*
* Length configurations of equivalency classes, as of Unicode
* 3.2. Most of the classes (83%) have two single codepoints.
* Here "112:28" means there are 28 equivalency classes with 2
* single codepoints and one string of length 2.
*
* 11:656
* 111:16
* 1111:3
* 112:28
* 113:2
* 12:31
* 13:12
* 22:38
*
* Note: This method does not count the frequencies of the
* different length configurations (as shown above after ':'); it
* merely records which configurations occur.
*
* @param pairs Accumulate equivalency classes that consist of
* exactly two codepoints here. This is 83+% of the classes.
* E.g., {"a", "A"}.
* @param nonpairs Accumulate other equivalency classes here, as
* lists of strings. E,g, {"st", "\uFB05", "\uFB06"}.
* @param lengths Accumulate a list of unique length structures,
* not including pairs. Each length structure is represented by a
* string of digits. The digit string "12" means the equivalency
* class contains a single code point and a string of length 2.
* Typical contents of 'lengths': { "111", "1111", "112",
* "113", "12", "13", "22" }. Note the absence of "11".
*/
static void analyzeCaseData(Map equivClasses,
StringBuffer pairs,
Vector nonpairs,
Vector lengths) {
Iterator i = new TreeSet(equivClasses.keySet()).iterator();
StringBuffer buf = new StringBuffer();
while (i.hasNext()) {
Object key = i.next();
Vector v = new Vector((Set) equivClasses.get(key));
if (v.size() == 2) {
String a = (String) v.elementAt(0);
String b = (String) v.elementAt(1);
if (a.length() == 1 && b.length() == 1) {
pairs.append(a).append(b);
continue;
// Note that pairs are included in 'lengths'
}
}
String[] a = new String[v.size()];
v.toArray(a);
nonpairs.add(a);
//int singleCount = 0;
//int stringCount = 0;
// Make a string of the lengths, e.g., "111" means 3
// single code points; "13" means a single code point
// and a string of length 3.
v.clear();
for (int j=0; j<a.length; ++j) {
v.add(new Integer(a[j].length()));
}
Collections.sort(v);
buf.setLength(0);
for (int j=0; j<v.size(); ++j) {
buf.append(String.valueOf(v.elementAt(j)));
}
if (!lengths.contains(buf.toString())) {
lengths.add(buf.toString());
}
}
}
@SuppressWarnings("resource")
static void generateCaseData() throws IOException {
Map equivClasses = createCaseFoldEquivalencyClasses();
// Accumulate equivalency classes that consist of exactly
// two codepoints here. This is 83+% of the classes.
// E.g., {"a", "A"}.
StringBuffer pairs = new StringBuffer();
// Accumulate other equivalency classes here, as lists
// of strings. E,g, {"st", "\uFB05", "\uFB06"}.
Vector nonpairs = new Vector(); // contains String[]
Vector lengths = new Vector(); // "111", "12", "22", etc.
analyzeCaseData(equivClasses, pairs, nonpairs, lengths);
//-------------------------------------------------------------
// Emit Java source
PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
System.out.println("Writing " + JAVA_OUT);
out.println(" // " + WARNING);
out.println(" private static final String CASE_PAIRS =");
out.println(Utility.formatForSource(pairs.toString()) + ";");
out.println();
out.println(" // " + WARNING);
out.println(" private static final String[][] CASE_NONPAIRS = {");
for (int j=0; j<nonpairs.size(); ++j) {
String[] a = (String[]) nonpairs.elementAt(j);
out.print(" {");
for (int k=0; k<a.length; ++k) {
if (k != 0) out.print(", ");
out.print(Utility.format1ForSource(a[k]));
}
out.println("},");
}
out.println(" };");
//-------------------------------------------------------------
// Emit C++ source
// In C++, the pairs are again emitted in an array, but this
// array is the final representation form -- it will not be
// reprocessed into a hash. It will be binary searched by
// looking at the even elements [0], [2], [4], etc., and
// ignoring the odd elements. The even elements must contain
// the folded members of the pairs. That is, in the pair
// {'A', 'a'}, the even element must be 'a', not 'A'. Then a
// code point to be located is first folded ('Y' => 'y') then
// it binary searched against [0]='A', [2]='B', etc. When a
// match is found at k, the pair is [k], [k+1].
out = new PrintStream(new FileOutputStream(C_SET_OUT));
System.out.println("Writing " + C_SET_OUT);
// Sort the pairs. They must be ordered by the folded element.
// Store these as two-character strings, with charAt(0) being
// the folded member of the pair.
TreeSet sortPairs = new TreeSet(new Comparator() {
public int compare(Object a, Object b) {
return ((int) ((String) a).charAt(0)) -
((int) ((String) b).charAt(0));
}
public boolean equals(Object obj) {
return false;
}
});
for (int i=0; i<pairs.length(); i+=2) {
String a = String.valueOf(pairs.charAt(i));
String b = String.valueOf(pairs.charAt(i+1));
String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
if (a.equals(folded)) {
sortPairs.add(a + b);
} else {
sortPairs.add(b + a);
}
}
// Emit the pairs
out.println("// " + WARNING);
out.println("static const UChar CASE_PAIRS[] = {");
Iterator it = sortPairs.iterator();
while (it.hasNext()) {
out.print(" ");
int n = 0;
while (n++ < 5 && it.hasNext()) {
String s = (String) it.next();
//out.print((int) s.charAt(0) + "," +
// (int) s.charAt(1) + ",");
out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
Utility.hex(s.charAt(1)) + ",");
}
out.println();
}
out.println("};");
out.println();
// The non-pairs are encoded in the following way. All the
// single codepoints in each class are grouped together
// followed by a zero. Then each multi-character string is
// added, followed by a zero. Finally, another zero is added.
// Some examples:
// {"iQ", "R"} => [ 'R', 0, 'i', 'Q', 0, 0 ]
// {"S", "D", "F", "G"} => [ 'S', 'D', 'F', 'G', 0, 0 ]
// {"jW", "jY"} => [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
// The end-result is a short, flat array of UChar values that
// can be used to initialize a UChar[] array in C.
int maxLen = 0; // Maximum encoded length of any class, including zeros
out.println("// " + WARNING);
out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
for (int j=0; j<nonpairs.size(); ++j) {
int len = 0;
String[] a = (String[]) nonpairs.elementAt(j);
out.print(" {");
// Emit single code points
for (int k=0; k<a.length; ++k) {
if (a[k].length() != 1) continue;
//out.print((int) a[k].charAt(0) + ",");
out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
++len;
}
out.print("0, "); // End of single code points
++len;
// Emit multi-character strings
for (int k=0; k<a.length; ++k) {
if (a[k].length() == 1) continue;
for (int m=0; m<a[k].length(); ++m) {
//out.print((int) a[k].charAt(m) + ",");
out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
++len;
}
out.print("0, "); // End of string
++len;
}
out.println("0},"); // End of equivalency class
++len;
if (len > maxLen) maxLen = len;
}
out.println("};");
// Make sure the CaseEquivClass data can fit.
if (maxLen > 8) {
throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
}
// Also make sure that we can map into this array using a
// CompactByteArray. We could do this check above, but we
// keep it here, adjacent to the maxLen check. We use one
// value (-1 == 255) to indicate "no value."
if (nonpairs.size() > 255) {
throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
}
//-------------------------------------------------------------
// Case-unique set: All characters c for which closeOver(c)==c.
// UPDATE: Instead of using this, we're using the related
// notion of Case_Sensitive. See below. Note that
// Case_Sensitive != ^Case_Unique.
if (false) {
UnicodeSet caseUnique = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String cp = UTF16.valueOf(i);
if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
caseUnique.add(i);
}
}
// out.println("caseUnique = " + caseUnique.toPattern(true));
}
UnicodeSet caseSensitive = getCaseSensitive();
//System.out.println("caseSensitive = " + caseSensitive.toPattern(true));
// Now for C, emit an array of ranges
out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
System.out.println("Writing " + C_UCHAR_OUT);
out.println("/* " + WARNING + " */");
emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");
// For Java, emit a string with the ranges (each pair of chars
// in the string is a range).
out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
System.out.println("Writing " + JAVA_CHARPROP_OUT);
out.println(" // " + WARNING);
emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
}
/**
* Create the set of case-sensitive characters. These are characters
* that participate in any case mapping operation as a source or
* as a member of a target string.
*/
static UnicodeSet getCaseSensitive() {
UnicodeSet caseSensitive = new UnicodeSet();
Locale loc = Locale.US;
BreakIterator bi = BreakIterator.getTitleInstance(loc);
for (int c = 0; c <= 0x10FFFF; ++c) {
String cp = UTF16.valueOf(c);
for (int j=0; j<4; ++j) {
String s = null;
switch (j) {
case 0: s = UCharacter.toUpperCase(loc, cp); break;
case 1: s = UCharacter.toLowerCase(loc, cp); break;
case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
}
if (!s.equals(cp)) {
int cc;
for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
cc = UTF16.charAt(s, k);
caseSensitive.add(cc);
}
for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
cc = UTF16.charAt(cp, k);
caseSensitive.add(cc);
}
}
}
// Also do the single-codepoint API. This shouldn't add any
// code points, but we do it for completeness.
for (int j=0; j<4; ++j) {
int d = 0;
switch (j) {
case 0: d = UCharacter.toUpperCase(c); break;
case 1: d = UCharacter.toLowerCase(c); break;
case 2: d = UCharacter.toTitleCase(c); break;
case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
}
if (d != c) {
if (!caseSensitive.contains(c) ||
!caseSensitive.contains(d)) {
System.out.println("Warning: code point " + c +
" => " + d + " created NEW MAPPING"+
" for Case_Sensitive");
}
caseSensitive.add(c);
caseSensitive.add(d);
}
}
}
return caseSensitive;
}
/**
* Given a UnicodeSet, emit it as an array of UChar pairs. Each
* pair will be the start/end of a range. Code points >= U+10000
* will be represented as surrogate pairs.
*/
static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
// Store the pairs in a StringBuffer. This handles surrogate
// representation.
StringBuffer buf = new StringBuffer();
for (int i=0; i<set.getRangeCount(); ++i) {
UTF16.append(buf, set.getRangeStart(i));
UTF16.append(buf, set.getRangeEnd(i));
}
// Emit the pairs
out.println("static const UChar " + id + "[] = {");
for (int i=0; i<buf.length(); ) {
out.print(" ");
for (int n=0; n++<10 && i<buf.length(); ++i) {
out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
}
out.println();
}
out.println("};");
out.println("#define " + id + "_LENGTH (sizeof(" + id +
")/sizeof(" + id + "[0]))");
}
/**
* Given a UnicodeSet, emit it as a Java string. The most economical
* format is not the pattern, but instead a pairs list, with each
* range pair represented as two adjacent characters.
*/
static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
// Store the pairs in a StringBuffer. This handles surrogate
// representation.
StringBuffer buf = new StringBuffer();
for (int i=0; i<set.getRangeCount(); ++i) {
UTF16.append(buf, set.getRangeStart(i));
UTF16.append(buf, set.getRangeEnd(i));
}
// Emit the pairs
out.println(" private static final String " + id + " =");
out.println(Utility.formatForSource(buf.toString()) + ";");
}
}