blob: ec017e73b011657e279e80ea69480a786760ee51 [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Mark Davis
**********************************************************************
*/
package com.ibm.icu.dev.tool.cldr;
import java.io.File;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.DateFormat;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.SimpleDateFormat;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.Currency;
import com.ibm.icu.util.LocaleData;
import com.ibm.icu.util.TimeZone;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.dev.test.util.Relation;
import com.ibm.icu.dev.test.util.SortedBag;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.dev.tool.cldr.ICUResourceWriter.Resource;
import com.ibm.icu.dev.tool.cldr.ICUResourceWriter.ResourceTable;
/**
* Generated tests for CLDR. Currently, these are driven off of a version of ICU4J, and just
* use the data from that.
* TODO Get the data directly from the CLDR tree.
* @author medavis
*/
public class GenerateCldrTests {
static private PrintWriter log;
PrintWriter out;
private static final int
HELP1 = 0,
HELP2 = 1,
DESTDIR = 2,
LOGDIR = 3,
SOURCEDIR =4,
MATCH = 5,
FULLY_RESOLVED = 6;
private static final UOption[] options = {
UOption.HELP_H(),
UOption.HELP_QUESTION_MARK(),
UOption.DESTDIR().setDefault("C:\\DATA\\GEN\\cldr\\test\\"),
UOption.create("log", 'l', UOption.REQUIRES_ARG).setDefault("C:\\DATA\\GEN\\cldr\\test\\"),
UOption.SOURCEDIR().setDefault("C:\\ICU4C\\locale\\common\\"),
UOption.create("match", 'm', UOption.REQUIRES_ARG).setDefault(".*"),
UOption.create("fullyresolved", 'f', UOption.NO_ARG),
};
CldrCollations cldrCollations;
static String logDir = null, destDir = null;
public static void main(String[] args) throws Exception {
UOption.parseArgs(args, options);
log = BagFormatter.openUTF8Writer(options[LOGDIR].value, "log.txt");
try {
//compareAvailable();
//if (true) return;
//System.out.println(createCaseClosure(new UnicodeSet("[a{bc}{def}{oss}]")));
//System.out.println(createCaseClosure(new UnicodeSet("[a-zß{aa}]")));
GenerateCldrTests t = new GenerateCldrTests();
//t.generate(new ULocale("hu"), null);
t.generate(options[MATCH].value);
/*
t.generate(new ULocale("da"));
t.generate(new ULocale("hu"));
t.generate(new ULocale("de"));
t.generate(new ULocale("ar@collation=direct"));
*/
} finally {
log.close();
System.out.println("Done");
}
}
/**
*
*/
/*
private static void compareAvailable() {
ULocale[] cols = Collator.getAvailableULocales();
Locale[] alocs = NumberFormat.getAvailableLocales();
Set sCols = filter(cols);
Set sLocs = filter(alocs);
Set oldSLocs = new TreeSet(sCols);
sLocs.removeAll(sCols);
log.println("main - collation");
showLocales(sLocs);
sCols.removeAll(oldSLocs);
log.println();
log.println("collation - main");
showLocales(sCols);
}
*/
/**
* @param sLocs
*/
private static void showLocales(Set sLocs) {
for (Iterator it = sLocs.iterator(); it.hasNext();) {
String s = (String) it.next();
log.println(s + "\t" + ULocale.getDisplayLanguage(s,"en"));
}
}
/**
* @param cols
* @return
*/
private static Set filter(Object[] cols) {
Set result = new TreeSet();
for (int i = 0; i < cols.length; ++i) {
String s = cols[i].toString();
if (s.indexOf('_') >= 0) continue;
result.add(s);
}
return result;
}
Set addULocales(Object[] objects, Set target) {
for (int i = 0; i < objects.length; ++i) {
target.add(new ULocale(objects[i].toString()));
}
return target;
}
private void addLocale(ULocale item) {
String lang = item.getLanguage();
if (lang.length() == 0) return; // skip root
ULocale parent = new ULocale(lang);
//System.out.println(item + ", " + parent);
parentToLocales.add(parent, item);
/*
RuleBasedCollator col = cldrCollations.getInstance(item);
if (col == null) {
System.out.println("No collator for: " + item);
}
String rules = col.getRules(); // ((RuleBasedCollator)Collator.getInstance(item)).getRules();
rulesToLocales.add(rules, item);
localesToRules.put(item, rules);
*/
}
Set collationLocales; // = addULocales(Collator.getAvailableULocales(), new TreeSet(ULocaleComparator));
Set numberLocales = addULocales(NumberFormat.getAvailableLocales(), new TreeSet(ULocaleComparator));
Set dateLocales = addULocales(DateFormat.getAvailableLocales(), new TreeSet(ULocaleComparator));
Set allLocales = new TreeSet(ULocaleComparator);
Map localesToRules = new HashMap();
Relation.CollectionFactory cm = new Relation.CollectionMaker(ULocaleComparator);
Relation rulesToLocales = new Relation(new TreeMap(), cm);
Relation parentToLocales = new Relation(new TreeMap(ULocaleComparator), cm);
void getLocaleList() {
collationLocales = new TreeSet(ULocaleComparator);
collationLocales.addAll(cldrCollations.getAvailableSet());
/*
collationLocales = addULocales(new String[] { // HACK
"ga",
"nl",
"pt",
"de@collation=phonebook",
"es@collation=traditional",
"hi@collation=direct",
"zh@collation=pinyin",
"zh@collation=stroke",
"zh@collation=traditional",
}, collationLocales);
*/
allLocales.addAll(collationLocales);
allLocales.addAll(numberLocales);
allLocales.addAll(dateLocales);
// HACK
// get all collations with same rules
for (Iterator it = allLocales.iterator(); it.hasNext();) {
addLocale((ULocale) it.next());
}
/*
String[] others = new String[] {
"de@collation=phonebook",
"es@collation=traditional",
"hi@collation=direct",
"zh@collation=pinyin",
"zh@collation=stroke",
"zh@collation=traditional",
};
for (int i = 0; i < others.length; ++i) {
addLocale(new ULocale(others[i]));
}
*/
}
CldrOthers cldrOthers;
void generate(String pat) throws Exception {
cldrOthers = new CldrOthers(options[SOURCEDIR].value + "main" + File.separator, pat);
cldrOthers.show();
//if (true) return;
cldrCollations = new CldrCollations(options[SOURCEDIR].value + "collation" + File.separator, pat);
cldrCollations.show();
getLocaleList();
Matcher m = Pattern.compile(pat).matcher("");
for (Iterator it = parentToLocales.keySet().iterator(); it.hasNext();) {
String p = it.next().toString();
if (!m.reset(p).matches()) continue;
generate(new ULocale(p));
}
}
private void generate(ULocale locale) throws Exception {
out = BagFormatter.openUTF8Writer(options[DESTDIR].value, locale + ".xml");
out.println("<?xml version='1.0' encoding='UTF-8' ?>");
out.println("<!DOCTYPE ldml SYSTEM 'http://www.unicode.org/cldr/dtd/1.2/beta/cldrTest.dtd'>");
out.println("<!-- For information, see readme.html -->");
out.println(" <cldrTest version='1.2' base='" + locale + "'>");
out.println(" <!-- " + BagFormatter.toXML.transliterate(
locale.getDisplayName(ULocale.ENGLISH) + " ["
+ locale.getDisplayName(locale))
+ "] -->");
//generateItems(locale, numberLocales, NumberEquator, NumberShower);
//generateItems(locale, dateLocales, DateEquator, DateShower);
generateItems(locale, collationLocales, CollationEquator, CollationShower);
out.println(" </cldrTest>");
out.close();
GenerateSidewaysView.generateBat(options[SOURCEDIR].value + "test" + File.separator, locale + ".xml", options[DESTDIR].value, locale + ".xml");
}
/*
*
// first pass through and get all the functional equivalents
Map uniqueLocales = new TreeMap();
String[] keywords = Collator.getKeywords();
boolean [] isAvailable = new boolean[1];
for (int i = 0; i < locales.length; ++i) {
add(locales[i], uniqueLocales);
if (true) continue; // TODO restore once Vladimir fixes
for (int j = 0; j < keywords.length; ++j) {
String[] values = Collator.getKeywordValues(keywords[j]);
for (int k = 0; k < values.length; ++k) {
// TODO -- for a full job, would do all combinations of different keywords!
if (values[k].equals("standard")) continue;
add(new ULocale(locales[i] + "@" + keywords[j] + "=" + values[k]), uniqueLocales);
//ULocale other = Collator.getFunctionalEquivalent(keywords[j], locales[i], isAvailable);
}
}
}
for (int i = 0; i < extras.length; ++i) {
add(new ULocale(extras[i]), uniqueLocales);
}
// items are now sorted by rules. So resort by locale
Map toDo = new TreeMap(ULocaleComparator);
for (Iterator it = uniqueLocales.keySet().iterator(); it.hasNext();) {
Object rules = it.next();
Set s = (Set) uniqueLocales.get(rules);
ULocale ulocale = (ULocale) s.iterator().next(); // get first one
toDo.put(ulocale, s);
}
for (Iterator it = toDo.keySet().iterator(); it.hasNext();) {
ULocale ulocale = (ULocale) it.next();
Set s = (Set) toDo.get(ulocale);
generate(ulocale);
}
*/
/**
* add locale into list. Replace old if shorter
* @param locale
*/
void add(ULocale locale, Map uniqueLocales) {
try {
RuleBasedCollator col = cldrCollations.getInstance(locale); // (RuleBasedCollator) Collator.getInstance(locale);
// for our purposes, separate locales if we are using different exemplars
String key = col.getRules() + "\uFFFF" + getExemplarSet(locale, 0);
Set s = (Set) uniqueLocales.get(key);
if (s == null) {
s = new TreeSet(ULocaleComparator);
uniqueLocales.put(key, s);
}
System.out.println("Adding " + locale);
s.add(locale);
} catch (Throwable e) { // skip
System.out.println("skipped " + locale);
}
}
/**
* Work-around
* @param locale
* @param options
* @return
*/
public UnicodeSet getExemplarSet(ULocale locale, int options) {
String n = locale.toString();
int pos = n.indexOf('@');
if (pos >= 0) locale = new ULocale(n.substring(0,pos));
UnicodeSet result = cldrOthers.getExemplarSet(locale); // LocaleData.getExemplarSet(locale, options);
if (options == 0) result.closeOver(UnicodeSet.CASE);
return result;
}
public static final Comparator ULocaleComparator = new Comparator() {
public int compare(Object o1, Object o2) {
return o1.toString().compareTo(o2.toString());
}
};
public interface Equator {
public boolean equals(Object o1, Object o2);
}
static boolean intersects(Collection a, Collection b) {
for (Iterator it = a.iterator(); it.hasNext();) {
if (b.contains(it.next())) return true;
}
return false;
}
static Collection extract(Object x, Collection a, Equator e, Collection output) {
List itemsToRemove = new ArrayList();
for (Iterator it = a.iterator(); it.hasNext();) {
Object item = it.next();
if (e.equals(x, item)) {
itemsToRemove.add(item); // have to do this because iterator may not allow
output.add(item);
}
}
a.removeAll(itemsToRemove);
return output;
}
class ResultsPrinter {
Map settings = new TreeMap();
Map oldSettings = new TreeMap();
void set(String name, String value) {
settings.put(name, value);
}
void print(String result) {
out.print(" <result");
for (Iterator it = settings.keySet().iterator(); it.hasNext();) {
Object key = it.next();
Object value = settings.get(key);
if (!value.equals(oldSettings.get(key))) {
out.print(" " + key + "='" + BagFormatter.toXML.transliterate(value.toString()) + "'");
oldSettings.put(key, value);
}
}
out.println(">" + BagFormatter.toXML.transliterate(result) + "</result>");
}
}
interface DataShower {
void show(ULocale first, Collection others) throws Exception;
}
private void generateItems(ULocale locale, Collection onlyLocales, Equator test, DataShower generator) throws Exception {
Collection sublocales = parentToLocales.get(locale, new ArrayList());
sublocales.retainAll(onlyLocales);
// get all the things that share the same behavior
while (sublocales.size() != 0) {
// start with the first one
ULocale first = (ULocale) sublocales.iterator().next();
Collection others = extract(first, sublocales, test, new ArrayList());
generator.show(first, others);
}
}
private void showLocales(String elementName, Collection others) {
//System.out.println(elementName + ": " + locale);
out.println(" <" + elementName + " ");
StringBuffer comment = new StringBuffer();
if (others != null && others.size() != 0) {
out.print("locales='");
boolean first = true;
for (Iterator it = others.iterator(); it.hasNext();) {
if (first)
first = false;
else {
out.print(" ");
comment.append("; ");
}
ULocale loc = (ULocale) it.next();
out.print(loc);
comment.append(loc.getDisplayName(ULocale.ENGLISH) + " ["
+ loc.getDisplayName(loc) + "]");
}
out.print("'");
}
out.println(">");
out.println("<!-- "
+ BagFormatter.toXML.transliterate(comment.toString())
+ " -->");
}
// ========== DATES ==========
static TimeZone utc = TimeZone.getTimeZone("GMT");
static DateFormat iso = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
{
iso.setTimeZone(utc);
}
static int[] DateFormatValues = {-1, DateFormat.SHORT, DateFormat.MEDIUM, DateFormat.LONG, DateFormat.FULL};
static String[] DateFormatNames = {"none", "short", "medium", "long", "full"};
private static DateFormat getDateFormat(ULocale locale, int i, int k) {
Locale olocale = locale.toLocale(); // TODO replace once fixed!!
DateFormat df;
assert(olocale == null);
if (DateFormatValues[k] == -1) df = DateFormat.getDateInstance(DateFormatValues[i], olocale);
else if (DateFormatValues[i] == -1) df = DateFormat.getTimeInstance(DateFormatValues[k], olocale);
else df = DateFormat.getDateTimeInstance(DateFormatValues[i], DateFormatValues[k], olocale);
return df;
}
static Equator DateEquator = new Equator() {
/**
* Must both be ULocales
*/
public boolean equals(Object o1, Object o2) {
ULocale loc1 = (ULocale) o1;
ULocale loc2 = (ULocale) o2;
for (int i = 0; i < DateFormatValues.length; ++i) {
for (int j = 0; j < DateFormatValues.length; ++j) {
if (i == 0 && j == 0) continue; // skip null case
DateFormat df1 = getDateFormat(loc1, i, j);
NumberFormat nf = df1.getNumberFormat();
nf.setCurrency(NO_CURRENCY);
df1.setNumberFormat(nf);
DateFormat df2 = getDateFormat(loc2, i, j);
nf = df2.getNumberFormat();
nf.setCurrency(NO_CURRENCY);
df2.setNumberFormat(nf);
if (!df1.equals(df2)) {
df1.equals(df2);
return false;
}
}
}
return true;
}
};
DataShower DateShower = new DataShower() {
public void show(ULocale locale, Collection others) throws ParseException {
showLocales("date", others);
String[] samples = {
"1900-01-31T00:00:00Z",
"1909-02-28T00:00:01Z",
"1918-03-26T00:59:59Z",
"1932-04-24T01:00:00Z",
"1945-05-20T01:00:01Z",
"1952-06-18T11:59:59Z",
"1973-07-16T12:00:00Z",
"1999-08-14T12:00:01Z",
"2000-09-12T22:59:59Z",
"2001-10-08T23:00:00Z",
"2004-11-04T23:00:01Z",
"2010-12-01T23:59:59Z",
};
ResultsPrinter rp = new ResultsPrinter();
for (int j = 0; j < samples.length; ++j) {
Date datetime = iso.parse(samples[j]);
rp.set("input", iso.format(datetime));
for (int i = 0; i < DateFormatValues.length; ++i) {
rp.set("dateType", DateFormatNames[i]);
for (int k = 0; k < DateFormatValues.length; ++k) {
if (DateFormatValues[i] == -1 && DateFormatValues[k] == -1) continue;
rp.set("timeType", DateFormatNames[k]);
DateFormat df = getDateFormat(locale, i, k);
df.setTimeZone(utc);
rp.print(df.format(datetime));
}
}
}
out.println(" </date>");
}};
// ========== NUMBERS ==========
static String[] NumberNames = {"standard", "integer", "decimal", "percent", "scientific"};
private static NumberFormat getNumberFormat(ULocale ulocale, int i) {
Locale olocale = ulocale.toLocale();
NumberFormat nf = null;
switch(i) {
case 0: nf = NumberFormat.getInstance(olocale); break;
case 1: nf = NumberFormat.getIntegerInstance(olocale); break;
case 2: nf = NumberFormat.getNumberInstance(olocale); break;
case 3: nf = NumberFormat.getPercentInstance(olocale); break;
case 4: nf = NumberFormat.getScientificInstance(olocale); break;
default: throw new IllegalArgumentException("Unknown NumberFormat: " + i);
}
return nf;
}
static Currency NO_CURRENCY = Currency.getInstance("XXX");
static Equator NumberEquator = new Equator() {
/**
* Must both be ULocales
*/
public boolean equals(Object o1, Object o2) {
ULocale loc1 = (ULocale) o1;
ULocale loc2 = (ULocale) o2;
for (int i = 0; i < NumberNames.length; ++i) {
NumberFormat nf1 = getNumberFormat(loc1, i);
nf1.setCurrency(NO_CURRENCY);
NumberFormat nf2 = getNumberFormat(loc2, i);
nf2.setCurrency(NO_CURRENCY);
if (!nf1.equals(nf2)) {
//nf1.equals(nf2);
return false;
}
}
return true;
}
};
DataShower NumberShower = new DataShower() {
public void show(ULocale locale, Collection others) throws ParseException {
showLocales("number", others);
double[] samples = {
0,
0.01, -0.01,
1, -1,
123.456, -123.456,
123456.78, -123456.78,
Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY,
Double.NaN
};
ResultsPrinter rp = new ResultsPrinter();
for (int j = 0; j < samples.length; ++j) {
double sample = samples[j];
rp.set("input", String.valueOf(sample));
for (int i = 0; i < NumberNames.length; ++i) {
rp.set("numberType", NumberNames[i]);
NumberFormat nf = getNumberFormat(locale, i);
rp.print(nf.format(sample));
}
}
out.println(" </number>");
}};
// ========== COLLATION ==========
Equator CollationEquator = new Equator() {
/**
* Must both be ULocales
*/
public boolean equals(Object o1, Object o2) {
ULocale loc1 = (ULocale) o1;
ULocale loc2 = (ULocale) o2;
return cldrCollations.getInstance(loc1).equals(cldrCollations.getInstance(loc2)); // Collator.getInstance(loc1).equals(Collator.getInstance(loc2));
}
};
DataShower CollationShower = new DataShower() {
public void show(ULocale locale, Collection others) {
showLocales("collation", others);
Collator col = cldrCollations.getInstance(locale); // Collator.getInstance(locale);
UnicodeSet tailored = col.getTailoredSet();
if (locale.getLanguage().equals("zh")) {
tailored.addAll(new UnicodeSet("[[a-z]-[v]]"));
log.println("HACK for Pinyin");
}
tailored = createCaseClosure(tailored);
tailored = nfc(tailored);
//System.out.println(tailored.toPattern(true));
UnicodeSet exemplars = getExemplarSet(locale, UnicodeSet.CASE);
// add all the exemplars
if (false) for (Iterator it = others.iterator(); it.hasNext(); ) {
exemplars.addAll(getExemplarSet((ULocale)it.next(), UnicodeSet.CASE));
}
exemplars = createCaseClosure(exemplars);
exemplars = nfc(exemplars);
//System.out.println(exemplars.toPattern(true));
tailored.addAll(exemplars);
//UnicodeSet tailoredMinusHan = new UnicodeSet(tailored).removeAll(SKIP_COLLATION_SET);
if (!exemplars.containsAll(tailored)) {
//BagFormatter bf = new BagFormatter();
log.println("In Tailored, but not Exemplar; Locale: " + locale + "\t" + locale.getDisplayName());
log.println(new UnicodeSet(tailored).removeAll(exemplars).toPattern(false));
//bf.(log,"tailored", tailored, "exemplars", exemplars);
log.flush();
}
tailored.addAll(new UnicodeSet("[\\ .02{12}]"));
tailored.removeAll(SKIP_COLLATION_SET);
SortedBag bag = new SortedBag(col);
doCollationResult(col, tailored, bag);
out.println(" </collation>");
}};
static final UnicodeSet SKIP_COLLATION_SET = new UnicodeSet(
"[[:script=han:][:script=hangul:]-[\u4e00-\u4eff \u9f00-\u9fff \uac00-\uacff \ud700-\ud7ff]]");
/**
* @param col
* @param tailored
* @param bag
*/
private void doCollationResult(Collator col, UnicodeSet tailored, SortedBag bag) {
for (UnicodeSetIterator usi = new UnicodeSetIterator(tailored); usi.next(); ) {
String s = usi.getString();
bag.add('x' + s);
bag.add('X' + s);
bag.add('x' + s + 'x');
}
//out.println(" <set locale='" + locale + "'/>");
/*
if (others != null) for (Iterator it = others.iterator(); it.hasNext(); ) {
ULocale uloc = (ULocale) it.next();
if (uloc.equals(locale)) continue;
out.println(" <other locale='" + uloc + "'/>");
}
*/
out.println(" <result>");
String last = "";
boolean needEquals = false;
for (Iterator it = bag.iterator(); it.hasNext(); ) {
String s = (String) it.next();
if (col.compare(s, last) != 0) {
if (needEquals) out.println(last);
needEquals = false;
last = s;
} else {
needEquals = true;
}
out.println(BagFormatter.toXML.transliterate(s));
}
out.println(" </result>");
}
static public Set getMatchingXMLFiles(String sourceDir, String localeRegex) {
Matcher m = Pattern.compile(localeRegex).matcher("");
Set s = new TreeSet();
File[] files = new File(sourceDir).listFiles();
for (int i = 0; i < files.length; ++i) {
String name = files[i].getName();
if (!name.endsWith(".xml")) continue;
if (name.startsWith("supplementalData")) continue;
String locale = name.substring(0,name.length()-4); // drop .xml
if (!locale.equals("root") && !m.reset(locale).matches()) continue;
s.add(locale);
}
return s;
}
static class CldrOthers {
Map ulocale_exemplars = new TreeMap(ULocaleComparator);
Map uniqueExemplars = new HashMap();
String sourceDir;
Set locales = new TreeSet(ULocaleComparator);
UnicodeSet getExemplarSet(ULocale locale) {
return (UnicodeSet) ulocale_exemplars.get(locale);
}
void show() {
log.println("Showing Locales");
log.println("Unique Exemplars: " + uniqueExemplars.size());
for (Iterator it2 = ulocale_exemplars.keySet().iterator(); it2.hasNext();) {
ULocale locale = (ULocale) it2.next();
UnicodeSet us = getExemplarSet(locale);
log.println("\t" + locale + ", " + us);
}
}
static final ULocale ROOT = new ULocale("root"); // since CLDR has different root.
CldrOthers(String sourceDir, String localeRegex) {
this.sourceDir = sourceDir;
Set s = getMatchingXMLFiles(sourceDir, localeRegex);
for (Iterator it = s.iterator(); it.hasNext();) {
getInfo((String) it.next());
}
// now do inheritance manually
for (Iterator it = locales.iterator(); it.hasNext();) {
ULocale locale = (ULocale) it.next();
UnicodeSet ex = (UnicodeSet) ulocale_exemplars.get(locale);
if (ex != null) continue;
for (ULocale parent = locale.getFallback(); parent != null; parent = parent.getFallback()) {
ULocale fixedParent = parent.getLanguage().length() == 0 ? ROOT : parent;
ex = (UnicodeSet) ulocale_exemplars.get(fixedParent);
if (ex == null) continue;
ulocale_exemplars.put(locale, ex);
break;
}
}
}
void getInfo(String locale) {
System.out.println("Getting info for: " + locale);
locales.add(new ULocale(locale));
Document doc;
if (options[FULLY_RESOLVED].doesOccur) {
doc = LDMLUtilities.getFullyResolvedLDML(sourceDir, locale,
false, false, false);
} else {
doc = LDMLUtilities.parse(sourceDir + locale + ".xml", false);
}
Node node = LDMLUtilities.getNode(doc, "//ldml/characters/exemplarCharacters");
if (node == null) return;
if (isDraft(node)) System.out.println("Skipping draft: " + locale + ", " + getXPath(node));
String exemplars = LDMLUtilities.getNodeValue(node);
UnicodeSet exemplarSet = new UnicodeSet(exemplars);
UnicodeSet fixed = (UnicodeSet) uniqueExemplars.get(exemplarSet);
if (fixed == null) {
uniqueExemplars.put(exemplarSet, exemplarSet);
fixed = exemplarSet;
}
ulocale_exemplars.put(new ULocale(locale), fixed);
}
}
public static boolean isDraft(Node node) {
for (; node.getNodeType() != Node.DOCUMENT_NODE; node = node.getParentNode()){
NamedNodeMap attributes = node.getAttributes();
if (attributes == null) continue;
for (int i = 0; i < attributes.getLength(); ++i) {
Node attribute = attributes.item(i);
if (attribute.getNodeName().equals("draft") && attribute.getNodeValue().equals("true")) return true;
}
}
return false;
}
public static String getXPath(Node node) {
StringBuffer xpathFragment = new StringBuffer();
StringBuffer xpath = new StringBuffer();
for (; node.getNodeType() != Node.DOCUMENT_NODE; node = node.getParentNode()){
xpathFragment.setLength(0);
xpathFragment.append('/').append(node.getNodeName());
NamedNodeMap attributes = node.getAttributes();
if (attributes != null) {
for (int i = 0; i < attributes.getLength(); ++i) {
Node attribute = attributes.item(i);
xpathFragment.append("[@").append(attribute.getNodeName()).append('=')
.append(attribute.getNodeValue()).append(']');
}
}
xpath.insert(0, xpathFragment);
}
xpath.insert(0, '/');
return xpath.toString();
}
public static String getParent(String locale) {
int pos = locale.lastIndexOf('_');
if (pos >= 0) {
return locale.substring(0,pos);
}
if (!locale.equals("root")) return "root";
return null;
}
static class CldrCollations {
Set validLocales = new TreeSet();
Map ulocale_rules = new TreeMap(ULocaleComparator);
Map locale_types_rules = new TreeMap();
String sourceDir;
Map collation_collation = new HashMap();
RuleBasedCollator emptyCollator = (RuleBasedCollator) Collator.getInstance(new ULocale(""));
public Set getAvailableSet() {
return ulocale_rules.keySet();
}
public RuleBasedCollator getInstance(ULocale locale) {
return (RuleBasedCollator) ulocale_rules.get(locale);
}
void show() {
log.println("Showing Locales");
log.println("Unique Collators: " + collation_collation.size());
for (Iterator it2 = ulocale_rules.keySet().iterator(); it2.hasNext();) {
ULocale locale = (ULocale) it2.next();
RuleBasedCollator col = (RuleBasedCollator) ulocale_rules.get(locale);
log.println("\t" + locale + ", " + col.getRules());
}
}
CldrCollations(String sourceDir, String localeRegex) throws Exception {
this.sourceDir = sourceDir;
Set s = getMatchingXMLFiles(sourceDir, localeRegex);
for (Iterator it = s.iterator(); it.hasNext();) {
getCollationRules((String) it.next());
}
// now fixup the validLocales, adding in what they inherit
// TODO, add check: validSubLocales are masked by intervening locales.
for (Iterator it = validLocales.iterator(); it.hasNext(); ) {
String locale = (String) it.next();
Map types_rules = (Map) locale_types_rules.get(locale);
if (types_rules != null) log.println("Weird: overlap in validLocales: " + locale);
else {
for (String parentlocale = getParent(locale); parentlocale != null; locale = getParent(parentlocale)) {
types_rules = (Map) locale_types_rules.get(parentlocale);
if (types_rules != null) {
locale_types_rules.put(locale, types_rules);
break;
}
}
}
}
// now generate the @-style locales
for (Iterator it = locale_types_rules.keySet().iterator(); it.hasNext(); ) {
String locale = (String) it.next();
Map types_rules = (Map) locale_types_rules.get(locale);
for (Iterator it2 = types_rules.keySet().iterator(); it2.hasNext(); ) {
String type = (String) it2.next();
RuleBasedCollator col = (RuleBasedCollator) types_rules.get(type);
String name = type.equals("standard") ? locale : locale + "@collation=" + type;
ulocale_rules.put(new ULocale(name), col);
}
}
}
public static String replace(String source, String pattern, String replacement) {
// dumb code for now
for (int pos = source.indexOf(pattern, 0); pos >= 0; pos = source.indexOf(pattern, pos + 1)) {
source = source.substring(0, pos) + replacement + source.substring(pos+pattern.length());
}
return source;
}
static Transliterator fromHex = Transliterator.getInstance("hex-any");
private void getCollationRules(String locale) throws Exception {
System.out.println(locale);
Document doc = LDMLUtilities.getFullyResolvedLDML(sourceDir, locale, false, false, false);
Node node = LDMLUtilities.getNode(doc, "//ldml/collations");
LDML2ICUConverter cnv = new LDML2ICUConverter();
StringBuffer stringBuffer = new StringBuffer();
ICUResourceWriter.ResourceTable resource = (ICUResourceWriter.ResourceTable) cnv.parseCollations(node, stringBuffer);
Map types_rules = new TreeMap();
locale_types_rules.put(locale, types_rules);
for (Resource current = resource.first; current != null; current = current.next) {
//System.out.println(current.name);
if (current instanceof ICUResourceWriter.ResourceTable) {
ICUResourceWriter.ResourceTable table = (ICUResourceWriter.ResourceTable) current;
for (Resource current2 = table.first; current2 != null; current2 = current2.next) {
if (current2 instanceof ICUResourceWriter.ResourceString) {
ICUResourceWriter.ResourceString foo = (ICUResourceWriter.ResourceString) current2;
//System.out.println("\t" + foo.name + ", " + foo.val);
/* skip since the utilities have the wrong value
if (current.name.equals("validSubLocales")) {
// skip since it is wrong
log.println("Valid Sub Locale: " + foo.name);
validLocales.add(foo.name);
} else
*/
if (foo.name.equals("Sequence")) {
// remove the \ u's, because they blow up
String rules = fromHex.transliterate(foo.val);
RuleBasedCollator fixed = generateCollator(locale, current.name, foo.name, rules);
if (fixed != null) {
log.println("Rules for: " + locale + "," + current.name);
log.println(rules);
if (!rules.equals(foo.val)) {
log.println("Original Rules from Ram: ");
log.println(foo.val);
}
types_rules.put(current.name, fixed);
}
}
}
}
}
//current.write(System.out,0,false);
}
// now get the valid sublocales
Document doc2 = LDMLUtilities.parse(sourceDir + locale + ".xml", false);
Node colls = LDMLUtilities.getNode(doc2,"//ldml/collations");
String validSubLocales = LDMLUtilities.getAttributeValue(colls, "validSubLocales");
if (validSubLocales != null) {
String items[] = new String[100]; // allocate plenty
Utility.split(validSubLocales, ' ', items);
for (int i = 0; items[i].length() != 0; ++i) {
log.println("Valid Sub Locale: " + items[i]);
validLocales.add(items[i]);
}
}
}
/**
* @param locale
* @param current
* @param foo
* @param rules
*/
private RuleBasedCollator generateCollator(String locale, String current, String foo, String rules) {
RuleBasedCollator fixed = null;
try {
if (rules.equals("")) fixed = emptyCollator;
else {
rules = replace(rules, "[optimize[", "[optimize [");
rules = replace(rules, "[suppressContractions[", "[suppressContractions [");
RuleBasedCollator col = new RuleBasedCollator(rules);
fixed = (RuleBasedCollator) collation_collation.get(col);
if (fixed == null) {
collation_collation.put(col, col);
fixed = col;
}
}
} catch (Exception e) {
log.println("***Cannot create collator from: " + locale + ", " + current + ", " + foo + ", " + rules);
e.printStackTrace(log);
RuleBasedCollator coll = (RuleBasedCollator)Collator.getInstance(new ULocale(locale));
String oldrules = coll.getRules();
log.println("Old ICU4J: " + oldrules);
log.println("Equal?: " + oldrules.equals(rules));
}
return fixed;
}
}
// ========== UNICODESET UTILITIES ==========
public static interface Apply {
String apply(String source);
}
static UnicodeSet apply(UnicodeSet source, Apply apply) {
UnicodeSet target = new UnicodeSet();
for (UnicodeSetIterator usi = new UnicodeSetIterator(source); usi.next(); ) {
String s = usi.getString();
target.add(apply.apply(s));
}
return target;
}
static UnicodeSet nfc(UnicodeSet source) {
return apply(source, new Apply() {
public String apply(String source) {
return Normalizer.compose(source, false);
}
});
}
public static interface CloseCodePoint {
/**
* @param cp code point to get closure for
* @param toAddTo Unicode set for the closure
* @return toAddTo (for chaining)
*/
UnicodeSet close(int cp, UnicodeSet toAddTo);
}
public static UnicodeSet createCaseClosure(UnicodeSet source) {
UnicodeSet target = new UnicodeSet();
for (UnicodeSetIterator usi = new UnicodeSetIterator(source); usi.next(); ) {
String s = usi.getString();
UnicodeSet temp = createClosure(s, CCCP);
if (temp == null) target.add(s);
else target.addAll(temp);
}
return target;
}
public static final CloseCodePoint CCCP = new CloseCodePoint() {
Locale locale = Locale.ENGLISH;
UnicodeSet NONE = new UnicodeSet();
UnicodeMap map = new UnicodeMap();
public UnicodeSet close(int cp, UnicodeSet toAddTo) {
UnicodeSet result = (UnicodeSet) map.getValue(cp);
if (result == null) {
result = new UnicodeSet();
result.add(cp);
String s = UCharacter.toLowerCase(locale, UTF16.valueOf(cp));
result.add(s);
s = UCharacter.toUpperCase(locale, UTF16.valueOf(cp));
result.add(s);
s = UCharacter.toTitleCase(locale, UTF16.valueOf(cp), null);
result.add(s);
// special hack
if (result.contains("SS")) result.add("sS").add("ss");
if (result.size() == 1) result = NONE;
map.put(cp, result);
}
if (result != NONE) toAddTo.addAll(result);
else toAddTo.add(cp);
return toAddTo;
}
};
public static UnicodeSet createClosure(String source, CloseCodePoint closer) {
return createClosure(source, 0, closer);
}
public static UnicodeSet createClosure(String source, int position, CloseCodePoint closer) {
UnicodeSet result = new UnicodeSet();
// if at end, return empty set
if (position >= source.length()) return result;
int cp = UTF16.charAt(source, position);
// if last character, return its set
int endPosition = position + UTF16.getCharCount(cp);
if (endPosition >= source.length()) return closer.close(cp, result);
// otherwise concatenate its set with the remainder
UnicodeSet remainder = createClosure(source, endPosition, closer);
return createAppend(closer.close(cp, result), remainder);
}
/**
* Produce the result of appending each element of this to each element of other.
* That is, [a{cd}] + [d{ef}] => [{ad}{aef}{cdd}{cdef}]
* @param other
* @return
*/
public static UnicodeSet createAppend(UnicodeSet a, UnicodeSet b) {
UnicodeSet target = new UnicodeSet();
for (UnicodeSetIterator usi = new UnicodeSetIterator(a); usi.next(); ) {
String s = usi.getString();
for (UnicodeSetIterator usi2 = new UnicodeSetIterator(b); usi2.next(); ) {
String s2 = usi2.getString();
target.add(s + s2);
}
}
return target;
}
}