blob: b2707f4d3c52ca3fcb5c5efa0bf37500b99f4014 [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Mark Davis
**********************************************************************
*/
package com.ibm.icu.dev.tool.cldr;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.xml.sax.SAXException;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.dev.test.util.Relation;
import com.ibm.icu.dev.test.util.SortedBag;
import com.ibm.icu.dev.tool.UOption;
/**
* Class to generate POSIX format from CLDR. This is just a prototype version,
* that is driven off of ICU4J. The eventual version will want to take CLDR
* data directly from the XML.
* TODO Get the data directly from the CLDR tree.
* @author medavis
*/
public class GeneratePOSIX {
// http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
private static final int
HELP1 = 0,
HELP2 = 1,
SOURCEDIR = 2,
DESTDIR = 3,
MATCH = 4,
UNICODESET = 5,
CHARSET = 6;
private static final UOption[] options = {
UOption.HELP_H(),
UOption.HELP_QUESTION_MARK(),
UOption.SOURCEDIR().setDefault("C:\\ICU4C\\locale\\common\\"),
UOption.DESTDIR().setDefault("C:\\DATA\\GEN\\"),
UOption.create("match", 'm', UOption.REQUIRES_ARG).setDefault("hu"),
UOption.create("unicodeset", 'u', UOption.REQUIRES_ARG).setDefault("[\\u0000-\\U0010FFFF]"),
UOption.create("charset", 'c', UOption.REQUIRES_ARG).setDefault("iso8859-2"),
};
public static void main(String[] args) throws IOException {
UOption.parseArgs(args, options);
//testSortedBag();
// TODO change to walk through available locales
String locale = options[MATCH].value;
GeneratePOSIX gp = new GeneratePOSIX(new ULocale(locale),
// new UnicodeSet("[\u0000-\u00FF]")
new UnicodeSet(options[UNICODESET].value),
Charset.forName(options[CHARSET].value)
);
PrintWriter out = BagFormatter.openUTF8Writer(options[DESTDIR].value,locale + "_posix.txt");
gp.write(out);
out.close();
}
/**
*
*/
private static void testSortedBag() {
SortedBag foo = new SortedBag(Collator.getInstance());
foo.add("\u0001");
foo.add("\u0002");
for (Iterator it = foo.iterator(); it.hasNext();) {
System.out.println(Utility.hex(((String)it.next()).charAt(0),4));
}
}
SortedBag allItems;
SortedBag contractions;
//PrintWriter out;
//Map definedID = new HashMap();
RuleBasedCollator col;
UnicodeSet chars;
Charset cs;
public GeneratePOSIX(ULocale locale, UnicodeSet chars, Charset cs) {
this.cs = cs;
if (cs != null) {
UnicodeSet csset = new SimpleConverter(cs).getCharset();
chars = new UnicodeSet(chars).retainAll(csset);
}
this.chars = chars;
System.out.println("Generating: " + locale.getDisplayName());
col = (RuleBasedCollator) RuleBasedCollator.getInstance(locale);
allItems = new SortedBag(col);
contractions = new SortedBag(col);
// add all the chars
for (UnicodeSetIterator it = new UnicodeSetIterator(chars); it.next();) {
allItems.add(it.getString());
}
// get the tailored contractions
// we need to filter only the ones in chars
UnicodeSet tailored = col.getTailoredSet();
getFilteredSet(chars, tailored);
getFilteredSet(uca_contractions, tailored);
}
/**
* @param chars
* @param tailored
*/
private void getFilteredSet(UnicodeSet chars, UnicodeSet tailored) {
for (UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next();) {
if (it.codepoint != it.IS_STRING) continue;
String s = it.getString();
s = Normalizer.compose(s,false); // normalize to make sure
if (!UTF16.hasMoreCodePointsThan(s, 1)) continue;
if (!chars.containsAll(s)) continue;
System.out.println("Contractions: " + it.getString());
contractions.add(s);
allItems.add(s);
}
}
public void write(PrintWriter out) {
out.println("######################");
out.println("# POSIX locale");
out.println("# Generated automatically from the Unicode Character Database and Common Locale Data Repository");
out.println("# see http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html");
out.println("# charset:\t" + cs);
out.println("# characters:\t" + chars.toPattern(true));
out.println("######################");
out.println();
doCtype(out);
out.println("######################");
out.println();
doCollate(out);
out.println("######################");
}
/**
* @param out
*/
private void doCollate(PrintWriter out) {
out.println("LC_COLLATE");
out.println();
writeDefinitions(out);
out.println();
out.println("order_start forward;" +
(col.isFrenchCollation() ? "backward" : "forward")
+ ";forward");
out.println();
out.println("# weights used, in order");
out.println();
writeList(out);
out.println();
out.println("# assignment of characters to weights");
out.println();
for (Iterator it = allItems.iterator(); it.hasNext();) {
out.println(showLine(col, (String) it.next()));
}
out.println("UNDEFINED \t...;" + getID('X', 5) + ";" + getID('X', 5));
out.println();
out.println("order_end");
out.println("END LC_COLLATE");
}
/**
* @param out
*/
private void doCtype(PrintWriter out) {
out.println("LC_CTYPE");
// digit <zero>;\
// .. <eight>;\
// <nine>
String[][] types = { { "alpha", "[:Alphabetic:]" },
{ "upper", "[:Uppercase:]" },
{ "lower", "[:Lowercase:]" },
{ "space", "[:Whitespace:]" },
{ "cntrl", "[:Control:]" },
{ "graph", "[^[:Whitespace:][:Control:][:Format:][:Surrogate:][:Unassigned:]]" },
{ "print", "[[:Whitespace:]-[[:Control:][:Format:][:Surrogate:][:Unassigned:]]]" },
{ "punct", "[:Punctuation:]" },
{ "digit", "[:Decimal_Number:]" },
{ "xdigit", "[[:Decimal_Number:]a-f, A-F, ï½?-f, A-F]" },
{ "blank", "[[:Whitespace:]-[\\u000A-\\u000D \\u0085 [:Line_Separator:][:Paragraph_Separator:]]]" } };
// print character types, restricted to the charset
int item, last;
for (int i = 0; i < types.length; ++i) {
UnicodeSet us = new UnicodeSet(types[i][1]).retainAll(chars);
item = 0;
last = us.size() - 1;
for (UnicodeSetIterator it = new UnicodeSetIterator(us); it.next(); ++item) {
if (item == 0) out.print(types[i][0]);
out.print("\t" + getID('U',it.codepoint));
if (item != last) out.print(";\\");
out.println(" \t# " + getName(it.getString()));
}
out.println();
}
/*
toupper (<a>,<A>);(<b>,<B>);(<c>,<C>);(<d>,<D>);(<e>,<E>);\
(<f>,<F>);(<g>,<G>);(<h>,<H>);(<i>,<I>);(<j>,<J>);\
(<k>,<K>);(<l>,<L>);(<m>,<M>);(<n>,<N>);(<o>,<O>);\
(<p>,<P>);(<q>,<Q>);(<r>,<R>);(<s>,<S>);(<t>,<T>);\
(<u>,<U>);(<v>,<V>);(<w>,<W>);(<x>,<X>);(<y>,<Y>);\
(<z>,<Z>);\
...
(<t-cedilla>,<T-cedilla>)
*/
UnicodeSet us = new UnicodeSet();
for (UnicodeSetIterator it = new UnicodeSetIterator(chars); it.next();) {
int low = UCharacter.toUpperCase(it.codepoint);
if (low != it.codepoint) us.add(it.codepoint);
}
item = 0;
last = chars.size() - 1;
for (UnicodeSetIterator it = new UnicodeSetIterator(us); it.next(); ++item) {
if (item == 0) out.print("toupper");
out.print("\t(<" + getID('U',it.codepoint) + ">,<" +
getID('U',UCharacter.toUpperCase(it.codepoint)) + ">)");
if (item != last) out.print(";\\");
out.println(" \t# " + getName(it.getString()));
}
out.println();
}
private void writeDefinitions(PrintWriter out) {
//collating-element <A-A> from "<U0041><U0041>"
StringBuffer buffer = new StringBuffer();
for (Iterator it = contractions.iterator(); it.hasNext();) {
buffer.setLength(0);
String s = (String) it.next();
buffer.append("collating-element ")
.append(getID(s, true))
.append(" from ")
.append(getID(s, false));
out.println(buffer.toString());
}
}
private class IntList {
private BitSet stuff = new BitSet();
private int leastItem = Integer.MAX_VALUE;
void add(int item) {
stuff.set(item);
if (item < leastItem) leastItem = item;
}
void remove(int item) {
stuff.clear(item);
if (item == leastItem) {
// search for new least
for (int i = item+1; i < stuff.size(); ++i) {
if (stuff.get(i)) {
leastItem = i;
return;
}
}
leastItem = Integer.MAX_VALUE; // failed, now empty
}
}
int getLeast() {
return leastItem;
}
}
IntList needToWritePrimary = new IntList();
Set nonUniqueWeights = new HashSet();
Set allWeights = new HashSet();
Map stringToWeights = new HashMap();
private void writeList(PrintWriter out) {
BitSet alreadySeen = new BitSet();
BitSet needToWrite = new BitSet();
needToWrite.set(1); // special weight for uniqueness
int maxSeen = 0;
for (Iterator it1 = allItems.iterator(); it1.hasNext();) {
String string = (String) it1.next();
Weights w = new Weights(col.getCollationElementIterator(string));
w.primaries.setBits(needToWrite);
w.secondaries.setBits(needToWrite);
w.tertiaries.setBits(needToWrite);
if (allWeights.contains(w)) nonUniqueWeights.add(w);
allWeights.add(w);
stringToWeights.put(string, w);
}
for (int i = 0; i < needToWrite.size(); ++i) {
if (needToWrite.get(i)) out.println(getID('X', i));
}
}
/**
* @param col
* @param string
*/
private String showLine(RuleBasedCollator col, String string) {
String prefix = "";
StringBuffer result = new StringBuffer();
result.append(getID(string, true));
result.append(" \t");
// gather data
Weights w = (Weights) stringToWeights.get(string);
result.append(w.primaries)
.append(";")
.append(w.secondaries)
.append(";")
.append(w.tertiaries)
.append(";")
.append(nonUniqueWeights.contains(w)
? getID(Normalizer.decompose(string,false), false)
: getID('X', 1))
.append(" \t# ")
.append(getName(string));
if (prefix.length() != 0) result.insert(0,prefix);
return result.toString();
}
/* primaries.size() == 0
&& secondaries.size() == 0
&& tertiaries.size() == 0
? "IGNORE"
: */
/**
* @param string
* @return
*/
private Object getName(String s) {
int cp;
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
if (i != 0) result.append(", ");
String n = UCharacter.getExtendedName(cp);
result.append(n);
}
return result.toString();
}
/**
* @param leadChar TODO
* @param i
* @param intList
* @return
*/
private static String getID(char leadChar, int i) {
return "<" + leadChar + Utility.hex(i,4)+ ">";
}
/**
* @param i
* @param intList
*/
private class Weights {
WeightList primaries = new WeightList();
WeightList secondaries = new WeightList();
WeightList tertiaries = new WeightList();
public Weights(CollationElementIterator it) {
while (true) {
int ce = it.next();
if (ce == it.NULLORDER) break;
int p = it.primaryOrder(ce);
primaries.append(p);
secondaries.append(it.secondaryOrder(ce));
tertiaries.append(it.tertiaryOrder(ce));
}
}
public boolean equals(Object other) {
Weights that = (Weights)other;
return primaries.equals(that.primaries)
&& secondaries.equals(that.secondaries)
&& tertiaries.equals(that.tertiaries);
}
public int hashCode() {
return (primaries.hashCode()*37
+ secondaries.hashCode())*37
+ tertiaries.hashCode();
}
}
private class WeightList {
char[] weights = new char[5];
// TODO lengthen on demand
int count = 0;
public void append(int i) {
// add each 16-bit quantity
for (int j = 16; j >= 0; j -= 16) {
char b = (char)(i >>> j);
if (b == 0) continue;
weights[count++] = b;
}
}
public void setBits(BitSet s) {
for (int j = 0; j < count; ++j) s.set(weights[j]);
}
public String toString() {
if (count == 0) return "IGNORE";
if (count == 1) return getID('X', weights[0]);
String result = "\"";
for (int i = 0; i < count; ++i) {
result += getID('X', weights[i]);
}
return result + "\"";
}
public boolean equals(Object other) {
WeightList that = (WeightList)other;
for (int j = 0; j < count; ++j) {
if (weights[j] != that.weights[j]) return false;
}
return true;
}
public int hashCode() {
int result = count;
for (int j = 0; j < count; ++j) result = result*37 + weights[j];
return result;
}
}
private String getID(String s, boolean isSingleID) {
//Object defined = definedID.get(s);
//if (defined != null) return (String) defined;
//if (defined != null) return (String) defined;
StringBuffer result = new StringBuffer();
if (!UTF16.hasMoreCodePointsThan(s, 1)) {
// single code point
appendID(UTF16.charAt(s,0), result, false);
} else if (isSingleID) {
result.append('<');
int cp;
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
if (i != 0) result.append('-');
appendID(cp, result, true);
}
result.append('>');
} else {
result.append('"');
int cp;
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
appendID(cp, result, false);
}
result.append('"');
}
return result.toString();
}
private StringBuffer appendID(int cp, StringBuffer result, boolean nakedID) {
if (!nakedID) result.append('<');
result.append('U').append(Utility.hex(cp,4));
if (!nakedID) result.append('>');
return result;
}
UnicodeSet uca_contractions = new UnicodeSet("[{\u0406\u0308}{\u0410\u0306}{\u0410\u0308}{\u0413\u0301}{\u0413\u0341}{\u0415\u0306}{\u0416\u0308}{\u0417\u0308}{\u0418\u0306}{\u0418\u0308}{\u041A\u0301}{\u041A\u0341}{\u041E\u0308}{\u0423\u0306}{\u0423\u0308}{\u0423\u030B}{\u0427\u0308}{\u042B\u0308}{\u042D\u0308}{\u0430\u0306}{\u0430\u0308}{\u0433\u0301}{\u0433\u0341}{\u0435\u0306}{\u0436\u0308}{\u0437\u0308}{\u0438\u0306}{\u0438\u0308}{\u043A\u0301}{\u043A\u0341}{\u043E\u0308}{\u0443\u0306}{\u0443\u0308}{\u0443\u030B}{\u0447\u0308}{\u044B\u0308}{\u044D\u0308}{\u0456\u0308}{\u0474\u030F}{\u0475\u030F}{\u04D8\u0308}{\u04D9\u0308}{\u04E8\u0308}{\u04E9\u0308}{\u0627\u0653}{\u0627\u0654}{\u0627\u0655}{\u0648\u0654}{\u064A\u0654}{\u09C7\u09BE}{\u09C7\u09D7}{\u0B47\u0B3E}{\u0B47\u0B56}{\u0B47\u0B57}{\u0B92\u0BD7}{\u0BC6\u0BBE}{\u0BC6\u0BD7}{\u0BC7\u0BBE}{\u0C46\u0C56}{\u0CBF\u0CD5}{\u0CC6\u0CC2}{\u0CC6\u0CC2\u0CD5}{\u0CC6\u0CD5}{\u0CC6\u0CD6}{\u0CCA\u0CD5}{\u0D46\u0D3E}{\u0D46\u0D57}{\u0D47\u0D3E}{\u0DD9\u0DCA}{\u0DD9\u0DCF}{\u0DD9\u0DCF\u0DCA}{\u0DD9\u0DDF}{\u0DDC\u0DCA}{\u0E4D\u0E32}{\u0ECD\u0EB2}{\u0F71\u0F72}{\u0F71\u0F74}{\u0F71\u0F80}{\u0FB2\u0F71}{\u0FB2\u0F71\u0F80}{\u0FB2\u0F80}{\u0FB2\u0F81}{\u0FB3\u0F71}{\u0FB3\u0F71\u0F80}{\u0FB3\u0F80}{\u0FB3\u0F81}{\u1025\u102E}]");
}