blob: b594b1e310224c72fd78ec19e8056115ac6cd29b [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
* $Date: 2006/11/27 23:15:21 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import org.unicode.cldr.util.Counter;
import com.ibm.icu.dev.demo.translit.CaseIterator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.Tabber;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty;
import com.ibm.icu.impl.PrettyPrinter;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
public class QuickTest implements UCD_Types {
public static void main(String[] args) throws IOException {
try {
String methodName = System.getProperty("method");
org.unicode.cldr.util.Utility.callMethod(methodName, QuickTest.class);
if (true) return;
getHangulDecomps();
showLeadingTrailingNonStarters();
//checkBufferStatus(true);
checkNormalization("NFC", Default.nfc());
//checkNormalization("NFKC", Default.nfkc());
if (true) return;
checkCaseChanges();
if (true) return;
checkCase();
getCaseFoldingUnstable();
getCaseLengths("Lower", UCD.LOWER);
getCaseLengths("Upper", UCD.UPPER);
getCaseLengths("Title", UCD.TITLE);
getCaseLengths("Fold", UCD.FOLD);
checkUnicodeSet();
getLengths("NFC", Default.nfc());
getLengths("NFD", Default.nfd());
getLengths("NFKC", Default.nfkc());
getLengths("NFKD", Default.nfkd());
if (true) return;
tem();
//checkPrettyPrint();
Collection l = new CaseVariantMaker().getVariants("abc");
for (Iterator it = l.iterator(); it.hasNext();) {
System.out.println(it.next());
}
String propName = UCharacter.getPropertyName(3, UProperty.NameChoice.LONG);
//testProps();
getBidiMirrored();
getHasAllNormalizations();
} finally {
System.out.println("Done");
}
}
private static void getHangulDecomps() {
//Normalizer nfkd500 = new Normalizer(Normalizer.NFKD, "5.0.0");
Normalizer nfkd218 = new Normalizer(Normalizer.NFKD, "2.1.8");
UnicodeMap diff = new UnicodeMap();
Map compose = new HashMap();
Map decompose = new HashMap();
// UnicodeSet applicable = // new UnicodeSet("[:HangulSyllable=NA:]");
UnicodeSet applicable = new UnicodeSet("[[\u1100-\u11FF \uAC00-\uD7FF]&[:assigned:]]");
for (UnicodeSetIterator it = new UnicodeSetIterator(applicable); it.next(); ) {
String source = it.getString();
String v218 = nfkd218.normalize(source);
//String v500 = nfkd500.normalize(source);
if (v218.equals(source)) continue;
decompose.put(source, v218);
compose.put(v218, source);
}
// now try recomposing
for (Iterator it = decompose.keySet().iterator(); it.hasNext();) {
String source = (String) it.next();
String decomposition = (String) decompose.get(source);
if (decomposition.length() > 2) {
String trial = decomposition.substring(0, decomposition.length() - 1);
String composition = (String) compose.get(trial);
if (composition != null) {
decomposition = composition + decomposition.substring(decomposition.length() - 1);
}
}
if (decomposition.length() != 2) System.out.println("Failed decomp: " + Default.ucd().getCodeAndName(source));
diff.put(source.charAt(0), com.ibm.text.utility.Utility.hex(decomposition, " "));
}
UnicodeMapProperty p = new UnicodeMapProperty().set(diff);
BagFormatter bf = new BagFormatter().setValueSource(p);
System.out.println(bf.showSetNames(diff.keySet()));
}
static void checkNormalization(String title, Normalizer nfx) {
UnicodeSet trailing = new UnicodeSet();
UnicodeSet leading = new UnicodeSet();
UnicodeSet starter = new UnicodeSet();
UnicodeSet nonStarter = new UnicodeSet();
UnicodeSet disallowed = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (!nfx.isNormalized(i)) {
disallowed.add(i);
continue;
}
if (nfx.isLeading(i)) leading.add(i);
if (nfx.isTrailing(i)) trailing.add(i);
if (Default.ucd().getCombiningClass(i) == 0) starter.add(i);
else nonStarter.add(i);
}
UnicodeSet allowed = new UnicodeSet(disallowed).complement();
UnicodeSet leadingOnly = new UnicodeSet(leading).removeAll(trailing);
UnicodeSet trailingOnly = new UnicodeSet(trailing).removeAll(leading);
UnicodeSet both = new UnicodeSet(trailing).retainAll(leading);
UnicodeSet stable = new UnicodeSet(allowed).removeAll(leading).removeAll(trailing);
UnicodeSet starterLeadingOnly = new UnicodeSet(starter).retainAll(leadingOnly);
UnicodeSet starterTrailingOnly = new UnicodeSet(starter).retainAll(trailingOnly);
UnicodeSet starterStable = new UnicodeSet(starter).retainAll(stable);
UnicodeSet starterBoth = new UnicodeSet(starter).retainAll(both);
UnicodeSet nonStarterTrailing = new UnicodeSet(nonStarter).retainAll(trailing);
UnicodeSet nonStarterNonTrailing = new UnicodeSet(nonStarter).removeAll(trailing);
System.out.println();
System.out.println(title);
System.out.println("Starter, CWF-Only: " + starterLeadingOnly.size());
System.out.println("Starter, CWP-Only: " + starterTrailingOnly.size());
System.out.println("Starter, Stable: " + starterStable.size());
System.out.println("Starter, Both: " + starterBoth.size());
System.out.println("Non-Starter, CWP: " + nonStarterTrailing.size());
System.out.println("Non-Starter, Non-CWP: " + nonStarterNonTrailing.size());
System.out.println("Disallowed: " + disallowed.size());
BagFormatter bf = new BagFormatter();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("5.0.0");
bf.setUnicodePropertyFactory(ups);
System.out.println("Starter, CWF-Only: " + "\r\n" + bf.showSetNames(starterLeadingOnly));
System.out.println("Starter, CWP-Only: " + "\r\n" + bf.showSetNames(starterTrailingOnly));
System.out.println("Starter, Stable: " + "\r\n" + bf.showSetNames(starterStable));
System.out.println("Starter, Both: " + "\r\n" + bf.showSetNames(starterBoth));
System.out.println("Non-Starter, CWP: " + "\r\n" + bf.showSetNames(nonStarterTrailing));
System.out.println("Non-Starter, Non-CWP: " + "\r\n" + bf.showSetNames(nonStarterNonTrailing));
System.out.println("Disallowed: " + "\r\n" + bf.showSetNames(disallowed));
// System.out.println(bf.showSetDifferences("NFC CWP", leadingC, "NFC Trailing", trailingC));
}
private static void checkCaseChanges() {
String first = "3.0.0";
String last = "4.1.0";
UCD ucd30 = UCD.make(first);
UCD ucd50 = UCD.make(last);
UnicodeSet sameBehavior = new UnicodeSet();
UnicodeSet newIn50 = new UnicodeSet();
UnicodeSet differentBehavior = new UnicodeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
int type = ucd50.getCategory(i);
if (type == UCD.UNASSIGNED || type == UCD.PRIVATE_USE || type == UCD.SURROGATE) continue;
String c1 = UTF16.valueOf(i);
String c3 = ucd30.getCase(i,UCD.FULL,UCD.FOLD);
String c5 = ucd50.getCase(i,UCD.FULL,UCD.FOLD);
if (c1.equals(c3) && c1.equals(c5)) continue;
if (!ucd30.isAssigned(i)) {
newIn50.add(i);
} else if (c3.equals(c5)) {
sameBehavior.add(i);
} else {
differentBehavior.add(i);
System.out.println(ucd50.getCodeAndName(i));
System.out.println("3.0=>" + ucd50.getCodeAndName(c3));
System.out.println("5.0=>" + ucd50.getCodeAndName(c5));
}
}
BagFormatter bf = new BagFormatter();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(last);
bf.setUnicodePropertyFactory(ups);
System.out.println("In 5.0 but not 3.0: " + newIn50);
System.out.println(bf.showSetNames(newIn50));
System.out.println();
System.out.println("Same Behavior in 3.0 and 5.0: " + sameBehavior);
System.out.println(bf.showSetNames(sameBehavior));
System.out.println();
System.out.println("Different Behavior in 3.0 and 5.0: " + differentBehavior);
System.out.println(bf.showSetNames(differentBehavior));
}
private static void checkUnicodeSet() {
UnicodeSet uset = new UnicodeSet("[a{bc}{cd}pqr\u0000]");
System.out.println(uset + " ~ " + uset.getRegexEquivalent());
String[][] testStrings = {
{"x", "none"},
{"bc", "all"},
{"cdbca", "all"},
{"a", "all"},
{"bcx", "some"},
{"ab", "some"},
{"acb", "some"},
{"bcda", "some"},
{"dccbx", "none"},
};
for (int i = 0; i < testStrings.length; ++i) {
check(uset, testStrings[i][0], testStrings[i][1]);
}
}
private static void check(UnicodeSet uset, String string, String desiredStatus) {
boolean shouldContainAll = desiredStatus.equals("all");
boolean shouldContainNone = desiredStatus.equals("none");
System.out.println((uset.containsAll(string) == shouldContainAll ? "" : "FAILURE:") + "\tcontainsAll " + string + " = " + shouldContainAll);
System.out.println((uset.containsNone(string) == shouldContainNone ? "" : "FAILURE:") + "\tcontainsNone " + string + " = " + shouldContainNone);
}
private static void getCaseFoldingUnstable() {
for (int i = 3; i < com.ibm.text.utility.Utility.searchPath.length - 1; ++i) {
String newName = com.ibm.text.utility.Utility.searchPath[i];
String oldName = com.ibm.text.utility.Utility.searchPath[i+1];
showMemoryUsage();
UCD ucdNew = UCD.make(newName);
showMemoryUsage();
UCD ucdOld = UCD.make(oldName);
showMemoryUsage();
UnicodeMap differences = new UnicodeMap();
UnicodeSet differenceSet = new UnicodeSet();
for (int j = 0; j < 0x10FFFF; ++j) {
if (!ucdOld.isAssigned(j)) continue;
String oldString = ucdOld.getCase(j, UCD.FULL, UCD.FOLD);
String newString = ucdNew.getCase(j, UCD.FULL, UCD.FOLD);
if (!oldString.equals(newString)) {
differenceSet.add(j);
differences.put(j, new String[]{oldString, newString});
System.out.println(".");
}
}
if (differenceSet.size() != 0) {
System.out.println("Differences in " + com.ibm.text.utility.Utility.searchPath[i]);
for (UnicodeSetIterator it = new UnicodeSetIterator(differenceSet); it.next();) {
System.out.println(ucdNew.getCodeAndName(it.codepoint));
String[] strings = (String[]) differences.getValue(it.codepoint);
System.out.println("\t" + oldName + ": " + ucdNew.getCodeAndName(strings[0]));
System.out.println("\t" + newName + ": " + ucdNew.getCodeAndName(strings[1]));
}
}
}
}
static public void showMemoryUsage() {
System.gc(); System.gc(); System.gc(); System.gc();
System.gc(); System.gc(); System.gc(); System.gc();
System.gc(); System.gc(); System.gc(); System.gc();
System.gc(); System.gc(); System.gc(); System.gc();
System.out.println("total:\t" + Runtime.getRuntime().totalMemory() + ";\tfree:\t" +
Runtime.getRuntime().freeMemory());
}
private static void getHasAllNormalizations() {
UnicodeSet items = new UnicodeSet();
Set s = new LinkedHashSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (!Default.ucd().isAssigned(i)) continue;
if (Default.ucd().getDecompositionType(i) == UCD.NONE) continue;
String source = UTF16.valueOf(i);
String nfc = Default.nfc().normalize(source);
String nfd = Default.nfd().normalize(source);
String nfkd = Default.nfkd().normalize(source);
String nfkc = Default.nfkc().normalize(source);
s.clear();
s.add(source);
s.add(nfc);
s.add(nfd);
s.add(nfkd);
s.add(nfkc);
if (s.size() > 3) {
System.out.println(Utility.hex(source) + "\t" + Utility.escape(source)
+ "\t" + Default.ucd().getName(source)
+ "\tnfd\t" + Utility.hex(nfd) + "\t" + Utility.escape(nfd)
+ "\tnfc\t" + Utility.hex(nfc) + "\t" + Utility.escape(nfc)
+ "\tnfkd\t" + Utility.hex(nfkd) + "\t" + Utility.escape(nfkd)
+ "\tnfkc\t" + Utility.hex(nfkc) + "\t" + Utility.escape(nfkc));
}
}
}
static UnicodeMap.Composer MyComposer = new UnicodeMap.Composer(){
public Object compose(int codePoint, Object a, Object b) {
if (a == null) return b;
if (b == null) return a;
return a + "; " + b;
}
};
static void add(UnicodeMap map, int cp, String s) {
String x = (String) map.getValue(cp);
if (x == null) map.put(cp, s);
else map.put(cp, x + "; " + s);
}
private static void getBidiMirrored() throws IOException {
//UnicodeMap.Composer composer;
//ToolUnicodePropertySource foo = ToolUnicodePropertySource.make("");
UnicodeSet proposed = new UnicodeSet("[\u0F3A-\u0F3D\u169B\u169C\u2018-\u201F\u301D-\u301F\uFD3E\uFD3F\uFE59-\uFE5E\uFE64\uFE65\\U0001D6DB\\U0001D715\\U0001D74F\\U0001D789\\U0001D7C3]");
//UnicodeSet proposed = new UnicodeSet("[\u0F3A-\u0F3D\u169B\u169C\u2018-\u201F\u301D-\u301F\uFD3E\uFD3F\uFE59-\uFE5E\uFE64\uFE65]");
UnicodeMap status = new UnicodeMap();
UCD ucd31 = UCD.make("3.1.0");
for (int cp = 0; cp < 0x10FFFF; ++cp) {
if (!Default.ucd().isAssigned(cp)) continue;
if (Default.ucd().isPUA(cp)) continue;
if (proposed.contains(cp)) {
add(status, cp, "***");
}
int type = Default.ucd().getCategory(cp);
if (type == UCD.Ps || type == Pe || type == Pi || type == Pf) {
add(status, cp, "Px");
}
String s = Default.ucd().getBidiMirror(cp);
if (!s.equals(UTF16.valueOf(cp))) add(status, cp, "bmg");
if (ucd31.getBinaryProperty(cp,BidiMirrored)) {
add(status, cp, "bmp3.1");
} else if (Default.ucd().getBinaryProperty(cp,BidiMirrored)) {
add(status, cp, "bmp5.0");
} else if (!Default.nfkc().isNormalized(cp)) {
String ss = Default.nfkc().normalize(cp);
if (isBidiMirrored(ss)) {
add(status, cp, "bmp(" + Utility.hex(ss) + ")");
String name = Default.ucd().getName(cp);
if (name.indexOf("VERTICAL") < 0) proposed.add(cp);
}
}
if (type == Sm) {
add(status, cp, "Sm");
}
else if (Default.ucd().getBinaryProperty(cp,Math_Property)) {
String ss = Default.nfkc().normalize(cp);
if (UTF16.countCodePoint(ss) == 1) {
int cp2 = UTF16.charAt(ss, 0);
int type2 = Default.ucd().getCategory(cp2);
if (type2 == UCD.Lu || type2 == Ll || type2 == Lo || type2 == Nd) {
//System.out.println("Skipping: " + Default.ucd().getCodeAndName(cp));
} else {
add(status, cp, "S-Math");
}
} else {
add(status, cp, "S-Math");
}
}
// temp = new UnicodeMap();
// UnicodeSet special = new UnicodeSet("[<>]");
// for (UnicodeSetIterator it = new UnicodeSetIterator(mathSet); it.next();) {
// String s = Default.nfkd().normalize(it.codepoint);
// if (special.containsSome(s)) temp.put(it.codepoint, "*special*");
// }
// status.composeWith(temp, MyComposer);
//showStatus(status);
// close under nfd
}
//proposed = status.getSet("Px");
System.out.println(proposed);
//showStatus(status);
PrintWriter pw = BagFormatter.openUTF8Writer(UCD.GEN_DIR, "bidimirroring_chars.txt");
showStatus(pw, status);
pw.close();
}
private static boolean isBidiMirrored(String ss) {
int cp;
for (int i = 0; i < ss.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(ss, i);
if (!Default.ucd().getBinaryProperty(cp,BidiMirrored)) return false;
}
return true;
}
static BagFormatter bf = new BagFormatter();
private static void showStatus(PrintWriter pw, UnicodeMap status) {
Collection list = new TreeSet(status.getAvailableValues());
for (Iterator it = list.iterator(); it.hasNext(); ) {
String value = (String) it.next();
if (value == null) continue;
UnicodeSet set = status.getSet(value);
for (UnicodeSetIterator umi = new UnicodeSetIterator(set); umi.next();) {
pw.println(Utility.hex(umi.codepoint)
//+ (value.startsWith("*") ? ";\tBidi_Mirrored" : "")
+ "\t# " + value
+ "\t\t( " + UTF16.valueOf(umi.codepoint) + " ) "
//+ ";\t" + (x.contains(umi.codepoint) ? "O" : "")
+ "\t" + Default.ucd().getName(umi.codepoint));
}
}
}
public static class Length {
String title;
int bytesPerCodeUnit;
int longestCodePoint = -1;
double longestLength = 0;
UnicodeMap longestSet = new UnicodeMap();
Length(String title, int bytesPerCodeUnit) {
this.title = title;
this.bytesPerCodeUnit = bytesPerCodeUnit;
}
void add(int codePoint, int cuLen, int processedUnitLength, String processedString) {
double codeUnitLength = processedUnitLength / (double) cuLen;
if (codeUnitLength > longestLength) {
longestCodePoint = codePoint;
longestLength = codeUnitLength;
longestSet.clear();
longestSet.put(codePoint, processedString);
System.out.println(title + " \t(" + codeUnitLength*bytesPerCodeUnit + " bytes, "
+ codeUnitLength + " code units) \t"
+ longestLength + " expansion) \t"
+ Default.ucd().getCodeAndName(codePoint)
+ "\r\n\t=> " + Default.ucd().getCodeAndName(processedString)
);
} else if (codeUnitLength == longestLength) {
longestSet.put(codePoint, processedString);
}
}
}
static final int skip = (1<<UCD.UNASSIGNED) | (1<<UCD.PRIVATE_USE) | (1<<UCD.SURROGATE);
/**
*
*/
private static void getLengths(String title, Normalizer normalizer) throws IOException {
System.out.println();
Length utf8Len = new Length(title + "\tUTF8", 1);
Length utf16Len = new Length(title + "\tUTF16", 1);
Length utf32Len = new Length(title + "\tUTF32", 1);
for (int i = 0; i <= 0x10FFFF; ++i) {
int type = Default.ucd().getCategoryMask(i);
if ((type & skip) != 0) continue;
String is = UTF16.valueOf(i);
String norm = normalizer.normalize(i);
utf8Len.add(i, getUTF8Length(is), getUTF8Length(norm), norm);
utf16Len.add(i, is.length(), norm.length(), norm);
utf32Len.add(i, 1, UTF16.countCodePoint(norm), norm);
}
UnicodeSet common = new UnicodeSet(utf8Len.longestSet.keySet())
.retainAll(utf16Len.longestSet.keySet())
.retainAll(utf32Len.longestSet.keySet());
if (common.size() > 0) {
UnicodeSetIterator it = new UnicodeSetIterator(common);
it.next();
System.out.println("Common Exemplar: " + Default.ucd().getCodeAndName(it.codepoint));
}
}
private static void getCaseLengths(String title, byte caseType) throws IOException {
System.out.println();
Length utf8Len = new Length(title + "\tUTF8", 1);
Length utf16Len = new Length(title + "\tUTF16", 1);
Length utf32Len = new Length(title + "\tUTF32", 1);
for (int i = 0; i <= 0x10FFFF; ++i) {
int type = Default.ucd().getCategoryMask(i);
if ((type & skip) != 0) continue;
String is = UTF16.valueOf(i);
String norm = Default.ucd().getCase(i, UCD.FULL, caseType);
utf8Len.add(i, getUTF8Length(is), getUTF8Length(norm), norm);
utf16Len.add(i, is.length(), norm.length(), norm);
utf32Len.add(i, 1, UTF16.countCodePoint(norm), norm);
}
UnicodeSet common = new UnicodeSet(utf8Len.longestSet.keySet())
.retainAll(utf16Len.longestSet.keySet())
.retainAll(utf32Len.longestSet.keySet());
if (common.size() > 0) {
UnicodeSetIterator it = new UnicodeSetIterator(common);
it.next();
System.out.println("Common Exemplar: " + Default.ucd().getCodeAndName(it.codepoint));
}
}
static ByteArrayOutputStream utf8baos;
static Writer utf8bw;
static int getUTF8Length(String source) throws IOException {
if (utf8bw == null) {
utf8baos = new ByteArrayOutputStream();
utf8bw = new OutputStreamWriter(utf8baos, "UTF-8");
}
utf8baos.reset();
utf8bw.write(source);
utf8bw.flush();
return utf8baos.size();
}
static final void test() {
String test2 = "ab\u263ac";
StringTokenizer st = new StringTokenizer(test2, "\u263a");
try {
while (true) {
String s = st.nextToken();
System.out.println(s);
}
} catch (Exception e) { }
StringReader r = new StringReader(test2);
StreamTokenizer s = new StreamTokenizer(r);
try {
while (true) {
int x = s.nextToken();
if (x == StreamTokenizer.TT_EOF) break;
System.out.println(s.sval);
}
} catch (Exception e) { }
String testString = "en-Arab-200-gaulish-a-abcd-def-x-abcd1234-12345678";
for (int i = testString.length() + 1; i > 0; --i) {
String trunc = truncateValidLanguageTag(testString, i);
System.out.println(i + "\t" + trunc + "\t" + trunc.length());
}
}
static String truncateValidLanguageTag(String tag, int limit) {
if (tag.length() <= limit) return tag;
// legit truncation point has - after, and two letters before
do {
if (tag.charAt(limit) == '-' && tag.charAt(limit-1) != '-' && tag.charAt(limit-2) != '-') break;
} while (--limit > 2);
return tag.substring(0,limit);
}
static final void test2() {
UnicodeSet format = new UnicodeSet("[:Cf:]");
/*
[4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
[#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
[#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
[#x3001-#xD7FF] | [#xF900-#xEFFFF]
[4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 |
[#x0300-#x036F] | [#x203F-#x2040]
*/
UnicodeSet nameStartChar = new UnicodeSet("[\\: A-Z \\_ a-z"
+ "\\u00c0-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF"
+ "\\u200C-\\u200D \\u2070-\\u218F \\u2C00-\\u2FEF"
+ "\\u3001-\\uD7FF \\uF900-\\U000EFFFF]");
UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 "
+ "\\u0300-\\u036F \\u203F-\\u2040]")
.addAll(nameStartChar);
UnicodeSet nameAll = new UnicodeSet(nameChar).addAll(nameStartChar);
showSet("NameStartChar", nameStartChar);
showDiffs("NameChar", nameChar, "NameStartChar", nameStartChar);
UnicodeSet ID_Start = new UnicodeSet("[:ID_Start:]");
UnicodeSet ID_Continue = new UnicodeSet("[:ID_Continue:]").removeAll(format);
UnicodeSet ID_All = new UnicodeSet(ID_Start).addAll(ID_Continue);
showDiffs("ID_All", ID_All, "nameAll", nameAll);
showDiffs("ID_Start", ID_Start, "nameStartChar", nameStartChar);
UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED | DefaultIgnorable).getSet();
UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES | White_space).getSet();
UnicodeSet notNFKC = new UnicodeSet();
UnicodeSet privateUse = new UnicodeSet();
UnicodeSet noncharacter = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (!Default.ucd().isAllocated(i)) continue;
if (!Default.nfkc().isNormalized(i)) notNFKC.add(i);
if (Default.ucd().isNoncharacter(i)) noncharacter.add(i);
if (Default.ucd().getCategory(i) == PRIVATE_USE) privateUse.add(i);
}
showSet("notNFKC in NameChar", new UnicodeSet(notNFKC).retainAll(nameChar));
showSet("notNFKC outside of NameChar", new UnicodeSet(notNFKC).removeAll(nameChar));
showSet("Whitespace in NameChar", new UnicodeSet(nameChar).retainAll(whitespace));
showSet("Whitespace not in NameChar", new UnicodeSet(whitespace).removeAll(nameChar));
showSet("Noncharacters in NameChar", new UnicodeSet(noncharacter).retainAll(noncharacter));
showSet("Noncharacters outside of NameChar", new UnicodeSet(noncharacter).removeAll(nameChar));
showSet("Format in NameChar", new UnicodeSet(nameChar).retainAll(format));
showSet("Other Default_Ignorables in NameChar", new UnicodeSet(defaultIgnorable).removeAll(format).retainAll(nameChar));
showSet("PrivateUse in NameChar", new UnicodeSet(defaultIgnorable).retainAll(privateUse));
UnicodeSet CID_Start = new UnicodeSet("[:ID_Start:]").removeAll(notNFKC);
UnicodeSet CID_Continue = new UnicodeSet("[:ID_Continue:]")
.removeAll(notNFKC).removeAll(format);
UnicodeSet CID_Continue_extras = new UnicodeSet(CID_Continue).removeAll(CID_Start);
showDiffs("NoK_ID_Start", CID_Start, "NameStartChar", nameStartChar);
showDiffs("NoK_ID_Continue_Extras", CID_Continue_extras, "NameChar", nameChar);
System.out.println("Removing canonical singletons");
}
static void showDiffs(String title1, UnicodeSet set1, String title2, UnicodeSet set2) {
showSet(title1 + " - " + title2, new UnicodeSet(set1).removeAll(set2));
}
static void showSet(String title1, UnicodeSet set1) {
System.out.println();
System.out.println(title1);
if (set1.size() == 0) {
System.out.println("\tNONE");
return;
}
System.out.println("\tCount:" + set1.size());
System.out.println("\tSet:" + set1.toPattern(true));
System.out.println("\tDetails:");
//Utility.showSetNames("", set1, false, Default.ucd());
}
private static void checkPrettyPrint() {
//System.out.println("Test: " + fixTransRule("\\u0061"));
UnicodeSet s = new UnicodeSet("[^[:script=common:][:script=inherited:]]");
UnicodeSet quoting = new UnicodeSet("[[:Mn:][:Me:]]");
String ss = new PrettyPrinter().setToQuote(quoting).toPattern(s);
System.out.println("test: " + ss);
}
static class CaseVariantMaker {
private ULocale locale = ULocale.ROOT;
private String string = null;
private Collection output;
private Collection getVariants(String string) {
return getVariants(string, null);
}
private Collection getVariants(String string, Collection output) {
this.string = string;
if (output == null) output = new ArrayList();
this.output = output;
getSimpleCaseVariants(0, "");
return output;
}
private void getSimpleCaseVariants(int i, String soFar) {
if (i == string.length()) {
output.add(soFar);
return;
}
// can optimize later
String s = UTF16.valueOf(string, i);
i += s.length();
getSimpleCaseVariants(i, soFar + s);
String upper = UCharacter.toUpperCase(locale, s);
if (!upper.equals(s)) {
getSimpleCaseVariants(i, soFar + upper);
}
String title = UCharacter.toTitleCase(locale, s, null);
if (!title.equals(s) && !title.equals(upper)) {
getSimpleCaseVariants(i, soFar + title);
}
String lower = UCharacter.toLowerCase(locale, s);
if (!lower.equals(s) && !lower.equals(upper) && !lower.equals(title)) {
getSimpleCaseVariants(i, soFar + lower);
}
}
public ULocale getLocale() {
return locale;
}
public void setLocale(ULocale locale) {
this.locale = locale;
}
}
private static void tem() {
PrintStream out = System.out;
String text = "\ufb03";
String BASE_RULES =
"'<' > '&lt;' ;" +
"'<' < '&'[lL][Tt]';' ;" +
"'&' > '&amp;' ;" +
"'&' < '&'[aA][mM][pP]';' ;" +
"'>' < '&'[gG][tT]';' ;" +
"'\"' < '&'[qQ][uU][oO][tT]';' ; " +
"'' < '&'[aA][pP][oO][sS]';' ; ";
String CONTENT_RULES =
"'>' > '&gt;' ;";
String HTML_RULES = BASE_RULES + CONTENT_RULES +
"'\"' > '&quot;' ; ";
String HTML_RULES_CONTROLS = HTML_RULES +
"([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:][\\u0080-\\U0010FFFF]-[\\u0020]]) > &hex/xml($1) ; ";
Transliterator toHTML = Transliterator.createFromRules(
"any-xml", HTML_RULES_CONTROLS, Transliterator.FORWARD);
int[][] ranges = {{UProperty.BINARY_START, UProperty.BINARY_LIMIT},
{UProperty.INT_START, UProperty.INT_LIMIT},
{UProperty.DOUBLE_START, UProperty.DOUBLE_START},
{UProperty.STRING_START, UProperty.STRING_LIMIT},
};
Collator col = Collator.getInstance(ULocale.ROOT);
((RuleBasedCollator)col).setNumericCollation(true);
Map alpha = new TreeMap(col);
String HTML_INPUT = "::hex-any/xml10; ::hex-any/unicode; ::hex-any/java;";
Transliterator fromHTML = Transliterator.createFromRules(
"any-xml", HTML_INPUT, Transliterator.FORWARD);
text = fromHTML.transliterate(text);
int cp = UTF16.charAt(text, 0);
text = UTF16.valueOf(text,0);
for (int range = 0; range < ranges.length; ++range) {
for (int propIndex = ranges[range][0]; propIndex < ranges[range][1]; ++propIndex) {
String propName = UCharacter.getPropertyName(propIndex, UProperty.NameChoice.LONG);
String propValue = null;
int ival;
switch (range) {
default: propValue = "???"; break;
case 0: ival = UCharacter.getIntPropertyValue(cp, propIndex);
if (ival != 0) propValue = "True";
break;
case 2: propValue = String.valueOf(UCharacter.getNumericValue(cp)); break;
case 3:
propValue = UCharacter.getStringPropertyValue(propIndex, cp, UProperty.NameChoice.LONG);
if (text.equals(propValue)) propValue = null;
break;
case 1: ival = UCharacter.getIntPropertyValue(cp, propIndex);
if (ival != 0) {
propValue = UCharacter.getPropertyValueName(propIndex, ival, UProperty.NameChoice.LONG);
if (propValue == null) propValue = String.valueOf(ival);
}
break;
}
if (propValue != null) {
alpha.put(propName, propValue);
}
}
}
String x;
String upper = x = UCharacter.toUpperCase(ULocale.ENGLISH,text);
if (!text.equals(x)) alpha.put("Uppercase", x);
String lower = x = UCharacter.toLowerCase(ULocale.ENGLISH,text);
if (!text.equals(x)) alpha.put("Lowercase", x);
String title = x = UCharacter.toTitleCase(ULocale.ENGLISH,text,null);
if (!text.equals(x)) alpha.put("Titlecase", x);
String nfc = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFC);
if (!text.equals(x)) alpha.put("NFC", x);
String nfd = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFD);
if (!text.equals(x)) alpha.put("NFD", x);
x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKD);
if (!text.equals(x)) alpha.put("NFKD", x);
x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKC);
if (!text.equals(x)) alpha.put("NFKC", x);
CanonicalIterator ci = new CanonicalIterator(text);
int count = 0;
for (String item = ci.next(); item != null; item = ci.next()) {
if (item.equals(text)) continue;
if (item.equals(nfc)) continue;
if (item.equals(nfd)) continue;
alpha.put("Other_Canonical_Equivalent#" + (++count), item);
}
CaseIterator cai = new CaseIterator();
cai.reset(text);
count = 0;
for (String item = cai.next(); item != null; item = cai.next()) {
if (item.equals(text)) continue;
if (item.equals(upper)) continue;
if (item.equals(lower)) continue;
if (item.equals(title)) continue;
alpha.put("Other_Case_Equivalent#" + (++count), item);
}
out.println("<table>");
out.println("<tr><td><b>" + "Character" + "</b></td><td><b>" + toHTML.transliterate(text) + "</b></td></tr>");
out.println("<tr><td><b>" + "Code_Point" + "</b></td><td><b>" + com.ibm.icu.impl.Utility.hex(cp,4) + "</b></td></tr>");
out.println("<tr><td><b>" + "Name" + "</b></td><td><b>" + toHTML.transliterate((String)alpha.get("Name")) + "</b></td></tr>");
alpha.remove("Name");
for (Iterator it = alpha.keySet().iterator(); it.hasNext();) {
String propName = (String) it.next();
String propValue = (String) alpha.get(propName);
out.println("<tr><td>" + propName + "</td><td>" + toHTML.transliterate(propValue) + "</td></tr>");
}
out.println("</table>");
}
private static void checkCase() {
System.out.println("Getting Values1");
UnicodeSet hasFrom = new UnicodeSet();
UnicodeSet hasTo = new UnicodeSet();
UnicodeSet isLower = new UnicodeSet();
UnicodeSet isUpper = new UnicodeSet();
UnicodeSet isTitle = new UnicodeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
String si = UTF16.valueOf(i);
String xx;
xx = UCharacter.toLowerCase(si);
if (si.equals(xx)) {
isLower.add(i);
} else {
hasFrom.add(i);
hasTo.add(xx);
}
xx = UCharacter.toUpperCase(si);
if (si.equals(xx)) {
isUpper.add(i);
} else {
hasFrom.add(i);
hasTo.add(xx);
}
xx = UCharacter.toTitleCase(si,null);
if (si.equals(xx)) {
isTitle.add(i);
} else {
hasFrom.add(i);
hasTo.add(xx);
}
}
PrettyPrinter pp = new PrettyPrinter();
showDifferences(pp, "hasFrom", hasFrom, "hasTo", hasTo, "xxx", new UnicodeSet());
System.out.println("Getting Values2");
isLower.retainAll(hasFrom);
isUpper.retainAll(hasFrom);
isTitle.retainAll(hasFrom);
hasFrom.removeAll(isLower).removeAll(isUpper).removeAll(isTitle);
UnicodeSet upperAndTitle = new UnicodeSet(isUpper).retainAll(isTitle);
isUpper.removeAll(upperAndTitle);
isTitle.removeAll(upperAndTitle);
System.out.println("isLower: " + isLower.size());
System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(isLower)));
System.out.println("isUpper (alone): " + isUpper.size());
System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(isUpper)));
System.out.println("isTitle (alone): " + isTitle.size());
System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(isTitle)));
System.out.println("isUpperAndTitle: " + upperAndTitle.size());
System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(upperAndTitle)));
System.out.println("other: " + hasFrom.size());
System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(hasFrom)));
UnicodeSet LowercaseProperty = new UnicodeSet("[:Lowercase:]");
UnicodeSet LowercaseCategory = new UnicodeSet("[:Lowercase_Letter:]");
//System.out.println(pp.toPattern(isLower));
showDifferences(pp, "Lowercase", LowercaseProperty,
"Functionally Lowercase", isLower,
"Lowercase_Letter", LowercaseCategory);
UnicodeSet TitlecaseProperty = new UnicodeSet();
UnicodeSet TitlecaseCategory = new UnicodeSet("[:Titlecase_Letter:]");
showDifferences(pp, "Titlecase", TitlecaseProperty,
"Functionally Titlecase", isTitle,
"Titlecase_Letter", TitlecaseCategory);
UnicodeSet UppercaseProperty = new UnicodeSet("[:Uppercase:]");
UnicodeSet UppercaseCategory = new UnicodeSet("[:Uppercase_Letter:]");
showDifferences(pp, "Uppercase", UppercaseProperty,
"Functionally Uppercase", new UnicodeSet(isUpper).addAll(upperAndTitle),
"Uppercase_Letter", UppercaseCategory);
// UnicodeMap compare = new UnicodeMap();
// compare.putAll(isLower,"isLowercase&isCased");
//
// compare.composeWith(new UnicodeMap().putAll(LowercaseProperty,"Lowercase"), new MyComposer());
// compare.composeWith(new UnicodeMap().putAll(LowercaseProperty,"Lowercase_Letter"), new MyComposer());
// for (Iterator it = compare.getAvailableValues().iterator(); it.hasNext();) {
// String value = (String) it.next();
// UnicodeSet chars = compare.getSet(value);
// System.out.println(value + ", size: " + chars.size());
// System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(chars)));
// }
}
private static void showDifferences(PrettyPrinter pp,
String lowercaseTitle, UnicodeSet LowercaseProperty,
String funcLowerTitle, UnicodeSet isLower,
String lowercaseCatTitle, UnicodeSet LowercaseCategory) {
System.out.println("Getting Values3");
UnicodeSet[] categories = new UnicodeSet[8];
for (int i = 0; i < categories.length; ++i) categories[i] = new UnicodeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
int sum = 0;
if (isLower.contains(i)) sum |= 1;
if (LowercaseCategory.contains(i)) sum |= 2;
if (LowercaseProperty.contains(i)) sum |= 4;
categories[sum].add(i);
}
System.out.println("Printing Values");
for (int i = 1; i < categories.length; ++i) {
if (categories[i].size() == 0) continue;
String name = "";
if ((i & 4) != 0) name += " & " + lowercaseTitle;
if ((i & 1) != 0) name += " & " + funcLowerTitle;
if ((i & 2) != 0) name += " & " + lowercaseCatTitle;
name = name.substring(3); // skip " & "
System.out.println(name + ", size: " + categories[i].size());
System.out.println(com.ibm.icu.impl.Utility.escape(pp.toPattern(categories[i])));
}
}
static class MyComposer implements UnicodeMap.Composer {
public Object compose(int codePoint, Object a, Object b) {
if (a == null) return b;
if (b == null) return a;
return a + " & " + b;
}
}
static Counter bufferTypes = new Counter();
static class BufferData {
byte starterIsZero;
int initials;
int medials;
int finals;
int sample;
public boolean equals(Object other) {
BufferData that = (BufferData)other;
return starterIsZero == that.starterIsZero && initials == that.initials && medials == that.medials && finals == that.finals;
}
public int hashCode() {
return ((starterIsZero * 37 + initials)*37 + medials)*37 + finals;
}
public BufferData set(int codepoint) {
String s = Default.nfkd().normalize(codepoint);
int cp;
starterIsZero = (byte)(UCharacter.getCombiningClass(codepoint) == 0 ? 0 : 1);
boolean isInitial = true;
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
int ccc = UCharacter.getCombiningClass(cp);
if (ccc != 0) {
if (isInitial) {
++initials;
} else {
++finals;
}
} else {
isInitial = false;
medials += finals + 1;
finals = 0;
}
}
if (medials != 0) medials = 1;
sample = codepoint;
if (starterIsZero == 0 && medials == 0) {
System.out.println("WARNING: BAD CHARACTER");
cp = sample;
int ccc = UCharacter.getCombiningClass(cp);
System.out.println("U+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
ccc = UCharacter.getCombiningClass(cp);
System.out.println("\tU+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
}
}
return this;
}
public static String getHeader() {
return "Starter?" + "\t" + "initials" + "\t" + "Contains Starter?" + "\t" + "finals" + "\t" + "sample hex" + "\t" + "sample name";
}
public String toString() {
String result = (starterIsZero == 0 ? "Y" : "") + "\t" + initials + "\t" + (medials != 0 ? "Y" : "") + "\t" + finals + "\t";
if (sample == 0) {
return result + "-" + "\t" + "all others";
}
return result + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
}
}
static class BufferDataComparator implements Comparator {
public int compare(Object arg0, Object arg1) {
BufferData a0 = (BufferData)arg0;
BufferData a1 = (BufferData)arg1;
int result;
if (0 != (result = a0.starterIsZero - a1.starterIsZero)) return result;
if (0 != (result = a0.initials - a1.initials)) return result;
if (0 != (result = a0.finals - a1.finals)) return result;
if (0 != (result = a0.medials - a1.medials)) return result;
return 0;
}
}
private static void showLeadingTrailingNonStarters() {
BufferData non = new BufferData().set(0);
Tabber tabber = new Tabber.HTMLTabber();
for (int i = 0; i <= 0x10ffff; ++i) {
int type = Default.ucd().getCategory(i);
if (type == UCD.UNASSIGNED || type == UCD.PRIVATE_USE || type == UCD.SURROGATE) {
bufferTypes.add(non,1);
continue;
}
bufferTypes.add(new BufferData().set(i),1);
}
Map m = bufferTypes.getMap();
TreeSet sorted = new TreeSet(new BufferDataComparator());
NumberFormat nf = NumberFormat.getInstance();
sorted.addAll(m.keySet());
System.out.println(tabber.process("total\t" + BufferData.getHeader()));
for (Iterator it = sorted.iterator(); it.hasNext();) {
Object key = it.next();
Object value = bufferTypes.getCount(key);
System.out.println(tabber.process(nf.format(value) + "\t" + key));
}
}
}