| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Compare14652.java,v $ |
| * $Date: 2004/02/07 01:01:16 $ |
| * $Revision: 1.3 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.util.*; |
| import java.io.*; |
| |
| import com.ibm.text.utility.*; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| // quick and dirty function for grabbing contents of ISO 14652 file |
| |
| public class Compare14652 implements UCD_Types { |
| |
| static final boolean oldVersion = false; |
| |
| public static UnicodeSet getSet(int prop, byte propValue) { |
| return UnifiedBinaryProperty.make(prop | propValue).getSet(); |
| } |
| |
| static UnicodeSet |
| titleSet = getSet(CATEGORY, Lt), |
| combiningSet = getSet(CATEGORY, Mc) |
| .addAll(getSet(CATEGORY, Me)) |
| .addAll(getSet(CATEGORY, Mn)), |
| zSet = getSet(CATEGORY, Zs) |
| .addAll(getSet(CATEGORY, Zl)) |
| .addAll(getSet(CATEGORY, Zp)), |
| pSet = getSet(CATEGORY, Pd) |
| .addAll(getSet(CATEGORY, Ps)) |
| .addAll(getSet(CATEGORY, Pe)) |
| .addAll(getSet(CATEGORY, Pc)) |
| .addAll(getSet(CATEGORY, Po)) |
| .addAll(getSet(CATEGORY, Pi)) |
| .addAll(getSet(CATEGORY, Pf)), |
| sSet = getSet(CATEGORY, Sm) |
| .addAll(getSet(CATEGORY, Sc)) |
| .addAll(getSet(CATEGORY, Sk)) |
| .addAll(getSet(CATEGORY, So)), |
| noSet = getSet(CATEGORY, No), |
| csSet = getSet(CATEGORY, Cs), |
| cfSet = getSet(CATEGORY, Cf), |
| cnSet = getSet(CATEGORY, Cn), |
| circled = getSet(DECOMPOSITION_TYPE, COMPAT_CIRCLE), |
| whitespaceSet = getSet(BINARY_PROPERTIES, White_space), |
| alphaSet = getSet(DERIVED, PropAlphabetic).addAll(combiningSet), |
| lowerSet = getSet(DERIVED, PropLowercase).addAll(titleSet).removeAll(circled), |
| upperSet = getSet(DERIVED, PropUppercase).addAll(titleSet).removeAll(circled), |
| digitSet = getSet(CATEGORY, Nd), |
| xdigitSet = new UnicodeSet("[a-fA-F\uFF21-\uFF26\uFF41-\uFF46]").addAll(digitSet), |
| spaceSet = whitespaceSet.size() == 0 ? zSet : whitespaceSet, |
| controlSet = getSet(CATEGORY, Cc), |
| punctSet = new UnicodeSet(pSet).addAll(sSet), |
| graphSet = new UnicodeSet(0,0x10ffff) |
| .removeAll(controlSet) |
| //.removeAll(getSet(CATEGORY, Cf)) |
| .removeAll(csSet) |
| .removeAll(cnSet) |
| .removeAll(zSet), |
| // Cc, Cf, Cs, Cn, Z |
| blankSet = new UnicodeSet(spaceSet).removeAll(new UnicodeSet("[\\u000A-\\u000D\\u0085]")) |
| .removeAll(getSet(CATEGORY, Zl)) |
| .removeAll(getSet(CATEGORY, Zp)); |
| |
| |
| static class Prop { |
| String name; |
| UnicodeSet contents = new UnicodeSet(); |
| String guess = "???"; |
| UnicodeSet guessContents = new UnicodeSet(); |
| |
| String wsname = whitespaceSet.size() == 0 ? "gc=Z" : "Whitespace"; |
| |
| Prop(String name) { |
| this.name = name; |
| if (name.equals("alpha")) { |
| guess = "Alphabetic + gc=M"; |
| guessContents = alphaSet; |
| } else if (name.equals("lower")) { |
| guess = "Lowercase + gc=Lt - dt=circle"; |
| guessContents = lowerSet; |
| } else if (name.equals("upper")) { |
| guess = "Uppercase + gc=Lt - dt=circle"; |
| guessContents = upperSet; |
| } else if (name.equals("digit")) { |
| guess = "gc=Nd"; |
| guessContents = digitSet; |
| } else if (name.equals("xdigit")) { |
| guess = "gc=Nd+a..f (upper/lower,normal/fullwidth)"; |
| guessContents = xdigitSet; |
| } else if (name.equals("space")) { |
| guess = wsname; |
| guessContents = spaceSet; |
| //Utility.showSetNames("Whitespace", spaceSet, true, Default.ucd); |
| } else if (name.equals("cntrl")) { |
| guess = "gc=Cc"; |
| guessContents = controlSet; |
| } else if (name.equals("punct")) { |
| guess = "gc=P,S"; |
| guessContents = punctSet; |
| } else if (name.equals("graph")) { |
| guess = "All - gc=Cc, Cs, Cn, or Z"; |
| guessContents = graphSet; |
| } else if (name.equals("blank")) { |
| guess = wsname + " - (LF,VT,FF,CR,NEL + gc=Zl,Zp)"; |
| guessContents = blankSet; |
| } else if (name.equals("ISO_14652_class \"combining\"")) { |
| guess = "gc=M"; |
| guessContents = combiningSet; |
| } |
| |
| |
| /*upper |
| lower |
| alpha |
| digit |
| outdigit |
| space |
| cntrl |
| punct |
| graph |
| xdigit |
| blank |
| toupper |
| tolower |
| */ |
| } |
| |
| void show(PrintWriter pw) { |
| if (name.equals("ISO_14652_LC_CTYPE")) return; |
| if (name.equals("ISO_14652_toupper")) return; |
| if (name.equals("ISO_14652_tolower")) return; |
| if (name.equals("ISO_14652_outdigit")) return; |
| if (name.equals("ISO_14652_outdigit")) return; |
| if (name.startsWith("ISO_14652_class")) return; |
| |
| pw.println(); |
| pw.println("**************************************************"); |
| pw.println(name); |
| pw.println("**************************************************"); |
| Utility.showSetDifferences(pw, name, contents, guess, guessContents, false, true, null, Default.ucd()); |
| //pw.println(props[i].contents); |
| } |
| } |
| |
| static Prop[] props = new Prop[100]; |
| static int propCount = 0; |
| |
| public static void main(String[] args) throws IOException { |
| |
| String version = Default.ucd().getVersion(); |
| PrintWriter log = Utility.openPrintWriter("Diff14652_" + version + ".txt", Utility.UTF8_WINDOWS); |
| try { |
| log.write('\uFEFF'); |
| log.print("Version: " + version); |
| |
| if (false) { |
| UnicodeSet ID = getSet(DERIVED, ID_Start).addAll(getSet(DERIVED, ID_Continue_NO_Cf)); |
| UnicodeSet XID = getSet(DERIVED, Mod_ID_Start).addAll(getSet(DERIVED, Mod_ID_Continue_NO_Cf)); |
| UnicodeSet alphanumSet = new UnicodeSet(alphaSet).addAll(digitSet).addAll(getSet(CATEGORY, Pc)); |
| |
| Utility.showSetDifferences("ID", ID, "XID", XID, false, Default.ucd()); |
| Utility.showSetDifferences("ID", ID, "Alphabetic+Digit+Pc", alphanumSet, false, Default.ucd()); |
| } |
| |
| BufferedReader br = Utility.openReadFile("C:\\DATA\\ISO14652_CTYPE.txt", Utility.LATIN1); |
| while (true) { |
| String line = br.readLine(); |
| if (line == null) break; |
| line = line.trim(); |
| if (line.length() == 0) continue; |
| if (line.charAt(line.length() - 1) == '/') { |
| line = line.substring(0, line.length() - 1); |
| } |
| line = line.trim(); |
| if (line.length() == 0) continue; |
| |
| char ch = line.charAt(0); |
| if (ch == '%') continue; |
| if (ch == '(') continue; |
| if (ch == '<') { |
| addItems(line, props[propCount-1].contents); |
| } else { |
| // new property |
| System.out.println(line); |
| if (line.equals("width")) break; |
| props[propCount] = new Prop(line); |
| props[propCount].name = "ISO_14652_" + line; |
| props[propCount].contents = new UnicodeSet(); |
| propCount++; |
| } |
| } |
| |
| for (int i = 0; i < propCount; ++i) props[i].show(log); |
| |
| log.println(); |
| log.println("**************************************************"); |
| log.println("Checking POSIX requirements for inclusion and disjointness."); |
| log.println("**************************************************"); |
| log.println(); |
| /* |
| alpha, digit, punct, cntrl are all disjoint |
| space, cntrl, blank are pairwise disjoint with any of alpha, digit, xdigit |
| alpha includes upper, lower |
| graph includes alpha, digit, punct |
| print includes graph |
| xdigit includes digit |
| */ |
| Prop |
| alpha = getProp("ISO_14652_alpha"), |
| upper = getProp("ISO_14652_upper"), |
| lower = getProp("ISO_14652_lower"), |
| graph = getProp("ISO_14652_graph"), |
| //print = getProp("ISO_14652_print"), |
| punct = getProp("ISO_14652_punct"), |
| digit = getProp("ISO_14652_digit"), |
| xdigit = getProp("ISO_14652_xdigit"), |
| space = getProp("ISO_14652_space"), |
| blank = getProp("ISO_14652_blank"), |
| cntrl = getProp("ISO_14652_cntrl"); |
| |
| checkDisjoint(log, new Prop[] {alpha, digit, punct, cntrl}); |
| |
| Prop [] l1 = new Prop[] {space, cntrl, blank}; |
| Prop [] l2 = new Prop[] {alpha, digit, xdigit}; |
| for (int i = 0; i < l1.length; ++i) { |
| for (int j = i + 1; j < l2.length; ++j) { |
| checkDisjoint(log, l1[i], l2[j]); |
| } |
| } |
| checkIncludes(log, alpha, upper); |
| checkIncludes(log, alpha, lower); |
| checkIncludes(log, graph, alpha); |
| checkIncludes(log, graph, digit); |
| checkIncludes(log, graph, punct); |
| //checkIncludes(log, print, graph); |
| checkIncludes(log, xdigit, digit); |
| |
| |
| // possibly alpha, digit, punct, cntrl, space cover the !(Cn,Cs) |
| |
| UnicodeSet trRemainder = new UnicodeSet(cnSet) |
| .complement() |
| .removeAll(csSet) |
| .removeAll(digit.contents) |
| .removeAll(punct.contents) |
| .removeAll(alpha.contents) |
| .removeAll(cntrl.contents) |
| .removeAll(space.contents); |
| Utility.showSetNames(log, "TR Remainder: ", trRemainder, false, false, Default.ucd()); |
| |
| UnicodeSet propRemainder = new UnicodeSet(cnSet) |
| .complement() |
| .removeAll(csSet) |
| //.removeAll(noSet) |
| //.removeAll(cfSet) |
| .removeAll(digit.guessContents) |
| .removeAll(punct.guessContents) |
| .removeAll(alpha.guessContents) |
| .removeAll(cntrl.guessContents) |
| .removeAll(space.guessContents); |
| Utility.showSetNames(log, "Prop Remainder: ", propRemainder, false, false, Default.ucd()); |
| |
| /* |
| checkDisjoint(new Prop[] {alpha, digit, punct, cntrl}); |
| UnicodeSet remainder = cnSet.complement(); |
| UnicodeSet guessRemainder = new UnicodeSet(remainder); |
| for (int i = 0; i < list.length; ++i) { |
| for (int j = i + 1; j < list.length; ++j) { |
| compare(log, list[i].name, list[i].contents, list[j].name, list[j].contents); |
| compare(log, list[i].guess, list[i].guessContents, list[j].guess, list[j].guessContents); |
| } |
| remainder.removeAll(list[i].contents); |
| guessRemainder.removeAll(list[i].guessContents); |
| } |
| if (remainder.size() != 0) { |
| log.println(); |
| log.println("Incomplete (TR): " + remainder); |
| } |
| if (guessRemainder.size() != 0) { |
| log.println(); |
| log.println("Incomplete (Prop): " + guessRemainder); |
| } |
| */ |
| |
| } finally { |
| log.close(); |
| } |
| } |
| |
| static void checkDisjoint(PrintWriter log, Prop[] list) { |
| for (int i = 0; i < list.length; ++i) { |
| for (int j = i + 1; j < list.length; ++j) { |
| checkDisjoint(log, list[i], list[j]); |
| } |
| } |
| } |
| |
| static void checkDisjoint(PrintWriter log, Prop prop1, Prop prop2) { |
| checkDisjoint(log, prop1.name, prop1.contents, prop2.name, prop2.contents); |
| checkDisjoint(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents); |
| } |
| |
| static void checkDisjoint(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) { |
| if (set.containsSome(set2)) { |
| log.println(); |
| log.println("Fails test: " + name + " disjoint-with " + name2); |
| UnicodeSet diff = new UnicodeSet(set).retainAll(set2); |
| Utility.showSetNames(log, "", diff, false, false, Default.ucd()); |
| } |
| } |
| |
| static void checkIncludes(PrintWriter log, Prop prop1, Prop prop2) { |
| checkIncludes(log, prop1.name, prop1.contents, prop2.name, prop2.contents); |
| checkIncludes(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents); |
| } |
| |
| static void checkIncludes(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) { |
| if (!set.containsAll(set2)) { |
| log.println(); |
| log.println("Fails test:" + name + " includes " + name2); |
| UnicodeSet diff = new UnicodeSet(set2).removeAll(set); |
| Utility.showSetNames(log, "", diff, false, false, Default.ucd()); |
| } |
| } |
| |
| static String[] pieces = new String[100]; |
| |
| // example: <U1F48>..<U1F4D>;<U1F59>;<U1F5B>;<U1F5D>;<U1F5F>;<U1F68>..<U1F6F>;/ |
| static void addItems(String line, UnicodeSet contents) { |
| int len = Utility.split(line, ';', pieces); |
| for (int i = 0; i < len; ++i) { |
| String piece = pieces[i].trim(); |
| if (piece.length() == 0) continue; |
| if (piece.equals("<0>")) continue; |
| int start, end; |
| int rangePoint = piece.indexOf(".."); |
| if (rangePoint >= 0) { |
| start = parse(piece.substring(0,rangePoint)); |
| end = parse(piece.substring(rangePoint+2)); |
| } else { |
| start = end = parse(piece); |
| } |
| contents.add(start, end); |
| } |
| } |
| |
| static int parse(String piece) { |
| if (!piece.startsWith("<U") || !piece.endsWith(">")) { |
| throw new IllegalArgumentException("Bogus code point: " + piece); |
| } |
| return Integer.parseInt(piece.substring(2,piece.length()-1), 16); |
| } |
| |
| static Prop getProp(String name) { |
| //System.out.println("Searching for: " + name); |
| for (int i = 0; i < propCount; ++i) { |
| //System.out.println("Checking: " + props[i].name); |
| if (props[i].name.equals(name)) { |
| return props[i]; |
| } |
| } |
| //System.out.println("Missed"); |
| return null; |
| } |
| |
| // oddities: |
| // extra space after ';' <U0300>..<U036F>; <U20D0>..<U20FF>; <UFE20>..<UFE2F>;/ |
| // <0>?? <0>;<U0BE7>..<U0BEF>;/ |
| // <U202C>; <U202D>;<U202E>; <UFEFF> : 0;/ |
| // % "print" is by default "graph", and the <space> character |
| // print is odd, since it includes space but not other spaces. |
| // alnum not defined. |
| |
| } |