| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ |
| * $Date: 2005/04/06 08:48:16 $ |
| * $Revision: 1.13 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCA; |
| |
| import java.util.*; |
| import java.io.*; |
| import com.ibm.text.UCD.*; |
| import com.ibm.text.utility.*; |
| import com.ibm.icu.text.UTF16; |
| |
| public class GenOverlap implements UCD_Types, UCA_Types { |
| |
| static Map completes = new TreeMap(); |
| static Map back = new HashMap(); |
| static Map initials = new HashMap(); |
| static int[] ces = new int[50]; |
| static UCA collator; |
| static UCD ucd; |
| static Normalizer nfd; |
| static Normalizer nfkd; |
| |
| public static void validateUCA(UCA collatorIn) throws Exception { |
| collator = collatorIn; |
| ucd = UCD.make(); |
| |
| nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion()); |
| nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion()); |
| |
| for (int cp = 0x0; cp <= 0x10FFFF; ++cp) { |
| Utility.dot(cp); |
| if (!ucd.isRepresented(cp)) continue; |
| byte decompType = ucd.getDecompositionType(cp); |
| if (decompType >= UCD.COMPATIBILITY) { |
| String decomp = nfkd.normalize(cp); |
| CEList celistDecomp = getCEList(cp, decomp, true, decompType); |
| CEList celistNormal = getCEList(UTF16.valueOf(cp), false); |
| if (!celistNormal.equals(celistDecomp)) { |
| Utility.fixDot(); |
| System.out.println(); |
| System.out.println(ucd.getCodeAndName(cp)); |
| System.out.println(celistNormal); |
| System.out.println(celistDecomp); |
| } |
| } |
| } |
| |
| } |
| |
| public static void test(UCA collatorIn) throws Exception { |
| collator = collatorIn; |
| |
| CEList.main(null); |
| |
| System.out.println("# Overlap"); |
| System.out.println("# Generated " + Default.getDate()); |
| |
| ucd = UCD.make(); |
| |
| nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion()); |
| nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion()); |
| |
| UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); |
| |
| // store data for faster lookup |
| |
| System.out.println("# Gathering Data"); |
| int counter = 0; |
| |
| int[] lenArray = new int[1]; |
| |
| while (true) { |
| |
| Utility.dot(counter++); |
| String s = cc.next(ces, lenArray); |
| if (s == null) break; |
| int len = lenArray[0]; |
| |
| CEList currCEList = new CEList(ces, 0, len); |
| addString(s, currCEList); |
| } |
| |
| /* |
| for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) { |
| if (!ucd.isRepresented(cp)) continue; |
| byte decompType = ucd.getDecompositionType(cp); |
| if (decompType >= UCD.COMPATIBILITY) { |
| String decomp = nfkd.normalize(cp); |
| CEList celist = getCEList(cp, decomp, true, decompType); |
| addString(decomp, celist); |
| System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist); |
| } |
| } |
| */ |
| |
| Utility.fixDot(); |
| System.out.println("# Completes Count: " + completes.size()); |
| System.out.println("# Initials Count: " + initials.size()); |
| System.out.println("# Writing Overlaps"); |
| |
| // simpleList(); |
| fullCheck(); |
| } |
| |
| public static void addString(String s, CEList currCEList) { |
| back.put(s, currCEList); |
| completes.put(currCEList, s); |
| |
| for (int i = 1; i < currCEList.length(); ++i) { |
| CEList start = currCEList.start(i); |
| Set bag = (Set) initials.get(start); |
| if (bag == null) { |
| bag = new TreeSet(); |
| initials.put(start, bag); |
| } |
| bag.add(s); |
| } |
| } |
| |
| |
| static void simpleList() { |
| Iterator it = completes.keySet().iterator(); |
| int counter = 0; |
| int foundCount = 0; |
| |
| while (it.hasNext()) { |
| Utility.dot(counter++); |
| |
| // see if the ces for the current element are the start of something else |
| CEList key = (CEList) it.next(); |
| String val = (String) completes.get(key); |
| Set probe = (Set) initials.get(key); |
| |
| if (probe != null) { |
| Utility.fixDot(); |
| foundCount++; |
| System.out.println("Possible Overlap: "); |
| System.out.println(" " + ucd.getCodeAndName(val)); |
| System.out.println("\t" + key); |
| |
| Iterator it2 = probe.iterator(); |
| int count2 = 0; |
| while (it2.hasNext()) { |
| String match = (String) it2.next(); |
| CEList ceList = (CEList) back.get(match); |
| System.out.println((count2++) + ". " + ucd.getCodeAndName(match)); |
| System.out.println("\t" + ceList); |
| } |
| } |
| } |
| System.out.println("# Found Count: " + foundCount); |
| } |
| |
| static boolean PROGRESS = false; |
| |
| static void fullCheck() throws IOException { |
| PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.html", Utility.UTF8_WINDOWS); |
| PrintWriter simpleList = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.txt", Utility.UTF8_WINDOWS); |
| |
| Iterator it = completes.keySet().iterator(); |
| int counter = 0; |
| int foundCount = 0; |
| |
| String [] goalChars = new String[1]; |
| String [] matchChars = new String[1]; |
| |
| // CEList show = getCEList("\u2034"); |
| Utility.writeHtmlHeader(log, "Overlaps"); |
| log.print("<table>"); |
| |
| while (it.hasNext()) { |
| Utility.dot(counter++); |
| CEList key = (CEList) it.next(); |
| if (key.length() < 2) continue; |
| |
| String val = (String) completes.get(key); |
| goalChars[0] = ""; |
| matchChars[0] = ""; |
| if (matchWhole(val, key, 0, goalChars, matchChars)) { |
| |
| simpleList.println(ucd.getCodeAndName(val)); |
| |
| goalChars[0] = val + goalChars[0]; // fix first char |
| |
| if (!getCEList(goalChars[0]).equals(getCEList(matchChars[0]))) { |
| log.println("<tr><td colspan='6'>WARNING:" + getCEList(matchChars[0]) + "</td></tr>"); |
| } |
| foundCount++; |
| log.println("<tr><td>" + val + "</td>"); |
| log.println("<td>" + goalChars[0] + "</td>"); |
| log.println("<td>" + matchChars[0] + "</td>"); |
| log.println("<td>" + ucd.getCodeAndName(goalChars[0]) + "</td>"); |
| log.println("<td>" + ucd.getCodeAndName(matchChars[0]) + "</td>"); |
| log.println("<td>" + getCEList(goalChars[0]) + "</td></tr>"); |
| //log.println("\t" + ); |
| } |
| } |
| log.println("</tr></table>Number of Overlapping characters: " + foundCount + "</body>"); |
| log.close(); |
| simpleList.close(); |
| } |
| |
| static private CEList getCEList(String s) { |
| return getCEList(s, true); |
| } |
| |
| static private CEList getCEList(String s, boolean decomp) { |
| int len = collator.getCEs(s, decomp, ces); |
| return new CEList(ces, 0, len); |
| } |
| |
| static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) { |
| int len = collator.getCEs(s, decomp, ces); |
| if (decomp) { |
| for (int i = 0; i < len; ++i) { |
| ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]), |
| UCA.getSecondary(ces[i]), |
| CEList.remap(originalChar, type, UCA.getTertiary(ces[i]))); |
| } |
| } |
| return new CEList(ces, 0, len); |
| } |
| |
| static boolean matchWhole(String goalStr, CEList goal, int depth, String[] goalChars, String[] otherChars) { |
| |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Trying: " + ucd.getCodeAndName(goalStr) + ", " + goal); |
| |
| // to stop infinite loops, we limit the depth to 5 |
| if (depth > 5) { |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "stack exhausted"); |
| return false; |
| } |
| |
| String match; |
| |
| // There are 3 possible conditions. Any of which work. |
| |
| // To eliminate double matches at the top level, we test depth > 0 |
| |
| if (depth > 0) { |
| |
| // Condition 1. |
| // we have an exact match |
| |
| match = (String) completes.get(goal); |
| if (match != null) { |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Exactly: " + ucd.getCodeAndName(match)); |
| otherChars[0] = match + otherChars[0]; |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) |
| + ucd.getCode(goalChars[0]) |
| + " / " + ucd.getCode(otherChars[0]) |
| ); |
| return true; |
| } |
| |
| |
| // Condition 2 |
| // this whole string matches some initial portion of another string |
| // AND the remainder of that other string also does a matchWhole. |
| // Example: if we get the following, we search for a match to "de" |
| // abc... |
| // abcde |
| // If we find a match, we append to the strings, the string for abc |
| // and the one for abcde |
| |
| Set probe = (Set) initials.get(goal); |
| if (probe != null) { |
| Iterator it2 = probe.iterator(); |
| while (it2.hasNext()) { |
| match = (String) it2.next(); |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Longer: " + ucd.getCodeAndName(match) |
| + "\t\tswitching"); |
| CEList trail = ((CEList) back.get(match)).end(goal.length()); |
| boolean doesMatch = matchWhole(match, trail, depth+1, otherChars, goalChars); |
| if (doesMatch) { |
| otherChars[0] = match + otherChars[0]; |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) |
| + ucd.getCode(goalChars[0]) |
| + " / " + ucd.getCode(otherChars[0]) |
| ); |
| return true; |
| } |
| } |
| } |
| } |
| |
| // Condition 3 |
| // the first part of this string matches a whole other string |
| // and the remainder of this string also does a matchWhole |
| // Example: if we get the following, we search for a match to "de" |
| // abcde.. |
| // abc.. |
| // if we find a match |
| |
| for (int i = goal.length() - 1; i > 0; --i) { |
| CEList first = goal.start(i); |
| match = (String) completes.get(first); |
| if (match != null) { |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) + "Matches Shorter: " + ucd.getCodeAndName(match)); |
| boolean doesMatch = matchWhole("", goal.end(i), depth+1, goalChars, otherChars); |
| if (doesMatch) { |
| otherChars[0] = match + otherChars[0]; |
| if (PROGRESS) System.out.println(Utility.repeat(". ", depth) |
| + ucd.getCode(goalChars[0]) |
| + " / " + ucd.getCode(otherChars[0]) |
| ); |
| return true; |
| } |
| } |
| } |
| |
| // if we get this far, we failed. |
| |
| return false; |
| } |
| |
| public static void generateRevision (UCA collatorIn) throws Exception { |
| //generateRevision(collatorIn, false); |
| generateRevision(collatorIn, true); |
| } |
| |
| public static void generateRevision (UCA collatorIn, boolean doMax) throws Exception { |
| collator = collatorIn; |
| |
| CEList.main(null); |
| |
| System.out.println("# Generate"); |
| System.out.println("# Generated " + Default.getDate()); |
| |
| ucd = UCD.make(); |
| |
| nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion()); |
| nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion()); |
| |
| UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); |
| |
| // store data for faster lookup |
| |
| System.out.println("# Gathering Data"); |
| int counter = 0; |
| |
| int[] lenArray = new int[1]; |
| |
| Set list = new TreeSet(); |
| Map newCollisions = new HashMap(); |
| Map oldCollisions = new HashMap(); |
| Map newProblems = new TreeMap(); |
| Map oldProblems = new TreeMap(); |
| |
| CEList nullCEList = new CEList(new int[1]); |
| |
| while (true) { |
| Utility.dot(counter++); |
| String str = cc.next(ces, lenArray); |
| if (str == null) break; |
| int len = lenArray[0]; |
| |
| CEList oldList = new CEList(ces, 0, len); |
| |
| CEList newList = new CEList(ces,0,0); |
| int cp; |
| for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(str, i); |
| if (0xFF3F == cp) { |
| System.out.println("debug"); |
| } |
| boolean mashLast = false; |
| if (!nfkd.isNormalized(cp)) { |
| String decomp = nfkd.normalize(cp); |
| String canon = nfd.normalize(cp); |
| len = collator.getCEs(decomp, true, ces); |
| if (!decomp.equals(canon)) { |
| byte type = ucd.getDecompositionType(cp); |
| for (int j = 0; j < len; ++j) { |
| int p = (i == 0 && decomp.length() > 1 && decomp.charAt(0) == ' ' ? 0x20A : UCA.getPrimary(ces[j])); |
| int s = UCA.getSecondary(ces[j]); |
| boolean needsFix = (s != 0x20 && p != 0); |
| if (needsFix) ++len; |
| int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j]))); |
| if (needsFix) { |
| ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra |
| System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE! |
| p = 0; |
| } |
| ces[j] = UCA.makeKey(p, s, t); |
| } |
| } |
| } else { |
| len = collator.getCEs(UTF16.valueOf(cp), true, ces); |
| } |
| CEList inc = new CEList(ces, 0, len); |
| |
| if (cp == 0xFF71 || cp == 0xFF67) { |
| System.out.println(" String: " + ucd.getCodeAndName(cp)); |
| System.out.println(" Type: " + ucd.getDecompositionTypeID(cp)); |
| System.out.println(" xxx: " + inc); |
| } |
| |
| newList = newList.append(inc); |
| |
| } |
| if (newList.length() == 0) newList = nullCEList; |
| if (oldList.length() == 0) oldList = nullCEList; |
| |
| if (!newList.equals(oldList)) { |
| /* |
| System.out.println("String: " + ucd.getCodeAndName(str)); |
| System.out.println("\tOld: " + oldList); |
| System.out.println("\tNew: " + newList); |
| */ |
| list.add(new Pair(newList, new Pair(str, oldList))); |
| } |
| |
| // check for collisions |
| if (str.equals("\u206F")) { |
| System.out.println("debug"); |
| } |
| Object probe = newCollisions.get(newList); |
| if (probe == null) { |
| newCollisions.put(newList, str); |
| } else { |
| newProblems.put(str, new Pair((String)probe, newList)); |
| } |
| |
| probe = oldCollisions.get(oldList); |
| if (probe == null) { |
| oldCollisions.put(oldList, str); |
| } else { |
| oldProblems.put(str, new Pair((String)probe, oldList)); |
| } |
| |
| } |
| |
| Set newKeys = new TreeSet(newProblems.keySet()); |
| Set oldKeys = new TreeSet(oldProblems.keySet()); |
| Set joint = new TreeSet(newKeys); |
| joint.retainAll(oldKeys); |
| newKeys.removeAll(joint); |
| oldKeys.removeAll(joint); |
| |
| PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS); |
| Iterator it = list.iterator(); |
| int last = -1; |
| while (it.hasNext()) { |
| Utility.dot(counter++); |
| Pair value = (Pair) it.next(); |
| CEList newList = (CEList)value.first; |
| int cur = UCA.getPrimary(newList.at(0)); |
| if (cur != last) { |
| log.println(); |
| last = cur; |
| } |
| Pair v2 = (Pair) value.second; |
| String ss = (String)v2.first; |
| log.println(ucd.getCodeAndName(ss) + "\t\t" + ucd.getDecompositionTypeID(ss.charAt(0))); |
| log.println("\tnew:\t" + value.first); |
| log.println("\told:\t" + v2.second); |
| } |
| |
| /* |
| log.println(); |
| log.println("New Collisions: " + newKeys.size()); |
| it = newKeys.iterator(); |
| while (it.hasNext()) { |
| String key = (String) it.next(); |
| CEList cel = (CEList) newProblems.get(key); |
| String other = (String) newCollisions.get(cel); |
| log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other)); |
| log.println("\t" + cel); |
| } |
| |
| log.println("Removed Collisions: " + oldKeys.size()); |
| it = oldKeys.iterator(); |
| while (it.hasNext()) { |
| String key = (String) it.next(); |
| CEList cel = (CEList) oldProblems.get(key); |
| String other = (String) oldCollisions.get(cel); |
| log.println(ucd.getCodeAndName(key) + " collides with " + ucd.getCodeAndName(other)); |
| log.println("\t" + cel); |
| } |
| */ |
| |
| showCollisions(log, "New Collisions:", newKeys, newProblems); |
| showCollisions(log, "Old Collisions:", oldKeys, oldProblems); |
| showCollisions(log, "In Both:", joint, oldProblems); |
| log.close(); |
| } |
| |
| static void showCollisions(PrintWriter log, String title, Set bad, Map probs) { |
| log.println(); |
| log.println(title + bad.size()); |
| Iterator it = bad.iterator(); |
| Set lister = new TreeSet(); |
| |
| while (it.hasNext()) { |
| String key = (String) it.next(); |
| Pair pair = (Pair) probs.get(key); |
| String other = (String) pair.first; |
| CEList cel = (CEList) pair.second; |
| if (key.equals("\u0001")) { |
| System.out.println("debug"); |
| } |
| lister.add(new Pair(cel, ucd.getCodeAndName(key) + ",\t" + ucd.getCodeAndName(other))); |
| } |
| |
| it = lister.iterator(); |
| int last = -1; |
| while (it.hasNext()) { |
| Pair pair = (Pair) it.next(); |
| CEList cel = (CEList) pair.first; |
| int curr = UCA.getPrimary(cel.at(0)); |
| if (curr != last) { |
| last = curr; |
| log.println(); |
| } |
| log.println("Collision between: " + pair.second); |
| log.println("\t" + pair.first); |
| } |
| log.flush(); |
| } |
| |
| public static void checkHash(UCA collatorIn) throws Exception { |
| collator = collatorIn; |
| |
| System.out.println("# Check Hash"); |
| System.out.println("# Generated " + Default.getDate()); |
| |
| ucd = UCD.make(); |
| |
| //nfd = new Normalizer(Normalizer.NFD); |
| //nfkd = new Normalizer(Normalizer.NFKD); |
| |
| UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); |
| nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion()); |
| nfkd = new Normalizer(Normalizer.NFKD, collatorIn.getUCDVersion()); |
| |
| |
| int tableLength = 257; |
| /* |
| 257 263 269 271 277 281 283 293 307 311 313 317 |
| 331 337 347 349 353 359 367 373 379 383 389 397 |
| 401 409 419 421 431 433 439 443 449 457 461 463 |
| 467 479 487 491 499 503 509 521 523 541 547 557 |
| 563 569 571 577 587 593 599 601 607 613 617 619 |
| 631 641 643 647 653 659 661 673 677 683 691 701 |
| 709 719 727 733 739 743 751 757 761 769 773 787 |
| 797 809 811 821 823 827 829 839 853 857 859 863 |
| 877 881 883 887 907 911 919 929 937 941 947 953 |
| 967 971 977 983 991 997 |
| |
| */ |
| int [][] collisions = new int[LIMIT_SCRIPT][]; |
| BitSet[] repeats = new BitSet[LIMIT_SCRIPT]; |
| for (int i = 0; i < collisions.length; ++i) { |
| collisions[i] = new int[tableLength]; |
| repeats[i] = new BitSet(); |
| } |
| |
| int counter = 0; |
| |
| int[] lenArray = new int[1]; |
| |
| if (false) while (true) { |
| |
| Utility.dot(counter++); |
| String s = cc.next(ces, lenArray); |
| if (s == null) break; |
| |
| if (UTF16.countCodePoint(s) != 1) continue; // skip ligatures |
| int cp = UTF16.charAt(s, 0); |
| if (!nfkd.isNormalized(cp)) continue; |
| |
| int script = ucd.getScript(cp); |
| int len = lenArray[0]; |
| for (int i = 0; i < len; ++i) { |
| int prim = UCA.getPrimary(ces[i]); |
| int hash = prim % tableLength; |
| if (!repeats[script].get(prim)) { |
| ++collisions[script][hash]; |
| repeats[script].set(prim); |
| } else { |
| System.out.println("Skipping: " + prim + " in " + ucd.getCodeAndName(cp)); |
| } |
| if (!repeats[UNUSED_SCRIPT].get(prim)) { |
| ++collisions[UNUSED_SCRIPT][hash]; |
| repeats[UNUSED_SCRIPT].set(prim); |
| } |
| } |
| } |
| |
| String [] latin = new String[tableLength]; |
| for (int i = 0; i < latin.length; ++i) { |
| latin[i] = ""; |
| } |
| |
| for (int cp = 0; cp < 0x10FFFF; ++cp) { |
| |
| Utility.dot(counter++); |
| if (!ucd.isAllocated(cp)) continue; |
| if (!nfkd.isNormalized(cp)) continue; |
| if (ucd.getCategory(cp) == Lu) continue; // don't count case |
| |
| String scp = UTF16.valueOf(cp); |
| int len = collator.getCEs(scp, true, ces); |
| int script = ucd.getScript(cp); |
| |
| for (int i = 0; i < len; ++i) { |
| int prim = UCA.getPrimary(ces[i]); |
| int hash = prim % tableLength; |
| if (!repeats[script].get(prim)) { |
| ++collisions[script][hash]; |
| repeats[script].set(prim); |
| if (script == LATIN_SCRIPT) latin[hash] += scp; |
| } |
| if (!repeats[UNUSED_SCRIPT].get(prim)) { |
| ++collisions[UNUSED_SCRIPT][hash]; |
| repeats[UNUSED_SCRIPT].set(prim); |
| } |
| } |
| } |
| |
| System.out.println("Data Gathered"); |
| |
| PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "checkstringsearchhash.html", Utility.UTF8_WINDOWS); |
| Utility.writeHtmlHeader(log, "Check Hash"); |
| log.println("<h1>Collisions</h1>"); |
| log.println("<p>Shows collisions among primary values when hashed to table size = " + tableLength + "."); |
| log.println("Note: All duplicate primarys are removed: all non-colliding values are removed.</p>"); |
| log.println("<table><tr><th>Script</th><th>Sum</th><th>Average</th><th>Std Dev.</th></tr>"); |
| |
| for (byte i = 0; i < collisions.length; ++i) { |
| if (i == UNUSED_SCRIPT) continue; |
| showCollisions(log, ucd.getScriptID_fromIndex(i), collisions[i]); |
| } |
| showCollisions(log, "All", collisions[UNUSED_SCRIPT]); |
| log.println("</table>"); |
| |
| log.println("<p>Details of collisions for Latin</p>"); |
| |
| for (int i = 0; i < latin.length; ++i) { |
| if (latin[i].length() < 2) continue; |
| //if (UTF16.countCodePoint(latin[i]) < 2) continue; |
| int cp2; |
| log.println("<table>"); |
| for (int j = 0; j < latin[i].length(); j += UTF16.getCharCount(cp2)) { |
| cp2 = UTF16.charAt(latin[i], j); |
| String scp2 = UTF16.valueOf(cp2); |
| CEList clist = collator.getCEList(scp2, true); |
| log.println("<tr><td>" + scp2 + "</td><td>" + clist + "</td><td>" + ucd.getCodeAndName(cp2) + "</td></tr>"); |
| } |
| log.println("</table><br>"); |
| } |
| |
| log.close(); |
| } |
| |
| static java.text.NumberFormat nf = new java.text.DecimalFormat("#,##0.00"); |
| static java.text.NumberFormat nf0 = new java.text.DecimalFormat("#,##0"); |
| |
| static void showCollisions(PrintWriter log, String title, int[] curr) { |
| |
| double sum = 0; |
| int count = 0; |
| for (int j = 0; j < curr.length; ++j) { |
| if (curr[j] == 0) continue; |
| sum += curr[j]; |
| ++count; |
| } |
| double average = sum / count; |
| |
| double sd = 0; |
| for (int j = 0; j < curr.length; ++j) { |
| if (curr[j] == 0) continue; |
| double deviation = curr[j] - average; |
| sd += deviation * deviation; |
| } |
| sd = Math.sqrt(sd / count); |
| |
| log.println("<tr><td>" + title |
| + "</td><td align='right'>" + nf0.format(sum) |
| + "</td><td align='right'>" + nf.format(average) |
| + "</td><td align='right'>" + nf.format(sd) |
| + "</td></tr>"); |
| } |
| |
| public static void listCyrillic(UCA collatorIn) throws IOException { |
| PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "ListCyrillic.txt", Utility.UTF8_WINDOWS); |
| Set set = new TreeSet(collatorIn); |
| Set set2 = new TreeSet(collatorIn); |
| ucd = UCD.make(); |
| |
| nfd = new Normalizer(Normalizer.NFD, collatorIn.getUCDVersion()); |
| |
| for (char i = 0; i < 0xFFFF; ++i) { |
| Utility.dot(i); |
| if (!ucd.isRepresented(i)) continue; |
| if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue; |
| |
| String decomp = nfd.normalize(String.valueOf(i)); |
| String oldDecomp = decomp; |
| for (int j = 0; j < decomp.length(); ++j) { |
| if (ucd.getCategory(decomp.charAt(j)) == Mn) { |
| decomp = decomp.substring(0,j) + decomp.substring(j+1); |
| } |
| } |
| if (decomp.length() == 0) continue; |
| |
| set.add(decomp); |
| if (!decomp.equals(oldDecomp)) set2.add(oldDecomp); |
| } |
| |
| Iterator it = set.iterator(); |
| while (it.hasNext()) { |
| String s = (String) it.next(); |
| String name = ucd.getName(s.charAt(0)); |
| Utility.replace(name, "CYRILLIC ", ""); |
| log.println("# " + s + " <> XXX ; # " + name); |
| } |
| |
| it = set2.iterator(); |
| while (it.hasNext()) { |
| String s = (String) it.next(); |
| String name = ucd.getName(s.charAt(0)); |
| Utility.replace(name, "CYRILLIC ", ""); |
| log.println("### " + s + " <> XXX ; # " + name); |
| } |
| |
| log.close(); |
| } |
| |
| |
| } |