| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $ |
| * $Date: 2004/02/12 08:23:16 $ |
| * $Revision: 1.8 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.util.*; |
| import java.io.*; |
| |
| import com.ibm.icu.dev.test.util.BagFormatter; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| import com.ibm.text.utility.*; |
| |
| public final class TestNormalization { |
| static final String DIR = "C:\\Documents and Settings\\Davis\\My Documents\\UnicodeData\\Update 3.0.1\\"; |
| static final boolean SKIP_FILE = true; |
| |
| static PrintWriter out = null; |
| static BufferedReader in = null; |
| |
| static BitSet charsListed = new BitSet(0x110000); |
| static int errorCount = 0; |
| static int lineErrorCount = 0; |
| static String originalLine = ""; |
| static String lastLine = ""; |
| |
| public static void main(String[] args) throws java.io.IOException { |
| System.out.println("Creating Normalizers"); |
| |
| |
| String[] testSet = {"a\u0304\u0328", "a\u0328\u0304"}; |
| for (int i = 0; i < testSet.length; ++i) { |
| String s = testSet[i]; |
| boolean test = Default.nfc().isFCD(s); |
| System.out.println(test + ": " + Default.ucd().getCodeAndName(s)); |
| } |
| |
| |
| String x = UTF32.valueOf32(0x10000); |
| check("NFC", Default.nfc(), x); |
| check("NFD", Default.nfd(), x); |
| check("NFKC", Default.nfkc(), x); |
| check("NFKD", Default.nfkd(), x); |
| |
| |
| out = new PrintWriter( |
| new BufferedWriter( |
| new OutputStreamWriter( |
| new FileOutputStream("NormalizationTestLog.txt"), |
| "UTF8"), |
| 32*1024)); |
| |
| in = new BufferedReader ( |
| new FileReader (DIR + "NormalizationTest.txt"), |
| 32*1024); |
| |
| try { |
| String[] parts = new String[10]; |
| |
| System.out.println("Checking files"); |
| |
| int count = 0; |
| |
| while (true) { |
| String line = in.readLine(); |
| if ((count++ & 0x3FF) == 0) System.out.println("#LINE: " + line); |
| if (line == null) break; |
| originalLine = line; |
| int pos = line.indexOf('#'); |
| if (pos >= 0) { |
| line = line.substring(0,pos); |
| } |
| line = line.trim(); |
| if (line.length() == 0) continue; |
| |
| |
| int splitCount = Utility.split(line, ';', parts); |
| // FIX check splitCount |
| for (int i = 0; i < splitCount; ++i) { |
| parts[i] = Utility.fromHex(parts[i]); |
| } |
| |
| if (UTF32.length32(parts[0]) == 1) { |
| int code = UTF32.char32At(parts[0],0); |
| charsListed.set(code); |
| if ((code & 0x3FF) == 0) System.out.println("# " + Utility.hex(code)); |
| } |
| |
| // c2 == NFC(c1) == NFC(c2) == NFC(c3) |
| errorCount += check("NFCa", Default.nfc(), parts[1], parts[0]); |
| errorCount += check("NFCb", Default.nfc(), parts[1], parts[1]); |
| errorCount += check("NFCc", Default.nfc(), parts[1], parts[2]); |
| |
| // c4 == NFC(c4) == NFC(c5) |
| errorCount += check("NFCd", Default.nfc(), parts[3], parts[3]); |
| errorCount += check("NFCe", Default.nfc(), parts[3], parts[4]); |
| |
| // c3 == NFD(c1) == NFD(c2) == NFD(c3) |
| errorCount += check("NFDa", Default.nfd(), parts[2], parts[0]); |
| errorCount += check("NFDb", Default.nfd(), parts[2], parts[1]); |
| errorCount += check("NFDc", Default.nfd(), parts[2], parts[2]); |
| |
| // c5 == NFD(c4) == NFD(c5) |
| errorCount += check("NFDd", Default.nfd(), parts[4], parts[3]); |
| errorCount += check("NFDe", Default.nfd(), parts[4], parts[4]); |
| |
| // c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) |
| errorCount += check("NFKCa", Default.nfkc(), parts[3], parts[0]); |
| errorCount += check("NFKCb", Default.nfkc(), parts[3], parts[1]); |
| errorCount += check("NFKCc", Default.nfkc(), parts[3], parts[2]); |
| errorCount += check("NFKCd", Default.nfkc(), parts[3], parts[3]); |
| errorCount += check("NFKCe", Default.nfkc(), parts[3], parts[4]); |
| |
| // c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) |
| errorCount += check("NFKDa", Default.nfkd(), parts[4], parts[0]); |
| errorCount += check("NFKDb", Default.nfkd(), parts[4], parts[1]); |
| errorCount += check("NFKDc", Default.nfkd(), parts[4], parts[2]); |
| errorCount += check("NFKDd", Default.nfkd(), parts[4], parts[3]); |
| errorCount += check("NFKDe", Default.nfkd(), parts[4], parts[4]); |
| } |
| System.out.println("Total errors in file: " + errorCount |
| + ", lines: " + lineErrorCount); |
| errorCount = lineErrorCount = 0; |
| |
| System.out.println("Checking Missing"); |
| checkMissing(); |
| System.out.println("Total errors in unlisted items: " + errorCount |
| + ", lines: " + lineErrorCount); |
| |
| } finally { |
| if (in != null) in.close(); |
| if (out != null) out.close(); |
| } |
| } |
| |
| static String lastBase = ""; |
| |
| public static int check(String type, Normalizer n, String base, String other) { |
| try { |
| String trans = n.normalize(other); |
| if (!trans.equals(base)) { |
| String temp = ""; |
| if (!lastLine.equals(originalLine)) { |
| temp = "// " + originalLine; |
| lastLine = originalLine; |
| } |
| if (!base.equals(lastBase)) { |
| lastBase = base; |
| lineErrorCount++; |
| } |
| String otherList = ""; |
| if (!base.equals(other)) { |
| otherList = "(" + Default.ucd().getCodeAndName(other) + ")"; |
| } |
| out.println("DIFF " + type + ": " |
| + Default.ucd().getCodeAndName(base) + " != " |
| + type |
| + otherList |
| + " == " + Default.ucd().getCodeAndName(trans) |
| + temp |
| ); |
| return 1; |
| } |
| } catch (Exception e) { |
| throw new ChainException("DIFF " + type + ": " |
| + Default.ucd().getCodeAndName(base) + " != " |
| + type + "(" + Default.ucd().getCodeAndName(other) + ")", new Object[]{}, e); |
| } |
| return 0; |
| } |
| |
| public static int check(String type, Normalizer n, String base) { |
| return check(type, n, base, base); |
| } |
| |
| static void checkMissing() { |
| for (int missing = 0; missing < 0x100000; ++missing) { |
| if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing)); |
| if (charsListed.get(missing)) continue; |
| String x = UTF32.valueOf32(missing); |
| errorCount += check("NFC", Default.nfc(), x); |
| errorCount += check("NFD", Default.nfd(), x); |
| errorCount += check("NFKC", Default.nfkc(), x); |
| errorCount += check("NFKD", Default.nfkd(), x); |
| } |
| } |
| |
| public static void checkStarters () { |
| System.out.println("Checking Starters"); |
| UnicodeSet leading = new UnicodeSet(); |
| UnicodeSet trailing = new UnicodeSet(); |
| for (int i = 0; i <= 0x10FFFF; ++i) { |
| if (Default.nfc().isLeading(i)) leading.add(i); |
| if (Default.ucd().getCombiningClass(i) != 0) continue; |
| if (Default.nfc().isTrailing(i)) trailing.add(i); |
| } |
| System.out.println("Leading: " + leading.size()); |
| System.out.println("Trailing Starters: " + trailing.size()); |
| UnicodeSetIterator lead = new UnicodeSetIterator(leading); |
| UnicodeSetIterator trail = new UnicodeSetIterator(trailing); |
| UnicodeSet followers = new UnicodeSet(); |
| Map map = new TreeMap(new CompareProperties.UnicodeSetComparator()); |
| while (lead.next()) { |
| trail.reset(); |
| followers.clear(); |
| while (trail.next()) { |
| if (Default.nfc().getComposition(lead.codepoint, trail.codepoint) != 0xFFFF) { |
| followers.add(trail.codepoint); |
| } |
| } |
| if (followers.size() == 0) continue; |
| System.out.println(Default.ucd().getCode(lead.codepoint) |
| + "\t" + followers.toPattern(true)); |
| UnicodeSet possLead = (UnicodeSet) map.get(followers); |
| if (possLead == null) { |
| possLead = new UnicodeSet(); |
| map.put(followers.clone(), possLead); |
| } |
| possLead.add(lead.codepoint); |
| } |
| Iterator it = map.keySet().iterator(); |
| BagFormatter bf = new BagFormatter(); |
| bf.setLineSeparator("<br>"); |
| bf.setLabelSource(null); |
| bf.setAbbreviated(true); |
| while (it.hasNext()) { |
| UnicodeSet t = (UnicodeSet) it.next(); |
| UnicodeSet l = (UnicodeSet) map.get(t); |
| System.out.println("<tr><td>" |
| + bf.showSetNames(l) |
| + "</td><td>" |
| + bf.showSetNames(t) |
| + "</td></tr>"); |
| } |
| } |
| } |