| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java,v $ |
| * $Date: 2004/10/14 17:54:56 $ |
| * $Revision: 1.3 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.util.*; |
| import java.io.*; |
| import java.text.DateFormat; |
| import java.text.SimpleDateFormat; |
| |
| import com.ibm.text.utility.*; |
| import com.ibm.icu.dev.test.util.BagFormatter; |
| import com.ibm.icu.dev.test.util.UnicodeProperty; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| public class TestNameUniqueness implements UCD_Types { |
| |
| public static void main(String[] args) throws Exception { |
| checkNameList(); |
| // new TestNameUniqueness().checkNames(); |
| } |
| |
| Map names = new HashMap(); |
| int[] charCount = new int[128]; |
| int[] samples = new int[128]; |
| |
| public static class NameIterator { |
| int fileCount = -1; |
| String line; |
| BufferedReader br; |
| String[] pieces = new String[3]; |
| /** |
| * @return null when done |
| */ |
| static String[][] files = { |
| {"C:\\DATA\\", "pdam1040630.lst"}, |
| {"C:\\DATA\\UCD\\4.1.0-Update\\", "NamedCompositeEntities-4.1.0d2.txt"} |
| }; |
| |
| public String next() { |
| while (true) { |
| try { |
| if (br != null) line = br.readLine(); |
| if (line == null) { |
| fileCount++; |
| br = BagFormatter.openReader(files[fileCount][0], files[fileCount][1], "ISO-8859-1"); |
| line = br.readLine(); |
| } |
| } catch (IOException e) {} |
| if (line == null) return null; |
| if (line.length() == 0) continue; |
| if (fileCount == 0) { |
| char c = line.charAt(0); |
| // skip if doesn't start with hex digit |
| if (!(('0' <= c && c <= '9') || ('A' <= c && c <= 'F'))) continue; |
| Utility.split(line,'\t',pieces,true); |
| Utility.split(pieces[1],'(',pieces,true); |
| Utility.split(pieces[0],'*',pieces,true); |
| return pieces[0]; |
| } else { |
| Utility.split(line,';',pieces,true); |
| return pieces[1]; |
| } |
| //throw new IllegalArgumentException("Illegal file type"); |
| } |
| } |
| } |
| |
| public static void checkNameList() throws IOException { |
| Map map = new HashMap(); |
| NameIterator nameIterator = new NameIterator(); |
| int lineCount = 0; |
| while (true) { |
| String name = nameIterator.next(); |
| if (name == null) break; |
| String key; |
| try { |
| if (name.startsWith("<")) key = name; |
| else key = UnicodeProperty.toNameSkeleton(name); |
| } catch (RuntimeException e) { |
| System.out.println("Error on " + nameIterator.line); |
| throw e; |
| } |
| Object value = map.get(key); |
| if (value != null && !key.startsWith("<")) { |
| System.out.println("*!*!*!* Collision at " + key + " between: "); |
| System.out.println("\t" + value); |
| System.out.println("\t" + nameIterator.line); |
| //throw new IllegalArgumentException(); |
| } |
| map.put(key, nameIterator.line); |
| if (nameIterator.line.startsWith("116C") |
| || nameIterator.line.startsWith("1180") |
| || name.indexOf('-') >= 0 |
| || (lineCount++ % 1000) == 0) { |
| System.out.println("[" + lineCount + "]\t" + nameIterator.line + "\t" + name); |
| System.out.println("\t" + name); |
| System.out.println("\t" + key); |
| } |
| } |
| } |
| |
| void checkNames() throws IOException { |
| PrintWriter out = Utility.openPrintWriter("name_uniqueness.txt", Utility.LATIN1_WINDOWS); |
| try { |
| out.println("Collisions"); |
| out.println(); |
| for (int cp = 0; cp < 0x10FFFF; ++cp) { |
| Utility.dot(cp); |
| if (!Default.ucd().isAllocated(cp)) continue; |
| if (Default.ucd().hasComputableName(cp)) continue; |
| int cat = Default.ucd().getCategory(cp); |
| if (cat == Cc) continue; |
| |
| String name = Default.ucd().getName(cp); |
| String processedName = processName(cp, name); |
| Integer existing = (Integer) names.get(processedName); |
| if (existing != null) { |
| out.println("Collision between: " |
| + Default.ucd().getCodeAndName(existing.intValue()) |
| + ", " + Default.ucd().getCodeAndName(cp)); |
| } else { |
| names.put(processedName, new Integer(cp)); |
| } |
| } |
| out.println(); |
| out.println("Samples"); |
| out.println(); |
| for (int i = 0; i < charCount.length; ++i) { |
| int count = charCount[i]; |
| if (count == 0) continue; |
| String sampleName = Default.ucd().getCodeAndName(samples[i]); |
| out.println(count + "\t'" + ((char)i) |
| + "'\t" + Default.ucd().getCodeAndName(samples[i]) |
| + "\t=>\t" + processName(samples[i], Default.ucd().getName(samples[i]))); |
| } |
| out.println(); |
| out.println("Name Samples"); |
| out.println(); |
| for (int i = 0; i < 256; ++i) { |
| int cat = Default.ucd().getCategory(i); |
| if (cat == Cc) continue; |
| out.println(Default.ucd().getCodeAndName(i) |
| + "\t=>\t" + processName(i, Default.ucd().getName(i))); |
| } |
| } finally { |
| out.close(); |
| } |
| } |
| |
| static final String[][] replacements = { |
| //{"SMALL LETTER", ""}, |
| {"LETTER", ""}, |
| {"CHARACTER", ""}, |
| {"DIGIT", ""}, |
| {"SIGN", ""}, |
| //{"WITH", ""}, |
| }; |
| |
| StringBuffer processNamesBuffer = new StringBuffer(); |
| |
| String processName(int codePoint, String name) { |
| name = Utility.replace(name, replacements); |
| processNamesBuffer.setLength(0); |
| for (int i = 0; i < name.length(); ++i) { |
| char c = name.charAt(i); |
| ++charCount[c]; |
| if (samples[c] == 0) samples[c] = codePoint; |
| if ('A' <= c && c <= 'Z' |
| || '0' <= c && c <= '9') processNamesBuffer.append(c); |
| |
| } |
| if (processNamesBuffer.length() == name.length()) return name; |
| return processNamesBuffer.toString(); |
| } |
| } |
| |