| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $ |
| * $Date: 2006/09/24 23:32:44 $ |
| * $Revision: 1.12 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.text.UCD; |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.PrintWriter; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.TreeSet; |
| |
| import com.ibm.icu.dev.test.util.ArrayComparator; |
| import com.ibm.icu.dev.test.util.BagFormatter; |
| import com.ibm.icu.dev.test.util.TransliteratorUtilities; |
| import com.ibm.icu.dev.test.util.UnicodeLabel; |
| import com.ibm.icu.dev.test.util.UnicodeMap; |
| import com.ibm.icu.dev.test.util.UnicodeProperty; |
| import com.ibm.icu.dev.test.util.XEquivalenceClass; |
| import com.ibm.icu.impl.CollectionUtilities; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| import com.ibm.icu.util.ULocale; |
| import com.ibm.text.utility.Utility; |
| |
| |
| public class GenerateConfusables { |
| public static String version = "2.0"; |
| public static boolean EXCLUDE_CONFUSABLE_COMPAT = true; |
| |
| public static void main(String[] args) throws IOException { |
| quickTest(); |
| |
| Set arg2 = new HashSet(Arrays.asList(args)); |
| try { |
| if (arg2.contains("-b")) generateIDN(); |
| if (arg2.contains("-c")) generateConfusables(); |
| if (arg2.contains("-d")) generateDecompFile(); |
| if (arg2.contains("-s")) generateSource(); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } finally { |
| System.out.println("Done"); |
| } |
| } |
| |
| private static void quickTest() { |
| int script = getSingleScript("\u0430\u0061"); |
| script = getSingleScript("\u0061\u0430"); //0323 ; 093C |
| String a = "\u0323"; |
| String b = "\u093C"; |
| int isLess = betterTargetIsLess.compare(a, b); // ("\u0045", "\u13AC"); |
| MyEquivalenceClass test = new MyEquivalenceClass(); |
| test.add(a, b, "none"); |
| Set x = test.getEquivalences(a); |
| String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1); |
| } |
| |
| /** |
| * |
| */ |
| static UnicodeSet _Non_IICore; |
| |
| private static UnicodeSet getNonIICore() { |
| //Main + IICore + (Ext-A intersect Chinese) |
| //blk; n/a ; CJK_Unified_Ideographs |
| //blk; n/a ; CJK_Unified_Ideographs_Extension_A |
| //blk; n/a ; CJK_Unified_Ideographs_Extension_B |
| |
| if (_Non_IICore == null) { |
| // stuff to remove |
| _Non_IICore = ups.getSet("block=CJK_Unified_Ideographs_Extension_A"); |
| _Non_IICore.addAll(ups.getSet("block=CJK_Unified_Ideographs_Extension_B")); |
| _Non_IICore.removeAll(UNASSIGNED); // remove unassigned |
| // stuff to restore |
| UnicodeMap um = Default.ucd().getHanValue("kIICore"); |
| um.put(0x34E4, "2.1"); |
| um.put(0x3007, "2.1"); |
| _Non_IICore.removeAll(um.getSet("2.1")); |
| |
| // add Chinese? |
| if (true) { |
| UnicodeSet cjk_nic = new UnicodeSet(); |
| String line = null; |
| try { |
| BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt"); |
| while (true) { |
| line = Utility.readDataLine(br); |
| if (line == null) break; |
| if (line.length() == 0) continue; |
| String[] pieces = Utility.split(line, ';'); |
| // part 0 is range |
| String range = pieces[0].trim(); |
| int rangeDivider = range.indexOf(".."); |
| int start, end; |
| if (rangeDivider < 0) { |
| start = end = Integer.parseInt(range, 16); |
| } else { |
| start = Integer.parseInt(range.substring(0, rangeDivider), 16); |
| end = Integer.parseInt(range.substring(rangeDivider+2), 16); |
| } |
| cjk_nic.add(start, end); |
| } |
| br.close(); |
| } catch (Exception e) { |
| throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e); |
| } |
| _Non_IICore.removeAll(cjk_nic); |
| } |
| } |
| return _Non_IICore; |
| // for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) { |
| // Object value = it.next(); |
| // UnicodeSet set = um.getSet(value); |
| // System.out.println(value + "\t" + set); |
| // } |
| } |
| |
| static PrintWriter log; |
| static final String ARROW = "\u2192"; // \u2194 |
| static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make(); |
| static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn") |
| .addAll(ups.getSet("gc=Co")) |
| .addAll(ups.getSet("gc=Cs")); |
| static UnicodeSet skipSet = ups.getSet("gc=Cc") |
| .addAll(ups.getSet("gc=Cf")) |
| .addAll(UNASSIGNED); |
| static UnicodeSet whiteSpace = ups.getSet("Whitespace=TRUE"); |
| static UnicodeSet lowercase = ups.getSet("gc=Ll"); |
| static UnicodeSet _skipNFKD; |
| |
| static Map gatheredNFKD = new TreeMap(); |
| static UnicodeMap nfcMap; |
| static UnicodeMap nfkcMap; |
| |
| static String indir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\source\\"; |
| static String outdir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\"; |
| |
| static Comparator codepointComparator = new UTF16.StringComparator(true,false,0); |
| static Comparator UCAComparator = new CollectionUtilities.MultiComparator(new Comparator[] {Collator.getInstance(ULocale.ROOT), codepointComparator}); |
| |
| static UnicodeSet setsToAbbreviate = new UnicodeSet("[" + |
| "\\u3400-\\u4DB5" + |
| "\\u4E00-\\u9FA5" + |
| "\\uA000-\\uA48C" + |
| "\\uAC00-\\uD7A3" + |
| "\\u1100-\\u11FF" + |
| "\\uFB00-\\uFEFC" + |
| "\\u2460-\\u24FF" + |
| "\\u3251-\\u33FF" + |
| "\\u4DC0-\\u4DFF" + |
| "\\u3165-\\u318E" + |
| "\\uA490-\\uA4C6" + |
| "\\U00010140-\\U00010174" + |
| "\\U0001D300-\\U0001D356" + |
| "\\U0001D000-\\U0001D1DD" + |
| "\\U00020000-\\U0002A6D6" + |
| "\\U0001D400-\\U0001D7FF" + |
| "[:script=Canadian_Aboriginal:]" + |
| "[:script=ETHIOPIC:]" + |
| "[:script=Tagalog:]" + |
| "[:script=Hanunoo:]" + |
| "[:script=Buhid:]" + |
| "[:script=Tagbanwa:]" + |
| "[:script=Deseret:]" + |
| "[:script=Shavian:]" + |
| "[:script=Ogham:]" + |
| "[:script=Old Italic:]" + |
| "[:script=Runic:]" + |
| "[:script=Gothic:]" + |
| "[:script=Ugaritic:]" + |
| "[:script=Linear B:]" + |
| "[:script=Cypriot:]" + |
| "[:script=Coptic:]" + |
| "[:script=Syriac:]" + |
| "[:script=Glagolitic:]" + |
| "[:script=Glagolitic:]" + |
| "[:script=Old Persian:]" + |
| "[:script=Kharoshthi:]" + |
| "[:script=Osmanya:]" + |
| "[:default ignorable code point:]" + |
| "]"); |
| |
| /** |
| * @throws IOException |
| * |
| */ |
| private static void generateIDN() throws IOException { |
| IdentifierInfo info = IdentifierInfo.getIdentifierInfo(); |
| info.printIDNStuff(); |
| } |
| |
| private static class IdentifierInfo { |
| static private IdentifierInfo info; |
| |
| static IdentifierInfo getIdentifierInfo() { |
| try { |
| if (info == null) info = new IdentifierInfo(); |
| return info; |
| } catch (Exception e) { |
| throw (RuntimeException) new IllegalArgumentException("Unable to access data").initCause(e); |
| } |
| } |
| |
| private boolean mergeRanges = true; |
| |
| private UnicodeSet removalSet, remainingOutputSet, inputSet_strict, inputSet_lenient, nonstarting; |
| UnicodeSet propNFKCSet, notInXID, xidPlus; |
| |
| private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(), |
| reviews, removals2, lowerIsBetter; |
| |
| private UnicodeSet isCaseFolded; |
| |
| private IdentifierInfo() throws IOException { |
| isCaseFolded = new UnicodeSet(); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| Utility.dot(cp); |
| int cat = Default.ucd().getCategory(cp); |
| if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue; |
| String source = UTF16.valueOf(cp); |
| String cf = Default.ucd().getCase(source, UCD.FULL, UCD.FOLD); |
| if (cf.equals(source)) isCaseFolded.add(cp); |
| } |
| |
| propNFKCSet = ups.getSet("NFKC_QuickCheck=N").complement(); |
| UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE"); |
| |
| //removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant"); |
| loadFileData(); |
| xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet); |
| |
| getIdentifierSet(); |
| notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus); |
| removals.putAll(notInXID, PROHIBITED + NOT_IN_XID); |
| //UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet); |
| //removals.putAll(notNfkcXid, PROHIBITED + "compat variant"); |
| removalSet = removals.keySet(); |
| |
| remainingOutputSet = new UnicodeSet(IDNOutputSet).removeAll(removalSet); |
| |
| UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet) |
| .removeAll(removalSet).removeAll(remainingOutputSet); |
| UnicodeSet remainingInputSet = new UnicodeSet(); |
| UnicodeSet specialRemove = new UnicodeSet(); |
| // remove any others that don't normalize/case fold to something in |
| // the output set |
| for (UnicodeSetIterator usi = new UnicodeSetIterator( |
| remainingInputSet1); usi.next();) { |
| String nss = getModifiedNKFC(usi.getString()); |
| String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD); |
| String cf2 = getModifiedNKFC(cf); |
| if (remainingOutputSet.containsAll(cf2)) |
| remainingInputSet.add(usi.codepoint); |
| else |
| specialRemove.add(usi.codepoint); |
| } |
| // filter out the items that are case foldings of items in output |
| inputSet_strict = new UnicodeSet(); |
| for (UnicodeSetIterator usi = new UnicodeSetIterator( |
| remainingInputSet); usi.next();) { |
| String ss = usi.getString(); |
| String nss = getModifiedNKFC(ss); |
| String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD); |
| if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) { |
| System.out.println("check"); |
| } |
| //> > 2126 ; retained-input-only-CF # (?) OHM SIGN |
| //> > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN |
| |
| if (!remainingOutputSet.containsAll(nss) |
| && remainingOutputSet.containsAll(cf)) |
| inputSet_strict.add(ss); |
| } |
| // hack |
| inputSet_strict.remove(0x03F4).remove(0x2126).remove(0x212B); |
| inputSet_lenient = new UnicodeSet(remainingInputSet) |
| .removeAll(inputSet_strict); |
| nonstarting = new UnicodeSet(remainingOutputSet).addAll( |
| remainingInputSet).retainAll(new UnicodeSet("[:M:]")); |
| reviews = new UnicodeMap().putAll(removals); |
| reviews.putAll(remainingOutputSet, "output"); |
| reviews.putAll(inputSet_strict, "input"); |
| reviews.putAll(inputSet_lenient, "input-lenient"); |
| reviews.putAll(specialRemove, PROHIBITED + "output-disallowed"); |
| |
| lowerIsBetter = new UnicodeMap(); |
| |
| lowerIsBetter.putAll(propNFKCSet, MARK_NFC); // nfkc is better than the alternative |
| lowerIsBetter.putAll(inputSet_lenient, MARK_INPUT_LENIENT); |
| lowerIsBetter.putAll(inputSet_strict, MARK_INPUT_STRICT); |
| lowerIsBetter.putAll(remainingOutputSet, MARK_OUTPUT); |
| lowerIsBetter.putAll(remainingOutputSet, MARK_ASCII); |
| lowerIsBetter.setMissing(MARK_NOT_NFC); |
| |
| lowerIsBetter.freeze(); |
| // add special values: |
| //lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0)); |
| |
| UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting, |
| "nonstarting"); |
| UnicodeMap.Composer composer = new UnicodeMap.Composer() { |
| public Object compose(int codePoint, Object a, Object b) { |
| if (a == null) |
| return b; |
| else if (b == null) |
| return a; |
| else |
| return a.toString() + "-" + b.toString(); |
| } |
| }; |
| reviews.composeWith(nonstartingmap, composer); |
| reviews.putAll(new UnicodeSet(IDNInputSet).complement(), ""); |
| UnicodeMap.Composer composer2 = new UnicodeMap.Composer() { |
| public Object compose(int codePoint, Object a, Object b) { |
| if (b == null) |
| return a; |
| return "remap-to-" + Utility.hex(b.toString()); |
| } |
| }; |
| //reviews.composeWith(remap, composer2); |
| removals2 = new UnicodeMap().putAll(removals); |
| removals2.putAll(ups.getSet("XID_Continue=TRUE").complement(), |
| PROHIBITED + NOT_IN_XID); |
| removals2.setMissing("future?"); |
| |
| additions.freeze(); |
| remap.freeze(); |
| removals.freeze(); |
| reviews.freeze(); |
| removals2.freeze(); |
| } |
| |
| /** |
| * |
| */ |
| private void loadFileData() throws IOException { |
| // get the word chars |
| BufferedReader br = BagFormatter.openUTF8Reader(indir, |
| "wordchars.txt"); |
| String line = null; |
| try { |
| while (true) { |
| line = Utility.readDataLine(br); |
| if (line == null) |
| break; |
| if (line.length() == 0) |
| continue; |
| String[] pieces = Utility.split(line, ';'); |
| int code = Integer.parseInt(pieces[0].trim(), 16); |
| if (pieces[1].trim().equals("remap-to")) { |
| remap.put(code, UTF16.valueOf(Integer.parseInt( |
| pieces[2].trim(), 16))); |
| } else { |
| if (XIDContinueSet.contains(code)) { |
| System.out.println("Already in XID continue: " |
| + line); |
| continue; |
| } |
| additions.put(code, "addition"); |
| } |
| } |
| } catch (Exception e) { |
| throw (RuntimeException) new RuntimeException( |
| "Failure on line " + line).initCause(e); |
| } |
| br.close(); |
| |
| // get all the removals. |
| br = BagFormatter.openUTF8Reader(indir, "removals.txt"); |
| UnicodeSet allocated = ups.getSet("generalcategory=cn").complement(); |
| |
| UnicodeSet sources = new UnicodeSet(); |
| line = null; |
| try { |
| while (true) { |
| line = Utility.readDataLine(br); |
| if (line == null) |
| break; |
| if (line.length() == 0) |
| continue; |
| sources.clear(); |
| String[] pieces = Utility.split(line, ';'); |
| if (pieces.length < 2) { |
| System.out.println("Missing line " + line); |
| continue; |
| } |
| String codelist = pieces[0].trim(); |
| String reasons = pieces[1].trim(); |
| if (pieces[0].startsWith("[")) { |
| sources = new UnicodeSet(codelist).retainAll(allocated); |
| } else { |
| String[] codes = Utility.split(codelist, ' '); |
| for (int i = 0; i < codes.length; ++i) { |
| if (codes[i].length() == 0) |
| continue; |
| String[] range = codes[i].split("\\.\\."); |
| int start = Integer.parseInt(range[0], 16); |
| int end = start; |
| if (range.length > 1) |
| end = Integer.parseInt(range[1], 16); |
| sources.add(start, end); |
| } |
| } |
| removals.putAll(sources, PROHIBITED + reasons); |
| } |
| } catch (Exception e) { |
| throw (RuntimeException) new RuntimeException( |
| "Failure on line " + line).initCause(e); |
| } |
| removals.putAll(getNonIICore(), PROHIBITED + "~IICore"); |
| br.close(); |
| } |
| |
| void printIDNStuff() throws IOException { |
| PrintWriter out; |
| printIDModifications(); |
| writeIDChars(); |
| writeIDReview(); |
| generateDecompFile(); |
| } |
| |
| /** |
| * |
| */ |
| private void writeIDReview() throws IOException { |
| BagFormatter bf = new BagFormatter(); |
| bf.setUnicodePropertyFactory(ups); |
| bf.setLabelSource(null); |
| bf.setShowLiteral(TransliteratorUtilities.toHTMLControl); |
| bf.setMergeRanges(true); |
| |
| PrintWriter out = openAndWriteHeader("review.txt", "Review List for IDN"); |
| // PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt"); |
| //reviews.putAll(UNASSIGNED, ""); |
| // out.print("\uFEFF"); |
| // out.println("# Review List for IDN"); |
| // out.println("# $Revision: 1.12 $"); |
| // out.println("# $Date: 2006/09/24 23:32:44 $"); |
| // out.println(""); |
| |
| UnicodeSet fullSet = reviews.getSet("").complement(); |
| |
| bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { |
| }).set(reviews).setMain("Reviews", "GCB", |
| UnicodeProperty.ENUMERATED, "1.0")); |
| //bf.setMergeRanges(false); |
| |
| FakeBreak fakeBreak = new FakeBreak(); |
| bf.setRangeBreakSource(fakeBreak); |
| out.println(""); |
| out.println("# Characters allowed in IDNA"); |
| out.println(""); |
| bf.showSetNames(out, new UnicodeSet(fullSet)); // .removeAll(bigSets) |
| //bf.setMergeRanges(true); |
| // out.println(""); |
| // out.println("# Large Ranges"); |
| // out.println(""); |
| // bf.showSetNames(out, new UnicodeSet(fullSet).retainAll(bigSets)); |
| out.println(""); |
| out.println("# Characters disallowed in IDNA"); |
| out |
| .println("# The IDNA spec doesn't allow any of these characters,"); |
| out |
| .println("# so don't report any of them as being missing from the above list."); |
| out |
| .println("# Some possible future additions, once IDNA updates to Unicode 4.1, are given."); |
| out.println(""); |
| //bf.setRangeBreakSource(UnicodeLabel.NULL); |
| bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { |
| }).set(removals2).setMain("Removals", "GCB", |
| UnicodeProperty.ENUMERATED, "1.0")); |
| //bf.setValueSource(UnicodeLabel.NULL); |
| bf.showSetNames(out, new UnicodeSet(IDNInputSet).complement() |
| .removeAll(UNASSIGNED)); |
| out.close(); |
| } |
| |
| /** |
| * |
| */ |
| private void writeIDChars() throws IOException { |
| BagFormatter bf = new BagFormatter(); |
| bf.setUnicodePropertyFactory(ups); |
| bf.setLabelSource(null); |
| bf.setShowLiteral(TransliteratorUtilities.toHTMLControl); |
| bf.setMergeRanges(true); |
| |
| UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]"); |
| |
| PrintWriter out = openAndWriteHeader("idnchars.txt", "Recommended Identifier Profiles for IDN"); |
| |
| out.println("# Allowed as output characters"); |
| out.println(""); |
| bf.setValueSource("output"); |
| bf.showSetNames(out, remainingOutputSet); |
| showExtras(bf, remainingOutputSet, letters); |
| |
| /* |
| out.println(""); |
| |
| out.println(""); |
| out.println("# Input Characters"); |
| out.println(""); |
| bf.setValueSource("input"); |
| bf.showSetNames(out, inputSet_strict); |
| showExtras(bf, inputSet_strict, letters); |
| |
| out.println(""); |
| out.println("# Input Characters (lenient)"); |
| out.println(""); |
| bf.setValueSource("input-lenient"); |
| bf.showSetNames(out, inputSet_lenient); |
| showExtras(bf, inputSet_lenient, letters); |
| */ |
| |
| out.println(""); |
| out.println("# Not allowed at start of identifier"); |
| out.println(""); |
| bf.setValueSource("nonstarting"); |
| bf.showSetNames(out, nonstarting); |
| |
| //out.println(""); |
| |
| //showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", remap); |
| |
| out.close(); |
| } |
| |
| |
| /** |
| * |
| */ |
| private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) { |
| UnicodeSet extra = new UnicodeSet(source).removeAll(letters); |
| if (extra.size() != 0) { |
| UnicodeSet fixed = new UnicodeSet(); |
| for (UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next();) { |
| if (!letters.containsAll(Default.nfkd().normalize(it.getString()))) { |
| fixed.add(it.codepoint); |
| } |
| } |
| System.out.println(bf.showSetNames(fixed)); |
| } |
| } |
| |
| /** |
| * |
| */ |
| private void printIDModifications() throws IOException { |
| BagFormatter bf = new BagFormatter(); |
| bf.setUnicodePropertyFactory(ups); |
| bf.setLabelSource(null); |
| bf.setShowLiteral(TransliteratorUtilities.toHTMLControl); |
| bf.setMergeRanges(true); |
| |
| PrintWriter out = openAndWriteHeader("xidmodifications.txt", "Security Profile for General Identifiers"); |
| /* PrintWriter out = BagFormatter.openUTF8Writer(outdir, "xidmodifications.txt"); |
| |
| out.println("# Security Profile for General Identifiers"); |
| out.println("# $Revision: 1.12 $"); |
| out.println("# $Date: 2006/09/24 23:32:44 $"); |
| */ |
| |
| |
| out.println("# Characters restricted"); |
| out.println(""); |
| /* |
| * for (Iterator it = values.iterator(); it.hasNext();) { String |
| * reason1 = (String)it.next(); bf.setValueSource(reason1); |
| * out.println(""); bf.showSetNames(out, removals.getSet(reason1)); } |
| */ |
| bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { |
| }).set(removals).setMain("Removals", "GCB", |
| UnicodeProperty.ENUMERATED, "1.0")); |
| bf.showSetNames(out, removalSet); |
| |
| out.println(""); |
| out.println("# Characters added"); |
| out.println(""); |
| bf.setValueSource("addition"); |
| bf.showSetNames(out, additions.keySet()); |
| |
| //showRemapped(out, "Characters remapped on input", remap); |
| |
| out.close(); |
| |
| out = openAndWriteHeader("xidAllowed.txt", "Security Profile for General Identifiers"); |
| UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet()); |
| UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet); |
| allowed.removeAll(cfAllowed); |
| bf.setValueSource("case_folded"); |
| out.println("# XID characters allowed (no uppercase)"); |
| out.println(""); |
| bf.showSetNames(out, cfAllowed); |
| bf.setValueSource("not_case_folded"); |
| out.println(""); |
| out.println("# XID characters allowed (uppercase)"); |
| out.println(""); |
| bf.showSetNames(out, allowed); |
| out.close(); |
| |
| UnicodeMap someRemovals = new UnicodeMap(); |
| UnicodeMap.Composer myComposer = new UnicodeMap.Composer() { |
| public Object compose(int codePoint, Object a, Object b) { |
| if (b == null) return null; |
| String x = (String)b; |
| if (false) { |
| if (!IDNOutputSet.contains(codePoint)) { |
| return "~IDNA"; |
| } |
| if (!xidPlus.contains(codePoint)) { |
| return "~Unicode Identifier"; |
| } |
| } |
| if (x.startsWith(PROHIBITED)) x = x.substring(PROHIBITED.length()); |
| //if (!propNFKCSet.contains(codePoint)) x += "*"; |
| if (lowercase.contains(codePoint)) { |
| String upper = Default.ucd().getCase(codePoint, UCD.FULL, UCD.UPPER); |
| if (upper.equals(UTF16.valueOf(codePoint)) |
| && x.equals("technical symbol (phonetic)")) x = "technical symbol (phonetic with no uppercase)"; |
| } |
| return x; |
| } |
| }; |
| someRemovals.composeWith(removals, myComposer); |
| UnicodeSet nonIDNA = new UnicodeSet(IDNOutputSet).addAll(IDNInputSet).complement(); |
| someRemovals.putAll(nonIDNA, "~IDNA"); |
| someRemovals.putAll(new UnicodeSet(xidPlus).complement(), "~Unicode Identifier"); |
| someRemovals.putAll(UNASSIGNED, null); // clear extras |
| //someRemovals = removals; |
| out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt"); |
| out.println("# Characters restricted in domain names"); |
| out.println("# $Revision: 1.12 $"); |
| out.println("# $Date: 2006/09/24 23:32:44 $"); |
| out.println("#"); |
| out.println("# This file contains a draft list of characters for use in"); |
| out.println("# UTR #36: Unicode Security Considerations"); |
| out.println("# http://unicode.org/draft/reports/tr36/tr36.html"); |
| out.println("# According to the recommendations in that document, these characters"); |
| out.println("# would be restricted in domain names: people would only be able to use them"); |
| out.println("# by using lenient security settings."); |
| out.println("#"); |
| out.println("# If you have any feedback on this list, please use the submission form at:"); |
| out.println("# http://unicode.org/reporting.html."); |
| out.println("#"); |
| out.println("# Notes:"); |
| out.println("# - Characters are listed along with a reason for their removal."); |
| out.println("# - Characters listed as ~IDNA are excluded at this point in domain names,"); |
| out.println("# in many cases because the international domain name specification does not contain"); |
| out.println("# characters beyond Unicode 3.2. At this point in time, feedback on those characters"); |
| out.println("# is not relevant."); |
| out.println("# - Characters listed as ~Unicode Identifiers are restricted because they"); |
| out.println("# do not fit the specification of identifiers given in"); |
| out.println("# UAX #31: Identifier and Pattern Syntax"); |
| out.println("# http://unicode.org/reports/tr31/"); |
| out.println("# - Characters listed as ~IICore are restricted because they are Ideographic,"); |
| out.println("# but not part of the IICore set defined by the IRG as the minimal set"); |
| out.println("# of required ideographs for East Asian use."); |
| out.println("# - The files in this directory are 'live', and may change at any time."); |
| out.println("# Please include the above Revision number in your feedback."); |
| |
| bf.setRangeBreakSource(new FakeBreak2()); |
| if (true) { |
| Set values = new TreeSet(someRemovals.getAvailableValues()); |
| for (Iterator it = values.iterator(); it.hasNext();) { |
| String reason1 = (String) it.next(); |
| bf.setValueSource(reason1); |
| out.println(""); |
| bf.showSetNames(out, someRemovals.getSet(reason1)); |
| } |
| } else { |
| bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { |
| }).set(someRemovals).setMain("Removals", "GCB", |
| UnicodeProperty.ENUMERATED, "1.0")); |
| bf.showSetNames(out, someRemovals.keySet()); |
| } |
| out.close(); |
| } |
| } |
| |
| static final String PROHIBITED = "restricted ; "; |
| static final String NOT_IN_XID = "not in XID+"; |
| public static final boolean suppress_NFKC = true; |
| /** |
| * |
| */ |
| |
| |
| /** |
| * |
| */ |
| private static void generateDecompFile() throws IOException { |
| PrintWriter out = BagFormatter.openUTF8Writer(outdir, "decomps.txt"); |
| UnicodeProperty dt = ups.getProperty("Decomposition_Type"); |
| for (Iterator it = dt.getAvailableValues().iterator(); it.hasNext();) { |
| String value = (String) it.next(); |
| if (value.equalsIgnoreCase("none") || value.equalsIgnoreCase("canonical")) continue; |
| UnicodeSet s = dt.getSet(value); |
| out.println(""); |
| out.println("# Decomposition_Type = " + value); |
| out.println(""); |
| for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) { |
| String source = usi.getString(); |
| String target = getModifiedNKFC(source); |
| writeSourceTargetLine(out, source, null, target, value); |
| } |
| //bf.showSetNames(out, s); |
| out.flush(); |
| } |
| out.close(); |
| } |
| |
| static class FakeBreak extends UnicodeLabel { |
| UnicodeSet nobreakSet = setsToAbbreviate; |
| public String getValue(int codepoint, boolean isShort) { |
| return nobreakSet.contains(codepoint) ? "" |
| : (codepoint & 1) == 0 ? "O" |
| : "E"; |
| } |
| } |
| |
| static class FakeBreak2 extends UnicodeLabel { |
| UnicodeSet nobreakSet = new UnicodeSet(setsToAbbreviate) |
| .addAll(new UnicodeSet(IDNOutputSet).complement()) |
| .addAll(new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus).complement()); |
| |
| public String getValue(int codepoint, boolean isShort) { |
| return nobreakSet.contains(codepoint) ? "" |
| : (codepoint & 1) == 0 ? "O" |
| : "E"; |
| } |
| } |
| |
| /** |
| * |
| */ |
| private static void showRemapped(PrintWriter out, String title, UnicodeMap remap) { |
| out.println(""); |
| out.println("# " + title); |
| out.println(""); |
| int count = 0; |
| for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) { |
| writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null); |
| count++; |
| } |
| out.println(""); |
| out.println("# Total code points: " + count); |
| } |
| /** |
| * |
| */ |
| static UnicodeSet XIDContinueSet = new UnicodeSet("[:XID_Continue:]"); |
| private static UnicodeSet IDNOutputSet, IDNInputSet, _preferredIDSet; |
| |
| static UnicodeSet getIdentifierSet() { |
| if (_preferredIDSet == null) { |
| IDNOutputSet = new UnicodeSet(); |
| IDNInputSet = new UnicodeSet(); |
| IDNOutputSet.add('-'); // HACK |
| IDNInputSet.add('-'); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| Utility.dot(cp); |
| int cat = Default.ucd().getCategory(cp); |
| if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue; |
| // get IDNA |
| int idnaType = GenerateStringPrep.getIDNAType(cp); |
| if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp); |
| if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp); |
| } |
| _preferredIDSet = new UnicodeSet(IDNOutputSet).addAll(XIDContinueSet); |
| } |
| _preferredIDSet.add(0x2018).add(0x2019); |
| return _preferredIDSet; |
| } |
| |
| private static UnicodeSet getSkipNFKD() { |
| nfcMap = new UnicodeMap(); |
| nfkcMap = new UnicodeMap(); |
| if (_skipNFKD == null) { |
| _skipNFKD = new UnicodeSet(); |
| UnicodeSet idSet = getIdentifierSet(); |
| for (int cp = 0; cp <= 0x10FFFF; ++cp) { |
| Utility.dot(cp); |
| int cat = Default.ucd().getCategory(cp); |
| if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue; |
| int decompType = Default.ucd().getDecompositionType(cp); |
| String nfc = Default.nfc().normalize(cp); |
| if (decompType == UCD.CANONICAL) nfcMap.put(cp, nfc); |
| if (decompType == UCD.COMPAT_CIRCLE |
| || decompType == UCD.COMPAT_SUPER |
| || decompType == UCD.COMPAT_SUB |
| || decompType == UCD.COMPAT_VERTICAL |
| || decompType == UCD.COMPAT_SMALL |
| || decompType == UCD.COMPAT_SQUARE |
| || decompType == UCD.COMPAT_FRACTION) { |
| _skipNFKD.add(cp); |
| continue; |
| } |
| String source = UTF16.valueOf(cp); |
| String mapped = Default.nfkd().normalize(cp); |
| String kmapped = getModifiedNKFC(source); |
| if (!kmapped.equals(source) && !kmapped.equals(nfc)) { |
| if (kmapped.startsWith(" ") || kmapped.startsWith("\u0640")) { |
| System.out.println("?? " + Default.ucd().getCodeAndName(cp)); |
| System.out.println("\t" + Default.ucd().getCodeAndName(kmapped)); |
| kmapped = getModifiedNKFC(source); // for debugging |
| } |
| nfkcMap.put(cp,kmapped); |
| } |
| if (mapped.equals(source)) continue; |
| if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp); |
| else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp); |
| } |
| } |
| nfcMap.setMissing(""); |
| nfcMap.freeze(); |
| nfkcMap.setMissing(""); |
| nfkcMap.freeze(); |
| return _skipNFKD; |
| } |
| |
| private static boolean isMixedScript(String source) { |
| return getSingleScript(source) == UScript.INVALID_CODE; |
| } |
| |
| /** |
| * Returns the script of the input text. Script values of COMMON and INHERITED are ignored. |
| * @param source Input text. |
| * @return Script value found in the text. |
| * If more than one script values are found, then UScript.INVALID_CODE is returned. |
| * If no script value is found (other than COMMON or INHERITED), then UScript.COMMON is returned. |
| */ |
| public static int getSingleScript(String source) { |
| if (source.length() == 0) return UScript.COMMON; |
| int lastScript = UScript.COMMON; // temporary value |
| int cp; |
| for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(source, i); |
| int script = UScript.getScript(cp); |
| if (script == UScript.COMMON || script == UScript.INHERITED) { |
| continue; |
| } |
| if (lastScript == UScript.COMMON) { |
| lastScript = script; |
| } else if (script != lastScript) { |
| return UScript.INVALID_CODE; |
| } |
| } |
| return lastScript; |
| } |
| |
| /** |
| * |
| */ |
| private static void generateConfusables() throws IOException { |
| log = BagFormatter.openUTF8Writer(outdir, "log.txt"); |
| //fixMichel(indir, outdir); |
| generateConfusables(indir, outdir); |
| log.close(); |
| if (false) for (Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext();) { |
| String source = (String)it.next(); |
| System.out.println(Default.ucd().getCodeAndName(source) |
| + " => " + Default.ucd().getCodeAndName((String)gatheredNFKD.get(source))); |
| } |
| } |
| |
| /* static class Data2 { |
| String source; |
| String target; |
| int count; |
| Data2(String target, int count) { |
| this.target = target; |
| this.count = count; |
| } |
| } |
| */ |
| /* static class Data implements Comparable { |
| String source; |
| String target; |
| String type; |
| Data(String source, String target, String type) { |
| this.source = source; |
| this.target = target; |
| this.type = type; |
| } |
| public int compareTo(Object o) { |
| int result; |
| Data that = (Data)o; |
| if (0 != (result = target.compareTo(that.target))) return result; |
| if (0 != (result = source.compareTo(that.source))) return result; |
| if (0 != (result = type.compareTo(that.type))) return result; |
| return 0; |
| } |
| } |
| */ |
| |
| /** |
| * |
| */ |
| static void writeSourceTargetLine(PrintWriter out, String source, String tag, String target, String reason) { |
| out.print( |
| Utility.hex(source) |
| + " ;\t" + Utility.hex(target) |
| + (tag == null ? "" : " ;\t" + tag) |
| //+ " ;\t" + (preferredID.contains(source) ? "ID" : "") |
| + "\t#" |
| + (isXid(source) ? "" : "*") |
| + " ( " + source + " " + ARROW + " " + target + " ) " |
| + Default.ucd().getName(source) + " " + ARROW + " " |
| + Default.ucd().getName(target) |
| ); |
| if (reason != null) out.print("\t# " + reason); |
| out.println(); |
| } |
| |
| static UnicodeSet controls = new UnicodeSet("[:Cc:]"); |
| |
| static class MyEquivalenceClass extends XEquivalenceClass { |
| public MyEquivalenceClass() { |
| super("NONE"); |
| } |
| public boolean addCheck(String a, String b, String reason) { |
| // quick check for illegal containment, before changing object |
| if (checkForBad(a, b, reason) || checkForBad(b, a, reason)) { |
| return false; |
| } |
| super.add(a, b, reason); |
| // full check for any resulting illegal containment. |
| // illegal if for any x, y, x is a proper superstring of y |
| Set equivalences = getEquivalences(a); |
| for (Iterator it = equivalences.iterator(); it.hasNext();) { |
| String x = (String)it.next(); |
| if (!UTF16.hasMoreCodePointsThan(x,1)) continue; |
| for (Iterator it2 = equivalences.iterator(); it2.hasNext();) { |
| String y = (String)it2.next(); |
| if (x.equals(y)) continue; |
| if (x.indexOf(y) >= 0) throw new RuntimeException("Illegal containment: " |
| + Default.ucd().getCodeAndName(x) + " contains " |
| + Default.ucd().getCodeAndName(y) + " because " |
| + Default.ucd().getCodeAndName(a) + " ~ " |
| + Default.ucd().getCodeAndName(b) + " because of " |
| + reason); |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * |
| */ |
| private boolean checkForBad(String a, String b, String reason) { |
| Set equivalences = getEquivalences(b); |
| for (Iterator it = equivalences.iterator(); it.hasNext();) { |
| String b2 = (String)it.next(); |
| if (a.equals(b2)) continue; |
| if (b2.indexOf(a) >= 0 || a.indexOf(b2) >= 0) { |
| log.println("Illegal containment: " |
| + Default.ucd().getCodeAndName(a) |
| + " overlaps " |
| + Default.ucd().getCodeAndName(b2) |
| + "\r\n\tfrom " |
| + Default.ucd().getCodeAndName(b) |
| + "\r\n\twith reason " |
| + reason + " plus " |
| + getReasons(b2, b)); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| public XEquivalenceClass add(Object a1, Object b1, String reason) { |
| String a = (String)a1; |
| String b = (String)b1; |
| try { |
| addCheck(a, b, reason); |
| return this; |
| } catch (RuntimeException e) { |
| throw (RuntimeException) new RuntimeException("Failure adding " |
| + Default.ucd().getCodeAndName(a) + "; " |
| + Default.ucd().getCodeAndName(b) |
| + "; " + reason).initCause(e); |
| } |
| } |
| /** |
| * Only NFKD if the result doesn't cross from ID set to nonID set, and space is not added |
| */ |
| // private String specialNFKD(String item) { |
| // UnicodeSet skipSet = getSkipNFKD(); |
| // StringBuffer result = new StringBuffer(); |
| // int cp; |
| // for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) { |
| // cp = UTF16.charAt(item, i); |
| // if (skipSet.contains(cp)) { |
| // UTF16.append(result, cp); |
| // continue; |
| // } |
| // String cps = UTF16.valueOf(cp); |
| // String mapped = Default.nfkd().normalize(cps); |
| // if (cps.equals(mapped)) { |
| // UTF16.append(result, cp); |
| // continue; |
| // } |
| // result.append(mapped); |
| // gatheredNFKD.put(cps, mapped); |
| // } |
| // return result.toString(); |
| // } |
| |
| public void close(String reason) { |
| boolean addedItem; |
| StringBuffer reasons = new StringBuffer(); |
| do { |
| addedItem = false; |
| Set cloneForSafety = getOrderedExplicitItems(); |
| for (Iterator it = cloneForSafety.iterator(); it.hasNext();) { |
| String item = (String) it.next(); |
| if (!UTF16.hasMoreCodePointsThan(item,1)) continue; // just for speed |
| reasons.setLength(0); |
| String mapped = mapString(item, reasons); |
| if (!isEquivalent(item, mapped)) { |
| if (addCheck(item, mapped, reasons.toString())) { |
| // System.out.println("Closing: " + Default.ucd().getCodeAndName(item) + " => " + Default.ucd().getCodeAndName(mapped)); |
| addedItem = true; |
| } |
| } |
| } |
| } while (addedItem); |
| } |
| |
| /** |
| * |
| */ |
| private String mapString(String item, StringBuffer reasons) { |
| if (false && item.startsWith("\u03D2")) { |
| System.out.println("foo"); |
| } |
| StringBuffer result = new StringBuffer(); |
| int cp; |
| for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(item, i); |
| String cps = UTF16.valueOf(cp); |
| String mapped = getParadigm(cps, false, false); |
| if (mapped.indexOf(cps) >= 0) result.append(cps); |
| else { |
| result.append(mapped); |
| List x = getReasons(cps, mapped); |
| reasons.append(getBestForm(x)); |
| } |
| } |
| return result.toString(); |
| } |
| |
| private Object getBestForm(Collection x) { |
| if (x.size() != 1) return "[" + x + "]"; |
| Object item = x.iterator().next(); |
| if (!(item instanceof Collection)) return x.toString(); |
| return getBestForm((Collection)item); |
| } |
| |
| public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) { |
| Set filteredSet; |
| if (onlyLowercase == false && onlySameScript == false) { |
| filteredSet = getEquivalences(item); |
| } else { |
| filteredSet = new HashSet(); |
| for (Iterator it = getEquivalences(item).iterator(); it.hasNext();) { |
| String other = (String) it.next(); |
| String combined = item + other; |
| if (onlyLowercase) { |
| boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD)); |
| if (!isLowercase) continue; |
| } |
| if (onlySameScript) { |
| boolean isMixed = isMixedScript(combined); |
| if (isMixed) continue; |
| } |
| filteredSet.add(other); |
| } |
| } |
| return (String) CollectionUtilities.getBest(filteredSet, betterTargetIsLess, -1); |
| } |
| |
| public Set getOrderedExplicitItems() { |
| Set cloneForSafety = new TreeSet(codepointComparator); |
| cloneForSafety.addAll(getExplicitItems()); |
| return cloneForSafety; |
| } |
| /** |
| * |
| */ |
| public void writeSource(PrintWriter out) { |
| Set items = getOrderedExplicitItems(); |
| for (Iterator it = items.iterator(); it.hasNext();) { |
| String item = (String) it.next(); |
| String paradigm = (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1); |
| if (item.equals(paradigm)) continue; |
| writeSourceTargetLine(out, item, null, paradigm, null); |
| } |
| } |
| } |
| |
| static class DataSet { |
| MyEquivalenceClass dataMixedLowercase = new MyEquivalenceClass(); |
| MyEquivalenceClass dataMixedAnycase = new MyEquivalenceClass(); |
| MyEquivalenceClass dataSingleLowercase = new MyEquivalenceClass(); |
| MyEquivalenceClass dataSingleAnycase = new MyEquivalenceClass(); |
| |
| public DataSet add(String source, String target, String type, int lineCount, String errorLine) { |
| if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this; |
| String nsource = Default.nfd().normalize(source); |
| String ntarget = Default.nfd().normalize(target); |
| |
| // if it is just a compatibility match, return |
| //if (nsource.equals(ntarget)) return this; |
| if (type.indexOf("skip") >= 0) return this; |
| if (target.indexOf('\u203D') >= 0) return this; |
| |
| type = getReasonFromFilename(type); |
| |
| // if it is base + combining sequence => base2 + same combining sequence, do just the base |
| int nsourceFirst = UTF16.charAt(nsource,0); |
| String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); |
| int ntargetFirst = UTF16.charAt(ntarget,0); |
| String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); |
| |
| if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) { |
| source = UTF16.valueOf(nsourceFirst); |
| target = UTF16.valueOf(ntargetFirst); |
| type += "-base"; |
| } |
| type += ":" + lineCount; |
| |
| String combined = source + target; |
| if (combined.indexOf("\u0430") >= 0) { |
| System.out.println(Default.ucd().getCodeAndName(combined)); |
| } |
| boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD)); |
| boolean isMixed = isMixedScript(combined); |
| dataMixedAnycase.add(source, target, type); |
| if (isLowercase) { |
| dataMixedLowercase.add(source, target, type); |
| } |
| if (!isMixed) { |
| dataSingleAnycase.add(source, target, type); |
| } |
| if (!isMixed && isLowercase) { |
| dataSingleLowercase.add(source, target, type); |
| } |
| return this; |
| } |
| |
| /* *//** |
| * @param errorLine TODO |
| * |
| *//* |
| private DataSet add(Data newData, String errorLine) { |
| if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) { |
| System.out.println("Problem with " + errorLine); |
| System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target)); |
| } |
| String[] key = {newData.source, newData.target}; |
| Data old = (Data) dataMap.get(key); |
| if (old == null) { |
| dataSet.add(newData); |
| dataMap.put(key, newData); |
| }else { |
| old.type = old.type + "/" + newData.type; |
| } |
| return this; |
| } |
| */ // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt" |
| static final int NORMAL = 0, FOLDING = 1, OLD = 2; |
| |
| public DataSet addFile(String directory, String filename) throws IOException { |
| String line = null; |
| int count = 0; |
| try { |
| BufferedReader in = BagFormatter.openUTF8Reader(directory, filename); |
| int kind = NORMAL; |
| if (filename.indexOf("Folding") >= 0) kind = FOLDING; |
| else if (false && filename.indexOf("-old") >= 0) kind = OLD; |
| while (true) { |
| count++; |
| line = Utility.readDataLine(in); |
| if (line == null) break; |
| if (line.length() == 0) continue; |
| String[] pieces = Utility.split(line,';'); |
| if (pieces.length < 2) { |
| System.out.println("Error on: " + line); |
| continue; |
| } |
| String type = filename; |
| if (kind==FOLDING) { |
| String source = Utility.fromHex(pieces[0].trim(),true); |
| String target = Utility.fromHex(pieces[1].trim(),true); |
| String nsource = Default.nfkd().normalize(source); |
| String first = UTF16.valueOf(UTF16.charAt(nsource, 0)); |
| if (!first.equals(target)) { |
| add(source, target, type, count, line); |
| } |
| } else if (kind == OLD) { |
| String target = pieces[0].trim(); |
| for (int i = 1; i < pieces.length; ++i) { |
| add(pieces[i].trim(), target, type, count, line); |
| } |
| } else { |
| String source = Utility.fromHex(pieces[0].trim(),true); |
| String target = Utility.fromHex(pieces[1].trim(),true); |
| //if (pieces.length > 2) type = pieces[2].trim(); |
| String nfkdSource = Default.nfkd().normalize(source); |
| String nfkdTarget = Default.nfkd().normalize(target); |
| if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) { |
| System.out.println("Suppressing nfkc for: " + Default.ucd().getCodeAndName(source)); |
| } else { |
| add(source, target, type, count, line); |
| } |
| } |
| } |
| in.close(); |
| return this; |
| } catch (Exception e) { |
| throw (RuntimeException) new RuntimeException("Failure with file: " |
| + directory + filename + " on line: " + count |
| + ": " + line).initCause(e); |
| } |
| } |
| |
| public void writeSource(String directory, String filename) throws IOException { |
| PrintWriter out = openAndWriteHeader(filename, "Source File for IDN Confusables"); |
| // PrintWriter out = BagFormatter.openUTF8Writer(directory, filename); |
| // out.println("# Source File for IDN Confusables"); |
| // out.println("# $Revision: 1.12 $"); |
| // out.println("# $Date: 2006/09/24 23:32:44 $"); |
| // out.println(""); |
| dataMixedAnycase.writeSource(out); |
| out.close(); |
| } |
| |
| public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException { |
| PrintWriter out = openAndWriteHeader(filename, "Recommended confusable mapping for IDN"); |
| // PrintWriter out = BagFormatter.openUTF8Writer(directory, filename); |
| // out.println("# Recommended confusable mapping for IDN"); |
| // out.println("# $Revision: 1.12 $"); |
| // out.println("# $Date: 2006/09/24 23:32:44 $"); |
| // out.println(""); |
| |
| if (appendFile) { |
| String[] replacements = {"%date%", Default.getDate()}; |
| Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", |
| Utility.UTF8_WINDOWS, out, replacements); |
| } |
| if (true) { |
| writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, true, true); |
| writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, true); |
| writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, true, false); |
| writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false); |
| } else { |
| writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false); |
| writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false); |
| writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false); |
| writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false); |
| } |
| out.close(); |
| } |
| /** |
| * @param skipNFKEquivs TODO |
| * @param onlyLowercase TODO |
| * @param onlySingleScript TODO |
| * |
| */ |
| private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript) { |
| // first get all the sets. Then get the best paradigm from each. Then sort. |
| // Set setOfSets = data.getEquivalenceSets(); |
| // Map orderedResults = new TreeMap(betterTargetIsLess); |
| // for (Iterator it = setOfSets.iterator(); it.hasNext();) { |
| // Set setOfEquivs = (Set) it.next(); |
| // Object item = CollectionUtilities.getBest(setOfEquivs, betterTargetIsLess, -1); |
| // |
| // } |
| //int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00"); |
| //System.out.println("Code Point Compare: " + c); |
| Set items = data.getOrderedExplicitItems(); |
| out.println(); |
| out.println("# " + title); |
| out.println(); |
| int count = 0; |
| UnicodeSet preferredID = getIdentifierSet(); |
| ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator}); |
| Set orderedPairs = new TreeSet(ac); |
| for (Iterator it = items.iterator(); it.hasNext();) { |
| String source = (String) it.next(); |
| if (UTF16.hasMoreCodePointsThan(source,1)) continue; |
| String target = data.getParadigm(source, onlyLowercase, onlySingleScript); |
| if (target == null) continue; |
| if (source.equals(target)) continue; |
| if (skipNFKEquivs) { |
| if (!Default.nfkd().normalize(source).equals(source)) continue; |
| } |
| orderedPairs.add(new String[] {target, source}); |
| } |
| String lastTarget = null; |
| for (Iterator it = orderedPairs.iterator(); it.hasNext();) { |
| String[] pair = (String[]) it.next(); |
| String source = pair[1]; |
| String target = pair[0]; |
| String reason = fixReason(data.getReasons(source, target)); |
| if (lastTarget != null && !lastTarget.equals(target)) { |
| out.println(); |
| } |
| writeSourceTargetLine(out, source, tag, target, reason); |
| lastTarget = target; |
| count++; |
| } |
| out.println(); |
| out.println("# total for (" + tag + "): " + count); |
| out.println(); |
| } |
| |
| /** |
| * |
| */ |
| private String fixReason(List reasons) { |
| List first = (List)reasons.get(0); |
| String result = ""; |
| for (int i = 0; i < first.size(); ++i) { |
| if (i != 0) result += " "; |
| Object item = first.get(i); |
| if (item instanceof String) { |
| result += item; |
| } else { |
| String temp = ""; |
| for (Iterator it = ((Set)item).iterator(); it.hasNext();) { |
| if (temp.length() != 0) temp += "|"; |
| temp += it.next(); |
| } |
| result += "{" + temp + "}"; |
| } |
| } |
| return result.toString(); |
| } |
| |
| public void addAll(DataSet ds) { |
| dataMixedAnycase.addAll(ds.dataMixedAnycase); |
| dataMixedLowercase.addAll(ds.dataMixedLowercase); |
| dataSingleAnycase.addAll(ds.dataSingleAnycase); |
| dataSingleLowercase.addAll(ds.dataSingleLowercase); |
| } |
| /* *//** |
| * |
| *//* |
| public DataSet clean() { |
| // remove all skips |
| DataSet tempSet = new DataSet(); |
| Map m = new HashMap(); |
| for (Iterator it = dataSet.iterator(); it.hasNext();) { |
| Data d = (Data) it.next(); |
| if (d.type.indexOf("skip") >= 0) continue; |
| String newTarget = Default.nfkd().normalize(d.target); |
| String newSource = Default.nfkd().normalize(d.source); |
| String type = d.type; |
| if (!d.target.equals(newTarget) || !d.source.equals(newSource)) { |
| type += "-nf"; |
| log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource)); |
| log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type); |
| continue; |
| } |
| // swap order |
| if (preferSecondAsSource(newSource, newTarget)) { |
| String temp = newTarget; |
| newTarget = newSource; |
| newSource = temp; |
| } |
| |
| Data already = (Data) m.get(newSource); |
| if (already != null && !newTarget.equals(already.target)) { |
| log.println("X " + getCodeCharName(newSource) + " " + ARROW); |
| log.println("\t" + getCodeCharName(newTarget) + " \t" + type); |
| log.println("\t" + getCodeCharName(already.target) + " \t" + already.type); |
| if (preferSecondAsSource(already.target, newTarget)) { |
| // just fix new guy |
| type += "[" + newSource + "]" + already.type; |
| newSource = newTarget; |
| newTarget = already.target; |
| } else { |
| // need to fix new guy, AND fix old guy. |
| tempSet.remove(already); |
| type += "[" + newSource + "]" + already.type; |
| newSource = already.target; |
| already.type += "[" + already.target + "]" + type; |
| already.target = newTarget; |
| tempSet.add(already, ""); |
| } |
| } |
| Data newData = new Data(newSource, newTarget, type); |
| m.put(newSource, newData); |
| tempSet.add(newData, ""); |
| } |
| // now recursively apply |
| DataSet s = new DataSet(); |
| for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) { |
| Data d = (Data) it.next(); |
| int cp = 0; |
| StringBuffer result = new StringBuffer(); |
| for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(d.target, i); |
| String src = UTF16.valueOf(cp); |
| while (true) { |
| Data rep = (Data) m.get(src); |
| if (rep == null) break; |
| src = rep.target; |
| } |
| result.append(src); |
| } |
| String newTarget = result.toString(); |
| newTarget = Default.nfkd().normalize(newTarget); |
| s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), ""); |
| } |
| return s; |
| } |
| *//** |
| * |
| *//* |
| private void remove(Data already) { |
| String[] key = {already.source, already.target}; |
| dataMap.remove(key); |
| dataSet.remove(already); |
| }*/ |
| /** |
| * |
| */ |
| public void close(String reason) { |
| dataMixedAnycase.close(reason); |
| dataMixedLowercase.close(reason); |
| dataSingleAnycase.close(reason); |
| dataSingleLowercase.close(reason); |
| } |
| /** |
| * |
| */ |
| public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) { |
| int count = 0; |
| for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) { |
| add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine); |
| } |
| } |
| |
| static class MyFilter implements XEquivalenceClass.Filter { |
| UnicodeSet output; |
| public boolean matches(Object o) { |
| return output.containsAll((String)o); |
| } |
| } |
| |
| static class MyCollectionFilter implements CollectionUtilities.ObjectMatcher { |
| UnicodeSet outputAllowed; |
| int minLength; |
| public boolean matches(Object o) { |
| String item = (String)o; |
| if (!outputAllowed.containsAll(item)) return false; |
| int len = UTF16.countCodePoint(item); |
| if (len < minLength) minLength = len; |
| return true; |
| } |
| }; |
| /** |
| * @param script TODO |
| * @throws IOException |
| * |
| */ |
| public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException { |
| PrintWriter out = openAndWriteHeader(filename, "Summary: Recommended confusable mapping for IDN"); |
| // PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename); |
| // out.print('\uFEFF'); |
| // out.println("# Summary: Recommended confusable mapping for IDN"); |
| // out.println("# $Revision: 1.12 $"); |
| // out.println("# $Date: 2006/09/24 23:32:44 $"); |
| // out.println(""); |
| UnicodeSet representable = new UnicodeSet(); |
| MyEquivalenceClass data = dataMixedAnycase; |
| Set items = data.getOrderedExplicitItems(); |
| // for (Iterator it = items.iterator(); it.hasNext();) { |
| // System.out.println(Default.ucd().getCodeAndName((String)it.next())); |
| // } |
| int count = 0; |
| UnicodeSet preferredID = getIdentifierSet(); |
| String lastTarget = ""; |
| Set itemsSeen = new HashSet(); |
| Set equivalents = new TreeSet(betterTargetIsLess); |
| MyCollectionFilter myFilter = new MyCollectionFilter(); |
| myFilter.outputAllowed= new UnicodeSet("[[\u0021-\u007E]-[:letter:]]") |
| .addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet) |
| .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict); |
| |
| for (Iterator it = items.iterator(); it.hasNext();) { |
| String target = (String) it.next(); |
| if (itemsSeen.contains(target)) continue; |
| equivalents.clear(); |
| equivalents.addAll(data.getEquivalences(target)); |
| itemsSeen.addAll(equivalents); |
| if (outputOnly) { // remove non-output |
| myFilter.minLength = 1000; |
| CollectionUtilities.retainAll(equivalents, myFilter); |
| if (equivalents.size() <= 1) continue; |
| if (myFilter.minLength > 1) continue; |
| if (!equivalents.contains(target)) { // select new target if needed |
| target = (String) equivalents.iterator().next(); |
| } |
| } |
| scriptTest: |
| if (script != null) { |
| // see if at least one item contains the target script |
| for (Iterator it2 = equivalents.iterator(); it2.hasNext();) { |
| String item = (String) it2.next(); |
| if (script.containsAll(item)) { |
| target = item; |
| for (Iterator it3 = equivalents.iterator(); it3.hasNext();) { |
| representable.addAll((String)it3.next()); |
| } |
| break scriptTest; |
| } |
| } |
| continue; // skip this one |
| } |
| out.println(); |
| out.println(getStatus(target) + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + Default.ucd().getName(target)); |
| //if (UTF16.hasMoreCodePointsThan(source,1)) continue; |
| for (Iterator it2 = equivalents.iterator(); it2.hasNext();) { |
| String source = (String) it2.next(); |
| if (source.equals(target)) continue; |
| //boolean compatEqual = Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target)); |
| //if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue; |
| String reason = fixReason(data.getReasons(source, target)); |
| //if (!outputAllowed.containsAll(source)) continue; |
| // if (compatEqual) { |
| // out.print("\u21D0"); |
| // } else { |
| // out.print("\u2190"); |
| // } |
| out.println("\u2190" + getStatus(source) + "\t" + "(\u200E " + source + " \u200E)\t" + Utility.hex(source) + "\t " + Default.ucd().getName(source) |
| + "\t# " + reason); |
| count++; |
| } |
| } |
| out.println(); |
| out.println("# total : " + count); |
| out.println(); |
| if (script != null) { |
| out.println(); |
| out.println("# Base Letters Representable with Script"); |
| out.println(); |
| representable.removeAll(script); |
| BagFormatter bf = new BagFormatter(); |
| bf.setValueSource(ups.getProperty("script")); |
| bf.setShowLiteral(TransliteratorUtilities.toHTMLControl); |
| bf.showSetNames(out, representable); |
| } |
| out.close(); |
| } |
| |
| |
| |
| public void writeWholeScripts(String outdir, String filename) throws IOException { |
| UnicodeSet commonAndInherited = new UnicodeSet( |
| "[[:script=common:][:script=inherited:]]"); |
| |
| WholeScript wsLower = new WholeScript( |
| new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) |
| .removeAll(new UnicodeSet("[A-Z]")), "L"); |
| WholeScript wsAny = new WholeScript( |
| new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) |
| .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), "A"); |
| |
| MyEquivalenceClass data = new MyEquivalenceClass(); |
| for (Iterator it = dataMixedAnycase.getSamples().iterator(); it.hasNext();) { |
| String target = (String) it.next(); |
| Set equivalents = dataMixedAnycase.getEquivalences(target); |
| boolean first = true; |
| for (Iterator it2 = equivalents.iterator(); it2.hasNext();) { |
| String cleaned = CollectionUtilities.remove((String)it2.next(), commonAndInherited); |
| if (cleaned.length() == 0) continue; |
| if (first) { |
| target = cleaned; |
| first = false; |
| } else { |
| data.add(target, cleaned); |
| } |
| } |
| } |
| Set itemsSeen = new HashSet(); |
| for (Iterator it = data.getOrderedExplicitItems().iterator(); it.hasNext();) { |
| String target = (String) it.next(); |
| if (itemsSeen.contains(target)) continue; |
| Set equivalents = data.getEquivalences(target); |
| itemsSeen.addAll(equivalents); |
| wsAny.addEquivalents(equivalents); |
| wsLower.addEquivalents(equivalents); |
| } |
| PrintWriter out = openAndWriteHeader(filename, "Summary: Whole-Script Confusables"); |
| // PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename); |
| // out.print('\uFEFF'); |
| // out.println("# Summary: Whole-Script Confusables"); |
| // out.println("# $Revision: 1.12 $"); |
| // out.println("# $Date: 2006/09/24 23:32:44 $"); |
| out.println("# This data is used for determining whether a strings is a"); |
| out.println("# whole-script or mixed-script confusable."); |
| out.println("# The mappings here ignore common and inherited script characters,"); |
| out.println("# such as accents."); |
| out.println(""); |
| out.println("# Lowercase Only"); |
| out.println(""); |
| wsLower.write(out); |
| out.println(""); |
| out.println("# Any-Case"); |
| out.println(""); |
| wsAny.write(out); |
| out.close(); |
| } |
| /** |
| * |
| */ |
| private String getStatus(String source) { |
| // TODO Auto-generated method stub |
| int val = betterTargetIsLess.getValue(source); |
| if (val == MARK_NOT_NFC.intValue()) return "[x]"; |
| if (val == MARK_NFC.intValue()) return "[x]"; |
| if (val == MARK_INPUT_LENIENT.intValue()) return "[L]"; |
| if (val == MARK_INPUT_STRICT.intValue()) return "[I]"; |
| if (val == MARK_OUTPUT.intValue()) return "[O]"; |
| if (val == MARK_ASCII.intValue()) return "[A]"; |
| |
| return "?"; |
| } |
| } |
| |
| static class WholeScript { |
| private static UnicodeSet commonAndInherited = new UnicodeSet("[[:script=common:][:script=inherited:]]"); |
| private UnicodeSet filterSet; |
| private UnicodeSet[] script_representables = new UnicodeSet[UScript.CODE_LIMIT]; |
| private UnicodeSet[] script_set = new UnicodeSet[UScript.CODE_LIMIT]; |
| private BagFormatter bf = new BagFormatter(); |
| private String label; |
| { |
| for (int i = 0; i < UScript.CODE_LIMIT; ++i) { |
| script_representables[i] = new UnicodeSet(); |
| script_set[i] = new UnicodeSet("[:script=" + UScript.getName(i) + ":]"); // ugly hack |
| } |
| bf.setValueSource(ups.getProperty("script")); |
| bf.setShowLiteral(TransliteratorUtilities.toHTMLControl); |
| bf.setLabelSource(UnicodeLabel.NULL); |
| } |
| WholeScript(UnicodeSet filterSet, String label) { |
| this.filterSet = filterSet; |
| this.label = label; |
| finished = false; |
| } |
| void addEquivalents(Set set) { |
| finished = false; |
| // if we have y ~ x, and both are single scripts |
| // that means that x can be represented in script(y), |
| // and y can be represented in script(x). |
| for (Iterator it = set.iterator(); it.hasNext();) { |
| String item1 = (String)it.next(); |
| if (!filterSet.containsAll(item1)) continue; |
| int script1 = getSingleScript(item1); |
| if (script1 == UScript.INVALID_CODE) continue; |
| for (Iterator it2 = set.iterator(); it2.hasNext();) { |
| String item2 = (String)it2.next(); |
| if (!filterSet.containsAll(item2)) continue; |
| int script2 = getSingleScript(item2); |
| if (script2 == UScript.INVALID_CODE || script2 == script1) continue; |
| script_representables[script1].addAll(item2).removeAll(commonAndInherited); |
| } |
| } |
| } |
| |
| public static class UnicodeSetToScript { |
| public int getScript() { |
| return script; |
| } |
| public UnicodeSetToScript setScript(int script) { |
| this.script = script; |
| return this; |
| } |
| public UnicodeSet getSet() { |
| return set; |
| } |
| public UnicodeSetToScript setSet(UnicodeSet set) { |
| this.set = set; |
| return this; |
| } |
| private UnicodeSet set; |
| private int script; |
| } |
| |
| UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UScript.CODE_LIMIT][]; |
| UnicodeSet[] fastReject = new UnicodeSet[UScript.CODE_LIMIT]; |
| boolean finished = false; |
| |
| void finish() { |
| if (finished) return; |
| for (int j = 0; j < UScript.CODE_LIMIT; ++j) { |
| if (j == UScript.COMMON || j == UScript.INHERITED) continue; |
| if (script_representables[j].size() == 0) continue; |
| UnicodeSet accept = new UnicodeSet(); |
| List curr = new ArrayList(); |
| for (int k = 0; k < UScript.CODE_LIMIT; ++k) { |
| if (k == UScript.COMMON || k == UScript.INHERITED) continue; |
| if (script_representables[k].size() == 0) continue; |
| |
| if (script_set[j].containsNone(script_representables[k])) continue; |
| UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]); |
| UnicodeSetToScript uss = new UnicodeSetToScript().setScript(k).setSet(items); |
| curr.add(uss); |
| } |
| scriptToUnicodeSetToScript[j] = (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]); |
| fastReject[j] = accept.complement(); |
| } |
| finished = true; |
| } |
| |
| void write(PrintWriter out) throws IOException { |
| finish(); |
| for (int j = 0; j < UScript.CODE_LIMIT; ++j) { |
| if (scriptToUnicodeSetToScript[j] == null) continue; |
| for (int q = 0; q < scriptToUnicodeSetToScript[j].length; ++q) { |
| UnicodeSetToScript uss = scriptToUnicodeSetToScript[j][q]; |
| int k = uss.getScript(); |
| UnicodeSet items = uss.getSet(); |
| String sname = UScript.getShortName(j) + "; " + UScript.getShortName(k) + "; " + label; |
| String name = UScript.getName(j) + "; " + UScript.getName(k); |
| out.println("# " + name + ": " + items.toPattern(false)); |
| out.println(""); |
| bf.setValueSource(sname); |
| bf.showSetNames(out, items); |
| out.println(""); |
| } |
| } |
| } |
| |
| } |
| |
| /** |
| * @throws IOException |
| * |
| */ |
| private static void fixMichel(String indir, String outdir) throws IOException { |
| BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt"); |
| PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt"); |
| while (true) { |
| String line = Utility.readDataLine(in); |
| if (line == null) break; |
| String[] pieces = Utility.split(line,'\t'); |
| if (pieces.length < 2) { |
| out.println(line); |
| continue; |
| } |
| String source = Utility.fromHex(pieces[0].trim()); |
| if (Default.nfkd().isNormalized(source)) { |
| out.println(line); |
| } |
| } |
| in.close(); |
| out.close(); |
| } |
| /** |
| * |
| */ |
| |
| private static void generateSource() throws IOException { |
| File dir = new File(indir); |
| String[] names = dir.list(); |
| Set sources = new TreeSet(new ArrayComparator( |
| new Comparator[] {codepointComparator, codepointComparator})); |
| |
| int[] count = new int[1]; |
| for (int i = 0; i < names.length; ++i) { |
| if (new File(indir + names[i]).isDirectory()) continue; |
| if (!names[i].startsWith("confusables")) continue; |
| String reason = getReasonFromFilename(names[i]); |
| System.out.println(names[i]); |
| BufferedReader in = BagFormatter.openUTF8Reader(indir, names[i]); |
| String line; |
| count[0] = 0; |
| while (true) { |
| line = Utility.readDataLine(in, count); |
| if (line == null) break; |
| if (line.length() == 0) continue; |
| String[] pieces = Utility.split(line,';'); |
| if (pieces.length < 2) { |
| System.out.println("Error on: " + line); |
| continue; |
| } |
| String source = Utility.fromHex(pieces[0].trim(),true); |
| String target = Utility.fromHex(pieces[1].trim(),true); |
| |
| if (source.length() == 0 || target.length() == 0) { |
| throw new IllegalArgumentException("zero-length item: " + count[0] + ":\t" + line); |
| } |
| |
| // check for identical combining sequences |
| String nsource = Default.nfc().normalize(source); |
| String ntarget = Default.nfc().normalize(target); |
| if (nsource.equals(ntarget)) continue; |
| |
| if (true) { |
| int nsourceFirst = UTF16.charAt(nsource,0); |
| String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); |
| int ntargetFirst = UTF16.charAt(ntarget,0); |
| String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); |
| if (nsourceRest.equals(ntargetRest)) { |
| source = UTF16.valueOf(nsourceFirst); |
| target = UTF16.valueOf(ntargetFirst); |
| } |
| } |
| |
| if (betterTargetIsLess.compare(source, target) < 0) { |
| String temp = source; |
| source = target; |
| target = temp; |
| } |
| sources.add(new String[] {source, target}); |
| } |
| in.close(); |
| } |
| PrintWriter out = BagFormatter.openUTF8Writer(outdir, "confusableSource.txt"); |
| for (Iterator it = sources.iterator(); it.hasNext();) { |
| String[] sourceItem = (String[]) it.next(); |
| writeSourceTargetLine(out, sourceItem[0], null, sourceItem[1], null); |
| } |
| out.close(); |
| } |
| |
| private static void generateConfusables(String indir, String outdir) throws IOException { |
| File dir = new File(indir); |
| String[] names = dir.list(); |
| DataSet total = new DataSet(); |
| for (int i = 0; i < names.length; ++i) { |
| if (new File(indir + names[i]).isDirectory()) continue; |
| if (!names[i].startsWith("confusables")) continue; |
| System.out.println(names[i]); |
| DataSet ds = new DataSet(); |
| ds.addFile(indir, names[i]); |
| ds.writeSource(outdir, "new-" + names[i]); |
| ds.close("*"); |
| total.addAll(ds); |
| total.close("t*" + names[i]); |
| } |
| // add normalized data |
| // for (int i = 0; i <= 0x10FFFF; ++i) { |
| // if (Default.nfkc().isNormalized(i)) continue; |
| // String result = getModifiedNKFC(UTF16.valueOf(i)); |
| // ds.foo(); |
| // } |
| getSkipNFKD(); |
| DataSet ds = new DataSet(); |
| ds.addUnicodeMap(nfcMap, "nfc", "nfc"); |
| ds.close("*"); |
| total.addAll(ds); |
| total.close("*"); |
| |
| ds = new DataSet(); |
| ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc"); |
| ds.close("*"); |
| //ds.write(outdir, "new-decomp.txt", false, false); |
| total.addAll(ds); |
| total.close("*"); |
| |
| total.writeSummary(outdir, "confusablesSummary.txt", false, null); |
| total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null); |
| //total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true, |
| // new UnicodeSet("[[:script=Cyrillic:][:script=common:][:script=inherited:]]")); |
| total.writeWholeScripts(outdir, "confusablesWholeScript.txt"); |
| total.writeSourceOrder(outdir, "confusables.txt", false, false); |
| //DataSet clean = total.clean(); |
| //clean.write(outdir, "confusables.txt", true); |
| } |
| /* |
| BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"); |
| Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), |
| new UTF16.StringComparator()})); |
| while (true) { |
| String line = Utility.readDataLine(in); |
| if (line == null) break; |
| if (line.length() == 0) continue; |
| String[] pieces = Utility.split(line,';'); |
| if (pieces.length < 2) { |
| System.out.println("Error on: " + line); |
| continue; |
| } |
| String source = Utility.fromHex(pieces[0].trim()); |
| String target = Utility.fromHex(pieces[1].trim()); |
| String nsource = Default.nfkd().normalize(source); |
| String first = UTF16.valueOf(UTF16.charAt(nsource, 0)); |
| if (!first.equals(target)) { |
| set.add(new String[]{source, target}); |
| } |
| } |
| in.close(); |
| |
| } |
| public static void gen() throws IOException { |
| Map m = new TreeMap(); |
| BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt"); |
| while (true) { |
| String line = in.readLine(); |
| if (line == null) break; |
| String[] pieces = Utility.split(line,';'); |
| if (pieces.length < 3) { |
| System.out.println("Error on: " + line); |
| continue; |
| } |
| int codepoint = Integer.parseInt(pieces[1], 16); |
| int cat = Default.ucd().getCategory(codepoint); |
| if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use |
| if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC |
| String result = Utility.fromHex(pieces[0]); |
| if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC |
| int count = Integer.parseInt(pieces[2]); |
| String source = UTF16.valueOf(codepoint); |
| add(m, source, result, count); |
| } |
| in.close(); |
| |
| in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt"); |
| while (true) { |
| String line = in.readLine(); |
| if (line == null) break; |
| line = line.trim(); |
| int pos = line.indexOf("#"); |
| if (pos >= 0) line = line.substring(0,pos).trim(); |
| if (line.length() == 0) continue; |
| if (line.startsWith("@")) continue; |
| String[] pieces = Utility.split(line,';'); |
| if (pieces.length < 2) { |
| System.out.println("Error on: " + line); |
| continue; |
| } |
| String source = pieces[0].trim(); |
| for (int i = 1; i < pieces.length; ++i) { |
| add(m, source, pieces[i].trim(), -1); |
| } |
| } |
| in.close(); |
| |
| boolean gotOne; |
| // close the set |
| do { |
| gotOne = false; |
| for (Iterator it = m.keySet().iterator(); it.hasNext();) { |
| String source = (String) it.next(); |
| Data2 data = (Data2) m.get(source); |
| Data2 data2 = (Data2) m.get(data.target); |
| if (data2 == null) continue; |
| data.target = data2.target; |
| gotOne = true; |
| break; |
| } |
| } while (gotOne); |
| // put into different sorting order |
| Set s = new TreeSet(); |
| for (Iterator it = m.keySet().iterator(); it.hasNext();) { |
| String source = (String) it.next(); |
| Data2 data = (Data2) m.get(source); |
| s.add(new Data(source, data.target, data.count)); |
| } |
| // write it out |
| PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt"); |
| String[] replacements = {"%date%", Default.getDate()}; |
| Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", |
| Utility.UTF8_WINDOWS, out, replacements); |
| for (Iterator it = s.iterator(); it.hasNext();) { |
| Data d = (Data) it.next(); |
| if (d == null) continue; |
| out.println(formatLine(d.source, d.target, d.count)); |
| } |
| |
| out.close(); |
| System.out.println("Done"); |
| } |
| /** |
| * |
| */ |
| private static String formatLine(String source, String target, int count) { |
| return Utility.hex(source) + " ; " + Utility.hex(target," ") |
| + " ; " + count |
| + " # " |
| + "(" + source + " " + ARROW + " " + target + ") " |
| + Default.ucd().getName(source) |
| + " " + ARROW + " " + Default.ucd().getName(target); |
| } |
| /** |
| * |
| */ |
| /* private static void add(Map m, String source, String target, int count) { |
| if (source.length() == 0 || target.length() == 0) return; |
| if (preferSecondAsSource(source, target)) { |
| String temp = target; |
| target = source; |
| source = temp; |
| } |
| Data2 other = (Data2) m.get(source); |
| if (other != null) { |
| if (target.equals(other.target)) return; |
| System.out.println("conflict"); |
| System.out.println(formatLine(source, target, count)); |
| System.out.println(formatLine(source, other.target, other.count)); |
| // skip adding this, and instead add result -> other.target |
| add(m, target, other.target, count); |
| } else { |
| m.put(source, new Data2(target, count)); |
| } |
| }; |
| */ |
| |
| static Integer |
| MARK_NOT_NFC = new Integer(50), |
| MARK_NFC = new Integer(40), |
| MARK_INPUT_LENIENT = new Integer(30), |
| MARK_INPUT_STRICT = new Integer(20), |
| MARK_OUTPUT = new Integer(10), |
| MARK_ASCII = new Integer(10); |
| |
| static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess(); |
| |
| static UnicodeSet XID = new UnicodeSet("[:xidcontinue:]"); |
| |
| static boolean isXid(String x) { |
| return XID.containsAll(x); |
| } |
| |
| static class _BetterTargetIsLess implements Comparator { |
| IdentifierInfo info = IdentifierInfo.getIdentifierInfo(); |
| |
| public int compare(Object o1, Object o2) { |
| String a = (String)o1; |
| String b = (String)o2; |
| // longer is better (less) |
| int ca = UTF16.countCodePoint(a); |
| int cb = UTF16.countCodePoint(b); |
| if (ca != cb) { |
| return ca > cb ? -1 : 1; |
| } |
| |
| // is Identifier is better |
| boolean ba = isXid(a); |
| boolean bb = isXid(b); |
| if (ba != bb) { |
| return ba ? -1 : 1; |
| } |
| |
| int aok = getValue(a); |
| int bok = getValue(b); |
| if (aok != bok) return aok < bok ? -1 : 1; |
| return codepointComparator.compare(a, b); |
| } |
| static final int BAD = 1000; |
| |
| private int getValue(String a) { // lower is better |
| int cp; |
| int lastValue = 0; |
| for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(a, i); |
| Object objValue = info.lowerIsBetter.getValue(cp); |
| int value = ((Integer) objValue).intValue(); |
| if (value > lastValue) lastValue = value; |
| } |
| return lastValue; |
| } |
| }; |
| |
| /* static private boolean preferSecondAsSource(String a, String b) { |
| // if first is longer, prefer second |
| int ca = UTF16.countCodePoint(a); |
| int cb = UTF16.countCodePoint(b); |
| if (ca != cb) { |
| return ca > cb; |
| } |
| // if first is lower, prefer second |
| return a.compareTo(b) < 0; |
| } |
| */ |
| static String getCodeCharName(String a) { |
| return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a); |
| } |
| /** |
| * Returns the part between - and . |
| */ |
| public static String getReasonFromFilename(String type) { |
| int period = type.lastIndexOf('.'); |
| if (period < 0) period = type.length(); |
| int dash = type.lastIndexOf('-', period); |
| return type.substring(dash+1,period); |
| } |
| |
| static Normalizer modNFKC ; |
| |
| private static String getModifiedNKFC(String cf) { |
| if (modNFKC == null) { |
| modNFKC = new Normalizer(Normalizer.NFKC, Default.ucdVersion()); |
| modNFKC.setSpacingSubstitute(); |
| } |
| return modNFKC.normalize(cf); |
| } |
| |
| private static PrintWriter openAndWriteHeader(String filename, String title) throws IOException { |
| PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename); |
| out.print('\uFEFF'); |
| out.println("# " + title); |
| out.println("# File: " + filename); |
| out.println("# Version: " + version); |
| out.println("# Generated: " + Default.getDate()); |
| out.println("# Checkin: $Revision: 1.12 $"); |
| out.println("#"); |
| out.println("# For documentation and usage, see http://www.unicode.org/reports/tr39/"); |
| out.println("#"); |
| return out; |
| } |
| |
| } |