unicodetools/com/ibm/text/UCD/GenerateConfusables.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 1996-2001, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
 * $Date: 2006/09/24 23:32:44 $
 * $Revision: 1.12 $
 *
 *******************************************************************************
 */

 package com.ibm.text.UCD;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;

 import com.ibm.icu.dev.test.util.ArrayComparator;
 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.TransliteratorUtilities;
 import com.ibm.icu.dev.test.util.UnicodeLabel;
 import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodeProperty;
 import com.ibm.icu.dev.test.util.XEquivalenceClass;
 import com.ibm.icu.impl.CollectionUtilities;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;
 import com.ibm.icu.util.ULocale;
 import com.ibm.text.utility.Utility;


 public class GenerateConfusables {
     public static String version = "2.0";
 	public static boolean EXCLUDE_CONFUSABLE_COMPAT = true;

 	public static void main(String[] args) throws IOException {
         quickTest();

 		Set arg2 = new HashSet(Arrays.asList(args));
 		try {
 			if (arg2.contains("-b")) generateIDN();
 			if (arg2.contains("-c")) generateConfusables();
 			if (arg2.contains("-d")) generateDecompFile();
 			if (arg2.contains("-s")) generateSource();
 		} catch (Exception e) {
 			e.printStackTrace();
 		} finally {
 			System.out.println("Done");
 		}
 	}

     private static void quickTest() {
         int script = getSingleScript("\u0430\u0061");
         script = getSingleScript("\u0061\u0430"); //0323 ;  093C
         String a = "\u0323";
         String b = "\u093C";
         int isLess = betterTargetIsLess.compare(a, b); // ("\u0045", "\u13AC");
         MyEquivalenceClass test = new MyEquivalenceClass();
         test.add(a, b, "none");
         Set x = test.getEquivalences(a);
         String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1);
     }

 	/**
 	 *
 	 */
 	static UnicodeSet _Non_IICore;

 	private static UnicodeSet getNonIICore() {
 		//Main + IICore + (Ext-A intersect Chinese)
 		//blk; n/a       ; CJK_Unified_Ideographs
 		//blk; n/a       ; CJK_Unified_Ideographs_Extension_A
 		//blk; n/a       ; CJK_Unified_Ideographs_Extension_B

 		if (_Non_IICore == null) {
 			// stuff to remove
 			_Non_IICore = ups.getSet("block=CJK_Unified_Ideographs_Extension_A");
 			_Non_IICore.addAll(ups.getSet("block=CJK_Unified_Ideographs_Extension_B"));
 			_Non_IICore.removeAll(UNASSIGNED); // remove unassigned
 			// stuff to restore
 			UnicodeMap um = Default.ucd().getHanValue("kIICore");
 			um.put(0x34E4, "2.1");
 			um.put(0x3007, "2.1");
 			_Non_IICore.removeAll(um.getSet("2.1"));

 			// add Chinese?
             if (true) {
     			UnicodeSet cjk_nic = new UnicodeSet();
     			String line = null;
     			try {
     				BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
     				while (true) {
     					line = Utility.readDataLine(br);
     					if (line == null) break;
     					if (line.length() == 0) continue;
     					String[] pieces = Utility.split(line, ';');
     					// part 0 is range
     					String range = pieces[0].trim();
     					int rangeDivider = range.indexOf("..");
     					int start, end;
     					if (rangeDivider < 0) {
     						start = end = Integer.parseInt(range, 16);
     					} else {
     						start = Integer.parseInt(range.substring(0, rangeDivider), 16);
     						end = Integer.parseInt(range.substring(rangeDivider+2), 16);
     					}
     					cjk_nic.add(start, end);
     				}
     				br.close();
     			} catch (Exception e) {
     				throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
     			}
     			_Non_IICore.removeAll(cjk_nic);
             }
 		}
 		return _Non_IICore;
 //		for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) {
 //			Object value = it.next();
 //			UnicodeSet set = um.getSet(value);
 //			System.out.println(value + "\t" + set);
 //		}
 	}

 	static PrintWriter log;
 	static final String ARROW = "\u2192"; // \u2194
 	static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
 	static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn")
 		.addAll(ups.getSet("gc=Co"))
 		.addAll(ups.getSet("gc=Cs"));
 	static UnicodeSet skipSet = ups.getSet("gc=Cc")
 		.addAll(ups.getSet("gc=Cf"))
 		.addAll(UNASSIGNED);
 	static UnicodeSet whiteSpace = ups.getSet("Whitespace=TRUE");
 	static UnicodeSet lowercase = ups.getSet("gc=Ll");
 	static UnicodeSet _skipNFKD;

 	static Map gatheredNFKD = new TreeMap();
     static UnicodeMap nfcMap;
     static UnicodeMap nfkcMap;

 	static String indir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\source\\";
 	static String outdir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\";

 	static Comparator codepointComparator = new UTF16.StringComparator(true,false,0);
     static Comparator UCAComparator = new CollectionUtilities.MultiComparator(new Comparator[] {Collator.getInstance(ULocale.ROOT), codepointComparator});

 	static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
 			"\\u3400-\\u4DB5" +
 			"\\u4E00-\\u9FA5" +
 			"\\uA000-\\uA48C" +
 			"\\uAC00-\\uD7A3" +
 			"\\u1100-\\u11FF" +
 			"\\uFB00-\\uFEFC" +
 			"\\u2460-\\u24FF" +
 			"\\u3251-\\u33FF" +
 			"\\u4DC0-\\u4DFF" +
 			"\\u3165-\\u318E" +
 			"\\uA490-\\uA4C6" +
 			"\\U00010140-\\U00010174" +
 			"\\U0001D300-\\U0001D356" +
 			"\\U0001D000-\\U0001D1DD" +
 			"\\U00020000-\\U0002A6D6" +
 			"\\U0001D400-\\U0001D7FF" +
 			"[:script=Canadian_Aboriginal:]" +
 			"[:script=ETHIOPIC:]" +
 				"[:script=Tagalog:]" +
 				"[:script=Hanunoo:]" +
 				"[:script=Buhid:]" +
 				"[:script=Tagbanwa:]" +
 				"[:script=Deseret:]" +
 				"[:script=Shavian:]" +
 				"[:script=Ogham:]" +
 				"[:script=Old Italic:]" +
 				"[:script=Runic:]" +
 				"[:script=Gothic:]" +
 				"[:script=Ugaritic:]" +
 				"[:script=Linear B:]" +
 				"[:script=Cypriot:]" +
 				"[:script=Coptic:]" +
 				"[:script=Syriac:]" +
 				"[:script=Glagolitic:]" +
 				"[:script=Glagolitic:]" +
 				"[:script=Old Persian:]" +
 				"[:script=Kharoshthi:]" +
 				"[:script=Osmanya:]" +
 				"[:default ignorable code point:]" +
 				"]");

 /**
  * @throws IOException
 	 *
 	 */
 	private static void generateIDN() throws IOException {
 		IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
 		info.printIDNStuff();
 	}

 	private static class IdentifierInfo {
 		static private IdentifierInfo info;

 		static IdentifierInfo getIdentifierInfo() {
 			try {
 				if (info == null) info = new IdentifierInfo();
 				return info;
 			} catch (Exception e) {
 				throw (RuntimeException) new IllegalArgumentException("Unable to access data").initCause(e);
 			}
 		}

 		private boolean mergeRanges = true;

 		private UnicodeSet removalSet, remainingOutputSet, inputSet_strict, inputSet_lenient, nonstarting;
 		UnicodeSet propNFKCSet, notInXID, xidPlus;

 		private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(),
 		reviews, removals2, lowerIsBetter;

         private UnicodeSet isCaseFolded;

 		private IdentifierInfo() throws IOException {
             isCaseFolded = new UnicodeSet();
             for (int cp = 0; cp <= 0x10FFFF; ++cp) {
                 Utility.dot(cp);
                 int cat = Default.ucd().getCategory(cp);
                 if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
                 String source = UTF16.valueOf(cp);
                 String cf = Default.ucd().getCase(source, UCD.FULL, UCD.FOLD);
                 if (cf.equals(source)) isCaseFolded.add(cp);
             }

 			propNFKCSet = ups.getSet("NFKC_QuickCheck=N").complement();
 			UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE");

             //removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant");
 			loadFileData();
 			xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet);

 			getIdentifierSet();
 			notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus);
 			removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
             //UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet);
             //removals.putAll(notNfkcXid, PROHIBITED + "compat variant");
 			removalSet = removals.keySet();

 			remainingOutputSet = new UnicodeSet(IDNOutputSet).removeAll(removalSet);

 			UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet)
 					.removeAll(removalSet).removeAll(remainingOutputSet);
 			UnicodeSet remainingInputSet = new UnicodeSet();
 			UnicodeSet specialRemove = new UnicodeSet();
 			// remove any others that don't normalize/case fold to something in
 			// the output set
 			for (UnicodeSetIterator usi = new UnicodeSetIterator(
 					remainingInputSet1); usi.next();) {
 				String nss = getModifiedNKFC(usi.getString());
 				String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD);
 				String cf2 = getModifiedNKFC(cf);
 				if (remainingOutputSet.containsAll(cf2))
 					remainingInputSet.add(usi.codepoint);
 				else
 					specialRemove.add(usi.codepoint);
 			}
 			// filter out the items that are case foldings of items in output
 			inputSet_strict = new UnicodeSet();
 			for (UnicodeSetIterator usi = new UnicodeSetIterator(
 					remainingInputSet); usi.next();) {
 				String ss = usi.getString();
 				String nss = getModifiedNKFC(ss);
 				String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD);
 				if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) {
 					System.out.println("check");
 				}
 				//> > 2126 ; retained-input-only-CF # (?) OHM SIGN
 				//> > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN

 				if (!remainingOutputSet.containsAll(nss)
 						&& remainingOutputSet.containsAll(cf))
 					inputSet_strict.add(ss);
 			}
 			// hack
 			inputSet_strict.remove(0x03F4).remove(0x2126).remove(0x212B);
 			inputSet_lenient = new UnicodeSet(remainingInputSet)
 					.removeAll(inputSet_strict);
 			nonstarting = new UnicodeSet(remainingOutputSet).addAll(
 					remainingInputSet).retainAll(new UnicodeSet("[:M:]"));
 			reviews = new UnicodeMap().putAll(removals);
 			reviews.putAll(remainingOutputSet, "output");
 			reviews.putAll(inputSet_strict, "input");
 			reviews.putAll(inputSet_lenient, "input-lenient");
 			reviews.putAll(specialRemove, PROHIBITED + "output-disallowed");

 			lowerIsBetter = new UnicodeMap();

 			lowerIsBetter.putAll(propNFKCSet, MARK_NFC); // nfkc is better than the alternative
 			lowerIsBetter.putAll(inputSet_lenient, MARK_INPUT_LENIENT);
 			lowerIsBetter.putAll(inputSet_strict, MARK_INPUT_STRICT);
 			lowerIsBetter.putAll(remainingOutputSet, MARK_OUTPUT);
 			lowerIsBetter.putAll(remainingOutputSet, MARK_ASCII);
 			lowerIsBetter.setMissing(MARK_NOT_NFC);

 			lowerIsBetter.freeze();
 			// add special values:
 			//lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0));

 			UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting,
 					"nonstarting");
 			UnicodeMap.Composer composer = new UnicodeMap.Composer() {
 				public Object compose(int codePoint, Object a, Object b) {
 					if (a == null)
 						return b;
 					else if (b == null)
 						return a;
 					else
 						return a.toString() + "-" + b.toString();
 				}
 			};
 			reviews.composeWith(nonstartingmap, composer);
 			reviews.putAll(new UnicodeSet(IDNInputSet).complement(), "");
 			UnicodeMap.Composer composer2 = new UnicodeMap.Composer() {
 				public Object compose(int codePoint, Object a, Object b) {
 					if (b == null)
 						return a;
 					return "remap-to-" + Utility.hex(b.toString());
 				}
 			};
 			//reviews.composeWith(remap, composer2);
 			removals2 = new UnicodeMap().putAll(removals);
 			removals2.putAll(ups.getSet("XID_Continue=TRUE").complement(),
 					PROHIBITED + NOT_IN_XID);
 			removals2.setMissing("future?");

 			additions.freeze();
 			remap.freeze();
 			removals.freeze();
 			reviews.freeze();
 			removals2.freeze();
 		}

 		/**
 		 *
 		 */
 		private void loadFileData() throws IOException {
 			// get the word chars
 			BufferedReader br = BagFormatter.openUTF8Reader(indir,
 					"wordchars.txt");
 			String line = null;
 			try {
 				while (true) {
 					line = Utility.readDataLine(br);
 					if (line == null)
 						break;
 					if (line.length() == 0)
 						continue;
 					String[] pieces = Utility.split(line, ';');
 					int code = Integer.parseInt(pieces[0].trim(), 16);
 					if (pieces[1].trim().equals("remap-to")) {
 						remap.put(code, UTF16.valueOf(Integer.parseInt(
 								pieces[2].trim(), 16)));
 					} else {
 						if (XIDContinueSet.contains(code)) {
 							System.out.println("Already in XID continue: "
 									+ line);
 							continue;
 						}
 						additions.put(code, "addition");
 					}
 				}
 			} catch (Exception e) {
 				throw (RuntimeException) new RuntimeException(
 						"Failure on line " + line).initCause(e);
 			}
 			br.close();

 			// get all the removals.
 			br = BagFormatter.openUTF8Reader(indir, "removals.txt");
 			UnicodeSet allocated = ups.getSet("generalcategory=cn").complement();

 			UnicodeSet sources = new UnicodeSet();
 			line = null;
 			try {
 				while (true) {
 					line = Utility.readDataLine(br);
 					if (line == null)
 						break;
 					if (line.length() == 0)
 						continue;
 					sources.clear();
 					String[] pieces = Utility.split(line, ';');
 					if (pieces.length < 2) {
 						System.out.println("Missing line " + line);
 						continue;
 					}
 					String codelist = pieces[0].trim();
 					String reasons = pieces[1].trim();
 					if (pieces[0].startsWith("[")) {
 						sources = new UnicodeSet(codelist).retainAll(allocated);
 					} else {
 						String[] codes = Utility.split(codelist, ' ');
 						for (int i = 0; i < codes.length; ++i) {
 							if (codes[i].length() == 0)
 								continue;
 							String[] range = codes[i].split("\\.\\.");
 							int start = Integer.parseInt(range[0], 16);
 							int end = start;
 							if (range.length > 1)
 								end = Integer.parseInt(range[1], 16);
 							sources.add(start, end);
 						}
 					}
 					removals.putAll(sources, PROHIBITED + reasons);
 				}
 			} catch (Exception e) {
 				throw (RuntimeException) new RuntimeException(
 						"Failure on line " + line).initCause(e);
 			}
 			removals.putAll(getNonIICore(), PROHIBITED + "~IICore");
 			br.close();
 		}

 		void printIDNStuff() throws IOException {
 			PrintWriter out;
 			printIDModifications();
 			writeIDChars();
 			writeIDReview();
 			generateDecompFile();
 		}

 		/**
 		 *
 		 */
 		private void writeIDReview() throws IOException {
 			BagFormatter bf = new BagFormatter();
 			bf.setUnicodePropertyFactory(ups);
 			bf.setLabelSource(null);
 			bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 			bf.setMergeRanges(true);

             PrintWriter out = openAndWriteHeader("review.txt", "Review List for IDN");
 //			PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
 			//reviews.putAll(UNASSIGNED, "");
 //			out.print("\uFEFF");
 //			out.println("# Review List for IDN");
 //			out.println("# $Revision: 1.12 $");
 //			out.println("# $Date: 2006/09/24 23:32:44 $");
 //			out.println("");

 			UnicodeSet fullSet = reviews.getSet("").complement();

 			bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
 			}).set(reviews).setMain("Reviews", "GCB",
 					UnicodeProperty.ENUMERATED, "1.0"));
 			//bf.setMergeRanges(false);

 			FakeBreak fakeBreak = new FakeBreak();
 			bf.setRangeBreakSource(fakeBreak);
 			out.println("");
 			out.println("# Characters allowed in IDNA");
 			out.println("");
 			bf.showSetNames(out, new UnicodeSet(fullSet)); // .removeAll(bigSets)
 			//bf.setMergeRanges(true);
 			//			out.println("");
 			//			out.println("# Large Ranges");
 			//			out.println("");
 			//			bf.showSetNames(out, new UnicodeSet(fullSet).retainAll(bigSets));
 			out.println("");
 			out.println("# Characters disallowed in IDNA");
 			out
 					.println("# The IDNA spec doesn't allow any of these characters,");
 			out
 					.println("# so don't report any of them as being missing from the above list.");
 			out
 					.println("# Some possible future additions, once IDNA updates to Unicode 4.1, are given.");
 			out.println("");
 			//bf.setRangeBreakSource(UnicodeLabel.NULL);
 			bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
 			}).set(removals2).setMain("Removals", "GCB",
 					UnicodeProperty.ENUMERATED, "1.0"));
 			//bf.setValueSource(UnicodeLabel.NULL);
 			bf.showSetNames(out, new UnicodeSet(IDNInputSet).complement()
 					.removeAll(UNASSIGNED));
 			out.close();
 		}

 		/**
 		 *
 		 */
 		private void writeIDChars() throws IOException {
 			BagFormatter bf = new BagFormatter();
 			bf.setUnicodePropertyFactory(ups);
 			bf.setLabelSource(null);
 			bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 			bf.setMergeRanges(true);

 			UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");

             PrintWriter out = openAndWriteHeader("idnchars.txt", "Recommended Identifier Profiles for IDN");

 			out.println("# Allowed as output characters");
 			out.println("");
 			bf.setValueSource("output");
 			bf.showSetNames(out, remainingOutputSet);
 			showExtras(bf, remainingOutputSet, letters);

             /*
 			out.println("");

 			out.println("");
 			out.println("# Input Characters");
 			out.println("");
 			bf.setValueSource("input");
 			bf.showSetNames(out, inputSet_strict);
 			showExtras(bf, inputSet_strict, letters);

 			out.println("");
 			out.println("# Input Characters (lenient)");
 			out.println("");
 			bf.setValueSource("input-lenient");
 			bf.showSetNames(out, inputSet_lenient);
 			showExtras(bf, inputSet_lenient, letters);
 			*/

 			out.println("");
 			out.println("# Not allowed at start of identifier");
 			out.println("");
 			bf.setValueSource("nonstarting");
 			bf.showSetNames(out, nonstarting);

 			//out.println("");

 			//showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", remap);

 			out.close();
 		}


 		/**
 		 *
 		 */
 		private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) {
 			UnicodeSet extra = new UnicodeSet(source).removeAll(letters);
 			if (extra.size() != 0) {
 				UnicodeSet fixed = new UnicodeSet();
 				for (UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next();) {
 					if (!letters.containsAll(Default.nfkd().normalize(it.getString()))) {
 						fixed.add(it.codepoint);
 					}
 				}
 				System.out.println(bf.showSetNames(fixed));
 			}
 		}

 		/**
 		 *
 		 */
 		private void printIDModifications() throws IOException {
 			BagFormatter bf = new BagFormatter();
 			bf.setUnicodePropertyFactory(ups);
 			bf.setLabelSource(null);
 			bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 			bf.setMergeRanges(true);

             PrintWriter out = openAndWriteHeader("xidmodifications.txt", "Security Profile for General Identifiers");
 			/* PrintWriter out = BagFormatter.openUTF8Writer(outdir, "xidmodifications.txt");

 			out.println("# Security Profile for General Identifiers");
 			out.println("# $Revision: 1.12 $");
 			out.println("# $Date: 2006/09/24 23:32:44 $");
             */


 			out.println("# Characters restricted");
 			out.println("");
 			/*
 			 * for (Iterator it = values.iterator(); it.hasNext();) { String
 			 * reason1 = (String)it.next(); bf.setValueSource(reason1);
 			 * out.println(""); bf.showSetNames(out, removals.getSet(reason1)); }
 			 */
 			bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
 			}).set(removals).setMain("Removals", "GCB",
 					UnicodeProperty.ENUMERATED, "1.0"));
 			bf.showSetNames(out, removalSet);

 			out.println("");
 			out.println("# Characters added");
 			out.println("");
 			bf.setValueSource("addition");
 			bf.showSetNames(out, additions.keySet());

 			//showRemapped(out, "Characters remapped on input", remap);

 			out.close();

            out = openAndWriteHeader("xidAllowed.txt", "Security Profile for General Identifiers");
            UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet());
             UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet);
             allowed.removeAll(cfAllowed);
             bf.setValueSource("case_folded");
             out.println("# XID characters allowed (no uppercase)");
             out.println("");
             bf.showSetNames(out, cfAllowed);
             bf.setValueSource("not_case_folded");
             out.println("");
             out.println("# XID characters allowed (uppercase)");
             out.println("");
             bf.showSetNames(out, allowed);
             out.close();

 			UnicodeMap someRemovals = new UnicodeMap();
 			UnicodeMap.Composer myComposer = new UnicodeMap.Composer() {
 				public Object compose(int codePoint, Object a, Object b) {
 					if (b == null) return null;
 					String x = (String)b;
 					if (false) {
 						if (!IDNOutputSet.contains(codePoint)) {
 							return "~IDNA";
 						}
 						if (!xidPlus.contains(codePoint)) {
 							return "~Unicode Identifier";
 						}
 					}
 					if (x.startsWith(PROHIBITED)) x = x.substring(PROHIBITED.length());
 					//if (!propNFKCSet.contains(codePoint)) x += "*";
 					if (lowercase.contains(codePoint)) {
 						String upper = Default.ucd().getCase(codePoint, UCD.FULL, UCD.UPPER);
 						if (upper.equals(UTF16.valueOf(codePoint))
 								&& x.equals("technical symbol (phonetic)")) x = "technical symbol (phonetic with no uppercase)";
 					}
 					return x;
 				}
 			};
 			someRemovals.composeWith(removals, myComposer);
 			UnicodeSet nonIDNA = new UnicodeSet(IDNOutputSet).addAll(IDNInputSet).complement();
 			someRemovals.putAll(nonIDNA, "~IDNA");
 			someRemovals.putAll(new UnicodeSet(xidPlus).complement(), "~Unicode Identifier");
 			someRemovals.putAll(UNASSIGNED, null); // clear extras
 			//someRemovals = removals;
 			out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
 			out.println("# Characters restricted in domain names");
 			out.println("# $Revision: 1.12 $");
 			out.println("# $Date: 2006/09/24 23:32:44 $");
 			out.println("#");
 			out.println("# This file contains a draft list of characters for use in");
 			out.println("#     UTR #36: Unicode Security Considerations");
 			out.println("#     http://unicode.org/draft/reports/tr36/tr36.html");
 			out.println("# According to the recommendations in that document, these characters");
 			out.println("# would be restricted in domain names: people would only be able to use them");
 			out.println("# by using lenient security settings.");
 			out.println("#");
 			out.println("# If you have any feedback on this list, please use the submission form at:");
 			out.println("#     http://unicode.org/reporting.html.");
 			out.println("#");
 			out.println("# Notes:");
 			out.println("# - Characters are listed along with a reason for their removal.");
 			out.println("# - Characters listed as ~IDNA are excluded at this point in domain names,");
 			out.println("#   in many cases because the international domain name specification does not contain");
 			out.println("#   characters beyond Unicode 3.2. At this point in time, feedback on those characters");
 			out.println("#   is not relevant.");
 			out.println("# - Characters listed as ~Unicode Identifiers are restricted because they");
 			out.println("#   do not fit the specification of identifiers given in");
 			out.println("#      UAX #31: Identifier and Pattern Syntax");
 			out.println("#      http://unicode.org/reports/tr31/");
 			out.println("# - Characters listed as ~IICore are restricted because they are Ideographic,");
 			out.println("#   but not part of the IICore set defined by the IRG as the minimal set");
 			out.println("#   of required ideographs for East Asian use.");
 			out.println("# - The files in this directory are 'live', and may change at any time.");
 			out.println("#   Please include the above Revision number in your feedback.");

 			bf.setRangeBreakSource(new FakeBreak2());
 			if (true) {
 				Set values = new TreeSet(someRemovals.getAvailableValues());
 				for (Iterator it = values.iterator(); it.hasNext();) {
 					String reason1 = (String) it.next();
 					bf.setValueSource(reason1);
 					out.println("");
 					bf.showSetNames(out, someRemovals.getSet(reason1));
 				}
 			} else {
 				bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
 				}).set(someRemovals).setMain("Removals", "GCB",
 						UnicodeProperty.ENUMERATED, "1.0"));
 				bf.showSetNames(out, someRemovals.keySet());
 			}
 			out.close();
 		}
 	}

 	static final String PROHIBITED = "restricted ; ";
 	static final String NOT_IN_XID = "not in XID+";
     public static final boolean suppress_NFKC = true;
 	/**
  *
  */


 	/**
 	 *
 	 */
 	private static void generateDecompFile() throws IOException {
 		PrintWriter out = BagFormatter.openUTF8Writer(outdir, "decomps.txt");
 		UnicodeProperty dt = ups.getProperty("Decomposition_Type");
 		for (Iterator it = dt.getAvailableValues().iterator(); it.hasNext();) {
 			String value = (String) it.next();
 			if (value.equalsIgnoreCase("none") || value.equalsIgnoreCase("canonical")) continue;
 			UnicodeSet s = dt.getSet(value);
 			out.println("");
 			out.println("# Decomposition_Type = " + value);
 			out.println("");
 			for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) {
 				String source = usi.getString();
 				String target = getModifiedNKFC(source);
 				writeSourceTargetLine(out, source, null, target, value);
 			}
 			//bf.showSetNames(out, s);
 			out.flush();
 		}
 		out.close();
 	}

 	static class FakeBreak extends UnicodeLabel {
 		UnicodeSet nobreakSet = setsToAbbreviate;
 		public String getValue(int codepoint, boolean isShort) {
 			return nobreakSet.contains(codepoint) ? ""
 			 : (codepoint & 1) == 0 ? "O"
 			 : "E";
 		}
 	}

 	static class FakeBreak2 extends UnicodeLabel {
 		UnicodeSet nobreakSet = new UnicodeSet(setsToAbbreviate)
 			.addAll(new UnicodeSet(IDNOutputSet).complement())
 			.addAll(new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus).complement());

 		public String getValue(int codepoint, boolean isShort) {
 			return nobreakSet.contains(codepoint) ? ""
 			 : (codepoint & 1) == 0 ? "O"
 			 : "E";
 		}
 	}

 	/**
 	 *
 	 */
 	private static void showRemapped(PrintWriter out, String title, UnicodeMap remap) {
 		out.println("");
 		out.println("# " + title);
 		out.println("");
 		int count = 0;
 		for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) {
 			writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null);
 			count++;
 		}
 		out.println("");
 		out.println("# Total code points: " + count);
 	}
 	/**
 	 *
 	 */
 	static UnicodeSet XIDContinueSet = new UnicodeSet("[:XID_Continue:]");
 	private static UnicodeSet IDNOutputSet, IDNInputSet, _preferredIDSet;

 	static UnicodeSet getIdentifierSet() {
 		if (_preferredIDSet == null) {
 			IDNOutputSet = new UnicodeSet();
 			IDNInputSet = new UnicodeSet();
 			IDNOutputSet.add('-'); // HACK
 			IDNInputSet.add('-');
 			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
 				Utility.dot(cp);
 				int cat = Default.ucd().getCategory(cp);
 				if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
 				// get IDNA
 				int idnaType = GenerateStringPrep.getIDNAType(cp);
 				if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp);
 				if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp);
 			}
 			_preferredIDSet = new UnicodeSet(IDNOutputSet).addAll(XIDContinueSet);
 		}
 		_preferredIDSet.add(0x2018).add(0x2019);
 		return _preferredIDSet;
 	}

 	private static UnicodeSet getSkipNFKD() {
         nfcMap = new UnicodeMap();
         nfkcMap = new UnicodeMap();
 		if (_skipNFKD == null) {
 			_skipNFKD = new UnicodeSet();
 			UnicodeSet idSet = getIdentifierSet();
 			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
 				Utility.dot(cp);
 				int cat = Default.ucd().getCategory(cp);
 				if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
 				int decompType = Default.ucd().getDecompositionType(cp);
                 String nfc = Default.nfc().normalize(cp);
                 if (decompType == UCD.CANONICAL) nfcMap.put(cp, nfc);
 				if (decompType == UCD.COMPAT_CIRCLE
 						|| decompType == UCD.COMPAT_SUPER
 						|| decompType == UCD.COMPAT_SUB
 						|| decompType == UCD.COMPAT_VERTICAL
 						|| decompType == UCD.COMPAT_SMALL
 						|| decompType == UCD.COMPAT_SQUARE
 						|| decompType == UCD.COMPAT_FRACTION) {
 					_skipNFKD.add(cp);
 					continue;
 				}
                 String source = UTF16.valueOf(cp);
 				String mapped = Default.nfkd().normalize(cp);
                 String kmapped = getModifiedNKFC(source);
                 if (!kmapped.equals(source) && !kmapped.equals(nfc)) {
                     if (kmapped.startsWith(" ") || kmapped.startsWith("\u0640")) {
                         System.out.println("?? " + Default.ucd().getCodeAndName(cp));
                         System.out.println("\t" + Default.ucd().getCodeAndName(kmapped));
                         kmapped = getModifiedNKFC(source); // for debugging
                     }
                     nfkcMap.put(cp,kmapped);
                 }
 				if (mapped.equals(source)) continue;
 				if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp);
 				else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp);
 			}
 		}
         nfcMap.setMissing("");
         nfcMap.freeze();
         nfkcMap.setMissing("");
         nfkcMap.freeze();
 		return _skipNFKD;
 	}

 	private static boolean isMixedScript(String source) {
 		return getSingleScript(source) == UScript.INVALID_CODE;
 	}

 /**
  * Returns the script of the input text. Script values of COMMON and INHERITED are ignored.
  * @param source Input text.
  * @return Script value found in the text.
  * If more than one script values are found, then UScript.INVALID_CODE is returned.
  * If no script value is found (other than COMMON or INHERITED), then UScript.COMMON is returned.
  */
 public static int getSingleScript(String source) {
     if (source.length() == 0) return UScript.COMMON;
 	int lastScript = UScript.COMMON; // temporary value
 	int cp;
 	for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
 		cp = UTF16.charAt(source, i);
 		int script = UScript.getScript(cp);
 		if (script == UScript.COMMON || script == UScript.INHERITED) {
             continue;
 		}
 		if (lastScript == UScript.COMMON) {
             lastScript = script;
         } else if (script != lastScript) {
             return UScript.INVALID_CODE;
         }
 	}
 	return lastScript;
 }

 	/**
 	 *
 	 */
 	private static void generateConfusables() throws IOException {
 		log = BagFormatter.openUTF8Writer(outdir, "log.txt");
 		//fixMichel(indir, outdir);
 		generateConfusables(indir, outdir);
 		log.close();
 		if (false) for (Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext();) {
 			String source = (String)it.next();
 			System.out.println(Default.ucd().getCodeAndName(source)
 					+ " => " + Default.ucd().getCodeAndName((String)gatheredNFKD.get(source)));
 		}
 	}

 	/*	static class Data2 {
 		String source;
 		String target;
 		int count;
 		Data2(String target, int count) {
 			this.target = target;
 			this.count = count;
 		}
 	}
 */
 /*	static class Data implements Comparable {
 		String source;
 		String target;
 		String type;
 		Data(String source, String target, String type) {
 			this.source = source;
 			this.target = target;
 			this.type = type;
 		}
 		public int compareTo(Object o) {
 			int result;
 			Data that = (Data)o;
 			if (0 != (result = target.compareTo(that.target))) return result;
 			if (0 != (result = source.compareTo(that.source))) return result;
 			if (0 != (result = type.compareTo(that.type))) return result;
 			return 0;
 		}
 	}
 */

 	/**
 	 *
 	 */
 	static void writeSourceTargetLine(PrintWriter out, String source, String tag, String target, String reason) {
 		out.print(
 				Utility.hex(source)
 				+ " ;\t" + Utility.hex(target)
 				+ (tag == null ? "" : " ;\t" + tag)
 				//+ " ;\t" + (preferredID.contains(source) ? "ID" : "")
 				+ "\t#"
                 + (isXid(source) ? "" : "*")
 				+ " ( " + source + " " + ARROW + " " + target + " ) "
 				+ Default.ucd().getName(source) + " " + ARROW + " "
 				+ Default.ucd().getName(target)
 				);
 		if (reason != null) out.print("\t# " + reason);
 		out.println();
 	}

 	static UnicodeSet controls = new UnicodeSet("[:Cc:]");

 	static class MyEquivalenceClass extends XEquivalenceClass {
 		public MyEquivalenceClass() {
 			super("NONE");
 		}
 		public boolean addCheck(String a, String b, String reason) {
 			// quick check for illegal containment, before changing object
 			if (checkForBad(a, b, reason) || checkForBad(b, a, reason)) {
 				return false;
 			}
 			super.add(a, b, reason);
 			// full check for any resulting illegal containment.
 			// illegal if for any x, y, x is a proper superstring of y
 			Set equivalences = getEquivalences(a);
 			for (Iterator it = equivalences.iterator(); it.hasNext();) {
 				String x = (String)it.next();
 				if (!UTF16.hasMoreCodePointsThan(x,1)) continue;
 				for (Iterator it2 = equivalences.iterator(); it2.hasNext();) {
 					String y = (String)it2.next();
 					if (x.equals(y)) continue;
 					if (x.indexOf(y) >= 0) throw new RuntimeException("Illegal containment: "
 							+ Default.ucd().getCodeAndName(x) + " contains "
 							+ Default.ucd().getCodeAndName(y) + " because "
 							+ Default.ucd().getCodeAndName(a) + " ~ "
 							+ Default.ucd().getCodeAndName(b) + " because of "
 							+ reason);
 				}
 			}
 			return true;
 		}

 		/**
 		 *
 		 */
 		private boolean checkForBad(String a, String b, String reason) {
 			Set equivalences = getEquivalences(b);
 			for (Iterator it = equivalences.iterator(); it.hasNext();) {
 				String b2 = (String)it.next();
 				if (a.equals(b2)) continue;
 				if (b2.indexOf(a) >= 0 || a.indexOf(b2) >= 0) {
 					log.println("Illegal containment: "
 							+ Default.ucd().getCodeAndName(a)
 							+ " overlaps "
 							+ Default.ucd().getCodeAndName(b2)
 							+ "\r\n\tfrom "
 							+ Default.ucd().getCodeAndName(b)
 							+ "\r\n\twith reason "
 							+ reason + " plus "
 							+ getReasons(b2, b));
 					return true;
 				}
 			}
 			return false;
 		}

 		public XEquivalenceClass add(Object a1, Object b1, String reason) {
 			String a = (String)a1;
 			String b = (String)b1;
 			try {
 				addCheck(a, b, reason);
 				return this;
 			} catch (RuntimeException e) {
 				throw (RuntimeException) new RuntimeException("Failure adding "
 						+ Default.ucd().getCodeAndName(a) + "; "
 						+ Default.ucd().getCodeAndName(b)
 						+ "; " + reason).initCause(e);
 			}
 		}
 		/**
 		 * Only NFKD if the result doesn't cross from ID set to nonID set, and space is not added
 		 */
 //		private String specialNFKD(String item) {
 //			UnicodeSet skipSet = getSkipNFKD();
 //			StringBuffer result = new StringBuffer();
 //			int cp;
 //			for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
 //				cp = UTF16.charAt(item, i);
 //				if (skipSet.contains(cp)) {
 //					UTF16.append(result, cp);
 //					continue;
 //				}
 //				String cps = UTF16.valueOf(cp);
 //				String mapped = Default.nfkd().normalize(cps);
 //				if (cps.equals(mapped)) {
 //					UTF16.append(result, cp);
 //					continue;
 //				}
 //				result.append(mapped);
 //				gatheredNFKD.put(cps, mapped);
 //			}
 //			return result.toString();
 //		}

 		public void close(String reason) {
 			boolean addedItem;
 			StringBuffer reasons = new StringBuffer();
 			do {
 				addedItem = false;
 				Set cloneForSafety = getOrderedExplicitItems();
 				for (Iterator it = cloneForSafety.iterator(); it.hasNext();) {
 					String item = (String) it.next();
 					if (!UTF16.hasMoreCodePointsThan(item,1)) continue; // just for speed
 					reasons.setLength(0);
 					String mapped = mapString(item, reasons);
 					if (!isEquivalent(item, mapped)) {
 						if (addCheck(item, mapped, reasons.toString())) {
 							// System.out.println("Closing: " + Default.ucd().getCodeAndName(item) + " => " + Default.ucd().getCodeAndName(mapped));
 							addedItem = true;
 						}
 					}
 				}
 			} while (addedItem);
 		}

 		/**
 		 *
 		 */
 		private String mapString(String item, StringBuffer reasons) {
 			if (false && item.startsWith("\u03D2")) {
 				System.out.println("foo");
 			}
 			StringBuffer result = new StringBuffer();
 			int cp;
 			for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
 				cp = UTF16.charAt(item, i);
 				String cps = UTF16.valueOf(cp);
 				String mapped = getParadigm(cps, false, false);
 				if (mapped.indexOf(cps) >= 0) result.append(cps);
 				else {
 					result.append(mapped);
                     List x = getReasons(cps, mapped);
 					reasons.append(getBestForm(x));
 				}
 			}
 			return result.toString();
 		}

 		private Object getBestForm(Collection x) {
             if (x.size() != 1)  return "[" +  x + "]";
             Object item = x.iterator().next();
             if (!(item instanceof Collection))  return x.toString();
             return getBestForm((Collection)item);
         }

         public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) {
             Set filteredSet;
             if (onlyLowercase == false && onlySameScript == false) {
                 filteredSet = getEquivalences(item);
             } else {
                 filteredSet = new HashSet();
                 for (Iterator it = getEquivalences(item).iterator(); it.hasNext();) {
                     String other = (String) it.next();
                     String combined = item + other;
                     if (onlyLowercase) {
                         boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
                         if (!isLowercase) continue;
                     }
                     if (onlySameScript) {
                         boolean isMixed = isMixedScript(combined);
                         if (isMixed) continue;
                     }
                     filteredSet.add(other);
                 }
             }
 			return (String) CollectionUtilities.getBest(filteredSet, betterTargetIsLess, -1);
 		}

 		public Set getOrderedExplicitItems() {
 			Set cloneForSafety = new TreeSet(codepointComparator);
 			cloneForSafety.addAll(getExplicitItems());
 			return cloneForSafety;
 		}
 		/**
 		 *
 		 */
 		public void writeSource(PrintWriter out) {
 			Set items = getOrderedExplicitItems();
 			for (Iterator it = items.iterator(); it.hasNext();) {
 				String item = (String) it.next();
 				String paradigm = (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
 				if (item.equals(paradigm)) continue;
 				writeSourceTargetLine(out, item, null, paradigm, null);
 			}
 		}
 	}

 	static class DataSet {
 		MyEquivalenceClass dataMixedLowercase = new MyEquivalenceClass();
 		MyEquivalenceClass dataMixedAnycase = new MyEquivalenceClass();
 		MyEquivalenceClass dataSingleLowercase = new MyEquivalenceClass();
 		MyEquivalenceClass dataSingleAnycase = new MyEquivalenceClass();

 		public DataSet add(String source, String target, String type, int lineCount, String errorLine) {
 			if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
 			String nsource = Default.nfd().normalize(source);
 			String ntarget = Default.nfd().normalize(target);

 			// if it is just a compatibility match, return
 			//if (nsource.equals(ntarget)) return this;
 			if (type.indexOf("skip") >= 0) return this;
 			if (target.indexOf('\u203D') >= 0) return this;

 			type = getReasonFromFilename(type);

 			// if it is base + combining sequence => base2 + same combining sequence, do just the base
 			int nsourceFirst = UTF16.charAt(nsource,0);
 			String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
 			int ntargetFirst = UTF16.charAt(ntarget,0);
 			String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));

 			if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
 				source = UTF16.valueOf(nsourceFirst);
 				target = UTF16.valueOf(ntargetFirst);
 				type += "-base";
 			}
 			type += ":" + lineCount;

 			String combined = source + target;
             if (combined.indexOf("\u0430") >= 0) {
                 System.out.println(Default.ucd().getCodeAndName(combined));
             }
 			boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
 			boolean isMixed = isMixedScript(combined);
 			dataMixedAnycase.add(source, target, type);
 			if (isLowercase) {
                 dataMixedLowercase.add(source, target, type);
             }
 			if (!isMixed) {
                 dataSingleAnycase.add(source, target, type);
             }
 			if (!isMixed && isLowercase) {
                 dataSingleLowercase.add(source, target, type);
             }
 			return this;
 		}

 /*		*//**
 		 * @param errorLine TODO
 		 *
 		 *//*
 		private DataSet add(Data newData, String errorLine) {
 			if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
 				System.out.println("Problem with " + errorLine);
 				System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
 			}
 			String[] key = {newData.source, newData.target};
 			Data old = (Data) dataMap.get(key);
 			if (old == null) {
 				dataSet.add(newData);
 				dataMap.put(key, newData);
 			}else {
 				old.type = old.type + "/" + newData.type;
 			}
 			return this;
 		}
 */		// Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
 		static final int NORMAL = 0, FOLDING = 1, OLD = 2;

 		public DataSet addFile(String directory, String filename) throws IOException {
 			String line = null;
 			int count = 0;
 			try {
 				BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
 				int kind = NORMAL;
 				if (filename.indexOf("Folding") >= 0) kind = FOLDING;
 				else if (false && filename.indexOf("-old") >= 0) kind = OLD;
 				while (true) {
 					count++;
 					line = Utility.readDataLine(in);
 					if (line == null) break;
 					if (line.length() == 0) continue;
 					String[] pieces = Utility.split(line,';');
 					if (pieces.length < 2) {
 						System.out.println("Error on: " + line);
 						continue;
 					}
 					String type = filename;
 					if (kind==FOLDING) {
 						String source = Utility.fromHex(pieces[0].trim(),true);
 						String target = Utility.fromHex(pieces[1].trim(),true);
 						String nsource = Default.nfkd().normalize(source);
 						String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
 						if (!first.equals(target)) {
 							add(source, target, type, count, line);
 						}
 					} else if (kind == OLD) {
 						String target = pieces[0].trim();
 						for (int i = 1; i < pieces.length; ++i) {
 							add(pieces[i].trim(), target, type, count, line);
 						}
 					} else {
 						String source = Utility.fromHex(pieces[0].trim(),true);
 						String target = Utility.fromHex(pieces[1].trim(),true);
 						//if (pieces.length > 2) type = pieces[2].trim();
                         String nfkdSource = Default.nfkd().normalize(source);
                         String nfkdTarget = Default.nfkd().normalize(target);
                         if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) {
                            System.out.println("Suppressing nfkc for: " + Default.ucd().getCodeAndName(source));
                         } else {
                             add(source, target, type, count, line);
                         }
 					}
 				}
 				in.close();
 				return this;
 			} catch (Exception e) {
 				throw (RuntimeException) new RuntimeException("Failure with file: "
 						+ directory + filename + " on line: " + count
 						+ ": " + line).initCause(e);
 			}
 		}

 		public void writeSource(String directory, String filename) throws IOException {
             PrintWriter out = openAndWriteHeader(filename, "Source File for IDN Confusables");
 //			PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
 //			out.println("# Source File for IDN Confusables");
 //			out.println("# $Revision: 1.12 $");
 //			out.println("# $Date: 2006/09/24 23:32:44 $");
 //			out.println("");
 			dataMixedAnycase.writeSource(out);
 			out.close();
 		}

 		public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
             PrintWriter out = openAndWriteHeader(filename, "Recommended confusable mapping for IDN");
 //            PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
 //			out.println("# Recommended confusable mapping for IDN");
 //			out.println("# $Revision: 1.12 $");
 //			out.println("# $Date: 2006/09/24 23:32:44 $");
 //			out.println("");

 			if (appendFile) {
 				String[] replacements = {"%date%", Default.getDate()};
 				Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
 						Utility.UTF8_WINDOWS, out, replacements);
 			}
             if (true) {
                 writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, true, true);
                 writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, true);
                 writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, true, false);
                 writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
             } else {
     			writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false);
     			writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false);
     			writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false);
     			writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
             }
 			out.close();
 		}
 		/**
 		 * @param skipNFKEquivs TODO
 		 * @param onlyLowercase TODO
 		 * @param onlySingleScript TODO
 		 *
 		 */
 		private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript) {
 			// first get all the sets. Then get the best paradigm from each. Then sort.
 //			Set setOfSets = data.getEquivalenceSets();
 //			Map orderedResults = new TreeMap(betterTargetIsLess);
 //			for (Iterator it = setOfSets.iterator(); it.hasNext();) {
 //				Set setOfEquivs = (Set) it.next();
 //				Object item = CollectionUtilities.getBest(setOfEquivs, betterTargetIsLess, -1);
 //
 //			}
 			//int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00");
 			//System.out.println("Code Point Compare: " + c);
 			Set items = data.getOrderedExplicitItems();
 			out.println();
 			out.println("# " + title);
 			out.println();
 			int count = 0;
 			UnicodeSet preferredID = getIdentifierSet();
             ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator});
             Set orderedPairs = new TreeSet(ac);
 			for (Iterator it = items.iterator(); it.hasNext();) {
 				String source = (String) it.next();
                 if (UTF16.hasMoreCodePointsThan(source,1)) continue;
 				String target = data.getParadigm(source, onlyLowercase, onlySingleScript);
                 if (target == null) continue;
 				if (source.equals(target)) continue;
 				if (skipNFKEquivs) {
 					if (!Default.nfkd().normalize(source).equals(source)) continue;
 				}
                 orderedPairs.add(new String[] {target, source});
             }
             String lastTarget = null;
             for (Iterator it = orderedPairs.iterator(); it.hasNext();) {
                 String[] pair = (String[]) it.next();
                 String source = pair[1];
                 String target = pair[0];
 				String reason = fixReason(data.getReasons(source, target));
                 if (lastTarget != null && !lastTarget.equals(target)) {
                     out.println();
                 }
 				writeSourceTargetLine(out, source, tag, target, reason);
                 lastTarget = target;
 				count++;
 			}
 			out.println();
 			out.println("# total for (" + tag + "): " + count);
 			out.println();
 		}

 		/**
 		 *
 		 */
 		private String fixReason(List reasons) {
 			List first = (List)reasons.get(0);
 			String result = "";
 			for (int i = 0; i < first.size(); ++i) {
 				if (i != 0) result += " ";
 				Object item = first.get(i);
 				if (item instanceof String) {
 					result += item;
 				} else {
 					String temp = "";
 					for (Iterator it = ((Set)item).iterator(); it.hasNext();) {
 						if (temp.length() != 0) temp += "|";
 						temp += it.next();
 					}
 					result += "{" + temp + "}";
 				}
 			}
 			return result.toString();
 		}

 		public void addAll(DataSet ds) {
 			dataMixedAnycase.addAll(ds.dataMixedAnycase);
 			dataMixedLowercase.addAll(ds.dataMixedLowercase);
 			dataSingleAnycase.addAll(ds.dataSingleAnycase);
 			dataSingleLowercase.addAll(ds.dataSingleLowercase);
 		}
 		/*		*//**
 		 *
 		 *//*
 		public DataSet clean() {
 			// remove all skips
 			DataSet tempSet = new DataSet();
 			Map m = new HashMap();
 			for (Iterator it = dataSet.iterator(); it.hasNext();) {
 				Data d = (Data) it.next();
 				if (d.type.indexOf("skip") >= 0) continue;
 				String newTarget = Default.nfkd().normalize(d.target);
 				String newSource = Default.nfkd().normalize(d.source);
 				String type = d.type;
 				if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
 					type += "-nf";
 					log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
 					log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
 					continue;
 				}
 				// swap order
 				if (preferSecondAsSource(newSource, newTarget)) {
 					String temp = newTarget;
 					newTarget = newSource;
 					newSource = temp;
 				}

 				Data already = (Data) m.get(newSource);
 				if (already != null && !newTarget.equals(already.target)) {
 					log.println("X " + getCodeCharName(newSource) + " " + ARROW);
 					log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
 					log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
 					if (preferSecondAsSource(already.target, newTarget)) {
 						// just fix new guy
 						type += "[" + newSource + "]" + already.type;
 						newSource = newTarget;
 						newTarget = already.target;
 					} else {
 						// need to fix new guy, AND fix old guy.
 						tempSet.remove(already);
 						type += "[" + newSource + "]" + already.type;
 						newSource = already.target;
 						already.type += "[" + already.target + "]" + type;
 						already.target = newTarget;
 						tempSet.add(already, "");
 					}
 				}
 				Data newData = new Data(newSource, newTarget, type);
 				m.put(newSource, newData);
 				tempSet.add(newData, "");
 			}
 			// now recursively apply
 			DataSet s = new DataSet();
 			for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
 				Data d = (Data) it.next();
 				int cp = 0;
 				StringBuffer result = new StringBuffer();
 				for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
 					cp = UTF16.charAt(d.target, i);
 					String src = UTF16.valueOf(cp);
 					while (true) {
 						Data rep = (Data) m.get(src);
 						if (rep == null) break;
 						src = rep.target;
 					}
 					result.append(src);
 				}
 				String newTarget = result.toString();
 				newTarget = Default.nfkd().normalize(newTarget);
 				s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
 			}
 			return s;
 		}
 		*//**
 		 *
 		 *//*
 		private void remove(Data already) {
 			String[] key = {already.source, already.target};
 			dataMap.remove(key);
 			dataSet.remove(already);
 		}*/
 		/**
 		 *
 		 */
 		public void close(String reason) {
 			dataMixedAnycase.close(reason);
 			dataMixedLowercase.close(reason);
 			dataSingleAnycase.close(reason);
 			dataSingleLowercase.close(reason);
 		}
 		/**
 		 *
 		 */
 		public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) {
 			int count = 0;
 			for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) {
 				add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine);
 			}
 		}

 		static class MyFilter implements XEquivalenceClass.Filter {
 			UnicodeSet output;
 			public boolean matches(Object o) {
 				return output.containsAll((String)o);
 			}
 		}

 		static class MyCollectionFilter implements CollectionUtilities.ObjectMatcher {
 			UnicodeSet outputAllowed;
 			int minLength;
 			public boolean matches(Object o) {
 				String item = (String)o;
 				if (!outputAllowed.containsAll(item)) return false;
 				int len = UTF16.countCodePoint(item);
 				if (len < minLength) minLength = len;
 				return true;
 			}
 		};
 		/**
 		 * @param script TODO
 		 * @throws IOException
 		 *
 		 */
 		public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException {
             PrintWriter out = openAndWriteHeader(filename, "Summary: Recommended confusable mapping for IDN");
 //			PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
 //			out.print('\uFEFF');
 //			out.println("# Summary: Recommended confusable mapping for IDN");
 //			out.println("# $Revision: 1.12 $");
 //			out.println("# $Date: 2006/09/24 23:32:44 $");
 //			out.println("");
             UnicodeSet representable = new UnicodeSet();
 			MyEquivalenceClass data = dataMixedAnycase;
 			Set items = data.getOrderedExplicitItems();
 //			for (Iterator it = items.iterator(); it.hasNext();) {
 //				System.out.println(Default.ucd().getCodeAndName((String)it.next()));
 //			}
 			int count = 0;
 			UnicodeSet preferredID = getIdentifierSet();
 			String lastTarget = "";
 			Set itemsSeen = new HashSet();
 			Set equivalents = new TreeSet(betterTargetIsLess);
 			MyCollectionFilter myFilter = new MyCollectionFilter();
 			myFilter.outputAllowed= new UnicodeSet("[[\u0021-\u007E]-[:letter:]]")
 			.addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet)
 			.addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict);

 			for (Iterator it = items.iterator(); it.hasNext();) {
 				String target = (String) it.next();
 				if (itemsSeen.contains(target)) continue;
 				equivalents.clear();
 				equivalents.addAll(data.getEquivalences(target));
 				itemsSeen.addAll(equivalents);
 				if (outputOnly) { // remove non-output
 					myFilter.minLength = 1000;
 					CollectionUtilities.retainAll(equivalents, myFilter);
 					if (equivalents.size() <= 1) continue;
 					if (myFilter.minLength > 1) continue;
 					if (!equivalents.contains(target)) { // select new target if needed
 						target = (String) equivalents.iterator().next();
 					}
 				}
 				scriptTest:
 				if (script != null) {
 					// see if at least one item contains the target script
 					for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
 						String item = (String) it2.next();
 						if (script.containsAll(item)) {
 							target = item;
 							for (Iterator it3 = equivalents.iterator(); it3.hasNext();) {
 								representable.addAll((String)it3.next());
 							}
 							break scriptTest;
 						}
 					}
 					continue; // skip this one
 				}
 				out.println();
 				out.println(getStatus(target) + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + Default.ucd().getName(target));
 				//if (UTF16.hasMoreCodePointsThan(source,1)) continue;
 				for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
 					String source = (String) it2.next();
 					if (source.equals(target)) continue;
 					//boolean compatEqual = Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target));
 					//if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue;
 					String reason = fixReason(data.getReasons(source, target));
 					//if (!outputAllowed.containsAll(source)) continue;
 //					if (compatEqual) {
 //						out.print("\u21D0");
 //					} else {
 //						out.print("\u2190");
 //					}
 					out.println("\u2190" + getStatus(source) + "\t" + "(\u200E " + source + " \u200E)\t" + Utility.hex(source) + "\t " + Default.ucd().getName(source)
 							+ "\t# " + reason);
 					count++;
 				}
 			}
 			out.println();
 			out.println("# total : " + count);
 			out.println();
 			if (script != null) {
 				out.println();
 				out.println("# Base Letters Representable with Script");
 				out.println();
 				representable.removeAll(script);
 				BagFormatter bf = new BagFormatter();
 				bf.setValueSource(ups.getProperty("script"));
 				bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 				bf.showSetNames(out, representable);
 			}
 			out.close();
 		}


 		public void writeWholeScripts(String outdir, String filename) throws IOException {
 			UnicodeSet commonAndInherited = new UnicodeSet(
 			"[[:script=common:][:script=inherited:]]");

 			WholeScript wsLower = new WholeScript(
 					new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet)
 					.removeAll(new UnicodeSet("[A-Z]")), "L");
 			WholeScript wsAny = new WholeScript(
 					new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet)
 					.addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), "A");

 			MyEquivalenceClass data = new MyEquivalenceClass();
 			for (Iterator it = dataMixedAnycase.getSamples().iterator(); it.hasNext();) {
 				String target = (String) it.next();
 				Set equivalents = dataMixedAnycase.getEquivalences(target);
 				boolean first = true;
 				for (Iterator it2 = equivalents.iterator(); it2.hasNext();) {
 					String cleaned = CollectionUtilities.remove((String)it2.next(), commonAndInherited);
 					if (cleaned.length() == 0) continue;
 					if (first) {
 						target = cleaned;
 						first = false;
 					} else {
 						data.add(target, cleaned);
 					}
 				}
 			}
 			Set itemsSeen = new HashSet();
 			for (Iterator it = data.getOrderedExplicitItems().iterator(); it.hasNext();) {
 				String target = (String) it.next();
 				if (itemsSeen.contains(target)) continue;
 				Set equivalents = data.getEquivalences(target);
 				itemsSeen.addAll(equivalents);
 				wsAny.addEquivalents(equivalents);
 				wsLower.addEquivalents(equivalents);
 			}
             PrintWriter out = openAndWriteHeader(filename, "Summary: Whole-Script Confusables");
 //			PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
 //			out.print('\uFEFF');
 //			out.println("# Summary: Whole-Script Confusables");
 //			out.println("# $Revision: 1.12 $");
 //			out.println("# $Date: 2006/09/24 23:32:44 $");
 			out.println("# This data is used for determining whether a strings is a");
 			out.println("# whole-script or mixed-script confusable.");
 			out.println("# The mappings here ignore common and inherited script characters,");
 			out.println("# such as accents.");
 			out.println("");
 			out.println("# Lowercase Only");
 			out.println("");
 			wsLower.write(out);
 			out.println("");
 			out.println("# Any-Case");
 			out.println("");
 			wsAny.write(out);
 			out.close();
 		}
 		/**
 		 *
 		 */
 		private String getStatus(String source) {
 			// TODO Auto-generated method stub
 			int val = betterTargetIsLess.getValue(source);
 			if (val == MARK_NOT_NFC.intValue()) return "[x]";
 			if (val == MARK_NFC.intValue()) return "[x]";
 			if (val == MARK_INPUT_LENIENT.intValue()) return "[L]";
 			if (val == MARK_INPUT_STRICT.intValue()) return "[I]";
 			if (val == MARK_OUTPUT.intValue()) return "[O]";
 			if (val == MARK_ASCII.intValue()) return "[A]";

 			return "?";
 		}
 	}

 	static class WholeScript {
 		private static UnicodeSet commonAndInherited = new UnicodeSet("[[:script=common:][:script=inherited:]]");
 		private UnicodeSet filterSet;
 		private UnicodeSet[] script_representables = new UnicodeSet[UScript.CODE_LIMIT];
 		private UnicodeSet[] script_set = new UnicodeSet[UScript.CODE_LIMIT];
 		private BagFormatter bf = new BagFormatter();
 		private String label;
 		{
 			for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
 				script_representables[i] = new UnicodeSet();
 				script_set[i] = new UnicodeSet("[:script=" + UScript.getName(i) + ":]"); // ugly hack
 			}
 			bf.setValueSource(ups.getProperty("script"));
 			bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 			bf.setLabelSource(UnicodeLabel.NULL);
 		}
 		WholeScript(UnicodeSet filterSet, String label) {
 			this.filterSet = filterSet;
 			this.label = label;
 			finished = false;
 		}
 		void addEquivalents(Set set) {
 			finished = false;
 			// if we have y ~ x, and both are single scripts
 			// that means that x can be represented in script(y),
 			// and y can be represented in script(x).
 			for (Iterator it = set.iterator(); it.hasNext();) {
 				String item1 = (String)it.next();
 				if (!filterSet.containsAll(item1)) continue;
 				int script1 = getSingleScript(item1);
 				if (script1 == UScript.INVALID_CODE) continue;
 				for (Iterator it2 = set.iterator(); it2.hasNext();) {
 					String item2 = (String)it2.next();
 					if (!filterSet.containsAll(item2)) continue;
 					int script2 = getSingleScript(item2);
 					if (script2 == UScript.INVALID_CODE || script2 == script1) continue;
 					script_representables[script1].addAll(item2).removeAll(commonAndInherited);
 				}
 			}
 		}

 		public static class UnicodeSetToScript {
 			public int getScript() {
 				return script;
 			}
 			public UnicodeSetToScript setScript(int script) {
 				this.script = script;
 				return this;
 			}
 			public UnicodeSet getSet() {
 				return set;
 			}
 			public UnicodeSetToScript setSet(UnicodeSet set) {
 				this.set = set;
 				return this;
 			}
 			private UnicodeSet set;
 			private int script;
 		}

 		UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UScript.CODE_LIMIT][];
 		UnicodeSet[] fastReject = new UnicodeSet[UScript.CODE_LIMIT];
 		boolean finished = false;

 		void finish() {
 			if (finished) return;
 			for (int j = 0; j < UScript.CODE_LIMIT; ++j) {
 				if (j == UScript.COMMON || j == UScript.INHERITED) continue;
 				if (script_representables[j].size() == 0) continue;
 				UnicodeSet accept = new UnicodeSet();
 				List curr = new ArrayList();
 				for (int k = 0; k < UScript.CODE_LIMIT; ++k) {
 					if (k == UScript.COMMON || k == UScript.INHERITED) continue;
 					if (script_representables[k].size() == 0) continue;

 					if (script_set[j].containsNone(script_representables[k])) continue;
 					UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]);
 					UnicodeSetToScript uss = new UnicodeSetToScript().setScript(k).setSet(items);
 					curr.add(uss);
 				}
 				scriptToUnicodeSetToScript[j] = (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]);
 				fastReject[j] = accept.complement();
 			}
 			finished = true;
 		}

 		void write(PrintWriter out) throws IOException {
 			finish();
 			for (int j = 0; j < UScript.CODE_LIMIT; ++j) {
 				if (scriptToUnicodeSetToScript[j] == null) continue;
 				for (int q = 0; q < scriptToUnicodeSetToScript[j].length; ++q) {
 					UnicodeSetToScript uss = scriptToUnicodeSetToScript[j][q];
 					int k = uss.getScript();
 					UnicodeSet items = uss.getSet();
 					String sname = UScript.getShortName(j) + "; " + UScript.getShortName(k) + "; " + label;
 					String name = UScript.getName(j) + "; " + UScript.getName(k);
 					out.println("# " + name + ": " + items.toPattern(false));
 					out.println("");
 					bf.setValueSource(sname);
 					bf.showSetNames(out, items);
 					out.println("");
 				}
 			}
 		}

 	}

 	/**
 		 * @throws IOException
 	 *
 	 */
 	private static void fixMichel(String indir, String outdir) throws IOException {
 		BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
 		PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
 		while (true) {
 			String line = Utility.readDataLine(in);
 			if (line == null) break;
 			String[] pieces = Utility.split(line,'\t');
 			if (pieces.length < 2) {
 				out.println(line);
 				continue;
 			}
 			String source = Utility.fromHex(pieces[0].trim());
 			if (Default.nfkd().isNormalized(source)) {
 				out.println(line);
 			}
 		}
 		in.close();
 		out.close();
 	}
 		/**
 	 *
 	 */

 	private static void generateSource() throws IOException {
 		File dir = new File(indir);
 		String[] names = dir.list();
 		Set sources = new TreeSet(new ArrayComparator(
 				new Comparator[] {codepointComparator, codepointComparator}));

 		int[] count = new int[1];
 		for (int i = 0; i < names.length; ++i) {
 			if (new File(indir + names[i]).isDirectory()) continue;
 			if (!names[i].startsWith("confusables")) continue;
 			String reason = getReasonFromFilename(names[i]);
 			System.out.println(names[i]);
 			BufferedReader in = BagFormatter.openUTF8Reader(indir, names[i]);
 			String line;
 			count[0] = 0;
 			while (true) {
 				line = Utility.readDataLine(in, count);
 				if (line == null) break;
 				if (line.length() == 0) continue;
 				String[] pieces = Utility.split(line,';');
 				if (pieces.length < 2) {
 					System.out.println("Error on: " + line);
 					continue;
 				}
 				String source = Utility.fromHex(pieces[0].trim(),true);
 				String target = Utility.fromHex(pieces[1].trim(),true);

 				if (source.length() == 0 || target.length() == 0) {
 					throw new IllegalArgumentException("zero-length item: " + count[0] + ":\t" + line);
 				}

 				// check for identical combining sequences
 				String nsource = Default.nfc().normalize(source);
 				String ntarget = Default.nfc().normalize(target);
 				if (nsource.equals(ntarget)) continue;

 				if (true) {
 					int nsourceFirst = UTF16.charAt(nsource,0);
 					String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
 					int ntargetFirst = UTF16.charAt(ntarget,0);
 					String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
 					if (nsourceRest.equals(ntargetRest)) {
 						source = UTF16.valueOf(nsourceFirst);
 						target = UTF16.valueOf(ntargetFirst);
 					}
 				}

 				if (betterTargetIsLess.compare(source, target) < 0) {
 					String temp = source;
 					source = target;
 					target = temp;
 				}
 				sources.add(new String[] {source, target});
 			}
 			in.close();
 		}
 		PrintWriter out = BagFormatter.openUTF8Writer(outdir, "confusableSource.txt");
 		for (Iterator it = sources.iterator(); it.hasNext();) {
 			String[] sourceItem = (String[]) it.next();
 			writeSourceTargetLine(out, sourceItem[0], null, sourceItem[1], null);
 		}
 		out.close();
 	}

 	private static void generateConfusables(String indir, String outdir) throws IOException {
 		File dir = new File(indir);
 		String[] names = dir.list();
 		DataSet total = new DataSet();
 		for (int i = 0; i < names.length; ++i) {
 			if (new File(indir + names[i]).isDirectory()) continue;
 			if (!names[i].startsWith("confusables")) continue;
 			System.out.println(names[i]);
 			DataSet ds = new DataSet();
 			ds.addFile(indir, names[i]);
 			ds.writeSource(outdir, "new-" + names[i]);
 			ds.close("*");
 			total.addAll(ds);
 			total.close("t*" + names[i]);
 		}
         // add normalized data
 //        for (int i = 0; i <= 0x10FFFF; ++i) {
 //            if (Default.nfkc().isNormalized(i)) continue;
 //            String result = getModifiedNKFC(UTF16.valueOf(i));
 //            ds.foo();
 //        }
         getSkipNFKD();
 		DataSet ds = new DataSet();
 		ds.addUnicodeMap(nfcMap, "nfc", "nfc");
 		ds.close("*");
         total.addAll(ds);
         total.close("*");

         ds = new DataSet();
         ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc");
         ds.close("*");
 		//ds.write(outdir, "new-decomp.txt", false, false);
 		total.addAll(ds);
 		total.close("*");

 		total.writeSummary(outdir, "confusablesSummary.txt", false, null);
 		total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null);
 		//total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true,
 		//		new UnicodeSet("[[:script=Cyrillic:][:script=common:][:script=inherited:]]"));
 		total.writeWholeScripts(outdir, "confusablesWholeScript.txt");
 		total.writeSourceOrder(outdir, "confusables.txt", false, false);
 		//DataSet clean = total.clean();
 		//clean.write(outdir, "confusables.txt", true);
 	}
 	/*
 		BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
 		Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(),
 				new UTF16.StringComparator()}));
 		while (true) {
 			String line = Utility.readDataLine(in);
 			if (line == null) break;
 			if (line.length() == 0) continue;
 			String[] pieces = Utility.split(line,';');
 			if (pieces.length < 2) {
 				System.out.println("Error on: " + line);
 				continue;
 			}
 			String source = Utility.fromHex(pieces[0].trim());
 			String target = Utility.fromHex(pieces[1].trim());
 			String nsource = Default.nfkd().normalize(source);
 			String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
 			if (!first.equals(target)) {
 				set.add(new String[]{source, target});
 			}
 		}
 		in.close();

 	}
 	public static void gen() throws IOException {
 		Map m = new TreeMap();
 		BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
 		while (true) {
 			String line = in.readLine();
 			if (line == null) break;
 			String[] pieces = Utility.split(line,';');
 			if (pieces.length < 3) {
 				System.out.println("Error on: " + line);
 				continue;
 			}
 			int codepoint = Integer.parseInt(pieces[1], 16);
 			int cat = Default.ucd().getCategory(codepoint);
 			if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
 			if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
 			String result = Utility.fromHex(pieces[0]);
 			if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
 			int count = Integer.parseInt(pieces[2]);
 			String source = UTF16.valueOf(codepoint);
 			add(m, source, result, count);
 		}
 		in.close();

 		in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
 		while (true) {
 			String line = in.readLine();
 			if (line == null) break;
 			line = line.trim();
 			int pos = line.indexOf("#");
 			if (pos >= 0) line = line.substring(0,pos).trim();
 			if (line.length() == 0) continue;
 			if (line.startsWith("@")) continue;
 			String[] pieces = Utility.split(line,';');
 			if (pieces.length < 2) {
 				System.out.println("Error on: " + line);
 				continue;
 			}
 			String source = pieces[0].trim();
 			for (int i = 1; i < pieces.length; ++i) {
 				add(m, source, pieces[i].trim(), -1);
 			}
 		}
 		in.close();

 		boolean gotOne;
 		// close the set
 		do {
 			gotOne = false;
 			for (Iterator it = m.keySet().iterator(); it.hasNext();) {
 				String source = (String) it.next();
 				Data2 data = (Data2) m.get(source);
 				Data2 data2 = (Data2) m.get(data.target);
 				if (data2 == null) continue;
 				data.target = data2.target;
 				gotOne = true;
 				break;
 			}
 		} while (gotOne);
 		// put into different sorting order
 		Set s = new TreeSet();
 		for (Iterator it = m.keySet().iterator(); it.hasNext();) {
 			String source = (String) it.next();
 			Data2 data = (Data2) m.get(source);
 			s.add(new Data(source, data.target, data.count));
 		}
 		// write it out
 		PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
 		String[] replacements = {"%date%", Default.getDate()};
 		Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
 				Utility.UTF8_WINDOWS, out, replacements);
 		for (Iterator it = s.iterator(); it.hasNext();) {
 			Data d = (Data) it.next();
 			if (d == null) continue;
 			out.println(formatLine(d.source, d.target, d.count));
 		}

 		out.close();
 		System.out.println("Done");
 	}
 	/**
 	 *
 	 */
 	private static String formatLine(String source, String target, int count) {
 		return Utility.hex(source) + " ; " + Utility.hex(target," ")
 				+ " ; " + count
 				+ " # "
 				+ "(" + source + " " + ARROW + " " + target + ") "
 				+ Default.ucd().getName(source)
 				+ " " + ARROW + " " + Default.ucd().getName(target);
 	}
 	/**
 	 *
 	 */
 /*	private static void add(Map m, String source, String target, int count) {
 		if (source.length() == 0 || target.length() == 0) return;
 		if (preferSecondAsSource(source, target)) {
 			String temp = target;
 			target = source;
 			source = temp;
 		}
 		Data2 other = (Data2) m.get(source);
 		if (other != null) {
 			if (target.equals(other.target)) return;
 			System.out.println("conflict");
 			System.out.println(formatLine(source, target, count));
 			System.out.println(formatLine(source, other.target, other.count));
 			// skip adding this, and instead add result -> other.target
 			add(m, target, other.target, count);
 		} else {
 			m.put(source, new Data2(target, count));
 		}
 	};
 */

 	static Integer
 		MARK_NOT_NFC = new Integer(50),
 		MARK_NFC = new Integer(40),
 		MARK_INPUT_LENIENT = new Integer(30),
 		MARK_INPUT_STRICT = new Integer(20),
 		MARK_OUTPUT = new Integer(10),
 		MARK_ASCII = new Integer(10);

 	static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();

     static UnicodeSet XID = new UnicodeSet("[:xidcontinue:]");

     static boolean isXid(String x) {
         return  XID.containsAll(x);
     }

 	static class _BetterTargetIsLess implements Comparator {
 		IdentifierInfo info = IdentifierInfo.getIdentifierInfo();

 		public int compare(Object o1, Object o2) {
 			String a = (String)o1;
 			String b = (String)o2;
             // longer is better (less)
 			int ca = UTF16.countCodePoint(a);
 			int cb = UTF16.countCodePoint(b);
 			if (ca != cb)  {
                 return ca > cb ? -1 : 1;
             }

             // is Identifier is better
             boolean ba = isXid(a);
             boolean bb = isXid(b);
             if (ba != bb) {
                 return ba ? -1 : 1;
             }

 			int aok = getValue(a);
 			int bok = getValue(b);
 			if (aok != bok) return aok < bok ? -1 : 1;
 			return codepointComparator.compare(a, b);
 		}
 		static final int BAD = 1000;

 		private int getValue(String a) { // lower is better
 			int cp;
 			int lastValue = 0;
 			for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
 				cp = UTF16.charAt(a, i);
 				Object objValue = info.lowerIsBetter.getValue(cp);
 				int value = ((Integer) objValue).intValue();
 				if (value > lastValue) lastValue = value;
 			}
 			return lastValue;
 		}
 	};

 /*	static private boolean preferSecondAsSource(String a, String b) {
 		// if first is longer, prefer second
 		int ca = UTF16.countCodePoint(a);
 		int cb = UTF16.countCodePoint(b);
 		if (ca != cb) {
 			return ca > cb;
 		}
 		// if first is lower, prefer second
 		return a.compareTo(b) < 0;
 	}
 */
 	static String getCodeCharName(String a) {
 		return Default.ucd().getCode(a) + "(  " + a + "  ) " + Default.ucd().getName(a);
 	}
 	/**
 	 * Returns the part between - and .
 	 */
 	public static String getReasonFromFilename(String type) {
 		int period = type.lastIndexOf('.');
 		if (period < 0) period = type.length();
 		int dash = type.lastIndexOf('-', period);
 		return type.substring(dash+1,period);
 	}

     static Normalizer modNFKC ;

      private static String getModifiedNKFC(String cf) {
          if (modNFKC == null) {
              modNFKC =  new Normalizer(Normalizer.NFKC, Default.ucdVersion());
              modNFKC.setSpacingSubstitute();
          }
          return modNFKC.normalize(cf);
      }

      private static PrintWriter openAndWriteHeader(String filename, String title) throws IOException {
          PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
          out.print('\uFEFF');
          out.println("# " + title);
          out.println("# File: " + filename);
          out.println("# Version: " + version);
          out.println("# Generated: " + Default.getDate());
          out.println("# Checkin: $Revision: 1.12 $");
          out.println("#");
          out.println("# For documentation and usage, see http://www.unicode.org/reports/tr39/");
          out.println("#");
          return out;
      }

 }