unicodetools/com/ibm/text/UCD/MakeNamesChart.java - external/github.com/unicode-org/icu - Git at Google

 package com.ibm.text.UCD;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.BitSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.TransliteratorUtilities;
 import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodePropertySource;
 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.Replaceable;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;
 import com.ibm.icu.util.ULocale;
 import com.ibm.text.utility.Utility;
 import com.ibm.text.utility.Utility.Encoding;

 public class MakeNamesChart {

 	static int lastCodePoint = -1;
 	static boolean lastCodePointIsOld = false;
 	static int lastDecompType = UCD.NONE;

 	static final String chartPrefix = "c_";
 	static final String namePrefix = "n_";

 	static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]");
 	static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]");
 	static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");

 	static UCD ucd41;

 	public static void main(String[] args) throws Exception {
 		//ConvertUCD.main(new String[]{"5.0.0"});
 		BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt");
 		// http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst
 		Default.setUCD("5.0.0");
 		ucd41 = UCD.make("4.1.0");
 		ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0");
 		skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn"));
 		//"[[:gc=cn:]-[:noncharactercodepoint:]]");
 		rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]");
 		usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");

 		List nameList = new ArrayList();
 		ArrayList lines = new ArrayList();
 		UnicodeSet collectedCodePoints = new UnicodeSet();
 		BitSet nameListNew = new BitSet();

 		int limit = Integer.MAX_VALUE;
 		for (int count = 0; count < limit; ++count) {
 			if (!blockInfo.next(lines)) break;
 			String firstLine = (String)lines.get(0);
 			if (firstLine.startsWith("@@@")) continue;
 			String[] lineParts = firstLine.split("\t");
 			String fileName = lineParts[1] + ".html";
 			nameList.add(firstLine);
 			System.out.println();
 			System.out.println("file: " + chartPrefix + fileName);
 			PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
 			out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
 					TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
 					"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
 					"<base target='names'></head><body>");

 			// header
 			out.println("<table class='headerTable'><tr><td class='headerLeft'>" +
 					lineParts[1] +
 					" <a href='help.html'>help</a></td><td class='headerCenter'>" +
 					getHeading(lineParts[2]) +
 					"</td><td class='headerRight'><a href='mainList.html'>index</a> " +
 					lineParts[3] +
 					"</td></tr></table>");

 			if ("Unassigned".equals(lineParts[2])) {
 				System.out.println("debug");
 			}
 			// first pass through and collect all the code points
 			collectedCodePoints.clear();
 			for (int i = 1; i < lines.size(); ++i) {
 				String line = (String)lines.get(i);
 				int cp1 = line.charAt(0);
 				if (cp1 != '@' && cp1 != '\t') {
 					int cp = Integer.parseInt(line.split("\t")[0],16);
 					collectedCodePoints.add(cp);
 				}
 			}
 			collectedCodePoints.removeAll(skipChars);
 			if (collectedCodePoints.size() == 0) {
 				out.println("<p align='center'>No Names List</p>");
 			} else {
 				out.println("<div align='center'><table class='chart'><tr>");
 				int counter = 0;
 				for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) {
 					if ((counter % 16) == 0 && counter != 0) {
 						out.println("</tr><tr>");
 					}
 					String tdclass = "cell";
 					if (counter < 16) tdclass = "cellw";
 					if (it.codepoint == 0x242) {
 						System.out.println("debug");
 					}
 					boolean isNew = isNew(it.codepoint);
 					if (isNew) tdclass += "new";
 					String hexcp = Utility.hex(it.codepoint, 4);
 					String title = "";
 					String name = Default.ucd().getName(it.codepoint);
 					if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
 					out.println("<td class='" + tdclass + "'"
 							+ title
 							+ ">\u00A0"
 							+ showChar(it.codepoint) + "\u00A0<br><tt><a href='" + namePrefix + fileName + "#"+ hexcp + "'>" +
 							hexcp + "</a></tt></td>");
 					counter++;
 				}
 				if (counter > 16) {
 					counter &= 0xF;
 					if (counter != 0) for (; counter < 16; ++counter) out.println("<td class='cell'>\u00A0</td>");
 					out.println("</tr></table></div>");
 				}
 			}
 			out.close();
 			out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName);
 			out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
 					"<link rel='stylesheet' type='text/css' href='namelist.css'></head><body>");

 			// now do the characters
 			boolean inTable = false;
 			for (int i = 1; i < lines.size(); ++i) {
 				String line = (String)lines.get(i);
 				try {
 					if (line.startsWith("@")) {
 						finishItem(out);
 						if (inTable) {
 							out.println("</table>");
 							inTable = false;
 						}
 						if (line.startsWith("@+")) {
 							line = line.substring(2).trim();
 							out.println("<p class='comment'>"
 									+ line
 									+ "</p>");
 						} else {
 							line = line.substring(1).trim();
 							out.println("<h2>"
 									+ line
 									+ "</h2>");
 						}
 					} else {
 						if (!inTable) {
 							out.println("<table>");
 							inTable = true;
 						}
 						//String line2 = lineParts[1];
 						if (line.startsWith("\t")) {
 							String body = line.trim();
 							if (false && line.indexOf(body) != 1) {
 								System.out.println("Format error: too much inital whitespace: <" + line + ">");
 							}
 							char firstChar = body.charAt(0);
 							switch (firstChar) {
 							case '*': body = "\u2022 " + body.substring(2); break;
 							case ':': body = checkCanonical(lastCodePoint, body); break;
 							case '#': body = checkCompatibility(lastCodePoint, body); break;
 							case 'x': body = getOther(body); break;
 							case '=': break;
 							default: throw new IllegalArgumentException("Huh? " + body);
 							}
 							out.println("<tr><td>\u00A0</td><td>\u00A0</td><td>"
 									+ maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=')
 									+ "</td></tr>");
 						} else {
 							finishItem(out);
 							lineParts = line.split("\t");
 							String x = lineParts[0];
 							lastCodePoint = Integer.parseInt(x,16);
 							boolean lastCodePointIsNew = isNew(lastCodePoint);
 							if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true);
 							out.println("<tr><td"
 									+ (lastCodePointIsNew ? " class='new'" : "")
 									+ "><code><a name='" + x + "'>" + x + "</a></code></td><td>\u00A0"
 									+ showChar(lastCodePoint) + "\u00A0</td><td"
 									+ (lastCodePointIsNew ? " class='new'" : "") + ">"
 									+ nameStyle(showTextConvertingHex(lineParts[1], false)) + "</td></tr>");
 							lastDecompType = Default.ucd().getDecompositionType(lastCodePoint);
 						}
 					}
 				} catch (Exception e) {
 					throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line)
 					.initCause(e);
 				}
 			}
 			finishItem(out);
 			out.close();
 		}
 		blockInfo.in.close();
 		PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html");
 		out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
 				"<title>Main List</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
 				"<base target='chart'></head><body><table>");
 		for (int i = 0; i < nameList.size(); ++i) {
 			String line = (String) nameList.get(i);
 			String[] lineParts = line.split("\t");
 			String fileName = lineParts[1] + ".html";
 			out.println("<tr><td><code>" + lineParts[1] +
 					"</code></td><td"
 					+ (nameListNew.get(i) ? " class='new'" : "")
 					+ "><a href='" + chartPrefix + fileName + "'>" + getHeading(lineParts[2]) + "</a></td><td><code>" +
 					lineParts[3] +"</code></td></tr>");
 		}
 		out.println("</table></body></html>");
 		out.close();
 		BagFormatter bf = new BagFormatter();
 		//System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName));
 		System.out.println("Name differences: Canonical");
 		showNameDifferences(hasNameCan, hasNoNameCan);
 		System.out.println("Name differences: Compatibility");
 		showNameDifferences(hasNameComp, hasNoNameComp);
 //		System.out.println("Characters with names in decomps: " + hasName.toPattern(true));
 //		System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true));
 //		System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true));
 		System.out.println("Done");
 	}

 	private static boolean isNew(int codepoint) {
 		return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint);
 	}

 	private static void showNameDifferences(Map hasName, Map hasNoName) {
 		Set both = new TreeSet(hasNoName.keySet());
 		both.retainAll(hasName.keySet());
 		//hasNoName.removeAll(both);
 		//hasName.removeAll(both);
 		for (Iterator it = both.iterator(); it.hasNext();) {
 			String decomp = (String) it.next();
 			System.out.println();
 			System.out.println("decomp: " + Utility.hex(decomp));
 			System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp)));
 			System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp)));
 		}
 		System.out.println("Count: " + both.size());
 	}

 	static TestIdentifiers ti;
 	static {
 		try {
 			ti = new TestIdentifiers("L");
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 	}

 	private static void finishItem(PrintWriter out) {
 		if (lastCodePoint < 0) return;
 		if (lastDecompType != UCD.NONE) {
 			System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint));
 		}
 		String str = UTF16.valueOf(lastCodePoint);
 		String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191");
 		showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195");
 		String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193");
 		showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194");

 		String dc = Default.ucd().getDecompositionMapping(lastCodePoint);
 		String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB");
 		//String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB");
 		String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD");

 		if (nfkd.equals(str)) {
 			Set s = ti.getConfusables(lastCodePoint, "MA");
 			if (s.size() > 1) {
 				sortedSet.clear();
 				for (Iterator it = s.iterator(); it.hasNext();) {
 					sortedSet.add(Default.nfkd().normalize((String)it.next()));
 				}
 				sortedSet.remove(nfkd); // remove me
 				for (Iterator it = sortedSet.iterator(); it.hasNext();) {
 					String other = (String)it.next();
 					if (nfkd.equals(Default.nfkd().normalize(other))) continue;
 					out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='conf'>\u279F\u00A0"
 							+ showTextConvertingHex(Utility.hex(other, 4, " + "), true)
 							+ " "
 							+ Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase()
 							// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
 							+ "</td></tr>");
 				}
 			}
 		}
 		lastCodePoint = -1;
 	}

 	static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH));

 	private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) {
 		if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) {
 			out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='c'>" + symbol + "\u00A0"
 				+ showTextConvertingHex(Utility.hex(transformed, 4, " + "), true)
 				+ (UTF16.countCodePoint(transformed) != 1 ? "" :
 					" " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase())
 				// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
 				+ "</td></tr>");
 		}
 		return transformed;
 	}

 	static public String getHeading(String name) {
 		int pos = name.lastIndexOf(" (");
 		if (pos < 0) return name;
 		return name.substring(0, pos);
 	}

 	private static String maybeNameStyle(String string, boolean b) {
 		if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string);
 		return string;
 	}


 	private static String nameStyle(String string) {
 		// TODO Auto-generated method stub
 		String result = "<i>" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + "</i>";
 		// if it has any &xxx;, then restore them.
 		int position = 0;
 		while (true) {
 			if (!escapeMatch.reset(result).find(position)) break;
 			int start = escapeMatch.start();
 			position = escapeMatch.end();
 			result = result.substring(0,start)
 			+ result.substring(start, position).toLowerCase()
 			+ result.substring(position);
 		}
 		return result;
 	}

 	static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");

 	private static String showTextConvertingHex(String body, boolean addCharToHex) {
 		body = TransliteratorUtilities.toHTML.transliterate(body);
 		if (addCharToHex) {
 			int position = 0;
 			while (position < body.length()) {
 				if (!findHex.reset(body).find(position)) break;
 				position = findHex.end();
 				int start = findHex.start();
 				int len = position - start;
 				if (len < 4 || len > 6) continue;
 				int cp = Integer.parseInt(findHex.group(),16);
 				if (cp > 0x10FFFF) continue;
 				String insert = "\u00A0" + showChar(cp);
 				String beginning = body.substring(0,start)
 					+ "<code>" + body.substring(start, position) + "</code>"
 					+ insert;
 				body = beginning + body.substring(position);
 				position = beginning.length();
 			}
 		}
 		return body;
 	}

 	static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher("");
 	static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher("");
 	static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher("");

 	private static String getOther(String body) {
 		// of form: 	x (hyphenation point - 2027)
 		// => arrow 2027 X hyphenation point
 		int cp;
 		String name = null;
 		if (pointer.reset(body).matches()) {
 			cp = Integer.parseInt(pointer.group(2),16);
 			name = pointer.group(1);
 			String name2 = Default.ucd().getName(cp);
 			if (name2 == null) name2 = "<not a character>";
 			if (!name.equalsIgnoreCase(name2)) {
 				System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint));
 				System.out.println("\tName is: " + name2);
 			}
 		} else if (pointer2.reset(body).matches()) {
 			cp = Integer.parseInt(pointer2.group(1),16);
 			// name = UCharacter.getName(cp).toLowerCase();
 			// System.out.println("Irregular format: " + body);
 		} else {
 			throw new IllegalArgumentException("Bad format: " + body);
 		}
 		return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : "");
 	}

 	static String showChar(int cp) {
 		if (usePicture.contains(cp)) {
 			int rep = '\u2588';
 			if (cp <= 0x20) rep = 0x2400 + cp;
 			else if (cp == 0x7F) rep = 0x2421;
 			return "<span class='inv'>" + (char)rep + "</span>";
 			//String hex = Utility.hex(cp);
 			//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
 		}

 		int type = Default.ucd().getCategory(cp);
 		if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
 			return "\u2588";
 		}
 		String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
 		if (type == UCD.Me || type == UCD.Mn) {
 			result = "\u25CC" + result;
 		} else if (rtl.contains(cp)) {
 			result = "\u200E" + result + "\u200E";
 		}
 		return result;
 	}

 	//static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]");
 	static final Map hasNoNameCan = new TreeMap();
 	static final Map hasNameCan = new TreeMap();
 	static final Map hasNoNameComp = new TreeMap();
 	static final Map hasNameComp = new TreeMap();

 	private static String checkCanonical(int codePoint, String body) {
 		body = body.substring(2);
 		if (lastDecompType != UCD.CANONICAL) {
 			System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
 		}
 		String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
 		String hexed = Utility.hex(lastDecomp, 4, " ");
 		String hexed2 = hexed;
 		if (UTF16.countCodePoint(lastDecomp) == 1) {
 			hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
 		}
 		if (hexed.equalsIgnoreCase(body)) {
 			hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
 		} else if (hexed2.equalsIgnoreCase(body)) {
 			hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
 		} else {
 			System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
 			System.out.println("\tShould be: " + hexed);
 		}
 		lastDecompType = UCD.NONE;
 		return "\u2261 " + body;
 	}

 	private static String checkCompatibility(int codePoint, String body) {
 		body = body.substring(2);
 		if (lastDecompType <= UCD.CANONICAL) {
 			System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
 		}
 		String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
 		String hexed = Utility.hex(lastDecomp, 4, " ");
 		if (lastDecompType != UCD.COMPAT_UNSPECIFIED) {
 			String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint);
 			hexed = "<" + lastDecompID + "> " + hexed;
 		}
 		String hexed2 = hexed;
 		if (UTF16.countCodePoint(lastDecomp) == 1) {
 			hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
 		}
 		if (hexed.equalsIgnoreCase(body)) {
 			hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
 		} else if (hexed2.equalsIgnoreCase(body)) {
 			hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
 		} else {
 			System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
 			System.out.println("\tShould be: " + hexed);
 		}
 		lastDecompType = UCD.NONE;
 		return "\u2248 " + body;
 	}

 	static class BlockInfo {
 		BufferedReader in;
 		String lastLine;
 		BlockInfo (String version, String filename) throws IOException {
 			in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS);
 			//in = BagFormatter.openUTF8Reader(dir, filename);
 		}
 		boolean next(List inout) throws IOException {
 			inout.clear();
 			if (lastLine != null) {
 				inout.add(lastLine);
 				lastLine = null;
 			}
 			while (true) {
 				String line = in.readLine();
 				if (line == null) break;
 				if (line.startsWith("@@\t")) {
 					lastLine = line;
 					break;
 				}
 				inout.add(line);
 			}
 			return inout.size() > 0;
 		}

 	}
 }
	package com.ibm.text.UCD;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.PrintWriter;
	import java.util.ArrayList;
	import java.util.BitSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Set;
	import java.util.TreeMap;
	import java.util.TreeSet;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import com.ibm.icu.dev.test.util.BagFormatter;
	import com.ibm.icu.dev.test.util.TransliteratorUtilities;
	import com.ibm.icu.dev.test.util.UnicodeMap;
	import com.ibm.icu.dev.test.util.UnicodePropertySource;
	import com.ibm.icu.text.Collator;
	import com.ibm.icu.text.Replaceable;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;
	import com.ibm.icu.text.UnicodeSetIterator;
	import com.ibm.icu.util.ULocale;
	import com.ibm.text.utility.Utility;
	import com.ibm.text.utility.Utility.Encoding;

	public class MakeNamesChart {

	static int lastCodePoint = -1;
	static boolean lastCodePointIsOld = false;
	static int lastDecompType = UCD.NONE;

	static final String chartPrefix = "c_";
	static final String namePrefix = "n_";

	static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]");
	static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]");
	static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");

	static UCD ucd41;

	public static void main(String[] args) throws Exception {
	//ConvertUCD.main(new String[]{"5.0.0"});
	BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt");
	// http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst
	Default.setUCD("5.0.0");
	ucd41 = UCD.make("4.1.0");
	ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0");
	skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn"));
	//"[[:gc=cn:]-[:noncharactercodepoint:]]");
	rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]");
	usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");

	List nameList = new ArrayList();
	ArrayList lines = new ArrayList();
	UnicodeSet collectedCodePoints = new UnicodeSet();
	BitSet nameListNew = new BitSet();

	int limit = Integer.MAX_VALUE;
	for (int count = 0; count < limit; ++count) {
	if (!blockInfo.next(lines)) break;
	String firstLine = (String)lines.get(0);
	if (firstLine.startsWith("@@@")) continue;
	String[] lineParts = firstLine.split("\t");
	String fileName = lineParts[1] + ".html";
	nameList.add(firstLine);
	System.out.println();
	System.out.println("file: " + chartPrefix + fileName);
	PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
	out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
	TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
	"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
	"<base target='names'></head><body>");

	// header
	out.println("<table class='headerTable'><tr><td class='headerLeft'>" +
	lineParts[1] +
	" <a href='help.html'>help</a></td><td class='headerCenter'>" +
	getHeading(lineParts[2]) +
	"</td><td class='headerRight'><a href='mainList.html'>index</a> " +
	lineParts[3] +
	"</td></tr></table>");

	if ("Unassigned".equals(lineParts[2])) {
	System.out.println("debug");
	}
	// first pass through and collect all the code points
	collectedCodePoints.clear();
	for (int i = 1; i < lines.size(); ++i) {
	String line = (String)lines.get(i);
	int cp1 = line.charAt(0);
	if (cp1 != '@' && cp1 != '\t') {
	int cp = Integer.parseInt(line.split("\t")[0],16);
	collectedCodePoints.add(cp);
	}
	}
	collectedCodePoints.removeAll(skipChars);
	if (collectedCodePoints.size() == 0) {
	out.println("<p align='center'>No Names List</p>");
	} else {
	out.println("<div align='center'><table class='chart'><tr>");
	int counter = 0;
	for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) {
	if ((counter % 16) == 0 && counter != 0) {
	out.println("</tr><tr>");
	}
	String tdclass = "cell";
	if (counter < 16) tdclass = "cellw";
	if (it.codepoint == 0x242) {
	System.out.println("debug");
	}
	boolean isNew = isNew(it.codepoint);
	if (isNew) tdclass += "new";
	String hexcp = Utility.hex(it.codepoint, 4);
	String title = "";
	String name = Default.ucd().getName(it.codepoint);
	if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
	out.println("<td class='" + tdclass + "'"
	+ title
	+ ">\u00A0"
	+ showChar(it.codepoint) + "\u00A0<br><tt><a href='" + namePrefix + fileName + "#"+ hexcp + "'>" +
	hexcp + "</a></tt></td>");
	counter++;
	}
	if (counter > 16) {
	counter &= 0xF;
	if (counter != 0) for (; counter < 16; ++counter) out.println("<td class='cell'>\u00A0</td>");
	out.println("</tr></table></div>");
	}
	}
	out.close();
	out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName);
	out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
	"<link rel='stylesheet' type='text/css' href='namelist.css'></head><body>");

	// now do the characters
	boolean inTable = false;
	for (int i = 1; i < lines.size(); ++i) {
	String line = (String)lines.get(i);
	try {
	if (line.startsWith("@")) {
	finishItem(out);
	if (inTable) {
	out.println("</table>");
	inTable = false;
	}
	if (line.startsWith("@+")) {
	line = line.substring(2).trim();
	out.println("<p class='comment'>"
	+ line
	+ "</p>");
	} else {
	line = line.substring(1).trim();
	out.println("<h2>"
	+ line
	+ "</h2>");
	}
	} else {
	if (!inTable) {
	out.println("<table>");
	inTable = true;
	}
	//String line2 = lineParts[1];
	if (line.startsWith("\t")) {
	String body = line.trim();
	if (false && line.indexOf(body) != 1) {
	System.out.println("Format error: too much inital whitespace: <" + line + ">");
	}
	char firstChar = body.charAt(0);
	switch (firstChar) {
	case '*': body = "\u2022 " + body.substring(2); break;
	case ':': body = checkCanonical(lastCodePoint, body); break;
	case '#': body = checkCompatibility(lastCodePoint, body); break;
	case 'x': body = getOther(body); break;
	case '=': break;
	default: throw new IllegalArgumentException("Huh? " + body);
	}
	out.println("<tr><td>\u00A0</td><td>\u00A0</td><td>"
	+ maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=')
	+ "</td></tr>");
	} else {
	finishItem(out);
	lineParts = line.split("\t");
	String x = lineParts[0];
	lastCodePoint = Integer.parseInt(x,16);
	boolean lastCodePointIsNew = isNew(lastCodePoint);
	if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true);
	out.println("<tr><td"
	+ (lastCodePointIsNew ? " class='new'" : "")
	+ "><code><a name='" + x + "'>" + x + "</a></code></td><td>\u00A0"
	+ showChar(lastCodePoint) + "\u00A0</td><td"
	+ (lastCodePointIsNew ? " class='new'" : "") + ">"
	+ nameStyle(showTextConvertingHex(lineParts[1], false)) + "</td></tr>");
	lastDecompType = Default.ucd().getDecompositionType(lastCodePoint);
	}
	}
	} catch (Exception e) {
	throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line)
	.initCause(e);
	}
	}
	finishItem(out);
	out.close();
	}
	blockInfo.in.close();
	PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html");
	out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
	"<title>Main List</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
	"<base target='chart'></head><body><table>");
	for (int i = 0; i < nameList.size(); ++i) {
	String line = (String) nameList.get(i);
	String[] lineParts = line.split("\t");
	String fileName = lineParts[1] + ".html";
	out.println("<tr><td><code>" + lineParts[1] +
	"</code></td><td"
	+ (nameListNew.get(i) ? " class='new'" : "")
	+ "><a href='" + chartPrefix + fileName + "'>" + getHeading(lineParts[2]) + "</a></td><td><code>" +
	lineParts[3] +"</code></td></tr>");
	}
	out.println("</table></body></html>");
	out.close();
	BagFormatter bf = new BagFormatter();
	//System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName));
	System.out.println("Name differences: Canonical");
	showNameDifferences(hasNameCan, hasNoNameCan);
	System.out.println("Name differences: Compatibility");
	showNameDifferences(hasNameComp, hasNoNameComp);
	// System.out.println("Characters with names in decomps: " + hasName.toPattern(true));
	// System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true));
	// System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true));
	System.out.println("Done");
	}

	private static boolean isNew(int codepoint) {
	return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint);
	}

	private static void showNameDifferences(Map hasName, Map hasNoName) {
	Set both = new TreeSet(hasNoName.keySet());
	both.retainAll(hasName.keySet());
	//hasNoName.removeAll(both);
	//hasName.removeAll(both);
	for (Iterator it = both.iterator(); it.hasNext();) {
	String decomp = (String) it.next();
	System.out.println();
	System.out.println("decomp: " + Utility.hex(decomp));
	System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp)));
	System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp)));
	}
	System.out.println("Count: " + both.size());
	}

	static TestIdentifiers ti;
	static {
	try {
	ti = new TestIdentifiers("L");
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	private static void finishItem(PrintWriter out) {
	if (lastCodePoint < 0) return;
	if (lastDecompType != UCD.NONE) {
	System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint));
	}
	String str = UTF16.valueOf(lastCodePoint);
	String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191");
	showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195");
	String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193");
	showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194");

	String dc = Default.ucd().getDecompositionMapping(lastCodePoint);
	String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB");
	//String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB");
	String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD");

	if (nfkd.equals(str)) {
	Set s = ti.getConfusables(lastCodePoint, "MA");
	if (s.size() > 1) {
	sortedSet.clear();
	for (Iterator it = s.iterator(); it.hasNext();) {
	sortedSet.add(Default.nfkd().normalize((String)it.next()));
	}
	sortedSet.remove(nfkd); // remove me
	for (Iterator it = sortedSet.iterator(); it.hasNext();) {
	String other = (String)it.next();
	if (nfkd.equals(Default.nfkd().normalize(other))) continue;
	out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='conf'>\u279F\u00A0"
	+ showTextConvertingHex(Utility.hex(other, 4, " + "), true)
	+ " "
	+ Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase()
	// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
	+ "</td></tr>");
	}
	}
	}
	lastCodePoint = -1;
	}

	static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH));

	private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) {
	if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) {
	out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='c'>" + symbol + "\u00A0"
	+ showTextConvertingHex(Utility.hex(transformed, 4, " + "), true)
	+ (UTF16.countCodePoint(transformed) != 1 ? "" :
	" " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase())
	// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
	+ "</td></tr>");
	}
	return transformed;
	}

	static public String getHeading(String name) {
	int pos = name.lastIndexOf(" (");
	if (pos < 0) return name;
	return name.substring(0, pos);
	}

	private static String maybeNameStyle(String string, boolean b) {
	if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string);
	return string;
	}


	private static String nameStyle(String string) {
	// TODO Auto-generated method stub
	String result = "<i>" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + "</i>";
	// if it has any &xxx;, then restore them.
	int position = 0;
	while (true) {
	if (!escapeMatch.reset(result).find(position)) break;
	int start = escapeMatch.start();
	position = escapeMatch.end();
	result = result.substring(0,start)
	+ result.substring(start, position).toLowerCase()
	+ result.substring(position);
	}
	return result;
	}

	static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");

	private static String showTextConvertingHex(String body, boolean addCharToHex) {
	body = TransliteratorUtilities.toHTML.transliterate(body);
	if (addCharToHex) {
	int position = 0;
	while (position < body.length()) {
	if (!findHex.reset(body).find(position)) break;
	position = findHex.end();
	int start = findHex.start();
	int len = position - start;
	if (len < 4 \|\| len > 6) continue;
	int cp = Integer.parseInt(findHex.group(),16);
	if (cp > 0x10FFFF) continue;
	String insert = "\u00A0" + showChar(cp);
	String beginning = body.substring(0,start)
	+ "<code>" + body.substring(start, position) + "</code>"
	+ insert;
	body = beginning + body.substring(position);
	position = beginning.length();
	}
	}
	return body;
	}

	static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher("");
	static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher("");
	static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher("");

	private static String getOther(String body) {
	// of form: x (hyphenation point - 2027)
	// => arrow 2027 X hyphenation point
	int cp;
	String name = null;
	if (pointer.reset(body).matches()) {
	cp = Integer.parseInt(pointer.group(2),16);
	name = pointer.group(1);
	String name2 = Default.ucd().getName(cp);
	if (name2 == null) name2 = "<not a character>";
	if (!name.equalsIgnoreCase(name2)) {
	System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint));
	System.out.println("\tName is: " + name2);
	}
	} else if (pointer2.reset(body).matches()) {
	cp = Integer.parseInt(pointer2.group(1),16);
	// name = UCharacter.getName(cp).toLowerCase();
	// System.out.println("Irregular format: " + body);
	} else {
	throw new IllegalArgumentException("Bad format: " + body);
	}
	return "\u2192 " + Utility.hex(cp,4) /+ " " + showChar(cp)/ + (name != null ? " " + name : "");
	}

	static String showChar(int cp) {
	if (usePicture.contains(cp)) {
	int rep = '\u2588';
	if (cp <= 0x20) rep = 0x2400 + cp;
	else if (cp == 0x7F) rep = 0x2421;
	return "<span class='inv'>" + (char)rep + "</span>";
	//String hex = Utility.hex(cp);
	//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
	}

	int type = Default.ucd().getCategory(cp);
	if (type == UCD.Cn \|\| type == UCD.Co \|\| type == UCD.Cs) {
	return "\u2588";
	}
	String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
	if (type == UCD.Me \|\| type == UCD.Mn) {
	result = "\u25CC" + result;
	} else if (rtl.contains(cp)) {
	result = "\u200E" + result + "\u200E";
	}
	return result;
	}

	//static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]");
	static final Map hasNoNameCan = new TreeMap();
	static final Map hasNameCan = new TreeMap();
	static final Map hasNoNameComp = new TreeMap();
	static final Map hasNameComp = new TreeMap();

	private static String checkCanonical(int codePoint, String body) {
	body = body.substring(2);
	if (lastDecompType != UCD.CANONICAL) {
	System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
	}
	String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
	String hexed = Utility.hex(lastDecomp, 4, " ");
	String hexed2 = hexed;
	if (UTF16.countCodePoint(lastDecomp) == 1) {
	hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
	}
	if (hexed.equalsIgnoreCase(body)) {
	hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
	} else if (hexed2.equalsIgnoreCase(body)) {
	hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
	} else {
	System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
	System.out.println("\tShould be: " + hexed);
	}
	lastDecompType = UCD.NONE;
	return "\u2261 " + body;
	}

	private static String checkCompatibility(int codePoint, String body) {
	body = body.substring(2);
	if (lastDecompType <= UCD.CANONICAL) {
	System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
	}
	String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
	String hexed = Utility.hex(lastDecomp, 4, " ");
	if (lastDecompType != UCD.COMPAT_UNSPECIFIED) {
	String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint);
	hexed = "<" + lastDecompID + "> " + hexed;
	}
	String hexed2 = hexed;
	if (UTF16.countCodePoint(lastDecomp) == 1) {
	hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
	}
	if (hexed.equalsIgnoreCase(body)) {
	hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
	} else if (hexed2.equalsIgnoreCase(body)) {
	hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
	} else {
	System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
	System.out.println("\tShould be: " + hexed);
	}
	lastDecompType = UCD.NONE;
	return "\u2248 " + body;
	}

	static class BlockInfo {
	BufferedReader in;
	String lastLine;
	BlockInfo (String version, String filename) throws IOException {
	in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS);
	//in = BagFormatter.openUTF8Reader(dir, filename);
	}
	boolean next(List inout) throws IOException {
	inout.clear();
	if (lastLine != null) {
	inout.add(lastLine);
	lastLine = null;
	}
	while (true) {
	String line = in.readLine();
	if (line == null) break;
	if (line.startsWith("@@\t")) {
	lastLine = line;
	break;
	}
	inout.add(line);
	}
	return inout.size() > 0;
	}

	}
	}