blob: 3895442b7c3f0c1a64f4b2a646b4d8535919b29d [file] [log] [blame]
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.text.utility.Utility;
import com.ibm.text.utility.Utility.Encoding;
public class MakeNamesChart {
static int lastCodePoint = -1;
static boolean lastCodePointIsOld = false;
static int lastDecompType = UCD.NONE;
static final String chartPrefix = "c_";
static final String namePrefix = "n_";
static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]");
static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]");
static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
static UCD ucd41;
public static void main(String[] args) throws Exception {
//ConvertUCD.main(new String[]{"5.0.0"});
BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt");
// http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst
Default.setUCD("5.0.0");
ucd41 = UCD.make("4.1.0");
ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0");
skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn"));
//"[[:gc=cn:]-[:noncharactercodepoint:]]");
rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]");
usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]");
List nameList = new ArrayList();
ArrayList lines = new ArrayList();
UnicodeSet collectedCodePoints = new UnicodeSet();
BitSet nameListNew = new BitSet();
int limit = Integer.MAX_VALUE;
for (int count = 0; count < limit; ++count) {
if (!blockInfo.next(lines)) break;
String firstLine = (String)lines.get(0);
if (firstLine.startsWith("@@@")) continue;
String[] lineParts = firstLine.split("\t");
String fileName = lineParts[1] + ".html";
nameList.add(firstLine);
System.out.println();
System.out.println("file: " + chartPrefix + fileName);
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
"<base target='names'></head><body>");
// header
out.println("<table class='headerTable'><tr><td class='headerLeft'>" +
lineParts[1] +
" <a href='help.html'>help</a></td><td class='headerCenter'>" +
getHeading(lineParts[2]) +
"</td><td class='headerRight'><a href='mainList.html'>index</a> " +
lineParts[3] +
"</td></tr></table>");
if ("Unassigned".equals(lineParts[2])) {
System.out.println("debug");
}
// first pass through and collect all the code points
collectedCodePoints.clear();
for (int i = 1; i < lines.size(); ++i) {
String line = (String)lines.get(i);
int cp1 = line.charAt(0);
if (cp1 != '@' && cp1 != '\t') {
int cp = Integer.parseInt(line.split("\t")[0],16);
collectedCodePoints.add(cp);
}
}
collectedCodePoints.removeAll(skipChars);
if (collectedCodePoints.size() == 0) {
out.println("<p align='center'>No Names List</p>");
} else {
out.println("<div align='center'><table class='chart'><tr>");
int counter = 0;
for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) {
if ((counter % 16) == 0 && counter != 0) {
out.println("</tr><tr>");
}
String tdclass = "cell";
if (counter < 16) tdclass = "cellw";
if (it.codepoint == 0x242) {
System.out.println("debug");
}
boolean isNew = isNew(it.codepoint);
if (isNew) tdclass += "new";
String hexcp = Utility.hex(it.codepoint, 4);
String title = "";
String name = Default.ucd().getName(it.codepoint);
if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
out.println("<td class='" + tdclass + "'"
+ title
+ ">\u00A0"
+ showChar(it.codepoint) + "\u00A0<br><tt><a href='" + namePrefix + fileName + "#"+ hexcp + "'>" +
hexcp + "</a></tt></td>");
counter++;
}
if (counter > 16) {
counter &= 0xF;
if (counter != 0) for (; counter < 16; ++counter) out.println("<td class='cell'>\u00A0</td>");
out.println("</tr></table></div>");
}
}
out.close();
out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
"<link rel='stylesheet' type='text/css' href='namelist.css'></head><body>");
// now do the characters
boolean inTable = false;
for (int i = 1; i < lines.size(); ++i) {
String line = (String)lines.get(i);
try {
if (line.startsWith("@")) {
finishItem(out);
if (inTable) {
out.println("</table>");
inTable = false;
}
if (line.startsWith("@+")) {
line = line.substring(2).trim();
out.println("<p class='comment'>"
+ line
+ "</p>");
} else {
line = line.substring(1).trim();
out.println("<h2>"
+ line
+ "</h2>");
}
} else {
if (!inTable) {
out.println("<table>");
inTable = true;
}
//String line2 = lineParts[1];
if (line.startsWith("\t")) {
String body = line.trim();
if (false && line.indexOf(body) != 1) {
System.out.println("Format error: too much inital whitespace: <" + line + ">");
}
char firstChar = body.charAt(0);
switch (firstChar) {
case '*': body = "\u2022 " + body.substring(2); break;
case ':': body = checkCanonical(lastCodePoint, body); break;
case '#': body = checkCompatibility(lastCodePoint, body); break;
case 'x': body = getOther(body); break;
case '=': break;
default: throw new IllegalArgumentException("Huh? " + body);
}
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td>"
+ maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=')
+ "</td></tr>");
} else {
finishItem(out);
lineParts = line.split("\t");
String x = lineParts[0];
lastCodePoint = Integer.parseInt(x,16);
boolean lastCodePointIsNew = isNew(lastCodePoint);
if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true);
out.println("<tr><td"
+ (lastCodePointIsNew ? " class='new'" : "")
+ "><code><a name='" + x + "'>" + x + "</a></code></td><td>\u00A0"
+ showChar(lastCodePoint) + "\u00A0</td><td"
+ (lastCodePointIsNew ? " class='new'" : "") + ">"
+ nameStyle(showTextConvertingHex(lineParts[1], false)) + "</td></tr>");
lastDecompType = Default.ucd().getDecompositionType(lastCodePoint);
}
}
} catch (Exception e) {
throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line)
.initCause(e);
}
}
finishItem(out);
out.close();
}
blockInfo.in.close();
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html");
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>" +
"<title>Main List</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
"<base target='chart'></head><body><table>");
for (int i = 0; i < nameList.size(); ++i) {
String line = (String) nameList.get(i);
String[] lineParts = line.split("\t");
String fileName = lineParts[1] + ".html";
out.println("<tr><td><code>" + lineParts[1] +
"</code></td><td"
+ (nameListNew.get(i) ? " class='new'" : "")
+ "><a href='" + chartPrefix + fileName + "'>" + getHeading(lineParts[2]) + "</a></td><td><code>" +
lineParts[3] +"</code></td></tr>");
}
out.println("</table></body></html>");
out.close();
BagFormatter bf = new BagFormatter();
//System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName));
System.out.println("Name differences: Canonical");
showNameDifferences(hasNameCan, hasNoNameCan);
System.out.println("Name differences: Compatibility");
showNameDifferences(hasNameComp, hasNoNameComp);
// System.out.println("Characters with names in decomps: " + hasName.toPattern(true));
// System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true));
// System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true));
System.out.println("Done");
}
private static boolean isNew(int codepoint) {
return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint);
}
private static void showNameDifferences(Map hasName, Map hasNoName) {
Set both = new TreeSet(hasNoName.keySet());
both.retainAll(hasName.keySet());
//hasNoName.removeAll(both);
//hasName.removeAll(both);
for (Iterator it = both.iterator(); it.hasNext();) {
String decomp = (String) it.next();
System.out.println();
System.out.println("decomp: " + Utility.hex(decomp));
System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp)));
System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp)));
}
System.out.println("Count: " + both.size());
}
static TestIdentifiers ti;
static {
try {
ti = new TestIdentifiers("L");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void finishItem(PrintWriter out) {
if (lastCodePoint < 0) return;
if (lastDecompType != UCD.NONE) {
System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint));
}
String str = UTF16.valueOf(lastCodePoint);
String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191");
showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195");
String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193");
showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194");
String dc = Default.ucd().getDecompositionMapping(lastCodePoint);
String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB");
//String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB");
String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD");
if (nfkd.equals(str)) {
Set s = ti.getConfusables(lastCodePoint, "MA");
if (s.size() > 1) {
sortedSet.clear();
for (Iterator it = s.iterator(); it.hasNext();) {
sortedSet.add(Default.nfkd().normalize((String)it.next()));
}
sortedSet.remove(nfkd); // remove me
for (Iterator it = sortedSet.iterator(); it.hasNext();) {
String other = (String)it.next();
if (nfkd.equals(Default.nfkd().normalize(other))) continue;
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='conf'>\u279F\u00A0"
+ showTextConvertingHex(Utility.hex(other, 4, " + "), true)
+ " "
+ Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase()
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
+ "</td></tr>");
}
}
}
lastCodePoint = -1;
}
static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH));
private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) {
if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) {
out.println("<tr><td>\u00A0</td><td>\u00A0</td><td class='c'>" + symbol + "\u00A0"
+ showTextConvertingHex(Utility.hex(transformed, 4, " + "), true)
+ (UTF16.countCodePoint(transformed) != 1 ? "" :
" " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase())
// maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=')
+ "</td></tr>");
}
return transformed;
}
static public String getHeading(String name) {
int pos = name.lastIndexOf(" (");
if (pos < 0) return name;
return name.substring(0, pos);
}
private static String maybeNameStyle(String string, boolean b) {
if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string);
return string;
}
private static String nameStyle(String string) {
// TODO Auto-generated method stub
String result = "<i>" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + "</i>";
// if it has any &xxx;, then restore them.
int position = 0;
while (true) {
if (!escapeMatch.reset(result).find(position)) break;
int start = escapeMatch.start();
position = escapeMatch.end();
result = result.substring(0,start)
+ result.substring(start, position).toLowerCase()
+ result.substring(position);
}
return result;
}
static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");
private static String showTextConvertingHex(String body, boolean addCharToHex) {
body = TransliteratorUtilities.toHTML.transliterate(body);
if (addCharToHex) {
int position = 0;
while (position < body.length()) {
if (!findHex.reset(body).find(position)) break;
position = findHex.end();
int start = findHex.start();
int len = position - start;
if (len < 4 || len > 6) continue;
int cp = Integer.parseInt(findHex.group(),16);
if (cp > 0x10FFFF) continue;
String insert = "\u00A0" + showChar(cp);
String beginning = body.substring(0,start)
+ "<code>" + body.substring(start, position) + "</code>"
+ insert;
body = beginning + body.substring(position);
position = beginning.length();
}
}
return body;
}
static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher("");
static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher("");
static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher("");
private static String getOther(String body) {
// of form: x (hyphenation point - 2027)
// => arrow 2027 X hyphenation point
int cp;
String name = null;
if (pointer.reset(body).matches()) {
cp = Integer.parseInt(pointer.group(2),16);
name = pointer.group(1);
String name2 = Default.ucd().getName(cp);
if (name2 == null) name2 = "<not a character>";
if (!name.equalsIgnoreCase(name2)) {
System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint));
System.out.println("\tName is: " + name2);
}
} else if (pointer2.reset(body).matches()) {
cp = Integer.parseInt(pointer2.group(1),16);
// name = UCharacter.getName(cp).toLowerCase();
// System.out.println("Irregular format: " + body);
} else {
throw new IllegalArgumentException("Bad format: " + body);
}
return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : "");
}
static String showChar(int cp) {
if (usePicture.contains(cp)) {
int rep = '\u2588';
if (cp <= 0x20) rep = 0x2400 + cp;
else if (cp == 0x7F) rep = 0x2421;
return "<span class='inv'>" + (char)rep + "</span>";
//String hex = Utility.hex(cp);
//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
}
int type = Default.ucd().getCategory(cp);
if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
return "\u2588";
}
String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
if (type == UCD.Me || type == UCD.Mn) {
result = "\u25CC" + result;
} else if (rtl.contains(cp)) {
result = "\u200E" + result + "\u200E";
}
return result;
}
//static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]");
static final Map hasNoNameCan = new TreeMap();
static final Map hasNameCan = new TreeMap();
static final Map hasNoNameComp = new TreeMap();
static final Map hasNameComp = new TreeMap();
private static String checkCanonical(int codePoint, String body) {
body = body.substring(2);
if (lastDecompType != UCD.CANONICAL) {
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
}
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
String hexed = Utility.hex(lastDecomp, 4, " ");
String hexed2 = hexed;
if (UTF16.countCodePoint(lastDecomp) == 1) {
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
}
if (hexed.equalsIgnoreCase(body)) {
hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
} else if (hexed2.equalsIgnoreCase(body)) {
hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint));
} else {
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
System.out.println("\tShould be: " + hexed);
}
lastDecompType = UCD.NONE;
return "\u2261 " + body;
}
private static String checkCompatibility(int codePoint, String body) {
body = body.substring(2);
if (lastDecompType <= UCD.CANONICAL) {
System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint));
}
String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint);
String hexed = Utility.hex(lastDecomp, 4, " ");
if (lastDecompType != UCD.COMPAT_UNSPECIFIED) {
String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint);
hexed = "<" + lastDecompID + "> " + hexed;
}
String hexed2 = hexed;
if (UTF16.countCodePoint(lastDecomp) == 1) {
hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase();
}
if (hexed.equalsIgnoreCase(body)) {
hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
} else if (hexed2.equalsIgnoreCase(body)) {
hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint));
} else {
System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint));
System.out.println("\tShould be: " + hexed);
}
lastDecompType = UCD.NONE;
return "\u2248 " + body;
}
static class BlockInfo {
BufferedReader in;
String lastLine;
BlockInfo (String version, String filename) throws IOException {
in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS);
//in = BagFormatter.openUTF8Reader(dir, filename);
}
boolean next(List inout) throws IOException {
inout.clear();
if (lastLine != null) {
inout.add(lastLine);
lastLine = null;
}
while (true) {
String line = in.readLine();
if (line == null) break;
if (line.startsWith("@@\t")) {
lastLine = line;
break;
}
inout.add(line);
}
return inout.size() > 0;
}
}
}