unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 1996-2001, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
 * $Date: 2006/04/05 22:12:45 $
 * $Revision: 1.18 $
 *
 *******************************************************************************
 */

 package com.ibm.text.UCD;

 import java.util.*;
 import java.io.*;

 import com.ibm.icu.text.UTF16;

 import com.ibm.text.utility.*;

 public class GenerateCaseFolding implements UCD_Types {
     public static boolean DEBUG = false;
     public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
     public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
     public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
     static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1

     // PICK_SHORT & NF_CLOSURE = false for old style


     /*public static void main(String[] args) throws java.io.IOException {
         makeCaseFold(arg[0]);
         //getAge();
     }
     */

     static PrintWriter log;


     public static void makeCaseFold(boolean normalized) throws java.io.IOException {
         PICK_SHORT = NF_CLOSURE = normalized;

         log = Utility.openPrintWriter("CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
         System.out.println("Writing Log: " + "CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true));

         System.out.println("Making Full Data");
         Map fullData = getCaseFolding(true, NF_CLOSURE, "");
         Utility.fixDot();

         System.out.println("Making Simple Data");
         Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
         // write the data

         System.out.println("Making Turkish Full Data");
         Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
         Utility.fixDot();

         System.out.println("Making Simple Data");
         Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
         // write the data

         Utility.fixDot();
         System.out.println("Writing");
         String filename = "CaseFolding";
         if (normalized) filename += "-Normalized";
         String directory = "DerivedData/";
         UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, filename);
         PrintWriter out = fc.out;

         /*
         PrintWriter out = new PrintWriter(
             new BufferedWriter(
             new OutputStreamWriter(
                 new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
                 "UTF8"),
             4*1024));
         */

         for (int ch = 0; ch <= 0x10FFFF; ++ch) {
             Utility.dot(ch);

             if (!charsUsed.get(ch)) continue;

             String rFull = (String)fullData.get(UTF32.valueOf32(ch));
             String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
             String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
             String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
             if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;

             if (rFull != null && rFull.equals(rSimple)
               || (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
                 String type = "C";
                 if (ch == 0x49) {
                 	drawLine(out, ch, "C", "i");
                 	drawLine(out, ch, "T", "\u0131");
                 } else if (ch == 0x130) {
                 	drawLine(out, ch, "F", "i\u0307");
                 	drawLine(out, ch, "T", "i");
                 } else if (ch == 0x131) {
                 	// do nothing
                 	//drawLine(out, ch, "I", "i");
                 } else {
                 	drawLine(out, ch, type, rFull);
                 }
             } else {
                 if (rFull != null) {
                     drawLine(out, ch, "F", rFull);
                 }
                 if (rSimple != null) {
                     drawLine(out, ch, "S", rSimple);
                 }
             }
             if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
                 drawLine(out, ch, "T", rFullTurkish);
             }
             if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
                 drawLine(out, ch, "t", rSimpleTurkish);
             }
         }
         fc.close();
         log.close();
     }

 /* Goal is following (with no entries for 0131 or 0069)

 0049; C; 0069; # LATIN CAPITAL LETTER I
 0049; T; 0131; # LATIN CAPITAL LETTER I

 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 */

     static void drawLine(PrintWriter out, int ch, String type, String result) {
         String comment = "";
         if (COMMENT_DIFFS) {
             String lower = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
             if (!lower.equals(result)) {
                 String upper = Default.ucd().getCase(UTF16.valueOf(ch), FULL, UPPER);
                 String lower2 = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
                 if (lower.equals(lower2)) {
                     comment = "[Diff " + Utility.hex(lower, " ") + "] ";
                 } else {
                     Utility.fixDot();
                     System.out.println("PROBLEM WITH: " + Default.ucd().getCodeAndName(ch));
                     comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
                 }
             }
         }

         out.println(Utility.hex(ch)
             + "; " + type
             + "; " + Utility.hex(result, " ")
             + "; # " + comment + Default.ucd().getName(ch));
     }

     static int probeCh = 0x01f0;
     static String shower = UTF16.valueOf(probeCh);

     static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
         Map data = new TreeMap();
         Map repChar = new TreeMap();
         //String option = "";

         // get the equivalence classes

         for (int ch = 0; ch <= 0x10FFFF; ++ch) {
             Utility.dot(ch);
             //if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
             if (!Default.ucd().isRepresented(ch)) continue;
             getClosure(ch, data, full, nfClose, condition);
         }

         // get the representative characters

         Iterator it = data.keySet().iterator();
         while (it.hasNext()) {
             String s = (String) it.next();
             Set set = (Set) data.get(s);
             show = set.contains(shower);
             if (show) {
                 Utility.fixDot();
                 System.out.println(toString(set));
             }

         // Pick the best available representative

             String rep = null;
             int repGood = 0;
             String dup = null;
             Iterator it2 = set.iterator();
             while (it2.hasNext()) {
                 String s2 = (String)it2.next();
                 int s2Good = goodness(s2, full, condition);
                 if (s2Good > repGood) {
                     rep = s2;
                     repGood = s2Good;
                     dup = null;
                 } else if (s2Good == repGood) {
                     dup = s2;
                 }
             }
             if (rep == null) {
                 Utility.fixDot();
                 System.err.println("No representative for: " + toString(set));
             } else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) {
                 String message = "";
                 if ((repGood & NFC_FORMAT) == 0) {
                     message += " [NOT NFC FORMAT]";
                 }
                 if ((repGood & ISLOWER) == 0) {
                     message += " [NOT LOWERCASE]";
                 }
                 Utility.fixDot();
                 log.println("Non-Optimal Representative " + message);
                 log.println(" Rep:\t" + Default.ucd().getCodeAndName(rep));
                 log.println(" Set:\t" + toString(set,true, true));
             }

             log.println();
             log.println();
             log.println(rep + "\t#" + Default.ucd().getName(rep));

         // Add it for all the elements of the set

             it2 = set.iterator();
             while (it2.hasNext()) {
                 String s2 = (String)it2.next();
                 if (s2.equals(rep)) continue;

                 log.println(s2 + "\t#" + Default.ucd().getName(s2));

                 if (UTF16.countCodePoint(s2) == 1) {
                     repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
                     charsUsed.set(UTF16.charAt(s2, 0));
                 }
             }
         }
         return repChar;
     }

     static BitSet charsUsed = new BitSet();
     static boolean show = false;
     static final int NFC_FORMAT = 64;
     static final int ISLOWER = 128;

     static int goodness(String s, boolean full, String condition) {
         if (s == null) return 0;
         int result = 32-s.length();
         if (!PICK_SHORT) {
             result = s.length();
         }
         if (!full) result <<= 8;
         String low = lower(upper(s, full, condition), full, condition);
         if (s.equals(low)) result |= ISLOWER;
         else if (PICK_SHORT && Default.nfd().normalize(s).equals(Default.nfd().normalize(low))) result |= ISLOWER;

         if (s.equals(Default.nfc().normalize(s))) result |= NFC_FORMAT;

         if (show) {
             Utility.fixDot();
             System.out.println(Utility.hex(result) + ", " + Default.ucd().getCodeAndName(s));
         }
         return result;
     }


     /*
     static HashSet temp = new HashSet();
     static void normalize(HashSet set) {
         temp.clear();
         temp.addAll(set);
         set.clear();
         Iterator it = temp.iterator();
         while (it.hasNext()) {
             String s = (String) it.next();
             String s2 = KC.normalize(s);
             set.add(s);
             data2.put(s,set);
             if (!s.equals(s2)) {
                 set.add(s2);
                 data2.put(s2,set);
                 System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
             }
         }
     }
     */

             /*
             String
             String lower1 = Default.ucd.getLowercase(ch);
             String lower2 = Default.ucd.toLowercase(ch,option);

             char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0);
             //String lower1 = String.valueOf(Default.ucd.getLowercase(ch));
             //String lower = Default.ucd.toLowercase(ch2,option);
             String upper = Default.ucd.toUppercase(ch2,option);
             String lowerUpper = Default.ucd.toLowercase(upper,option);
             //String title = Default.ucd.toTitlecase(ch2,option);
             //String lowerTitle = Default.ucd.toLowercase(upper,option);

             if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
                 output.println(Utility.hex(ch)
                     + "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
                     + "; " + Utility.hex(lowerUpper," ")
                     + ";\t#" + Default.ucd.getName(ch)
                     );
                 //if (!lowerUpper.equals(lower)) {
                 //    output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower));
                 //}
                 //if (!lowerUpper.equals(lowerTitle)) {
                 //    output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle));
                 //}
             }
             */

     static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
         String charStr = UTF32.valueOf32(ch);
         String lowerStr = lower(charStr, full, condition);
         String titleStr = title(charStr, full, condition);
         String upperStr = upper(charStr, full, condition);
         if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
         if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));

         // make new set
         Set set = new TreeSet();
         set.add(charStr);
         data.put(charStr, set);

         // add cases to get started
         add(set, lowerStr, data);
         add(set, upperStr, data);
         add(set, titleStr, data);

         // close it
         main:
         while (true) {
             Iterator it = set.iterator();
             while (it.hasNext()) {
                 String s = (String) it.next();
                 // do funny stuff since we can't modify set while iterating
                 // We don't do this because if the source is not normalized, we don't want to normalize
                 if (nfClose) {
                     if (add(set, Default.nfd().normalize(s), data)) continue main;
                     if (add(set, Default.nfc().normalize(s), data)) continue main;
                     if (add(set, Default.nfkd().normalize(s), data)) continue main;
                     if (add(set, Default.nfkc().normalize(s), data)) continue main;
                 }
                 if (add(set, lower(s, full, condition), data)) continue main;
                 if (add(set, title(s, full, condition), data)) continue main;
                 if (add(set, upper(s, full, condition), data)) continue main;
             }
             break;
         }
     }

     static String lower(String s, boolean full, String condition) {
         String result = lower2(s,full, condition);
         return result.replace('\u03C2', '\u03C3'); // HACK for lower
     }

     // These functions are no longer necessary, since Default.ucd is parameterized,
     // but it's not worth changing

     static String lower2(String s, boolean full, String condition) {
         /*if (!full) {
             if (s.length() != 1) return s;
             return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
         }
         */
         return Default.ucd().getCase(s, full ? FULL : SIMPLE, LOWER, condition);
     }

     static String upper(String s, boolean full, String condition) {
         /* if (!full) {
             if (s.length() != 1) return s;
             return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
         }
         */
         return Default.ucd().getCase(s, full ? FULL : SIMPLE, UPPER, condition);
     }

     static String title(String s, boolean full, String condition) {
         /*if (!full) {
             if (s.length() != 1) return s;
             return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
         }
         */
         return Default.ucd().getCase(s, full ? FULL : SIMPLE, TITLE, condition);
     }

     static boolean add(Set set, String s, Map data) {
         if (set.contains(s)) return false;
         set.add(s);
         if (DEBUG) System.err.println("adding: " + toString(set));
         Set other = (Set) data.get(s);
         if (other != null && other != set) { // merge
             // make all the items in set point to merged set
             Iterator it = other.iterator();
             while (it.hasNext()) {
                 data.put(it.next(), set);
             }
             set.addAll(other);
         }
         if (DEBUG) System.err.println("done adding: " + toString(set));
         return true;
     }

     static String toString(Set set) {
         return toString(set, false, false);
     }

     static String toString(Set set, boolean name, boolean crtab) {
         String result = "{";
         Iterator it2 = set.iterator();
         boolean first = true;
         while (it2.hasNext()) {
             String s2 = (String) it2.next();
             if (!first) {
                 if (crtab) {
                     result += ";\r\n\t";
                 } else {
                     result += "; ";
                 }
             }
             first = false;
             if (name) {
                 result += Default.ucd().getCodeAndName(s2);
             } else {
                 result += Utility.hex(s2, " ");
             }
         }
         return result + "}";
     }

     static boolean specialNormalizationDiffers(int ch) {
         if (ch == 0x00DF) return true;                  // es-zed
         return !Default.nfkd().isNormalized(ch);
     }

     static String specialNormalization(String s) {
         if (s.equals("\u00DF")) return "ss";
         return Default.nfkd().normalize(s);
     }

     static boolean isExcluded(int ch) {
         // if (ch == 0x130) return true;                  // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
         if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
         if (ch == 0x037A) return true;                 // skip GREEK YPOGEGRAMMENI
         if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
         if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..

         byte type = Default.ucd().getDecompositionType(ch);
         if (type == COMPAT_SQUARE) return true;
         //if (type == COMPAT_UNSPECIFIED) return true;
         return false;
     }

     static void generateSpecialCasing(boolean normalize) throws IOException {
         Map sorted = new TreeMap();

         String suffix2 = "";
         if (normalize) suffix2 = "-Normalized";

         PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
             + suffix2 + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);

         for (int ch = 0; ch <= 0x10FFFF; ++ch) {
             Utility.dot(ch);
             if (!Default.ucd().isRepresented(ch)) continue;
             if (!specialNormalizationDiffers(ch)) continue;

             String lower = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, LOWER));
             String upper = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, UPPER));
             String title = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, TITLE));

             String chstr = UTF16.valueOf(ch);

             String decomp = specialNormalization(chstr);
             String flower = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, LOWER));
             String fupper = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, UPPER));
             String ftitle = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, TITLE));

             String base = decomp;
             String blower = specialNormalization(lower);
             String bupper = specialNormalization(upper);
             String btitle = specialNormalization(title);

             if (true) {
                 flower = Default.nfc().normalize(flower);
                 fupper = Default.nfc().normalize(fupper);
                 ftitle = Default.nfc().normalize(ftitle);
                 base = Default.nfc().normalize(base);
                 blower = Default.nfc().normalize(blower);
                 bupper = Default.nfc().normalize(bupper);
                 btitle = Default.nfc().normalize(btitle);
             }

             if (ch == CHECK_CHAR) {
                 System.out.println("Code: " + Default.ucd().getCodeAndName(ch));
                 System.out.println("Decomp: " + Default.ucd().getCodeAndName(decomp));
                 System.out.println("Base: " + Default.ucd().getCodeAndName(base));
                 System.out.println("SLower: " + Default.ucd().getCodeAndName(lower));
                 System.out.println("FLower: " + Default.ucd().getCodeAndName(flower));
                 System.out.println("BLower: " + Default.ucd().getCodeAndName(blower));
                 System.out.println("STitle: " + Default.ucd().getCodeAndName(title));
                 System.out.println("FTitle: " + Default.ucd().getCodeAndName(ftitle));
                 System.out.println("BTitle: " + Default.ucd().getCodeAndName(btitle));
                 System.out.println("SUpper: " + Default.ucd().getCodeAndName(upper));
                 System.out.println("FUpper: " + Default.ucd().getCodeAndName(fupper));
                 System.out.println("BUpper: " + Default.ucd().getCodeAndName(bupper));
             }

             // presumably if there is a single code point, it would already be in the simple mappings

             if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
                 	&& UTF16.countCodePoint(title) == 1) {
             	if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Default.ucd().getCodeAndName(ch));
             	continue;
             }

             // if there is no change from the base, skip

             if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
             	if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Default.ucd().getCodeAndName(ch));
             	continue;
             }

             // fix special cases
             // if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
             if (flower.equals(blower)) flower = lower;
             if (fupper.equals(bupper)) fupper = upper;
             if (ftitle.equals(btitle)) ftitle = title;

             // if there are no changes from the original, or the expanded original, skip

             if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
             	if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Default.ucd().getCodeAndName(ch));
             	continue;
             }

             String name = Default.ucd().getName(ch);

             int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
                 : ch == 0x130 ? 2
                 : name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
                 : name.indexOf("LIGATURE") >= 0 ? 3
                 : name.indexOf("GEGRAMMENI") < 0 ? 5
                 : UTF16.countCodePoint(ftitle) == 1 ? 6
                 : UTF16.countCodePoint(fupper) == 2 ? 7
                 : 8;

             if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch));

             // HACK
             boolean denormalize = !normalize && order != 6 && order != 7;

             String mapping = Utility.hex(ch)
                 + "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd().normalize(flower) : flower)
                 + "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd().normalize(ftitle) : ftitle)
                 + "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd().normalize(fupper) : fupper)
                 + "; # " + Default.ucd().getName(ch);

             // special exclusions
             if (isExcluded(ch)) {
                 log.println("# " + mapping);
             } else {
                 int x = ch;
                 if (ch == 0x01F0) x = 0x03B1; // HACK to reorder the same
                 sorted.put(new Integer((order << 24) | x), mapping);
             }
         }
         log.close();

         System.out.println("Writing");
         //String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
         //PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);

         UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
         PrintWriter out = udf.out;

  /*       String[] batName = {""};
         String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
         out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
         out.println(UnicodeDataFile.generateDateLine());
         out.println("#");
 */
         //Utility.appendFile("com/ibm/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out);

         Iterator it = sorted.keySet().iterator();
         int lastOrder = -1;
         while (it.hasNext()) {
             Integer key = (Integer) it.next();
             String line = (String) sorted.get(key);
             int order = key.intValue() >> 24;
             if (order != lastOrder) {
                 lastOrder = order;
                 out.println();
                 boolean skipLine = false;
                 switch(order) {
                 case 1:
                     out.println("# The German es-zed is special--the normal mapping is to SS.");
                     out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
                     break;
                 case 2:
                     out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
 					break;
                 case 3: out.println("# Ligatures"); break;
                 case 4: skipLine = true; break;
                 case 5: out.println("# No corresponding uppercase precomposed character"); break;
                 case 6: Utility.appendFile("com/ibm/text/UCD/SpecialCasingIota.txt", Utility.UTF8, out); break;
                 case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
                 case 8: skipLine = true; break;
                 }
                 if (!skipLine) out.println();
             }
             out.println(line);
         }
         Utility.appendFile("com/ibm/text/UCD/SpecialCasingFooter.txt", Utility.UTF8, out);
         udf.close();
         //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
     }
 }
	/**
	*******************************************************************************
	* Copyright (C) 1996-2001, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
	* $Date: 2006/04/05 22:12:45 $
	* $Revision: 1.18 $
	*
	*******************************************************************************
	*/

	package com.ibm.text.UCD;

	import java.util.*;
	import java.io.*;

	import com.ibm.icu.text.UTF16;

	import com.ibm.text.utility.*;

	public class GenerateCaseFolding implements UCD_Types {
	public static boolean DEBUG = false;
	public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
	public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
	public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
	static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1

	// PICK_SHORT & NF_CLOSURE = false for old style


	/*public static void main(String[] args) throws java.io.IOException {
	makeCaseFold(arg[0]);
	//getAge();
	}
	*/

	static PrintWriter log;


	public static void makeCaseFold(boolean normalized) throws java.io.IOException {
	PICK_SHORT = NF_CLOSURE = normalized;

	log = Utility.openPrintWriter("CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
	System.out.println("Writing Log: " + "CaseFoldingLog" + UnicodeDataFile.getFileSuffix(true));

	System.out.println("Making Full Data");
	Map fullData = getCaseFolding(true, NF_CLOSURE, "");
	Utility.fixDot();

	System.out.println("Making Simple Data");
	Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
	// write the data

	System.out.println("Making Turkish Full Data");
	Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
	Utility.fixDot();

	System.out.println("Making Simple Data");
	Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
	// write the data

	Utility.fixDot();
	System.out.println("Writing");
	String filename = "CaseFolding";
	if (normalized) filename += "-Normalized";
	String directory = "DerivedData/";
	UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, filename);
	PrintWriter out = fc.out;

	/*
	PrintWriter out = new PrintWriter(
	new BufferedWriter(
	new OutputStreamWriter(
	new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
	"UTF8"),
	4*1024));
	*/

	for (int ch = 0; ch <= 0x10FFFF; ++ch) {
	Utility.dot(ch);

	if (!charsUsed.get(ch)) continue;

	String rFull = (String)fullData.get(UTF32.valueOf32(ch));
	String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
	String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
	String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
	if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;

	if (rFull != null && rFull.equals(rSimple)
	\|\| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
	String type = "C";
	if (ch == 0x49) {
	drawLine(out, ch, "C", "i");
	drawLine(out, ch, "T", "\u0131");
	} else if (ch == 0x130) {
	drawLine(out, ch, "F", "i\u0307");
	drawLine(out, ch, "T", "i");
	} else if (ch == 0x131) {
	// do nothing
	//drawLine(out, ch, "I", "i");
	} else {
	drawLine(out, ch, type, rFull);
	}
	} else {
	if (rFull != null) {
	drawLine(out, ch, "F", rFull);
	}
	if (rSimple != null) {
	drawLine(out, ch, "S", rSimple);
	}
	}
	if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
	drawLine(out, ch, "T", rFullTurkish);
	}
	if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
	drawLine(out, ch, "t", rSimpleTurkish);
	}
	}
	fc.close();
	log.close();
	}

	/* Goal is following (with no entries for 0131 or 0069)

	0049; C; 0069; # LATIN CAPITAL LETTER I
	0049; T; 0131; # LATIN CAPITAL LETTER I

	0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	*/

	static void drawLine(PrintWriter out, int ch, String type, String result) {
	String comment = "";
	if (COMMENT_DIFFS) {
	String lower = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
	if (!lower.equals(result)) {
	String upper = Default.ucd().getCase(UTF16.valueOf(ch), FULL, UPPER);
	String lower2 = Default.ucd().getCase(UTF16.valueOf(ch), FULL, LOWER);
	if (lower.equals(lower2)) {
	comment = "[Diff " + Utility.hex(lower, " ") + "] ";
	} else {
	Utility.fixDot();
	System.out.println("PROBLEM WITH: " + Default.ucd().getCodeAndName(ch));
	comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
	}
	}
	}

	out.println(Utility.hex(ch)
	+ "; " + type
	+ "; " + Utility.hex(result, " ")
	+ "; # " + comment + Default.ucd().getName(ch));
	}

	static int probeCh = 0x01f0;
	static String shower = UTF16.valueOf(probeCh);

	static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
	Map data = new TreeMap();
	Map repChar = new TreeMap();
	//String option = "";

	// get the equivalence classes

	for (int ch = 0; ch <= 0x10FFFF; ++ch) {
	Utility.dot(ch);
	//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
	if (!Default.ucd().isRepresented(ch)) continue;
	getClosure(ch, data, full, nfClose, condition);
	}

	// get the representative characters

	Iterator it = data.keySet().iterator();
	while (it.hasNext()) {
	String s = (String) it.next();
	Set set = (Set) data.get(s);
	show = set.contains(shower);
	if (show) {
	Utility.fixDot();
	System.out.println(toString(set));
	}

	// Pick the best available representative

	String rep = null;
	int repGood = 0;
	String dup = null;
	Iterator it2 = set.iterator();
	while (it2.hasNext()) {
	String s2 = (String)it2.next();
	int s2Good = goodness(s2, full, condition);
	if (s2Good > repGood) {
	rep = s2;
	repGood = s2Good;
	dup = null;
	} else if (s2Good == repGood) {
	dup = s2;
	}
	}
	if (rep == null) {
	Utility.fixDot();
	System.err.println("No representative for: " + toString(set));
	} else if ((repGood & (NFC_FORMAT \| ISLOWER)) != (NFC_FORMAT \| ISLOWER)) {
	String message = "";
	if ((repGood & NFC_FORMAT) == 0) {
	message += " [NOT NFC FORMAT]";
	}
	if ((repGood & ISLOWER) == 0) {
	message += " [NOT LOWERCASE]";
	}
	Utility.fixDot();
	log.println("Non-Optimal Representative " + message);
	log.println(" Rep:\t" + Default.ucd().getCodeAndName(rep));
	log.println(" Set:\t" + toString(set,true, true));
	}

	log.println();
	log.println();
	log.println(rep + "\t#" + Default.ucd().getName(rep));

	// Add it for all the elements of the set

	it2 = set.iterator();
	while (it2.hasNext()) {
	String s2 = (String)it2.next();
	if (s2.equals(rep)) continue;

	log.println(s2 + "\t#" + Default.ucd().getName(s2));

	if (UTF16.countCodePoint(s2) == 1) {
	repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
	charsUsed.set(UTF16.charAt(s2, 0));
	}
	}
	}
	return repChar;
	}

	static BitSet charsUsed = new BitSet();
	static boolean show = false;
	static final int NFC_FORMAT = 64;
	static final int ISLOWER = 128;

	static int goodness(String s, boolean full, String condition) {
	if (s == null) return 0;
	int result = 32-s.length();
	if (!PICK_SHORT) {
	result = s.length();
	}
	if (!full) result <<= 8;
	String low = lower(upper(s, full, condition), full, condition);
	if (s.equals(low)) result \|= ISLOWER;
	else if (PICK_SHORT && Default.nfd().normalize(s).equals(Default.nfd().normalize(low))) result \|= ISLOWER;

	if (s.equals(Default.nfc().normalize(s))) result \|= NFC_FORMAT;

	if (show) {
	Utility.fixDot();
	System.out.println(Utility.hex(result) + ", " + Default.ucd().getCodeAndName(s));
	}
	return result;
	}


	/*
	static HashSet temp = new HashSet();
	static void normalize(HashSet set) {
	temp.clear();
	temp.addAll(set);
	set.clear();
	Iterator it = temp.iterator();
	while (it.hasNext()) {
	String s = (String) it.next();
	String s2 = KC.normalize(s);
	set.add(s);
	data2.put(s,set);
	if (!s.equals(s2)) {
	set.add(s2);
	data2.put(s2,set);
	System.err.println("Adding " + Utility.hex(s) + " by " + Utility.hex(s2));
	}
	}
	}
	*/

	/*
	String
	String lower1 = Default.ucd.getLowercase(ch);
	String lower2 = Default.ucd.toLowercase(ch,option);

	char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0);
	//String lower1 = String.valueOf(Default.ucd.getLowercase(ch));
	//String lower = Default.ucd.toLowercase(ch2,option);
	String upper = Default.ucd.toUppercase(ch2,option);
	String lowerUpper = Default.ucd.toLowercase(upper,option);
	//String title = Default.ucd.toTitlecase(ch2,option);
	//String lowerTitle = Default.ucd.toLowercase(upper,option);

	if (ch != ch2 \|\| lowerUpper.length() != 1 \|\| ch != lowerUpper.charAt(0)) { //
	output.println(Utility.hex(ch)
	+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
	+ "; " + Utility.hex(lowerUpper," ")
	+ ";\t#" + Default.ucd.getName(ch)
	);
	//if (!lowerUpper.equals(lower)) {
	// output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower));
	//}
	//if (!lowerUpper.equals(lowerTitle)) {
	// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle));
	//}
	}
	*/

	static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
	String charStr = UTF32.valueOf32(ch);
	String lowerStr = lower(charStr, full, condition);
	String titleStr = title(charStr, full, condition);
	String upperStr = upper(charStr, full, condition);
	if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
	if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));

	// make new set
	Set set = new TreeSet();
	set.add(charStr);
	data.put(charStr, set);

	// add cases to get started
	add(set, lowerStr, data);
	add(set, upperStr, data);
	add(set, titleStr, data);

	// close it
	main:
	while (true) {
	Iterator it = set.iterator();
	while (it.hasNext()) {
	String s = (String) it.next();
	// do funny stuff since we can't modify set while iterating
	// We don't do this because if the source is not normalized, we don't want to normalize
	if (nfClose) {
	if (add(set, Default.nfd().normalize(s), data)) continue main;
	if (add(set, Default.nfc().normalize(s), data)) continue main;
	if (add(set, Default.nfkd().normalize(s), data)) continue main;
	if (add(set, Default.nfkc().normalize(s), data)) continue main;
	}
	if (add(set, lower(s, full, condition), data)) continue main;
	if (add(set, title(s, full, condition), data)) continue main;
	if (add(set, upper(s, full, condition), data)) continue main;
	}
	break;
	}
	}

	static String lower(String s, boolean full, String condition) {
	String result = lower2(s,full, condition);
	return result.replace('\u03C2', '\u03C3'); // HACK for lower
	}

	// These functions are no longer necessary, since Default.ucd is parameterized,
	// but it's not worth changing

	static String lower2(String s, boolean full, String condition) {
	/*if (!full) {
	if (s.length() != 1) return s;
	return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
	}
	*/
	return Default.ucd().getCase(s, full ? FULL : SIMPLE, LOWER, condition);
	}

	static String upper(String s, boolean full, String condition) {
	/* if (!full) {
	if (s.length() != 1) return s;
	return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
	}
	*/
	return Default.ucd().getCase(s, full ? FULL : SIMPLE, UPPER, condition);
	}

	static String title(String s, boolean full, String condition) {
	/*if (!full) {
	if (s.length() != 1) return s;
	return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
	}
	*/
	return Default.ucd().getCase(s, full ? FULL : SIMPLE, TITLE, condition);
	}

	static boolean add(Set set, String s, Map data) {
	if (set.contains(s)) return false;
	set.add(s);
	if (DEBUG) System.err.println("adding: " + toString(set));
	Set other = (Set) data.get(s);
	if (other != null && other != set) { // merge
	// make all the items in set point to merged set
	Iterator it = other.iterator();
	while (it.hasNext()) {
	data.put(it.next(), set);
	}
	set.addAll(other);
	}
	if (DEBUG) System.err.println("done adding: " + toString(set));
	return true;
	}

	static String toString(Set set) {
	return toString(set, false, false);
	}

	static String toString(Set set, boolean name, boolean crtab) {
	String result = "{";
	Iterator it2 = set.iterator();
	boolean first = true;
	while (it2.hasNext()) {
	String s2 = (String) it2.next();
	if (!first) {
	if (crtab) {
	result += ";\r\n\t";
	} else {
	result += "; ";
	}
	}
	first = false;
	if (name) {
	result += Default.ucd().getCodeAndName(s2);
	} else {
	result += Utility.hex(s2, " ");
	}
	}
	return result + "}";
	}

	static boolean specialNormalizationDiffers(int ch) {
	if (ch == 0x00DF) return true; // es-zed
	return !Default.nfkd().isNormalized(ch);
	}

	static String specialNormalization(String s) {
	if (s.equals("\u00DF")) return "ss";
	return Default.nfkd().normalize(s);
	}

	static boolean isExcluded(int ch) {
	// if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
	if (ch == 0x0132 \|\| ch == 0x0133) return true; // skip IJ, ij
	if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
	if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
	if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..

	byte type = Default.ucd().getDecompositionType(ch);
	if (type == COMPAT_SQUARE) return true;
	//if (type == COMPAT_UNSPECIFIED) return true;
	return false;
	}

	static void generateSpecialCasing(boolean normalize) throws IOException {
	Map sorted = new TreeMap();

	String suffix2 = "";
	if (normalize) suffix2 = "-Normalized";

	PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
	+ suffix2 + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);

	for (int ch = 0; ch <= 0x10FFFF; ++ch) {
	Utility.dot(ch);
	if (!Default.ucd().isRepresented(ch)) continue;
	if (!specialNormalizationDiffers(ch)) continue;

	String lower = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, LOWER));
	String upper = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, UPPER));
	String title = Default.nfc().normalize(Default.ucd().getCase(ch, SIMPLE, TITLE));

	String chstr = UTF16.valueOf(ch);

	String decomp = specialNormalization(chstr);
	String flower = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, LOWER));
	String fupper = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, UPPER));
	String ftitle = Default.nfc().normalize(Default.ucd().getCase(decomp, SIMPLE, TITLE));

	String base = decomp;
	String blower = specialNormalization(lower);
	String bupper = specialNormalization(upper);
	String btitle = specialNormalization(title);

	if (true) {
	flower = Default.nfc().normalize(flower);
	fupper = Default.nfc().normalize(fupper);
	ftitle = Default.nfc().normalize(ftitle);
	base = Default.nfc().normalize(base);
	blower = Default.nfc().normalize(blower);
	bupper = Default.nfc().normalize(bupper);
	btitle = Default.nfc().normalize(btitle);
	}

	if (ch == CHECK_CHAR) {
	System.out.println("Code: " + Default.ucd().getCodeAndName(ch));
	System.out.println("Decomp: " + Default.ucd().getCodeAndName(decomp));
	System.out.println("Base: " + Default.ucd().getCodeAndName(base));
	System.out.println("SLower: " + Default.ucd().getCodeAndName(lower));
	System.out.println("FLower: " + Default.ucd().getCodeAndName(flower));
	System.out.println("BLower: " + Default.ucd().getCodeAndName(blower));
	System.out.println("STitle: " + Default.ucd().getCodeAndName(title));
	System.out.println("FTitle: " + Default.ucd().getCodeAndName(ftitle));
	System.out.println("BTitle: " + Default.ucd().getCodeAndName(btitle));
	System.out.println("SUpper: " + Default.ucd().getCodeAndName(upper));
	System.out.println("FUpper: " + Default.ucd().getCodeAndName(fupper));
	System.out.println("BUpper: " + Default.ucd().getCodeAndName(bupper));
	}

	// presumably if there is a single code point, it would already be in the simple mappings

	if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
	&& UTF16.countCodePoint(title) == 1) {
	if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Default.ucd().getCodeAndName(ch));
	continue;
	}

	// if there is no change from the base, skip

	if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
	if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Default.ucd().getCodeAndName(ch));
	continue;
	}

	// fix special cases
	// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
	if (flower.equals(blower)) flower = lower;
	if (fupper.equals(bupper)) fupper = upper;
	if (ftitle.equals(btitle)) ftitle = title;

	// if there are no changes from the original, or the expanded original, skip

	if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
	if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Default.ucd().getCodeAndName(ch));
	continue;
	}

	String name = Default.ucd().getName(ch);

	int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
	: ch == 0x130 ? 2
	: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
	: name.indexOf("LIGATURE") >= 0 ? 3
	: name.indexOf("GEGRAMMENI") < 0 ? 5
	: UTF16.countCodePoint(ftitle) == 1 ? 6
	: UTF16.countCodePoint(fupper) == 2 ? 7
	: 8;

	if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch));

	// HACK
	boolean denormalize = !normalize && order != 6 && order != 7;

	String mapping = Utility.hex(ch)
	+ "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd().normalize(flower) : flower)
	+ "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd().normalize(ftitle) : ftitle)
	+ "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd().normalize(fupper) : fupper)
	+ "; # " + Default.ucd().getName(ch);

	// special exclusions
	if (isExcluded(ch)) {
	log.println("# " + mapping);
	} else {
	int x = ch;
	if (ch == 0x01F0) x = 0x03B1; // HACK to reorder the same
	sorted.put(new Integer((order << 24) \| x), mapping);
	}
	}
	log.close();

	System.out.println("Writing");
	//String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
	//PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);

	UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
	PrintWriter out = udf.out;

	/* String[] batName = {""};
	String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
	out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
	out.println(UnicodeDataFile.generateDateLine());
	out.println("#");
	*/
	//Utility.appendFile("com/ibm/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out);

	Iterator it = sorted.keySet().iterator();
	int lastOrder = -1;
	while (it.hasNext()) {
	Integer key = (Integer) it.next();
	String line = (String) sorted.get(key);
	int order = key.intValue() >> 24;
	if (order != lastOrder) {
	lastOrder = order;
	out.println();
	boolean skipLine = false;
	switch(order) {
	case 1:
	out.println("# The German es-zed is special--the normal mapping is to SS.");
	out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
	break;
	case 2:
	out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
	break;
	case 3: out.println("# Ligatures"); break;
	case 4: skipLine = true; break;
	case 5: out.println("# No corresponding uppercase precomposed character"); break;
	case 6: Utility.appendFile("com/ibm/text/UCD/SpecialCasingIota.txt", Utility.UTF8, out); break;
	case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
	case 8: skipLine = true; break;
	}
	if (!skipLine) out.println();
	}
	out.println(line);
	}
	Utility.appendFile("com/ibm/text/UCD/SpecialCasingFooter.txt", Utility.UTF8, out);
	udf.close();
	//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
	}
	}