tools/misc/src/com/ibm/icu/dev/tool/translit/UnicodeSetCloseOver.java - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 * Copyright (c) 2003-2012, International Business Machines
 * Corporation and others.  All Rights Reserved.
 **********************************************************************
 * Author: Alan Liu
 * Created: February 11 2003
 * Since: ICU 2.6
 **********************************************************************
 */
 package com.ibm.icu.dev.tool.translit;

 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.Vector;

 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 /**
  * This class produces the data tables used by the closeOver() method
  * of UnicodeSet.
  *
  * Whenever the Unicode database changes, this tool must be re-run
  * (AFTER the data file(s) underlying ICU4J are udpated).
  *
  * The output of this tool should then be pasted into the appropriate
  * files:
  *
  * ICU4J: com.ibm.icu.text.UnicodeSet.java
  * ICU4C: /icu/source/common/uniset.cpp
  */
 class UnicodeSetCloseOver {

     // Our output files
     static final String JAVA_OUT          = "to_UnicodeSet.java";
     static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
     static final String C_SET_OUT         = "to_uniset.cpp";
     static final String C_UCHAR_OUT       = "to_uchar.c";

     // Source code "do not edit" warning
     static final String WARNING = "MACHINE-GENERATED; Unicode version " +
         UCharacter.getUnicodeVersion() +
         "; DO NOT EDIT; See " +
         UnicodeSetCloseOver.class.getName();

     // Case folding options flag.  This must correspond to the options
     // used in UnicodeSet.closeOver() in Java and C++.
     static final boolean DEFAULT_CASE_MAP = true; // false for Turkish

     public static void main(String[] args) throws IOException {
         System.out.println("This tool will generate several output files.  Each is named according");
         System.out.println("the target file.  For example, the contents of to_UnicodeSet.java should");
         System.out.println("be pasted into UnicodeSet.java.");
         System.out.println();

         generateCaseData();
     }

     /**
      * Create a map of String => Set.  The String in this case is a
      * folded string for which
      * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
      * The Set contains all single-character strings x for which
      * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
      * well as folded itself.
      */
     static Map createCaseFoldEquivalencyClasses() {
         Map equivClasses = new HashMap();
         for (int i = 0; i <= 0x10FFFF; ++i) {
             int cat = UCharacter.getType(i);
             if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE)
                 continue;

             String cp = UTF16.valueOf(i);
             String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
             if (folded.equals(cp)) continue;

             // At this point, have different case folding.  Add
             // the code point and its folded equivalent into the
             // equivalency class.
             TreeSet s = (TreeSet) equivClasses.get(folded);
             if (s == null) {
                 s = new TreeSet();
                 s.add(folded); // add the case fold result itself
                 equivClasses.put(folded, s);
             }
             s.add(cp);
         }
         return equivClasses;
     }

     /**
      * Analyze the case fold equivalency classes.  Break them into two
      * groups: 'pairs', and 'nonpairs'.  Create a tally of the length
      * configurations of the nonpairs.
      *
      * Length configurations of equivalency classes, as of Unicode
      * 3.2.  Most of the classes (83%) have two single codepoints.
      * Here "112:28" means there are 28 equivalency classes with 2
      * single codepoints and one string of length 2.
      *
      * 11:656
      * 111:16
      * 1111:3
      * 112:28
      * 113:2
      * 12:31
      * 13:12
      * 22:38
      *
      * Note: This method does not count the frequencies of the
      * different length configurations (as shown above after ':'); it
      * merely records which configurations occur.
      *
      * @param pairs Accumulate equivalency classes that consist of
      * exactly two codepoints here.  This is 83+% of the classes.
      * E.g., {"a", "A"}.
      * @param nonpairs Accumulate other equivalency classes here, as
      * lists of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
      * @param lengths Accumulate a list of unique length structures,
      * not including pairs.  Each length structure is represented by a
      * string of digits.  The digit string "12" means the equivalency
      * class contains a single code point and a string of length 2.
      * Typical contents of 'lengths': { "111", "1111", "112",
      * "113", "12", "13", "22" }.  Note the absence of "11".
      */
     static void analyzeCaseData(Map equivClasses,
                                 StringBuffer pairs,
                                 Vector nonpairs,
                                 Vector lengths) {
         Iterator i = new TreeSet(equivClasses.keySet()).iterator();
         StringBuffer buf = new StringBuffer();
         while (i.hasNext()) {
             Object key = i.next();
             Vector v = new Vector((Set) equivClasses.get(key));
             if (v.size() == 2) {
                 String a = (String) v.elementAt(0);
                 String b = (String) v.elementAt(1);
                 if (a.length() == 1 && b.length() == 1) {
                     pairs.append(a).append(b);
                     continue;
                     // Note that pairs are included in 'lengths'
                 }
             }
             String[] a = new String[v.size()];
             v.toArray(a);
             nonpairs.add(a);
             //int singleCount = 0;
             //int stringCount = 0;
             // Make a string of the lengths, e.g., "111" means 3
             // single code points; "13" means a single code point
             // and a string of length 3.
             v.clear();
             for (int j=0; j<a.length; ++j) {
                 v.add(new Integer(a[j].length()));
             }
             Collections.sort(v);
             buf.setLength(0);
             for (int j=0; j<v.size(); ++j) {
                 buf.append(String.valueOf(v.elementAt(j)));
             }
             if (!lengths.contains(buf.toString())) {
                 lengths.add(buf.toString());
             }
         }
     }

     @SuppressWarnings("resource")
     static void generateCaseData() throws IOException {

         Map equivClasses = createCaseFoldEquivalencyClasses();

         // Accumulate equivalency classes that consist of exactly
         // two codepoints here.  This is 83+% of the classes.
         // E.g., {"a", "A"}.
         StringBuffer pairs = new StringBuffer();

         // Accumulate other equivalency classes here, as lists
         // of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
         Vector nonpairs = new Vector(); // contains String[]
         Vector lengths = new Vector(); // "111", "12", "22", etc.

         analyzeCaseData(equivClasses, pairs, nonpairs, lengths);

         //-------------------------------------------------------------
         // Emit Java source
         PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
         System.out.println("Writing " + JAVA_OUT);

         out.println("    // " + WARNING);
         out.println("    private static final String CASE_PAIRS =");
         out.println(Utility.formatForSource(pairs.toString()) + ";");
         out.println();
         out.println("    // " + WARNING);
         out.println("    private static final String[][] CASE_NONPAIRS = {");
         for (int j=0; j<nonpairs.size(); ++j) {
             String[] a = (String[]) nonpairs.elementAt(j);
             out.print("        {");
             for (int k=0; k<a.length; ++k) {
                 if (k != 0) out.print(", ");
                 out.print(Utility.format1ForSource(a[k]));
             }
             out.println("},");
         }
         out.println("    };");

         //-------------------------------------------------------------
         // Emit C++ source

         // In C++, the pairs are again emitted in an array, but this
         // array is the final representation form -- it will not be
         // reprocessed into a hash.  It will be binary searched by
         // looking at the even elements [0], [2], [4], etc., and
         // ignoring the odd elements.  The even elements must contain
         // the folded members of the pairs.  That is, in the pair
         // {'A', 'a'}, the even element must be 'a', not 'A'.  Then a
         // code point to be located is first folded ('Y' => 'y') then
         // it binary searched against [0]='A', [2]='B', etc.  When a
         // match is found at k, the pair is [k], [k+1].

         out = new PrintStream(new FileOutputStream(C_SET_OUT));
         System.out.println("Writing " + C_SET_OUT);

         // Sort the pairs.  They must be ordered by the folded element.
         // Store these as two-character strings, with charAt(0) being
         // the folded member of the pair.
         TreeSet sortPairs = new TreeSet(new Comparator() {
             public int compare(Object a, Object b) {
                 return ((int) ((String) a).charAt(0)) -
                        ((int) ((String) b).charAt(0));
             }
             public boolean equals(Object obj) {
                 return false;
             }
         });
         for (int i=0; i<pairs.length(); i+=2) {
             String a = String.valueOf(pairs.charAt(i));
             String b = String.valueOf(pairs.charAt(i+1));
             String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
             if (a.equals(folded)) {
                 sortPairs.add(a + b);
             } else {
                 sortPairs.add(b + a);
             }
         }

         // Emit the pairs
         out.println("// " + WARNING);
         out.println("static const UChar CASE_PAIRS[] = {");
         Iterator it = sortPairs.iterator();
         while (it.hasNext()) {
             out.print("    ");
             int n = 0;
             while (n++ < 5 && it.hasNext()) {
                 String s = (String) it.next();
                 //out.print((int) s.charAt(0) + "," +
                 //                 (int) s.charAt(1) + ",");
                 out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
                                  Utility.hex(s.charAt(1)) + ",");
             }
             out.println();
         }
         out.println("};");
         out.println();

         // The non-pairs are encoded in the following way.  All the
         // single codepoints in each class are grouped together
         // followed by a zero.  Then each multi-character string is
         // added, followed by a zero.  Finally, another zero is added.
         // Some examples:
         //  {"iQ", "R"}           =>  [ 'R', 0, 'i', 'Q', 0, 0 ]
         //  {"S", "D", "F", "G"}  =>  [ 'S', 'D', 'F', 'G', 0, 0 ]
         //  {"jW", "jY"}          =>  [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
         // The end-result is a short, flat array of UChar values that
         // can be used to initialize a UChar[] array in C.

         int maxLen = 0; // Maximum encoded length of any class, including zeros
         out.println("// " + WARNING);
         out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
         for (int j=0; j<nonpairs.size(); ++j) {
             int len = 0;
             String[] a = (String[]) nonpairs.elementAt(j);
             out.print("    {");
             // Emit single code points
             for (int k=0; k<a.length; ++k) {
                 if (a[k].length() != 1) continue;
                 //out.print((int) a[k].charAt(0) + ",");
                 out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
                 ++len;
             }
             out.print("0,  "); // End of single code points
             ++len;
             // Emit multi-character strings
             for (int k=0; k<a.length; ++k) {
                 if (a[k].length() == 1) continue;
                 for (int m=0; m<a[k].length(); ++m) {
                     //out.print((int) a[k].charAt(m) + ",");
                     out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
                     ++len;
                 }
                 out.print("0, "); // End of string
                 ++len;
             }
             out.println("0},"); // End of equivalency class
             ++len;
             if (len > maxLen) maxLen = len;
         }
         out.println("};");

         // Make sure the CaseEquivClass data can fit.
         if (maxLen > 8) {
             throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
         }

         // Also make sure that we can map into this array using a
         // CompactByteArray.  We could do this check above, but we
         // keep it here, adjacent to the maxLen check.  We use one
         // value (-1 == 255) to indicate "no value."
         if (nonpairs.size() > 255) {
             throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
         }

         //-------------------------------------------------------------
         // Case-unique set:  All characters c for which closeOver(c)==c.

         // UPDATE: Instead of using this, we're using the related
         // notion of Case_Sensitive.  See below.  Note that
         // Case_Sensitive != ^Case_Unique.

         if (false) {
             UnicodeSet caseUnique = new UnicodeSet();
             for (int i = 0; i <= 0x10FFFF; ++i) {
                 String cp = UTF16.valueOf(i);
                 if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
                     caseUnique.add(i);
                 }
             }
             // out.println("caseUnique = " + caseUnique.toPattern(true));
         }

         UnicodeSet caseSensitive = getCaseSensitive();
         //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));

         // Now for C, emit an array of ranges
         out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
         System.out.println("Writing " + C_UCHAR_OUT);

         out.println("/* " + WARNING + " */");
         emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");

         // For Java, emit a string with the ranges (each pair of chars
         // in the string is a range).
         out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
         System.out.println("Writing " + JAVA_CHARPROP_OUT);
         out.println("    // " + WARNING);
         emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
     }

     /**
      * Create the set of case-sensitive characters.  These are characters
      * that participate in any case mapping operation as a source or
      * as a member of a target string.
      */
     static UnicodeSet getCaseSensitive() {
         UnicodeSet caseSensitive = new UnicodeSet();
         Locale loc = Locale.US;
         BreakIterator bi = BreakIterator.getTitleInstance(loc);
         for (int c = 0; c <= 0x10FFFF; ++c) {
             String cp = UTF16.valueOf(c);
             for (int j=0; j<4; ++j) {
                 String s = null;
                 switch (j) {
                 case 0: s = UCharacter.toUpperCase(loc, cp); break;
                 case 1: s = UCharacter.toLowerCase(loc, cp); break;
                 case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
                 case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
                 }
                 if (!s.equals(cp)) {
                     int cc;
                     for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
                         cc = UTF16.charAt(s, k);
                         caseSensitive.add(cc);
                     }
                     for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
                         cc = UTF16.charAt(cp, k);
                         caseSensitive.add(cc);
                     }
                 }
             }
             // Also do the single-codepoint API.  This shouldn't add any
             // code points, but we do it for completeness.
             for (int j=0; j<4; ++j) {
                 int d = 0;
                 switch (j) {
                 case 0: d = UCharacter.toUpperCase(c); break;
                 case 1: d = UCharacter.toLowerCase(c); break;
                 case 2: d = UCharacter.toTitleCase(c); break;
                 case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
                 }
                 if (d != c) {
                     if (!caseSensitive.contains(c) ||
                         !caseSensitive.contains(d)) {
                         System.out.println("Warning: code point " + c +
                                            " => " + d + " created NEW MAPPING"+
                                            " for Case_Sensitive");
                     }
                     caseSensitive.add(c);
                     caseSensitive.add(d);
                 }
             }
         }
         return caseSensitive;
     }

     /**
      * Given a UnicodeSet, emit it as an array of UChar pairs.  Each
      * pair will be the start/end of a range.  Code points >= U+10000
      * will be represented as surrogate pairs.
      */
     static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
         // Store the pairs in a StringBuffer.  This handles surrogate
         // representation.
         StringBuffer buf = new StringBuffer();
         for (int i=0; i<set.getRangeCount(); ++i) {
             UTF16.append(buf, set.getRangeStart(i));
             UTF16.append(buf, set.getRangeEnd(i));
         }
         // Emit the pairs
         out.println("static const UChar " + id + "[] = {");
         for (int i=0; i<buf.length(); ) {
             out.print("    ");
             for (int n=0; n++<10 && i<buf.length(); ++i) {
                 out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
             }
             out.println();
         }
         out.println("};");
         out.println("#define " + id + "_LENGTH (sizeof(" + id +
                     ")/sizeof(" + id + "[0]))");
     }

     /**
      * Given a UnicodeSet, emit it as a Java string.  The most economical
      * format is not the pattern, but instead a pairs list, with each
      * range pair represented as two adjacent characters.
      */
     static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
         // Store the pairs in a StringBuffer.  This handles surrogate
         // representation.
         StringBuffer buf = new StringBuffer();
         for (int i=0; i<set.getRangeCount(); ++i) {
             UTF16.append(buf, set.getRangeStart(i));
             UTF16.append(buf, set.getRangeEnd(i));
         }
         // Emit the pairs
         out.println("    private static final String " + id + " =");
         out.println(Utility.formatForSource(buf.toString()) + ";");
     }
 }
	/*
	**********************************************************************
	* Copyright (c) 2003-2012, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Author: Alan Liu
	* Created: February 11 2003
	* Since: ICU 2.6
	**********************************************************************
	*/
	package com.ibm.icu.dev.tool.translit;

	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.PrintStream;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Set;
	import java.util.TreeSet;
	import java.util.Vector;

	import com.ibm.icu.impl.Utility;
	import com.ibm.icu.lang.UCharacter;
	import com.ibm.icu.text.BreakIterator;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;

	/**
	* This class produces the data tables used by the closeOver() method
	* of UnicodeSet.
	*
	* Whenever the Unicode database changes, this tool must be re-run
	* (AFTER the data file(s) underlying ICU4J are udpated).
	*
	* The output of this tool should then be pasted into the appropriate
	* files:
	*
	* ICU4J: com.ibm.icu.text.UnicodeSet.java
	* ICU4C: /icu/source/common/uniset.cpp
	*/
	class UnicodeSetCloseOver {

	// Our output files
	static final String JAVA_OUT = "to_UnicodeSet.java";
	static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
	static final String C_SET_OUT = "to_uniset.cpp";
	static final String C_UCHAR_OUT = "to_uchar.c";

	// Source code "do not edit" warning
	static final String WARNING = "MACHINE-GENERATED; Unicode version " +
	UCharacter.getUnicodeVersion() +
	"; DO NOT EDIT; See " +
	UnicodeSetCloseOver.class.getName();

	// Case folding options flag. This must correspond to the options
	// used in UnicodeSet.closeOver() in Java and C++.
	static final boolean DEFAULT_CASE_MAP = true; // false for Turkish

	public static void main(String[] args) throws IOException {
	System.out.println("This tool will generate several output files. Each is named according");
	System.out.println("the target file. For example, the contents of to_UnicodeSet.java should");
	System.out.println("be pasted into UnicodeSet.java.");
	System.out.println();

	generateCaseData();
	}

	/**
	* Create a map of String => Set. The String in this case is a
	* folded string for which
	* UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
	* The Set contains all single-character strings x for which
	* UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
	* well as folded itself.
	*/
	static Map createCaseFoldEquivalencyClasses() {
	Map equivClasses = new HashMap();
	for (int i = 0; i <= 0x10FFFF; ++i) {
	int cat = UCharacter.getType(i);
	if (cat == Character.UNASSIGNED \|\| cat == Character.PRIVATE_USE)
	continue;

	String cp = UTF16.valueOf(i);
	String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
	if (folded.equals(cp)) continue;

	// At this point, have different case folding. Add
	// the code point and its folded equivalent into the
	// equivalency class.
	TreeSet s = (TreeSet) equivClasses.get(folded);
	if (s == null) {
	s = new TreeSet();
	s.add(folded); // add the case fold result itself
	equivClasses.put(folded, s);
	}
	s.add(cp);
	}
	return equivClasses;
	}

	/**
	* Analyze the case fold equivalency classes. Break them into two
	* groups: 'pairs', and 'nonpairs'. Create a tally of the length
	* configurations of the nonpairs.
	*
	* Length configurations of equivalency classes, as of Unicode
	* 3.2. Most of the classes (83%) have two single codepoints.
	* Here "112:28" means there are 28 equivalency classes with 2
	* single codepoints and one string of length 2.
	*
	* 11:656
	* 111:16
	* 1111:3
	* 112:28
	* 113:2
	* 12:31
	* 13:12
	* 22:38
	*
	* Note: This method does not count the frequencies of the
	* different length configurations (as shown above after ':'); it
	* merely records which configurations occur.
	*
	* @param pairs Accumulate equivalency classes that consist of
	* exactly two codepoints here. This is 83+% of the classes.
	* E.g., {"a", "A"}.
	* @param nonpairs Accumulate other equivalency classes here, as
	* lists of strings. E,g, {"st", "\uFB05", "\uFB06"}.
	* @param lengths Accumulate a list of unique length structures,
	* not including pairs. Each length structure is represented by a
	* string of digits. The digit string "12" means the equivalency
	* class contains a single code point and a string of length 2.
	* Typical contents of 'lengths': { "111", "1111", "112",
	* "113", "12", "13", "22" }. Note the absence of "11".
	*/
	static void analyzeCaseData(Map equivClasses,
	StringBuffer pairs,
	Vector nonpairs,
	Vector lengths) {
	Iterator i = new TreeSet(equivClasses.keySet()).iterator();
	StringBuffer buf = new StringBuffer();
	while (i.hasNext()) {
	Object key = i.next();
	Vector v = new Vector((Set) equivClasses.get(key));
	if (v.size() == 2) {
	String a = (String) v.elementAt(0);
	String b = (String) v.elementAt(1);
	if (a.length() == 1 && b.length() == 1) {
	pairs.append(a).append(b);
	continue;
	// Note that pairs are included in 'lengths'
	}
	}
	String[] a = new String[v.size()];
	v.toArray(a);
	nonpairs.add(a);
	//int singleCount = 0;
	//int stringCount = 0;
	// Make a string of the lengths, e.g., "111" means 3
	// single code points; "13" means a single code point
	// and a string of length 3.
	v.clear();
	for (int j=0; j<a.length; ++j) {
	v.add(new Integer(a[j].length()));
	}
	Collections.sort(v);
	buf.setLength(0);
	for (int j=0; j<v.size(); ++j) {
	buf.append(String.valueOf(v.elementAt(j)));
	}
	if (!lengths.contains(buf.toString())) {
	lengths.add(buf.toString());
	}
	}
	}

	@SuppressWarnings("resource")
	static void generateCaseData() throws IOException {

	Map equivClasses = createCaseFoldEquivalencyClasses();

	// Accumulate equivalency classes that consist of exactly
	// two codepoints here. This is 83+% of the classes.
	// E.g., {"a", "A"}.
	StringBuffer pairs = new StringBuffer();

	// Accumulate other equivalency classes here, as lists
	// of strings. E,g, {"st", "\uFB05", "\uFB06"}.
	Vector nonpairs = new Vector(); // contains String[]
	Vector lengths = new Vector(); // "111", "12", "22", etc.

	analyzeCaseData(equivClasses, pairs, nonpairs, lengths);

	//-------------------------------------------------------------
	// Emit Java source
	PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
	System.out.println("Writing " + JAVA_OUT);

	out.println(" // " + WARNING);
	out.println(" private static final String CASE_PAIRS =");
	out.println(Utility.formatForSource(pairs.toString()) + ";");
	out.println();
	out.println(" // " + WARNING);
	out.println(" private static final String[][] CASE_NONPAIRS = {");
	for (int j=0; j<nonpairs.size(); ++j) {
	String[] a = (String[]) nonpairs.elementAt(j);
	out.print(" {");
	for (int k=0; k<a.length; ++k) {
	if (k != 0) out.print(", ");
	out.print(Utility.format1ForSource(a[k]));
	}
	out.println("},");
	}
	out.println(" };");

	//-------------------------------------------------------------
	// Emit C++ source

	// In C++, the pairs are again emitted in an array, but this
	// array is the final representation form -- it will not be
	// reprocessed into a hash. It will be binary searched by
	// looking at the even elements [0], [2], [4], etc., and
	// ignoring the odd elements. The even elements must contain
	// the folded members of the pairs. That is, in the pair
	// {'A', 'a'}, the even element must be 'a', not 'A'. Then a
	// code point to be located is first folded ('Y' => 'y') then
	// it binary searched against [0]='A', [2]='B', etc. When a
	// match is found at k, the pair is [k], [k+1].

	out = new PrintStream(new FileOutputStream(C_SET_OUT));
	System.out.println("Writing " + C_SET_OUT);

	// Sort the pairs. They must be ordered by the folded element.
	// Store these as two-character strings, with charAt(0) being
	// the folded member of the pair.
	TreeSet sortPairs = new TreeSet(new Comparator() {
	public int compare(Object a, Object b) {
	return ((int) ((String) a).charAt(0)) -
	((int) ((String) b).charAt(0));
	}
	public boolean equals(Object obj) {
	return false;
	}
	});
	for (int i=0; i<pairs.length(); i+=2) {
	String a = String.valueOf(pairs.charAt(i));
	String b = String.valueOf(pairs.charAt(i+1));
	String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
	if (a.equals(folded)) {
	sortPairs.add(a + b);
	} else {
	sortPairs.add(b + a);
	}
	}

	// Emit the pairs
	out.println("// " + WARNING);
	out.println("static const UChar CASE_PAIRS[] = {");
	Iterator it = sortPairs.iterator();
	while (it.hasNext()) {
	out.print(" ");
	int n = 0;
	while (n++ < 5 && it.hasNext()) {
	String s = (String) it.next();
	//out.print((int) s.charAt(0) + "," +
	// (int) s.charAt(1) + ",");
	out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
	Utility.hex(s.charAt(1)) + ",");
	}
	out.println();
	}
	out.println("};");
	out.println();

	// The non-pairs are encoded in the following way. All the
	// single codepoints in each class are grouped together
	// followed by a zero. Then each multi-character string is
	// added, followed by a zero. Finally, another zero is added.
	// Some examples:
	// {"iQ", "R"} => [ 'R', 0, 'i', 'Q', 0, 0 ]
	// {"S", "D", "F", "G"} => [ 'S', 'D', 'F', 'G', 0, 0 ]
	// {"jW", "jY"} => [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
	// The end-result is a short, flat array of UChar values that
	// can be used to initialize a UChar[] array in C.

	int maxLen = 0; // Maximum encoded length of any class, including zeros
	out.println("// " + WARNING);
	out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
	for (int j=0; j<nonpairs.size(); ++j) {
	int len = 0;
	String[] a = (String[]) nonpairs.elementAt(j);
	out.print(" {");
	// Emit single code points
	for (int k=0; k<a.length; ++k) {
	if (a[k].length() != 1) continue;
	//out.print((int) a[k].charAt(0) + ",");
	out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
	++len;
	}
	out.print("0, "); // End of single code points
	++len;
	// Emit multi-character strings
	for (int k=0; k<a.length; ++k) {
	if (a[k].length() == 1) continue;
	for (int m=0; m<a[k].length(); ++m) {
	//out.print((int) a[k].charAt(m) + ",");
	out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
	++len;
	}
	out.print("0, "); // End of string
	++len;
	}
	out.println("0},"); // End of equivalency class
	++len;
	if (len > maxLen) maxLen = len;
	}
	out.println("};");

	// Make sure the CaseEquivClass data can fit.
	if (maxLen > 8) {
	throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
	}

	// Also make sure that we can map into this array using a
	// CompactByteArray. We could do this check above, but we
	// keep it here, adjacent to the maxLen check. We use one
	// value (-1 == 255) to indicate "no value."
	if (nonpairs.size() > 255) {
	throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
	}

	//-------------------------------------------------------------
	// Case-unique set: All characters c for which closeOver(c)==c.

	// UPDATE: Instead of using this, we're using the related
	// notion of Case_Sensitive. See below. Note that
	// Case_Sensitive != ^Case_Unique.

	if (false) {
	UnicodeSet caseUnique = new UnicodeSet();
	for (int i = 0; i <= 0x10FFFF; ++i) {
	String cp = UTF16.valueOf(i);
	if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
	caseUnique.add(i);
	}
	}
	// out.println("caseUnique = " + caseUnique.toPattern(true));
	}

	UnicodeSet caseSensitive = getCaseSensitive();
	//System.out.println("caseSensitive = " + caseSensitive.toPattern(true));

	// Now for C, emit an array of ranges
	out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
	System.out.println("Writing " + C_UCHAR_OUT);

	out.println("/* " + WARNING + " */");
	emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");

	// For Java, emit a string with the ranges (each pair of chars
	// in the string is a range).
	out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
	System.out.println("Writing " + JAVA_CHARPROP_OUT);
	out.println(" // " + WARNING);
	emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
	}

	/**
	* Create the set of case-sensitive characters. These are characters
	* that participate in any case mapping operation as a source or
	* as a member of a target string.
	*/
	static UnicodeSet getCaseSensitive() {
	UnicodeSet caseSensitive = new UnicodeSet();
	Locale loc = Locale.US;
	BreakIterator bi = BreakIterator.getTitleInstance(loc);
	for (int c = 0; c <= 0x10FFFF; ++c) {
	String cp = UTF16.valueOf(c);
	for (int j=0; j<4; ++j) {
	String s = null;
	switch (j) {
	case 0: s = UCharacter.toUpperCase(loc, cp); break;
	case 1: s = UCharacter.toLowerCase(loc, cp); break;
	case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
	case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
	}
	if (!s.equals(cp)) {
	int cc;
	for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
	cc = UTF16.charAt(s, k);
	caseSensitive.add(cc);
	}
	for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
	cc = UTF16.charAt(cp, k);
	caseSensitive.add(cc);
	}
	}
	}
	// Also do the single-codepoint API. This shouldn't add any
	// code points, but we do it for completeness.
	for (int j=0; j<4; ++j) {
	int d = 0;
	switch (j) {
	case 0: d = UCharacter.toUpperCase(c); break;
	case 1: d = UCharacter.toLowerCase(c); break;
	case 2: d = UCharacter.toTitleCase(c); break;
	case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
	}
	if (d != c) {
	if (!caseSensitive.contains(c) \|\|
	!caseSensitive.contains(d)) {
	System.out.println("Warning: code point " + c +
	" => " + d + " created NEW MAPPING"+
	" for Case_Sensitive");
	}
	caseSensitive.add(c);
	caseSensitive.add(d);
	}
	}
	}
	return caseSensitive;
	}

	/**
	* Given a UnicodeSet, emit it as an array of UChar pairs. Each
	* pair will be the start/end of a range. Code points >= U+10000
	* will be represented as surrogate pairs.
	*/
	static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
	// Store the pairs in a StringBuffer. This handles surrogate
	// representation.
	StringBuffer buf = new StringBuffer();
	for (int i=0; i<set.getRangeCount(); ++i) {
	UTF16.append(buf, set.getRangeStart(i));
	UTF16.append(buf, set.getRangeEnd(i));
	}
	// Emit the pairs
	out.println("static const UChar " + id + "[] = {");
	for (int i=0; i<buf.length(); ) {
	out.print(" ");
	for (int n=0; n++<10 && i<buf.length(); ++i) {
	out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
	}
	out.println();
	}
	out.println("};");
	out.println("#define " + id + "_LENGTH (sizeof(" + id +
	")/sizeof(" + id + "[0]))");
	}

	/**
	* Given a UnicodeSet, emit it as a Java string. The most economical
	* format is not the pattern, but instead a pairs list, with each
	* range pair represented as two adjacent characters.
	*/
	static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
	// Store the pairs in a StringBuffer. This handles surrogate
	// representation.
	StringBuffer buf = new StringBuffer();
	for (int i=0; i<set.getRangeCount(); ++i) {
	UTF16.append(buf, set.getRangeStart(i));
	UTF16.append(buf, set.getRangeEnd(i));
	}
	// Emit the pairs
	out.println(" private static final String " + id + " =");
	out.println(Utility.formatForSource(buf.toString()) + ";");
	}
	}