unicodetools/com/ibm/text/UCA/UCA_Data.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 1996-2001, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
 * $Date: 2006/06/08 18:16:40 $
 * $Revision: 1.4 $
 *
 *******************************************************************************
 */

 package com.ibm.text.UCA;

 import java.util.*;
 import java.io.BufferedReader;
 import java.io.Reader;
 import java.io.PrintWriter;
 import java.io.FileReader;
 import java.text.MessageFormat;
 import java.io.IOException;
 import com.ibm.text.UCD.Normalizer;
 import com.ibm.text.UCD.UCD;
 import com.ibm.text.utility.*;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 public class UCA_Data implements UCA_Types {
     static final boolean DEBUG = false;
     static final boolean DEBUG_SHOW_ADD = false;
     static final boolean lessThan410 = false;

     private Normalizer toD;
     private UCD ucd;

     public UCA_Data(Normalizer toD, UCD ucd) {
         this.toD = toD;
         this.ucd = ucd;
     }

     /**
      * The collation element data is stored a couple of different structures.
      * First is collationElements, which generally contains the 32-bit CE corresponding
      * to the data. It is directly indexed by character code.<br>
      * For brevity in the implementation, we just use a flat array.
      * A real implementation would use a multi-stage table, as described in TUS Section 5.
      * table of simple collation elements, indexed by char.<br>
      * Exceptional cases: expanding, contracting, unsupported are handled as described below.
      */
     private int[] collationElements = new int[65536];

     /**
      * Although a single character can expand into multiple CEs, we don't want to burden
      * the normal case with the storage. So, they get a special value in the collationElements
      * array. This value has a distinct primary weight, followed by an index into a separate
      * table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
      * will be used for the expansion. The implementation is as a stack; this just makes it
      * easy to generate.
      */
     private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys

     /**
      * For now, this is just a simple mapping of strings to collation elements.
      * The implementation depends on the contracting characters being "completed",
      * so that it can be efficiently determined when to stop looking.
      */
     private Map contractingTable = new TreeMap();

     {
         // clear some tables
         for (int i = 0; i < collationElements.length; ++i) {
             collationElements[i] = UNSUPPORTED_FLAG;
         }
         // preload with parts
         for (char i = 0xD800; i < 0xDC00; ++i) {
             collationElements[i] = CONTRACTING;
             addToContractingTable(String.valueOf(i), UNSUPPORTED_FLAG);
         }
         checkConsistency();
     }

     /**
      * Return the type of the CE
      */
     public byte getCEType(int ch) {
         if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands

         int ce = collationElements[ch];
         if (ce == UNSUPPORTED_FLAG) {

             // Special check for Han, Hangul
             if (ucd.isHangulSyllable(ch)) return HANGUL_CE;

             if (ucd.isCJK_BASE(ch)) return CJK_CE;
             if (ucd.isCJK_AB(ch)) return CJK_AB_CE;

             // special check for unsupported surrogate pair, 20 1/8 bits
             //if (0xD800 <= ch && ch <= 0xDFFF) {
             //    return SURROGATE_CE;
             //}
             return UNSUPPORTED_CE;
         }
         if (ce == CONTRACTING) return CONTRACTING_CE;
         if ((ce & EXPANDING_MASK) == EXPANDING_MASK) return EXPANDING_CE;
         return NORMAL_CE;
     }

     public void add(String source, IntStack ces) {
         add(new StringBuffer(source), ces);
     }

     public void add(StringBuffer source, IntStack ces) {

         if (DEBUG_SHOW_ADD) {
             System.out.println("Adding: " + ucd.getCodeAndName(source.toString()) + CEList.toString(ces));
         }
         if (source.length() < 1 || ces.length() < 1) {
             throw new IllegalArgumentException("String or CEs too short");
         }

         int ce;
         if (ces.length() == 1) {
             ce = ces.get(0);
         } else {
             ce = EXPANDING_MASK | expandingTable.getTop();
             expandingTable.append(ces);
             expandingTable.append(TERMINATOR);
         }

         // assign CE(s) to char(s)
         char value = source.charAt(0);
         //if (value == 0x10000) System.out.print("DEBUG2: " + source);

         if (source.length() > 1) {
             addToContractingTable(source, ce);
             if (collationElements[value] == UNSUPPORTED_FLAG) {
                 collationElements[value] = CONTRACTING; // mark special
             } else if (collationElements[value] != CONTRACTING) {
                 // move old value to contracting table!
                 //contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
                 addToContractingTable(String.valueOf(value), collationElements[value]);
                 collationElements[value] = CONTRACTING; // signal we must look up in table
             }
         } else if (collationElements[value] == CONTRACTING) {
             // must add old value to contracting table!
             addToContractingTable(source, ce);
             //contractingTable.put(source, new Integer(ce));
         } else {
             collationElements[source.charAt(0)] = ce; // normal
         }
         //if (DEBUG) checkConsistency();
     }

     boolean isCompletelyIgnoreable(int cp) {
         int ce = collationElements[cp < UTF16.SUPPLEMENTARY_MIN_VALUE ? cp : UTF16.getLeadSurrogate(cp)];
         if (ce == 0) return true;
         if (ce != CONTRACTING) return false;
         Object newValue = contractingTable.get(UTF16.valueOf(cp));
         if (newValue == null) return false;
         return ((Integer)newValue).intValue() == 0;
     }

     // returns new pos, fills in result.
     public int get(char ch, StringBuffer decompositionBuffer, int index, IntStack result) {
         int ce = collationElements[ch];

         if (ce == CONTRACTING) {
             // Contracting is probably the most interesting (read "tricky") part
             // of the algorithm.
             // First get longest substring that is in the contracting table.
             // For simplicity, we use a hash table for contracting.
             // There are much better optimizations,
             // but they take a more complicated build algorithm than we want to show here.
             // NOTE: We are guaranteed that the first code unit is in the contracting table because
             // of the build process.
             String probe = String.valueOf(ch);
             Object value = contractingTable.get(probe);
             if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));

             // complete the first character, if part of supplementary
             if (UTF16.isLeadSurrogate(ch) && index < decompositionBuffer.length()) {
                 char ch2 = decompositionBuffer.charAt(index);
                 String newProbe = probe + ch2;
                 Object newValue = contractingTable.get(newProbe);
                 if (newValue != null) {
                     probe = newProbe;
                     value = newValue;
                     index++;
                 }
             }

             // We loop, trying to add successive CODE UNITS to the longest substring.
             int cp2;
             while (index < decompositionBuffer.length()) {
                 //char ch2 = decompositionBuffer.charAt(index);
                 cp2 = UTF16.charAt(decompositionBuffer, index);
                 int increment = UTF16.getCharCount(cp2);

                 // CHECK if last char was completely ignorable
                 if (lessThan410 && isCompletelyIgnoreable(cp2)) {
                     index += increment; // just skip char don't set probe, value
                     continue;
                 }

                 // see whether the current string plus the next char are in
                 // the contracting table.
                 String newProbe = probe + UTF16.valueOf(cp2);
                 Object newValue = contractingTable.get(newProbe);
                 if (newValue == null) break;    // stop if not in table.

                 // We succeeded--so update our new values, and set index
                 // and quaternary to indicate that we swallowed another character.
                 probe = newProbe;
                 value = newValue;
                 index += increment;
             }

             // Now, see if we can add any combining marks
             short lastCan = 0;
             int increment;
             for (int i = index; i < decompositionBuffer.length(); i += increment) {
                 // We only take certain characters. They have to be accents,
                 // and they have to not be blocked.
                 // Unlike above, if we don't find a match (and it was an accent!)
                 // then we don't stop, we continue looping.
                 cp2 = UTF16.charAt(decompositionBuffer, i);
                 increment = UTF16.getCharCount(cp2);
                 short can = toD.getCanonicalClass(cp2);
                 if (can == 0) break;            // stop with any zero (non-accent)
                 if (can == lastCan) continue;   // blocked if same class as last
                 lastCan = can;                  // remember for next time

                 // CHECK if last char was completely ignorable. If so, skip it.
                 if (lessThan410 && isCompletelyIgnoreable(cp2)) {
                     continue;
                 }

                 // Now see if we can successfully add it onto our string
                 // and find it in the contracting table.
                 String newProbe = probe + UTF16.valueOf(cp2);
                 Object newValue = contractingTable.get(newProbe);
                 if (newValue == null) continue;

                 // We succeeded--so update our new values, remove the char, and update
                 // quaternary to indicate that we swallowed another character.
                 probe = newProbe;
                 value = newValue;
                 decompositionBuffer.setCharAt(i,'\u0000');  // zero char
                 if (increment == 2) {
                     // WARNING: we had a supplementary character. zero BOTH parts
                     decompositionBuffer.setCharAt(i+1,'\u0000');  // zero char
                 }
             }

             // we are all done, and can extract the CE from the last value set.
             ce = ((Integer)value).intValue();

         }

         // if the CE is not expanding) we are done.
         if ((ce & EXPANDING_MASK) != EXPANDING_MASK) {
             result.push(ce);
         } else {
             // expanding, so copy list of items onto stack
             int ii = ce & EXCEPTION_INDEX_MASK; // get index
             // copy onto stack from index until reach TERMINATOR
             while (true) {
                 ce = expandingTable.get(ii++);
                 if (ce == TERMINATOR) break;
                 result.push(ce);
             }
         }
         return index;
     }

     private void addToContractingTable(Object s, int ce) {
         if (s == null) {
             throw new IllegalArgumentException("String can't be null");
         }
         contractingTable.put(s.toString(), new Integer(ce));
     }

     void checkConsistency() {
                 // at this point, we have to guarantee that the contractingTable is CLOSED
         // e.g. if a substring of length n is in the table, then the first n-1 characters
         // are also!!

         // First check consistency. the CE for a value is CONTRACTING if and only if there is a contraction starting
         // with that value.

         UnicodeSet ceSet = new UnicodeSet();
         for (int i = 0; i < collationElements.length; ++i) {
             if (collationElements[i] == CONTRACTING) ceSet.add(i);
         }
         UnicodeSet ceSet2 = new UnicodeSet();
         Iterator enum1 = contractingTable.keySet().iterator();
         while (enum1.hasNext()) {
             String sequence = (String)enum1.next();
             ceSet2.add(sequence.charAt(0));
         }

         if (!ceSet.equals(ceSet2)) {
             System.out.println("In both: " + new UnicodeSet(ceSet).retainAll(ceSet2).toPattern(true));
             System.out.println("CONTRACTING but not in table: " + new UnicodeSet(ceSet).removeAll(ceSet2).toPattern(true));
             System.out.println("In table but not CONTRACTING: " + new UnicodeSet(ceSet2).removeAll(ceSet).toPattern(true));
             throw new IllegalArgumentException("Inconsistent data");
         }

 /*
 0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
 0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
         int[] temp1 = int[20];
         int[] temp2 = int[20];
         int[] temp3 = int[20];
         getCEs("\u0fb2", true, temp1);
         getCEs("\u0fb3", true, temp2);
         getCEs("\u0f71", true, temp3);
         add("\u0FB2\u0F71", concat(temp1, temp3));
 */

     }

     Iterator getContractions() {
         return contractingTable.keySet().iterator();
     }

     int getContractionCount() {
         return contractingTable.size();
     }

     boolean contractionTableContains(String s) {
         return contractingTable.get(s) != null;
     }

 }
	/**
	*******************************************************************************
	* Copyright (C) 1996-2001, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
	* $Date: 2006/06/08 18:16:40 $
	* $Revision: 1.4 $
	*
	*******************************************************************************
	*/

	package com.ibm.text.UCA;

	import java.util.*;
	import java.io.BufferedReader;
	import java.io.Reader;
	import java.io.PrintWriter;
	import java.io.FileReader;
	import java.text.MessageFormat;
	import java.io.IOException;
	import com.ibm.text.UCD.Normalizer;
	import com.ibm.text.UCD.UCD;
	import com.ibm.text.utility.*;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;

	public class UCA_Data implements UCA_Types {
	static final boolean DEBUG = false;
	static final boolean DEBUG_SHOW_ADD = false;
	static final boolean lessThan410 = false;

	private Normalizer toD;
	private UCD ucd;

	public UCA_Data(Normalizer toD, UCD ucd) {
	this.toD = toD;
	this.ucd = ucd;
	}

	/**
	* The collation element data is stored a couple of different structures.
	* First is collationElements, which generally contains the 32-bit CE corresponding
	* to the data. It is directly indexed by character code.<br>
	* For brevity in the implementation, we just use a flat array.
	* A real implementation would use a multi-stage table, as described in TUS Section 5.
	* table of simple collation elements, indexed by char.<br>
	* Exceptional cases: expanding, contracting, unsupported are handled as described below.
	*/
	private int[] collationElements = new int[65536];

	/**
	* Although a single character can expand into multiple CEs, we don't want to burden
	* the normal case with the storage. So, they get a special value in the collationElements
	* array. This value has a distinct primary weight, followed by an index into a separate
	* table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
	* will be used for the expansion. The implementation is as a stack; this just makes it
	* easy to generate.
	*/
	private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys

	/**
	* For now, this is just a simple mapping of strings to collation elements.
	* The implementation depends on the contracting characters being "completed",
	* so that it can be efficiently determined when to stop looking.
	*/
	private Map contractingTable = new TreeMap();

	{
	// clear some tables
	for (int i = 0; i < collationElements.length; ++i) {
	collationElements[i] = UNSUPPORTED_FLAG;
	}
	// preload with parts
	for (char i = 0xD800; i < 0xDC00; ++i) {
	collationElements[i] = CONTRACTING;
	addToContractingTable(String.valueOf(i), UNSUPPORTED_FLAG);
	}
	checkConsistency();
	}

	/**
	* Return the type of the CE
	*/
	public byte getCEType(int ch) {
	if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands

	int ce = collationElements[ch];
	if (ce == UNSUPPORTED_FLAG) {

	// Special check for Han, Hangul
	if (ucd.isHangulSyllable(ch)) return HANGUL_CE;

	if (ucd.isCJK_BASE(ch)) return CJK_CE;
	if (ucd.isCJK_AB(ch)) return CJK_AB_CE;

	// special check for unsupported surrogate pair, 20 1/8 bits
	//if (0xD800 <= ch && ch <= 0xDFFF) {
	// return SURROGATE_CE;
	//}
	return UNSUPPORTED_CE;
	}
	if (ce == CONTRACTING) return CONTRACTING_CE;
	if ((ce & EXPANDING_MASK) == EXPANDING_MASK) return EXPANDING_CE;
	return NORMAL_CE;
	}

	public void add(String source, IntStack ces) {
	add(new StringBuffer(source), ces);
	}

	public void add(StringBuffer source, IntStack ces) {

	if (DEBUG_SHOW_ADD) {
	System.out.println("Adding: " + ucd.getCodeAndName(source.toString()) + CEList.toString(ces));
	}
	if (source.length() < 1 \|\| ces.length() < 1) {
	throw new IllegalArgumentException("String or CEs too short");
	}

	int ce;
	if (ces.length() == 1) {
	ce = ces.get(0);
	} else {
	ce = EXPANDING_MASK \| expandingTable.getTop();
	expandingTable.append(ces);
	expandingTable.append(TERMINATOR);
	}

	// assign CE(s) to char(s)
	char value = source.charAt(0);
	//if (value == 0x10000) System.out.print("DEBUG2: " + source);

	if (source.length() > 1) {
	addToContractingTable(source, ce);
	if (collationElements[value] == UNSUPPORTED_FLAG) {
	collationElements[value] = CONTRACTING; // mark special
	} else if (collationElements[value] != CONTRACTING) {
	// move old value to contracting table!
	//contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
	addToContractingTable(String.valueOf(value), collationElements[value]);
	collationElements[value] = CONTRACTING; // signal we must look up in table
	}
	} else if (collationElements[value] == CONTRACTING) {
	// must add old value to contracting table!
	addToContractingTable(source, ce);
	//contractingTable.put(source, new Integer(ce));
	} else {
	collationElements[source.charAt(0)] = ce; // normal
	}
	//if (DEBUG) checkConsistency();
	}

	boolean isCompletelyIgnoreable(int cp) {
	int ce = collationElements[cp < UTF16.SUPPLEMENTARY_MIN_VALUE ? cp : UTF16.getLeadSurrogate(cp)];
	if (ce == 0) return true;
	if (ce != CONTRACTING) return false;
	Object newValue = contractingTable.get(UTF16.valueOf(cp));
	if (newValue == null) return false;
	return ((Integer)newValue).intValue() == 0;
	}

	// returns new pos, fills in result.
	public int get(char ch, StringBuffer decompositionBuffer, int index, IntStack result) {
	int ce = collationElements[ch];

	if (ce == CONTRACTING) {
	// Contracting is probably the most interesting (read "tricky") part
	// of the algorithm.
	// First get longest substring that is in the contracting table.
	// For simplicity, we use a hash table for contracting.
	// There are much better optimizations,
	// but they take a more complicated build algorithm than we want to show here.
	// NOTE: We are guaranteed that the first code unit is in the contracting table because
	// of the build process.
	String probe = String.valueOf(ch);
	Object value = contractingTable.get(probe);
	if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));

	// complete the first character, if part of supplementary
	if (UTF16.isLeadSurrogate(ch) && index < decompositionBuffer.length()) {
	char ch2 = decompositionBuffer.charAt(index);
	String newProbe = probe + ch2;
	Object newValue = contractingTable.get(newProbe);
	if (newValue != null) {
	probe = newProbe;
	value = newValue;
	index++;
	}
	}

	// We loop, trying to add successive CODE UNITS to the longest substring.
	int cp2;
	while (index < decompositionBuffer.length()) {
	//char ch2 = decompositionBuffer.charAt(index);
	cp2 = UTF16.charAt(decompositionBuffer, index);
	int increment = UTF16.getCharCount(cp2);

	// CHECK if last char was completely ignorable
	if (lessThan410 && isCompletelyIgnoreable(cp2)) {
	index += increment; // just skip char don't set probe, value
	continue;
	}

	// see whether the current string plus the next char are in
	// the contracting table.
	String newProbe = probe + UTF16.valueOf(cp2);
	Object newValue = contractingTable.get(newProbe);
	if (newValue == null) break; // stop if not in table.

	// We succeeded--so update our new values, and set index
	// and quaternary to indicate that we swallowed another character.
	probe = newProbe;
	value = newValue;
	index += increment;
	}

	// Now, see if we can add any combining marks
	short lastCan = 0;
	int increment;
	for (int i = index; i < decompositionBuffer.length(); i += increment) {
	// We only take certain characters. They have to be accents,
	// and they have to not be blocked.
	// Unlike above, if we don't find a match (and it was an accent!)
	// then we don't stop, we continue looping.
	cp2 = UTF16.charAt(decompositionBuffer, i);
	increment = UTF16.getCharCount(cp2);
	short can = toD.getCanonicalClass(cp2);
	if (can == 0) break; // stop with any zero (non-accent)
	if (can == lastCan) continue; // blocked if same class as last
	lastCan = can; // remember for next time

	// CHECK if last char was completely ignorable. If so, skip it.
	if (lessThan410 && isCompletelyIgnoreable(cp2)) {
	continue;
	}

	// Now see if we can successfully add it onto our string
	// and find it in the contracting table.
	String newProbe = probe + UTF16.valueOf(cp2);
	Object newValue = contractingTable.get(newProbe);
	if (newValue == null) continue;

	// We succeeded--so update our new values, remove the char, and update
	// quaternary to indicate that we swallowed another character.
	probe = newProbe;
	value = newValue;
	decompositionBuffer.setCharAt(i,'\u0000'); // zero char
	if (increment == 2) {
	// WARNING: we had a supplementary character. zero BOTH parts
	decompositionBuffer.setCharAt(i+1,'\u0000'); // zero char
	}
	}

	// we are all done, and can extract the CE from the last value set.
	ce = ((Integer)value).intValue();

	}

	// if the CE is not expanding) we are done.
	if ((ce & EXPANDING_MASK) != EXPANDING_MASK) {
	result.push(ce);
	} else {
	// expanding, so copy list of items onto stack
	int ii = ce & EXCEPTION_INDEX_MASK; // get index
	// copy onto stack from index until reach TERMINATOR
	while (true) {
	ce = expandingTable.get(ii++);
	if (ce == TERMINATOR) break;
	result.push(ce);
	}
	}
	return index;
	}

	private void addToContractingTable(Object s, int ce) {
	if (s == null) {
	throw new IllegalArgumentException("String can't be null");
	}
	contractingTable.put(s.toString(), new Integer(ce));
	}

	void checkConsistency() {
	// at this point, we have to guarantee that the contractingTable is CLOSED
	// e.g. if a substring of length n is in the table, then the first n-1 characters
	// are also!!

	// First check consistency. the CE for a value is CONTRACTING if and only if there is a contraction starting
	// with that value.

	UnicodeSet ceSet = new UnicodeSet();
	for (int i = 0; i < collationElements.length; ++i) {
	if (collationElements[i] == CONTRACTING) ceSet.add(i);
	}
	UnicodeSet ceSet2 = new UnicodeSet();
	Iterator enum1 = contractingTable.keySet().iterator();
	while (enum1.hasNext()) {
	String sequence = (String)enum1.next();
	ceSet2.add(sequence.charAt(0));
	}

	if (!ceSet.equals(ceSet2)) {
	System.out.println("In both: " + new UnicodeSet(ceSet).retainAll(ceSet2).toPattern(true));
	System.out.println("CONTRACTING but not in table: " + new UnicodeSet(ceSet).removeAll(ceSet2).toPattern(true));
	System.out.println("In table but not CONTRACTING: " + new UnicodeSet(ceSet2).removeAll(ceSet).toPattern(true));
	throw new IllegalArgumentException("Inconsistent data");
	}

	/*
	0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
	0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
	int[] temp1 = int[20];
	int[] temp2 = int[20];
	int[] temp3 = int[20];
	getCEs("\u0fb2", true, temp1);
	getCEs("\u0fb3", true, temp2);
	getCEs("\u0f71", true, temp3);
	add("\u0FB2\u0F71", concat(temp1, temp3));
	*/

	}

	Iterator getContractions() {
	return contractingTable.keySet().iterator();
	}

	int getContractionCount() {
	return contractingTable.size();
	}

	boolean contractionTableContains(String s) {
	return contractingTable.get(s) != null;
	}

	}