src/com/ibm/text/UInfo.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 1996-2000, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UInfo.java,v $
  * $Date: 2000/03/10 04:07:24 $
  * $Revision: 1.3 $
  *
  *****************************************************************************************
  */

 package com.ibm.text;

 import java.io.*;
 import java.util.*;

 public final class UInfo {
     static final boolean DEBUG = false;
     static final int UINFO_VERSION = 5;

     // these values are aligned with the java.lang.Character constants

     public static final byte
     UNASSIGNED      = 0,
     UPPERCASE_LETTER    = 1,
     LOWERCASE_LETTER    = 2,
     TITLECASE_LETTER    = 3,
     MODIFIER_LETTER     = 4,
     OTHER_LETTER        = 5,
     NON_SPACING_MARK    = 6,
     ENCLOSING_MARK      = 7,
     COMBINING_SPACING_MARK  = 8,
     DECIMAL_DIGIT_NUMBER    = 9,
     LETTER_NUMBER       = 10,
     OTHER_NUMBER        = 11,
     SPACE_SEPARATOR     = 12,
     LINE_SEPARATOR      = 13,
     PARAGRAPH_SEPARATOR = 14,
     CONTROL         = 15,
     FORMAT          = 16,
     PRIVATE_USE     = 18,
     SURROGATE       = 19,
     DASH_PUNCTUATION    = 20,
     START_PUNCTUATION   = 21,
     END_PUNCTUATION     = 22,
     CONNECTOR_PUNCTUATION   = 23,
     OTHER_PUNCTUATION   = 24,
     MATH_SYMBOL     = 25,
     CURRENCY_SYMBOL     = 26,
     MODIFIER_SYMBOL     = 27,
     OTHER_SYMBOL        = 28;

     public String getName(char ch) {return getInfo(ch).name;}
     public String getDecomposition(char ch) {return getInfo(ch).decomposition;}
     public String getName10(char ch) {return getInfo(ch).name10;}
     public String getComment(char ch) {return getInfo(ch).comment;}

     public float getNumeric(char ch) {return getInfo(ch).numeric;}

     public short getCanonicalClass(char ch) {return getInfo(ch).canonical;}
     public short getDecimal(char ch) {return getInfo(ch).decimal;}
     public short getDigit(char ch) {return getInfo(ch).digit;}

     public char getUppercase(char ch) {return getInfo(ch).uppercase;}
     public char getLowercase(char ch) {return getInfo(ch).lowercase;}
     public char getTitlecase(char ch) {return getInfo(ch).titlecase;}

     public byte getCategory(char ch) {return getInfo(ch).category;}
     public byte getBidiClass(char ch) {return getInfo(ch).bidi;}
     public boolean getMirrored(char ch) {return getInfo(ch).mirrored;}

     public boolean isDisparaged(char ch) { return getDecomposition(ch).length() == 4; }

     public boolean isLetter(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<UPPERCASE_LETTER)
           | (1<<LOWERCASE_LETTER)
           | (1<<TITLECASE_LETTER)
           | (1<<MODIFIER_LETTER)
           | (1<<MODIFIER_LETTER))));
     }

     public boolean isMark(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<NON_SPACING_MARK)
           | (1<<ENCLOSING_MARK)
           | (1<<COMBINING_SPACING_MARK))));
     }

     public boolean isNumber(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<DECIMAL_DIGIT_NUMBER)
           | (1<<LETTER_NUMBER)
           | (1<<OTHER_NUMBER))));
     }

     public boolean isSeparator(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<SPACE_SEPARATOR)
           | (1<<LINE_SEPARATOR)
           | (1<<PARAGRAPH_SEPARATOR))));
     }

     public boolean isFormat(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<CONTROL)
           | (1<<FORMAT))));
     }

     public boolean isPunctuation(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<DASH_PUNCTUATION)
           | (1<<START_PUNCTUATION)
           | (1<<END_PUNCTUATION)
           | (1<<CONNECTOR_PUNCTUATION)
           | (1<<START_PUNCTUATION)
           | (1<<END_PUNCTUATION)
           | (1<<OTHER_PUNCTUATION))));
     }

     public boolean isSymbol(char ch) {
         return (0 != ((1<<getCategory(ch)) &
           ((1<<MATH_SYMBOL)
           | (1<<CURRENCY_SYMBOL)
           | (1<<MODIFIER_SYMBOL)
           | (1<<OTHER_SYMBOL))));
     }

     //
     // Characters excluded from composition.  This will eventually be read from
     // an auxiliary data file, but for now it's hardcoded
     //
     static final String composeExclude =
             "\u0958\u0959\u095a\u095b\u095c\u095d\u095e\u095f"  // Devanagari
         +   "\ufb1f\ufb2a\ufb2b\ufb2c\ufb2d\ufb2e\ufb2f"        // Hebrew
         +   "\ufb30\ufb31\ufb32\ufb33\ufb34\ufb35\ufb36"
         +   "\ufb38\ufb39\ufb3a\ufb3b\ufb3c\ufb3e"
         +   "\ufb40\ufb41\ufb43\ufb44\ufb46\ufb47\ufb48"
         +   "\ufb49\ufb4a\ufb4b\ufb4c\ufb4d\ufb4e"
         ;

     /**
      * Is this character excluded from the composition algorithm by virtue
      * of being listed in the composition exclusion table in Tech Report #15?
      */
     public boolean isExcludedComposition(char ch) {
         return isDisparaged(ch)
             || composeExclude.indexOf(ch) >= 0
             || getCanonicalClass(getDecompositionChars(ch).charAt(0)) != 0;
     }


     public String getName(String s) {
         return getName(s,true);
     }

     public String getName(String s, boolean shortVersion) {
         StringBuffer temp = new StringBuffer();
         for (int i = 0; i < s.length(); ++i) {
             if (i != 0) temp.append(", ");
             temp.append(getName(s.charAt(i), shortVersion));
         }
         return temp.toString();
     }

     public String getName(char ch, boolean shortVersion) {
         String result = getName(ch);
         if (!shortVersion) return result;
         result = replace(result,"LETTER ","");
         result = replace(result,"CHARACTER ","");
         result = replace(result,"SIGN ","");
         result = replace(result,"CAPITAL ","UC ");
         if (getCategory(ch) == LOWERCASE_LETTER)
           result = replace(result,"SMALL ","LC ");
         result = replace(result,"COMBINING ","-");
         result = replace(result,"WITH ","");
         result = replace(result,"AND ","");
         result = replace(result,"VARIA","GRAVE");
         result = replace(result,"OXIA","ACUTE");
         result = replace(result,"VRACHY","BREVE");
         result = replace(result,"VERTICAL LINE ABOVE","TONOS");
         result = replace(result,"PSILI","SMOOTH");
         result = replace(result,"DASIA","ROUGH");
         result = replace(result,"COMMA ABOVE","SMOOTH");
         result = replace(result,"REVERSED COMMA ABOVE","ROUGH");
         result = replace(result,"YPOGEGRAMMENI","IOTA-SUB");
         result = replace(result,"PROSGEGRAMMENI","IOTA-AD");
         result = replace(result,"DIALYTIKA","DIAERESIS");
         result = replace(result,"PERISPOMENI","CIRCUMFLEX");
         result = replace(result,"VOICED SOUND MARK","VOICED SIGN");
         result = replace(result,"PROLONGED SOUND MARK","VOICED SIGN");
         result = replace(result,"KATAKANA-HIRAGANA","KANA");
         result = replace(result,"COMPATIBILITY IDEOGRAPH-","");
         result = replace(result,"CHOSEONG","INITIAL");
         result = replace(result,"JUNGSEONG","MEDIAL");
         result = replace(result,"JONGSEONG","FINAL");

         return result.substring(0,1)
           + result.substring(1,result.length()).toLowerCase();
     }

     public String replace(String source,
       String replacee, String replacer) {
         int p = source.indexOf(replacee);
         if (p == -1) return source;
         return source.substring(0,p)
           + replacer
           + source.substring(p+replacee.length(),source.length());
     }

     public boolean isCCS(String s) {
         if (s.length() < 2) return false;
         if (isMark(s.charAt(0))) return false;
         for (int i = 1; i < s.length(); ++i) {
             if (!isMark(s.charAt(i))) return false;
         }
         return true;
     }

     // combining base sequence := <cat_zero>+ <cat_pos>*
     public boolean isCBS(String s) {
         if (s.length() == 0) return false;
         if (getCanonicalClass(s.charAt(0)) != 0) return false;
         boolean gotGreater = false;
         for (int i = 1; i < s.length(); ++i) {
             if (getCanonicalClass(s.charAt(i)) == 0) {
                 if (gotGreater) return false;
             } else {
                 gotGreater = true;
             }
         }
         return true;
     }

     public boolean hasCanonicalDecomposition(char ch) {
         String decomp = getDecomposition(ch);
         return (decomp.length() != 0 && decomp.indexOf('<') == -1);
     }

     public boolean hasCompatibilityDecomposition(char ch) {
         String decomp = getDecomposition(ch);
         return (decomp.length() != 0 && decomp.indexOf('<') != -1);
     }

     public boolean isEquivalent(
       String a, String b, boolean canonical) {
         return getFullDecomposition(a, canonical).equals(
           getFullDecomposition(b, canonical));
     }

     // use very dumb algorithm. Don't need lower order one.

     public String getFullDecomposition(
       String s, boolean canonical) {
         StringBuffer output = new StringBuffer();
         for (int i = 0; i < s.length(); ++i) {
             getFullDecomp2(s.charAt(i),canonical,output);
         }
         return fixCanonical(output).toString();
     }

     public StringBuffer getFullDecomposition(
       char ch, boolean canonical, StringBuffer output) {

         StringBuffer result = getFullDecomp2(ch,canonical,output);
         return fixCanonical(result);
     }

     public String getFullDecomposition(
       char ch, boolean canonical) {
         return getFullDecomposition(ch, canonical, new StringBuffer()).toString();
     }

     public StringBuffer fixCanonical(StringBuffer target) {
         for (int i = 1; i < target.length(); ++i) {
             char ch = target.charAt(i);
             short canClass = getCanonicalClass(ch);
             char chPrev = target.charAt(i-1);
             short canClassPrev = getCanonicalClass(chPrev);
             if (canClass != 0 && canClass < canClassPrev) {
                 target.setCharAt(i-1, ch);
                 target.setCharAt(i, chPrev);
                 if (i > 1) i -= 2; // backup (-1 to compensate for loop)
             }
         }
         return target;
     }

     public String fixCanonical(String source) {
         return fixCanonical(new StringBuffer(source)).toString();
     }


     // ============================================
     //                  PRIVATES
     // ============================================

     static class CharData {
         public CharData() {
         };

         String name = "";
         String decomposition = "";
         String name10 = "";
         String comment = "";

         float numeric = Float.MIN_VALUE;

         short canonical = 0;
         short decimal = Short.MIN_VALUE;
         short digit = Short.MIN_VALUE;

         char uppercase;
         char lowercase;
         char titlecase;

         byte category;
         byte bidi = 0;

         boolean mirrored;
     };

     private static final CharData UNASSIGNED_INFO = new CharData();
     private static char cachedChar = 0xFFFF;

     private CharData getInfo(char ch) {
         if (ch == cachedChar) return UNASSIGNED_INFO;
         // remap special ranges
         if (ch >= 0x4E00 && ch < 0xF900) {
             if (ch <= 0x9FA5) ch = 0x4E00;
             else if (ch >= 0xAC00 && ch <= 0xD7A3) ch = 0xAC00;
             else if (ch >= 0xD800 && ch <= 0xDFFF) ch = 0xD800;
             else if (ch >= 0xE000) ch = 0xE000;
         }
         Object value = cache[ch];
         CharData result;
         if (value == null) {
             result = UNASSIGNED_INFO;
         } else if (value instanceof String) {
             result = updateCache((String)value);
         } else {
             result = (CharData)value;
         }
         return result;
     }

     private StringBuffer getFullDecomp2(
       char ch, boolean canonical, StringBuffer output) {

         String decomp = getDecomposition(ch);
         if (decomp.length() == 0
           || (canonical && decomp.indexOf('<') != -1)) {
             output.append(ch);
             return output;
         }
         boolean inBrackets = false;
         for (int i = 0; i < decomp.length(); ++i) {
             char c = decomp.charAt(i);
             if (c == '<') inBrackets = true;
             else if (c == '>') inBrackets = false;
             else if (inBrackets) ; // skip
             else if (c == ' ') ; // skip
             else {
                 String tempString = decomp.substring(i,i+4);
                 char temp = (char)Integer.parseInt(tempString,16);
                 getFullDecomposition(temp,canonical,output);
                 i+= 3;
             }
         }
         return output;
     }

     public String getDecompositionChars(char ch) {
         StringBuffer output = new StringBuffer();
         String decomp = getDecomposition(ch);
         if (decomp.length() == 0) {
             output.append(ch);
             return output.toString();
         }
         boolean inBrackets = false;
         for (int i = 0; i < decomp.length(); ++i) {
             char c = decomp.charAt(i);
             if (c == '<') inBrackets = true;
             else if (c == '>') inBrackets = false;
             else if (inBrackets) ; // skip
             else if (c == ' ') ; // skip
             else {
                 String tempString = decomp.substring(i,i+4);
                 char temp = (char)Integer.parseInt(tempString,16);
                 output.append(temp);
                 i+= 3;
             }
         }
         return output.toString();
     }

     public UInfo(String fileName) {
         long startTime,endTime;

         BufferedReader input = null;
         String line = null;
         try {
             input = new BufferedReader(new FileReader(fileName),64*1024);
             for (int count = 0;;++count) {
                 line = input.readLine();
                 if (line == null) break;
                 if (line.length() == 0) continue;
                 char ch = charFrom(line.substring(0,4));
                 if (DEBUG) if ((count % 100) == 0)
                     System.out.println("[" + count + "," + hex(ch) + ']');
                 cache[ch] = line;
             }
         } catch (Exception ex) {
             try {
                 input.close();
             } catch (Exception ex2) {}
             ex.printStackTrace();
             throw new IllegalArgumentException("Couldn't read file "
               + ex.getClass().getName() + " " + ex.getMessage()
               + " line = " + line
               );
         }
     }

     public UInfo() {
         this("../src/data/unicode/UnicodeData.txt");
     }

     /*
   0 Code value in 4-digit hexadecimal format.
   1 Unicode 2.1 Character Name. These names match exactly the
   2 General Category. This is a useful breakdown into various "character
   3 Canonical Combining Classes. The classes used for the
   4 Bidirectional Category. See the list below for an explanation of the
   5 Character Decomposition. In the Unicode Standard, not all of
   6 Decimal digit value. This is a numeric field. If the character
   7 Digit value. This is a numeric field. If the character represents a
   8 Numeric value. This is a numeric field. If the character has the
   9 If the characters has been identified as a "mirrored" character in
  10 Unicode 1.0 Name. This is the old name as published in Unicode 1.0.
  11 10646 Comment field. This field is informative.
  12 Upper case equivalent mapping. If a character is part of an
  13 Lower case equivalent mapping. Similar to 12. This field is informative.
  14 Title case equivalent mapping. Similar to 12. This field is informative.
     */

     private CharData updateCache(String line) {
         try {
             String[] parts = new String[30];
             split(line,';',parts);
             CharData info = new CharData();
             char ch = charFrom(parts[0]);
             info.name = parts[1];
             info.category = (byte)lookup(parts[2], CATEGORY_TABLE);
             info.canonical = shortFrom(parts[3]);
             info.bidi = (byte)lookup(parts[4], BIDI_TABLE);
             info.decomposition = parts[5];
             info.decimal = shortFrom(parts[6]);
             info.digit = shortFrom(parts[7]);
             info.numeric = floatFrom(parts[8]);
             info.mirrored = charFrom(parts[9]) == 'Y';
             info.name10 = parts[10];
             info.comment = parts[11];
             info.uppercase = charFrom(parts[12]);
             if (info.uppercase == 0) info.uppercase = ch;
             info.lowercase = charFrom(parts[13]);
             if (info.lowercase == 0) info.lowercase = ch;
             info.titlecase = charFrom(parts[14]);
             if (info.titlecase == 0) info.titlecase = info.uppercase;
             String trial = hex(ch) + ";" + info;
             if (DEBUG) if (!trial.equals(line)) {
                 System.out.println("Difference between:");
                 System.out.println(line);
                 System.out.println(trial);
             }
             cache[ch] = info;
             return info;
         }
         catch (NumberFormatException e) {
             System.out.println("updateCache: error parsing '" + line + "'");
             throw e;
         }
     }

     private static CharData typeInfo = new CharData();

     private boolean latin1(char c) {
         return ((c >= 20 && c <= 0x7F) || c > 0xA0);
     }

     private static final String[] YN_TABLE = {"N", "Y"};

     private static final String[] CATEGORY_TABLE = {
         "Cn", // = Other, Not Assigned

         "Lu", // = Letter, Uppercase
         "Ll", // = Letter, Lowercase
         "Lt", // = Letter, Titlecase
         "Lm", // = Letter, Modifier
         "Lo", // = Letter, Other

         "Mn", // = Mark, Non-Spacing
         "Me", // = Mark, Enclosing
         "Mc", // = Mark, Spacing Combining

         "Nd", // = Number, Decimal Digit
         "Nl", // = Number, Letter
         "No", // = Number, Other

         "Zs", // = Separator, Space
         "Zl", // = Separator, Line
         "Zp", // = Separator, Paragraph

         "Cc", // = Other, Control
         "Cf", // = Other, Format
         "",   // unused
         "Co", // = Other, Private Use
         "Cs", // = Other, Surrogate


         "Pd", // = Punctuation, Dash
         "Ps", // = Punctuation, Open
         "Pe", // = Punctuation, Close
         "Pc", // = Punctuation, Connector
         "Po", // = Punctuation, Other

         "Sm", // = Symbol, Math
         "Sc", // = Symbol, Currency
         "Sk", // = Symbol, Modifier
         "So", // = Symbol, Other

         "Pi", // = Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
         "Pf", // = Punctuation, Final quote (may behave like Ps or Pe dependingon usage)
     };

     private static String[] BIDI_TABLE = {
         "L", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
         "R", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
         "EN", //    European Number
         "ES", //    European Number Separator
         "ET", //    European Number Terminator
         "AN", //    Arabic Number
         "CS", //    Common Number Separator
         "B", // Block Separator
         "S", // Segment Separator
         "WS", //    Whitespace
         "ON" // Other Neutrals ; All other characters: punctuation, symbols
     };

     private static short shortFrom(String p) {
         if (p.length() == 0) return Short.MIN_VALUE;
         return Short.parseShort(p);
     }

     private static float floatFrom(String p) {
         try {
             if (p.length() == 0) return Float.MIN_VALUE;
             int fract = p.indexOf('/');
             if (fract == -1) return Float.valueOf(p).floatValue();
             String q = p.substring(0,fract);
             float num = 0;
             if (q.length() != 0) num = Integer.parseInt(q);
             p = p.substring(fract+1,p.length());
             float den = 0;
             if (p.length() != 0) den = Integer.parseInt(p);
             return num/den;
         }
         catch (NumberFormatException e) {
             System.out.println("floatFrom: error parsing '" + p + "'");
             throw e;
         }
     }

     private static char charFrom(String p) {
         if (p.length() == 0) return '\u0000';
         else if (p.length() == 1) return p.charAt(0);
         int temp = Integer.parseInt(p, 16);
         if (temp < 0 || temp > 0xFFFF)
             throw new NumberFormatException(
                 "Hex char out of range: " + p);
         return (char)temp;
     }


     private Object[] cache = new Object[65536];

     //-------------------------------------------------------------------------
     // Static utility methods....
     //-------------------------------------------------------------------------
     public static String hex(char ch) {
         StringBuffer temp = new StringBuffer();
         return hex(ch, temp).toString();
     }

     public static String hex(String s) {
         StringBuffer temp = new StringBuffer();
         return hex(s, temp).toString();
     }

     public static StringBuffer hex(char ch, StringBuffer output) {
         String foo = Integer.toString(ch,16).toUpperCase();
         for (int i = foo.length(); i < 4; ++i) {
             output.append('0');
         }
         output.append(foo);
         return output;
     }

     public static StringBuffer hex(String s, StringBuffer result) {
         for (int i = 0; i < s.length(); ++i) {
             if (i != 0) result.append(',');
             result.append(hex(s.charAt(i)));
         }
         return result;
     }

     /**
      * Split a string into pieces based on the given divider character
      */
     private static void split(String s, char divider, String[] output) {
         int last = 0;
         int current = 0;
         int i;
         for (i = 0; i < s.length(); ++i) {
             if (s.charAt(i) == divider) {
                 output[current++] = s.substring(last,i);
                 last = i+1;
             }
         }
         output[current++] = s.substring(last,i);
         while (current < output.length) {
             output[current++] = "";
         }
     }

     /**
      * Look up a given string in a string array.  Returns the index at which the
      * string was found in the array, or -1 if it was not found.
      */
     private static int lookup(String source, String[] target) {
         for (int i = 0; i < target.length; ++i) {
             if (source.equals(target[i])) return i;
         }
         return -1;
     }
 }
	/*
	*******************************************************************************
	* Copyright (C) 1996-2000, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UInfo.java,v $
	* $Date: 2000/03/10 04:07:24 $
	* $Revision: 1.3 $
	*
	*****************************************************************************************
	*/

	package com.ibm.text;

	import java.io.*;
	import java.util.*;

	public final class UInfo {
	static final boolean DEBUG = false;
	static final int UINFO_VERSION = 5;

	// these values are aligned with the java.lang.Character constants

	public static final byte
	UNASSIGNED = 0,
	UPPERCASE_LETTER = 1,
	LOWERCASE_LETTER = 2,
	TITLECASE_LETTER = 3,
	MODIFIER_LETTER = 4,
	OTHER_LETTER = 5,
	NON_SPACING_MARK = 6,
	ENCLOSING_MARK = 7,
	COMBINING_SPACING_MARK = 8,
	DECIMAL_DIGIT_NUMBER = 9,
	LETTER_NUMBER = 10,
	OTHER_NUMBER = 11,
	SPACE_SEPARATOR = 12,
	LINE_SEPARATOR = 13,
	PARAGRAPH_SEPARATOR = 14,
	CONTROL = 15,
	FORMAT = 16,
	PRIVATE_USE = 18,
	SURROGATE = 19,
	DASH_PUNCTUATION = 20,
	START_PUNCTUATION = 21,
	END_PUNCTUATION = 22,
	CONNECTOR_PUNCTUATION = 23,
	OTHER_PUNCTUATION = 24,
	MATH_SYMBOL = 25,
	CURRENCY_SYMBOL = 26,
	MODIFIER_SYMBOL = 27,
	OTHER_SYMBOL = 28;

	public String getName(char ch) {return getInfo(ch).name;}
	public String getDecomposition(char ch) {return getInfo(ch).decomposition;}
	public String getName10(char ch) {return getInfo(ch).name10;}
	public String getComment(char ch) {return getInfo(ch).comment;}

	public float getNumeric(char ch) {return getInfo(ch).numeric;}

	public short getCanonicalClass(char ch) {return getInfo(ch).canonical;}
	public short getDecimal(char ch) {return getInfo(ch).decimal;}
	public short getDigit(char ch) {return getInfo(ch).digit;}

	public char getUppercase(char ch) {return getInfo(ch).uppercase;}
	public char getLowercase(char ch) {return getInfo(ch).lowercase;}
	public char getTitlecase(char ch) {return getInfo(ch).titlecase;}

	public byte getCategory(char ch) {return getInfo(ch).category;}
	public byte getBidiClass(char ch) {return getInfo(ch).bidi;}
	public boolean getMirrored(char ch) {return getInfo(ch).mirrored;}

	public boolean isDisparaged(char ch) { return getDecomposition(ch).length() == 4; }

	public boolean isLetter(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<UPPERCASE_LETTER)
	\| (1<<LOWERCASE_LETTER)
	\| (1<<TITLECASE_LETTER)
	\| (1<<MODIFIER_LETTER)
	\| (1<<MODIFIER_LETTER))));
	}

	public boolean isMark(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<NON_SPACING_MARK)
	\| (1<<ENCLOSING_MARK)
	\| (1<<COMBINING_SPACING_MARK))));
	}

	public boolean isNumber(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<DECIMAL_DIGIT_NUMBER)
	\| (1<<LETTER_NUMBER)
	\| (1<<OTHER_NUMBER))));
	}

	public boolean isSeparator(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<SPACE_SEPARATOR)
	\| (1<<LINE_SEPARATOR)
	\| (1<<PARAGRAPH_SEPARATOR))));
	}

	public boolean isFormat(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<CONTROL)
	\| (1<<FORMAT))));
	}

	public boolean isPunctuation(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<DASH_PUNCTUATION)
	\| (1<<START_PUNCTUATION)
	\| (1<<END_PUNCTUATION)
	\| (1<<CONNECTOR_PUNCTUATION)
	\| (1<<START_PUNCTUATION)
	\| (1<<END_PUNCTUATION)
	\| (1<<OTHER_PUNCTUATION))));
	}

	public boolean isSymbol(char ch) {
	return (0 != ((1<<getCategory(ch)) &
	((1<<MATH_SYMBOL)
	\| (1<<CURRENCY_SYMBOL)
	\| (1<<MODIFIER_SYMBOL)
	\| (1<<OTHER_SYMBOL))));
	}

	//
	// Characters excluded from composition. This will eventually be read from
	// an auxiliary data file, but for now it's hardcoded
	//
	static final String composeExclude =
	"\u0958\u0959\u095a\u095b\u095c\u095d\u095e\u095f" // Devanagari
	+ "\ufb1f\ufb2a\ufb2b\ufb2c\ufb2d\ufb2e\ufb2f" // Hebrew
	+ "\ufb30\ufb31\ufb32\ufb33\ufb34\ufb35\ufb36"
	+ "\ufb38\ufb39\ufb3a\ufb3b\ufb3c\ufb3e"
	+ "\ufb40\ufb41\ufb43\ufb44\ufb46\ufb47\ufb48"
	+ "\ufb49\ufb4a\ufb4b\ufb4c\ufb4d\ufb4e"
	;

	/**
	* Is this character excluded from the composition algorithm by virtue
	* of being listed in the composition exclusion table in Tech Report #15?
	*/
	public boolean isExcludedComposition(char ch) {
	return isDisparaged(ch)
	\|\| composeExclude.indexOf(ch) >= 0
	\|\| getCanonicalClass(getDecompositionChars(ch).charAt(0)) != 0;
	}



	public String getName(String s) {
	return getName(s,true);
	}

	public String getName(String s, boolean shortVersion) {
	StringBuffer temp = new StringBuffer();
	for (int i = 0; i < s.length(); ++i) {
	if (i != 0) temp.append(", ");
	temp.append(getName(s.charAt(i), shortVersion));
	}
	return temp.toString();
	}

	public String getName(char ch, boolean shortVersion) {
	String result = getName(ch);
	if (!shortVersion) return result;
	result = replace(result,"LETTER ","");
	result = replace(result,"CHARACTER ","");
	result = replace(result,"SIGN ","");
	result = replace(result,"CAPITAL ","UC ");
	if (getCategory(ch) == LOWERCASE_LETTER)
	result = replace(result,"SMALL ","LC ");
	result = replace(result,"COMBINING ","-");
	result = replace(result,"WITH ","");
	result = replace(result,"AND ","");
	result = replace(result,"VARIA","GRAVE");
	result = replace(result,"OXIA","ACUTE");
	result = replace(result,"VRACHY","BREVE");
	result = replace(result,"VERTICAL LINE ABOVE","TONOS");
	result = replace(result,"PSILI","SMOOTH");
	result = replace(result,"DASIA","ROUGH");
	result = replace(result,"COMMA ABOVE","SMOOTH");
	result = replace(result,"REVERSED COMMA ABOVE","ROUGH");
	result = replace(result,"YPOGEGRAMMENI","IOTA-SUB");
	result = replace(result,"PROSGEGRAMMENI","IOTA-AD");
	result = replace(result,"DIALYTIKA","DIAERESIS");
	result = replace(result,"PERISPOMENI","CIRCUMFLEX");
	result = replace(result,"VOICED SOUND MARK","VOICED SIGN");
	result = replace(result,"PROLONGED SOUND MARK","VOICED SIGN");
	result = replace(result,"KATAKANA-HIRAGANA","KANA");
	result = replace(result,"COMPATIBILITY IDEOGRAPH-","");
	result = replace(result,"CHOSEONG","INITIAL");
	result = replace(result,"JUNGSEONG","MEDIAL");
	result = replace(result,"JONGSEONG","FINAL");

	return result.substring(0,1)
	+ result.substring(1,result.length()).toLowerCase();
	}

	public String replace(String source,
	String replacee, String replacer) {
	int p = source.indexOf(replacee);
	if (p == -1) return source;
	return source.substring(0,p)
	+ replacer
	+ source.substring(p+replacee.length(),source.length());
	}

	public boolean isCCS(String s) {
	if (s.length() < 2) return false;
	if (isMark(s.charAt(0))) return false;
	for (int i = 1; i < s.length(); ++i) {
	if (!isMark(s.charAt(i))) return false;
	}
	return true;
	}

	// combining base sequence := <cat_zero>+ <cat_pos>*
	public boolean isCBS(String s) {
	if (s.length() == 0) return false;
	if (getCanonicalClass(s.charAt(0)) != 0) return false;
	boolean gotGreater = false;
	for (int i = 1; i < s.length(); ++i) {
	if (getCanonicalClass(s.charAt(i)) == 0) {
	if (gotGreater) return false;
	} else {
	gotGreater = true;
	}
	}
	return true;
	}

	public boolean hasCanonicalDecomposition(char ch) {
	String decomp = getDecomposition(ch);
	return (decomp.length() != 0 && decomp.indexOf('<') == -1);
	}

	public boolean hasCompatibilityDecomposition(char ch) {
	String decomp = getDecomposition(ch);
	return (decomp.length() != 0 && decomp.indexOf('<') != -1);
	}

	public boolean isEquivalent(
	String a, String b, boolean canonical) {
	return getFullDecomposition(a, canonical).equals(
	getFullDecomposition(b, canonical));
	}

	// use very dumb algorithm. Don't need lower order one.

	public String getFullDecomposition(
	String s, boolean canonical) {
	StringBuffer output = new StringBuffer();
	for (int i = 0; i < s.length(); ++i) {
	getFullDecomp2(s.charAt(i),canonical,output);
	}
	return fixCanonical(output).toString();
	}

	public StringBuffer getFullDecomposition(
	char ch, boolean canonical, StringBuffer output) {

	StringBuffer result = getFullDecomp2(ch,canonical,output);
	return fixCanonical(result);
	}

	public String getFullDecomposition(
	char ch, boolean canonical) {
	return getFullDecomposition(ch, canonical, new StringBuffer()).toString();
	}

	public StringBuffer fixCanonical(StringBuffer target) {
	for (int i = 1; i < target.length(); ++i) {
	char ch = target.charAt(i);
	short canClass = getCanonicalClass(ch);
	char chPrev = target.charAt(i-1);
	short canClassPrev = getCanonicalClass(chPrev);
	if (canClass != 0 && canClass < canClassPrev) {
	target.setCharAt(i-1, ch);
	target.setCharAt(i, chPrev);
	if (i > 1) i -= 2; // backup (-1 to compensate for loop)
	}
	}
	return target;
	}

	public String fixCanonical(String source) {
	return fixCanonical(new StringBuffer(source)).toString();
	}


	// ============================================
	// PRIVATES
	// ============================================

	static class CharData {
	public CharData() {
	};

	String name = "";
	String decomposition = "";
	String name10 = "";
	String comment = "";

	float numeric = Float.MIN_VALUE;

	short canonical = 0;
	short decimal = Short.MIN_VALUE;
	short digit = Short.MIN_VALUE;

	char uppercase;
	char lowercase;
	char titlecase;

	byte category;
	byte bidi = 0;

	boolean mirrored;
	};

	private static final CharData UNASSIGNED_INFO = new CharData();
	private static char cachedChar = 0xFFFF;

	private CharData getInfo(char ch) {
	if (ch == cachedChar) return UNASSIGNED_INFO;
	// remap special ranges
	if (ch >= 0x4E00 && ch < 0xF900) {
	if (ch <= 0x9FA5) ch = 0x4E00;
	else if (ch >= 0xAC00 && ch <= 0xD7A3) ch = 0xAC00;
	else if (ch >= 0xD800 && ch <= 0xDFFF) ch = 0xD800;
	else if (ch >= 0xE000) ch = 0xE000;
	}
	Object value = cache[ch];
	CharData result;
	if (value == null) {
	result = UNASSIGNED_INFO;
	} else if (value instanceof String) {
	result = updateCache((String)value);
	} else {
	result = (CharData)value;
	}
	return result;
	}

	private StringBuffer getFullDecomp2(
	char ch, boolean canonical, StringBuffer output) {

	String decomp = getDecomposition(ch);
	if (decomp.length() == 0
	\|\| (canonical && decomp.indexOf('<') != -1)) {
	output.append(ch);
	return output;
	}
	boolean inBrackets = false;
	for (int i = 0; i < decomp.length(); ++i) {
	char c = decomp.charAt(i);
	if (c == '<') inBrackets = true;
	else if (c == '>') inBrackets = false;
	else if (inBrackets) ; // skip
	else if (c == ' ') ; // skip
	else {
	String tempString = decomp.substring(i,i+4);
	char temp = (char)Integer.parseInt(tempString,16);
	getFullDecomposition(temp,canonical,output);
	i+= 3;
	}
	}
	return output;
	}

	public String getDecompositionChars(char ch) {
	StringBuffer output = new StringBuffer();
	String decomp = getDecomposition(ch);
	if (decomp.length() == 0) {
	output.append(ch);
	return output.toString();
	}
	boolean inBrackets = false;
	for (int i = 0; i < decomp.length(); ++i) {
	char c = decomp.charAt(i);
	if (c == '<') inBrackets = true;
	else if (c == '>') inBrackets = false;
	else if (inBrackets) ; // skip
	else if (c == ' ') ; // skip
	else {
	String tempString = decomp.substring(i,i+4);
	char temp = (char)Integer.parseInt(tempString,16);
	output.append(temp);
	i+= 3;
	}
	}
	return output.toString();
	}

	public UInfo(String fileName) {
	long startTime,endTime;

	BufferedReader input = null;
	String line = null;
	try {
	input = new BufferedReader(new FileReader(fileName),64*1024);
	for (int count = 0;;++count) {
	line = input.readLine();
	if (line == null) break;
	if (line.length() == 0) continue;
	char ch = charFrom(line.substring(0,4));
	if (DEBUG) if ((count % 100) == 0)
	System.out.println("[" + count + "," + hex(ch) + ']');
	cache[ch] = line;
	}
	} catch (Exception ex) {
	try {
	input.close();
	} catch (Exception ex2) {}
	ex.printStackTrace();
	throw new IllegalArgumentException("Couldn't read file "
	+ ex.getClass().getName() + " " + ex.getMessage()
	+ " line = " + line
	);
	}
	}

	public UInfo() {
	this("../src/data/unicode/UnicodeData.txt");
	}

	/*
	0 Code value in 4-digit hexadecimal format.
	1 Unicode 2.1 Character Name. These names match exactly the
	2 General Category. This is a useful breakdown into various "character
	3 Canonical Combining Classes. The classes used for the
	4 Bidirectional Category. See the list below for an explanation of the
	5 Character Decomposition. In the Unicode Standard, not all of
	6 Decimal digit value. This is a numeric field. If the character
	7 Digit value. This is a numeric field. If the character represents a
	8 Numeric value. This is a numeric field. If the character has the
	9 If the characters has been identified as a "mirrored" character in
	10 Unicode 1.0 Name. This is the old name as published in Unicode 1.0.
	11 10646 Comment field. This field is informative.
	12 Upper case equivalent mapping. If a character is part of an
	13 Lower case equivalent mapping. Similar to 12. This field is informative.
	14 Title case equivalent mapping. Similar to 12. This field is informative.
	*/

	private CharData updateCache(String line) {
	try {
	String[] parts = new String[30];
	split(line,';',parts);
	CharData info = new CharData();
	char ch = charFrom(parts[0]);
	info.name = parts[1];
	info.category = (byte)lookup(parts[2], CATEGORY_TABLE);
	info.canonical = shortFrom(parts[3]);
	info.bidi = (byte)lookup(parts[4], BIDI_TABLE);
	info.decomposition = parts[5];
	info.decimal = shortFrom(parts[6]);
	info.digit = shortFrom(parts[7]);
	info.numeric = floatFrom(parts[8]);
	info.mirrored = charFrom(parts[9]) == 'Y';
	info.name10 = parts[10];
	info.comment = parts[11];
	info.uppercase = charFrom(parts[12]);
	if (info.uppercase == 0) info.uppercase = ch;
	info.lowercase = charFrom(parts[13]);
	if (info.lowercase == 0) info.lowercase = ch;
	info.titlecase = charFrom(parts[14]);
	if (info.titlecase == 0) info.titlecase = info.uppercase;
	String trial = hex(ch) + ";" + info;
	if (DEBUG) if (!trial.equals(line)) {
	System.out.println("Difference between:");
	System.out.println(line);
	System.out.println(trial);
	}
	cache[ch] = info;
	return info;
	}
	catch (NumberFormatException e) {
	System.out.println("updateCache: error parsing '" + line + "'");
	throw e;
	}
	}

	private static CharData typeInfo = new CharData();

	private boolean latin1(char c) {
	return ((c >= 20 && c <= 0x7F) \|\| c > 0xA0);
	}

	private static final String[] YN_TABLE = {"N", "Y"};

	private static final String[] CATEGORY_TABLE = {
	"Cn", // = Other, Not Assigned

	"Lu", // = Letter, Uppercase
	"Ll", // = Letter, Lowercase
	"Lt", // = Letter, Titlecase
	"Lm", // = Letter, Modifier
	"Lo", // = Letter, Other

	"Mn", // = Mark, Non-Spacing
	"Me", // = Mark, Enclosing
	"Mc", // = Mark, Spacing Combining

	"Nd", // = Number, Decimal Digit
	"Nl", // = Number, Letter
	"No", // = Number, Other

	"Zs", // = Separator, Space
	"Zl", // = Separator, Line
	"Zp", // = Separator, Paragraph

	"Cc", // = Other, Control
	"Cf", // = Other, Format
	"", // unused
	"Co", // = Other, Private Use
	"Cs", // = Other, Surrogate


	"Pd", // = Punctuation, Dash
	"Ps", // = Punctuation, Open
	"Pe", // = Punctuation, Close
	"Pc", // = Punctuation, Connector
	"Po", // = Punctuation, Other

	"Sm", // = Symbol, Math
	"Sc", // = Symbol, Currency
	"Sk", // = Symbol, Modifier
	"So", // = Symbol, Other

	"Pi", // = Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
	"Pf", // = Punctuation, Final quote (may behave like Ps or Pe dependingon usage)
	};

	private static String[] BIDI_TABLE = {
	"L", // Left-Right; Most alphabetic, syllabic, and logographic characters (e.g., CJK ideographs)
	"R", // Right-Left; Arabic, Hebrew, and punctuation specific to those scripts
	"EN", // European Number
	"ES", // European Number Separator
	"ET", // European Number Terminator
	"AN", // Arabic Number
	"CS", // Common Number Separator
	"B", // Block Separator
	"S", // Segment Separator
	"WS", // Whitespace
	"ON" // Other Neutrals ; All other characters: punctuation, symbols
	};

	private static short shortFrom(String p) {
	if (p.length() == 0) return Short.MIN_VALUE;
	return Short.parseShort(p);
	}

	private static float floatFrom(String p) {
	try {
	if (p.length() == 0) return Float.MIN_VALUE;
	int fract = p.indexOf('/');
	if (fract == -1) return Float.valueOf(p).floatValue();
	String q = p.substring(0,fract);
	float num = 0;
	if (q.length() != 0) num = Integer.parseInt(q);
	p = p.substring(fract+1,p.length());
	float den = 0;
	if (p.length() != 0) den = Integer.parseInt(p);
	return num/den;
	}
	catch (NumberFormatException e) {
	System.out.println("floatFrom: error parsing '" + p + "'");
	throw e;
	}
	}

	private static char charFrom(String p) {
	if (p.length() == 0) return '\u0000';
	else if (p.length() == 1) return p.charAt(0);
	int temp = Integer.parseInt(p, 16);
	if (temp < 0 \|\| temp > 0xFFFF)
	throw new NumberFormatException(
	"Hex char out of range: " + p);
	return (char)temp;
	}


	private Object[] cache = new Object[65536];

	//-------------------------------------------------------------------------
	// Static utility methods....
	//-------------------------------------------------------------------------
	public static String hex(char ch) {
	StringBuffer temp = new StringBuffer();
	return hex(ch, temp).toString();
	}

	public static String hex(String s) {
	StringBuffer temp = new StringBuffer();
	return hex(s, temp).toString();
	}

	public static StringBuffer hex(char ch, StringBuffer output) {
	String foo = Integer.toString(ch,16).toUpperCase();
	for (int i = foo.length(); i < 4; ++i) {
	output.append('0');
	}
	output.append(foo);
	return output;
	}

	public static StringBuffer hex(String s, StringBuffer result) {
	for (int i = 0; i < s.length(); ++i) {
	if (i != 0) result.append(',');
	result.append(hex(s.charAt(i)));
	}
	return result;
	}

	/**
	* Split a string into pieces based on the given divider character
	*/
	private static void split(String s, char divider, String[] output) {
	int last = 0;
	int current = 0;
	int i;
	for (i = 0; i < s.length(); ++i) {
	if (s.charAt(i) == divider) {
	output[current++] = s.substring(last,i);
	last = i+1;
	}
	}
	output[current++] = s.substring(last,i);
	while (current < output.length) {
	output[current++] = "";
	}
	}

	/**
	* Look up a given string in a string array. Returns the index at which the
	* string was found in the array, or -1 if it was not found.
	*/
	private static int lookup(String source, String[] target) {
	for (int i = 0; i < target.length; ++i) {
	if (source.equals(target[i])) return i;
	}
	return -1;
	}
	}