src/com/ibm/icu/impl/UCharacterProperty.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 1996-2003, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * $Source:
 *         /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterPropertyDB.java $
 * $Date: 2003/12/17 04:56:04 $
 * $Revision: 1.35 $
 *
 *******************************************************************************
 */

 package com.ibm.icu.impl;

 import java.io.InputStream;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.util.Locale;

 import com.ibm.icu.util.RangeValueIterator;
 import com.ibm.icu.util.VersionInfo;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.*;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.BreakIterator;

 /**
 * <p>Internal class used for Unicode character property database.</p>
 * <p>This classes store binary data read from uprops.icu.
 * It does not have the capability to parse the data into more high-level
 * information. It only returns bytes of information when required.</p>
 * <p>Due to the form most commonly used for retrieval, array of char is used
 * to store the binary data.</p>
 * <p>UCharacterPropertyDB also contains information on accessing indexes to
 * significant points in the binary data.</p>
 * <p>Responsibility for molding the binary data into more meaning form lies on
 * <a href=UCharacter.html>UCharacter</a>.</p>
 * @author Syn Wee Quek
 * @since release 2.1, february 1st 2002
 * @draft 2.1
 */

 public final class UCharacterProperty implements Trie.DataManipulate
 {
 	// public data members -----------------------------------------------

 	/**
     * Trie data
     */
     public CharTrie m_trie_;
     /**
      * Optimization
      * CharTrie index array
      */
     public char[] m_trieIndex_;
     /**
      * Optimization
      * CharTrie data array
      */
     public char[] m_trieData_;
     /**
      * Optimization
      * CharTrie data offset
      */
     public int m_trieInitialValue_;
     /**
     * Character property table
     */
     public int m_property_[];
     /**
     * Unicode version
     */
     public VersionInfo m_unicodeVersion_;
     /**
      * Exception indicator for uppercase type
      */
     public static final int EXC_UPPERCASE_ = 0;
     /**
      * Exception indicator for lowercase type
      */
     public static final int EXC_LOWERCASE_ = 1;
     /**
      * Exception indicator for titlecase type
      */
     public static final int EXC_TITLECASE_ = 2;
     /**
      * Exception indicator for digit type
      */
     public static final int EXC_UNUSED_ = 3;
     /**
      * Exception indicator for numeric type
      */
     public static final int EXC_NUMERIC_VALUE_ = 4;
     /**
      * Exception indicator for denominator type
      */
     public static final int EXC_DENOMINATOR_VALUE_ = 5;
     /**
      * Exception indicator for mirror type
      */
     public static final int EXC_MIRROR_MAPPING_ = 6;
     /**
      * Exception indicator for special casing type
      */
     public static final int EXC_SPECIAL_CASING_ = 7;
     /**
      * Exception indicator for case folding type
      */
     public static final int EXC_CASE_FOLDING_ = 8;
     /**
      * EXC_COMBINING_CLASS_ is not found in ICU.
      * Used to retrieve the combining class of the character in the exception
      * value
      */
     public static final int EXC_COMBINING_CLASS_ = 9;
     /**
      * Maximum number of expansion for a case mapping
      */
     public static final int MAX_CASE_MAP_SIZE = 10;
     /**
     * Turkish ISO 639 2 character code
     */
     public static final String TURKISH_ = "tr";
     /**
     * Azerbaijani ISO 639 2 character code
     */
     public static final String AZERBAIJANI_ = "az";
     /**
     * Lithuanian ISO 639 2 character code
     */
     public static final String LITHUANIAN_ = "lt";
     /**
     * Latin capital letter i with dot above
     */
     public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
     /**
     * Latin small letter i with dot above
     */
     public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
     /**
     * Latin lowercase i
     */
     public static final char LATIN_SMALL_LETTER_I_ = 0x69;
     /**
     * Character type mask
     */
     public static final int TYPE_MASK = 0x1F;
     /**
     * Exception test mask
     */
     public static final int EXCEPTION_MASK = 0x20;
     /**
     * Mirror test mask
     */
     public static final int MIRROR_MASK = 1 << 11;

     // public methods ----------------------------------------------------

     /**
      * Java friends implementation
      */
     public void setIndexData(CharTrie.FriendAgent friendagent)
     {
         m_trieIndex_ = friendagent.getPrivateIndex();
         m_trieData_ = friendagent.getPrivateData();
         m_trieInitialValue_ = friendagent.getPrivateInitialValue();
     }

     /**
     * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
     * data the index array offset of the indexes for that lead surrogate.
     * @param property data value for a surrogate from the trie, including the
     *        folding offset
     * @return data offset or 0 if there is no data for the lead surrogate
     */
     public int getFoldingOffset(int value)
     {
         if ((value & SUPPLEMENTARY_FOLD_INDICATOR_MASK_) != 0) {
             return (value & SUPPLEMENTARY_FOLD_OFFSET_MASK_);
         }
         else {
             return 0;
         }
     }

     /**
     * Gets the property value at the index.
     * This is optimized.
     * Note this is alittle different from CharTrie the index m_trieData_
     * is never negative.
     * @param ch code point whose property value is to be retrieved
     * @return property value of code point
     */
     public int getProperty(int ch)
     {
         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
             // BMP codepoint
             // optimized
             try {
                 return m_property_[
                     m_trieData_[
                     (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
                           << Trie.INDEX_STAGE_2_SHIFT_)
                     + (ch & Trie.INDEX_STAGE_3_MASK_)]];
             } catch (ArrayIndexOutOfBoundsException e) {
                 return m_property_[m_trieInitialValue_];
             }
         }
         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
             return m_property_[
                     m_trieData_[
                     (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
                                   + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
                           << Trie.INDEX_STAGE_2_SHIFT_)
                     + (ch & Trie.INDEX_STAGE_3_MASK_)]];
         }
         // for optimization
         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
             // look at the construction of supplementary characters
             // trail forms the ends of it.
             return m_property_[m_trie_.getSurrogateValue(
                                           UTF16.getLeadSurrogate(ch),
                                           (char)(ch & Trie.SURROGATE_MASK_))];
         }
         // return m_dataOffset_ if there is an error, in this case we return
         // the default value: m_initialValue_
         // we cannot assume that m_initialValue_ is at offset 0
         // this is for optimization.
         return m_property_[m_trieInitialValue_];
         // return m_property_[m_trie_.getCodePointValue(ch)];
     }

     /**
     * Getting the signed numeric value of a character embedded in the property
     * argument
     * @param prop the character
     * @return signed numberic value
     */
     public static int getSignedValue(int prop)
     {
         return (prop >> VALUE_SHIFT_);
     }

     /**
     * Getting the exception index for argument property
     * @param prop character property
     * @return exception index
     */
     public static int getExceptionIndex(int prop)
     {
         return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
     }

     /**
     * Getting the unsigned numeric value of a character embedded in the property
     * argument
     * @param prop the character
     * @return unsigned numberic value
     */
     ///CLOVER:OFF
     public static int getUnsignedValue(int prop)
     {
         return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
     }
     ///CLOVER:ON

     /**
     * Determines if the exception value passed in has the kind of information
     * which the indicator wants, e.g if the exception value contains the digit
     * value of the character
     * @param index exception index
     * @param indicator type indicator
     * @return true if type value exist
     */
     public boolean hasExceptionValue(int index, int indicator)
     {
         return (m_exception_[index] & (1 << indicator)) != 0;
     }

     /**
     * Gets the exception value for the argument properties, assuming that data
     * type is available. -1 is returned if data is not available.
     * Different from getException, this function tests if the type data is
     * available.
     * @param props property value
     * @param exception data type
     * @return exception data type value at index
     */
     ///CLOVER:OFF
     public int getExceptionValue(int props, int etype)
     {
         int index = getExceptionIndex(props);
         if (hasExceptionValue(index, etype)) {
             // contained in exception data
             // return getException(index, etype);
             if (etype == EXC_COMBINING_CLASS_) {
                 return m_exception_[index];
             }
             // contained in the exception digit address
             index = addExceptionOffset(m_exception_[index], etype, ++ index);
             return m_exception_[index];
         }
         return -1;
     }
     ///CLOVER:ON

     /**
     * Gets the exception value at the index, assuming that data type is
     * available. Result is undefined if data is not available. Use
     * hasExceptionValue() to determine data's availability.
     * @param index
     * @param exception data type
     * @return exception data type value at index
     */
     public int getException(int index, int etype)
     {
         // contained in exception data
         if (etype == EXC_COMBINING_CLASS_) {
             return m_exception_[index];
         }
         // contained in the exception digit address
         index = addExceptionOffset(m_exception_[index], etype, ++ index);
         return m_exception_[index];
     }

     /**
     * Gets the folded case value at the index
     * @param index of the case value to be retrieved
     * @return folded case value at index
     */
     /*
      * Issue for canonical caseless match (UAX #21):
      * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
      * canonical equivalence, unlike default-option casefolding.
      * For example, I-grave and I + grave fold to strings that are not canonically
      * equivalent.
      * For more details, see the comment in unorm_compare() in unorm.cpp
      * and the intermediate prototype changes for Jitterbug 2021.
      * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
      *
      * This did not get fixed because it appears that it is not possible to fix
      * it for uppercase and lowercase characters (I-grave vs. i-grave)
      * together in a way that they still fold to common result strings.
      */

     public int getFoldCase(int index)
     {
         char single = m_case_[index];
         if (UTF16.LEAD_SURROGATE_MIN_VALUE <= single &&
             single <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
         // Convert the UTF-16 surrogate pair if necessary.
         // For simplicity in usage, and because the frequency of pairs is low,
         // look both directions.

 	        if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 	            char trail = m_case_[index + 1];
 	            if (UTF16.LEAD_SURROGATE_MIN_VALUE <= trail &&
 		            trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 	                return getRawSupplementary(single, trail);
 	            }
 	        }
 	        else
 	        {
 	            char lead = m_case_[index - 1];
 	            if (UTF16.LEAD_SURROGATE_MIN_VALUE <= lead &&
 		            lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 	                return getRawSupplementary(lead, single);
 	            }
 	        }
 	    }
 	    return single;
     }

     /**
     * Gets the folded case value at the index
     * @param index of the case value to be retrieved
     * @param count number of characters to retrieve
     * @param buffer string buffer to add result to
     */
     public void getFoldCase(int index, int count, StringBuffer str)
     {
         // first 2 chars are for the simple mappings
         index += 2;
         while (count > 0) {
         	str.append(m_case_[index]);
         	index ++;
         	count --;
         }
     }

     /**
     * Gets the upper case value at the index
     * @param index of the case value to be retrieved
     * @param buffer string buffer to add result to
     */
     public void getUpperCase(int index, StringBuffer buffer)
     {
     	int count = m_case_[index];
         // last 5 bits of the first char in m_case_ gives the position of the
         // alternate uppercase characters
         index += (count & LAST_5_BIT_MASK_) + 1;
         count = (count >> SHIFT_5_) & LAST_5_BIT_MASK_;

         for (int j = 0; j < count; j ++) {
         	buffer.append(m_case_[index + j]);
         }
     }

     /**
     * Gets the upper case value at the index
     * @param index of the case value to be retrieved
     * @param buffer string buffer to add result to
     */
     public void getTitleCase(int index, StringBuffer buffer)
     {
         int count = m_case_[index];
         // last 5 bits of the first char in m_case_ gives the position of the
         // alternate uppercase characters
         index += (count & LAST_5_BIT_MASK_) + 1 +
                  ((count >> SHIFT_5_) & LAST_5_BIT_MASK_);
         count = (count >> SHIFT_10_) & LAST_5_BIT_MASK_;

         for (int j = 0; j < count; j ++) {
         	buffer.append(m_case_[index + j]);
         }
     }

     /**
     * Gets the lower case value at the index
     * @param index of the case value to be retrieved
     * @param buffer string buffer to add result to
     */
     public void getLowerCase(int index, StringBuffer buffer)
     {
         int count = m_case_[index] & LAST_5_BIT_MASK_;
         // last 5 bits of the first char in m_case_ gives the size of the
         // lowercase characters
         index ++;
         for (int j = 0; j < count; j ++) {
         	buffer.append(m_case_[index + j]);
         }
     }

     /**
      * Gets the unicode additional properties.
      * C version getUnicodeProperties.
      * @param codepoint codepoint whose additional properties is to be
      *                  retrieved
      * @param column
      * @return unicode properties
      */
    	public int getAdditional(int codepoint, int column) {
         if (column == -1) {
             return getProperty(codepoint);
         }
    		if (column < 0 || column >= m_additionalColumnsCount_) {
            return 0;
        }
        return m_additionalVectors_[
                      m_additionalTrie_.getCodePointValue(codepoint) + column];
    	}

     static final int MY_MASK = UCharacterProperty.TYPE_MASK
         & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
             (1<<UCharacterCategory.LOWERCASE_LETTER) |
             (1<<UCharacterCategory.TITLECASE_LETTER) |
             (1<<UCharacterCategory.MODIFIER_LETTER) |
             (1<<UCharacterCategory.OTHER_LETTER));


    	/**
      * <p>Get the "age" of the code point.</p>
      * <p>The "age" is the Unicode version when the code point was first
      * designated (as a non-character or for Private Use) or assigned a
      * character.</p>
      * <p>This can be useful to avoid emitting code points to receiving
      * processes that do not accept newer characters.</p>
      * <p>The data is from the UCD file DerivedAge.txt.</p>
      * <p>This API does not check the validity of the codepoint.</p>
      * @param ch The code point.
      * @return the Unicode version number
      * @draft ICU 2.1
      */
     public VersionInfo getAge(int codepoint)
     {
     	int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
     	return VersionInfo.getInstance(
                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
                            version & LAST_NIBBLE_MASK_, 0, 0);
     }
     private static final long UNSIGNED_INT_MASK = 0xffffffffL;
     private static final class BinaryProperties{
        int column;
        long mask;
        public BinaryProperties(int column,long mask){
        		this.column = column;
        		this.mask  = mask;
        }
    }
    BinaryProperties[] binProps={
        /*
         * column and mask values for binary properties from u_getUnicodeProperties().
         * Must be in order of corresponding UProperty,
         * and there must be exacly one entry per binary UProperty.
         */
        new BinaryProperties(  1, (  1 << ALPHABETIC_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << ASCII_HEX_DIGIT_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << BIDI_CONTROL_PROPERTY_) ),
        new BinaryProperties( -1, (  1 << MIRROR_SHIFT_) ),
        new BinaryProperties(  1, (  1 << DASH_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << DEPRECATED_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << DIACRITIC_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << EXTENDER_PROPERTY_) ),
        new BinaryProperties(  0, 0 ),                                  /* UCHAR_FULL_COMPOSITION_EXCLUSION */
        new BinaryProperties(  1, (  1 << GRAPHEME_BASE_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << GRAPHEME_EXTEND_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << GRAPHEME_LINK_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << HEX_DIGIT_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << HYPHEN_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << ID_CONTINUE_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << ID_START_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << IDEOGRAPHIC_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << IDS_BINARY_OPERATOR_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << IDS_TRINARY_OPERATOR_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << JOIN_CONTROL_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << LOGICAL_ORDER_EXCEPTION_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << LOWERCASE_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << MATH_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << NONCHARACTER_CODE_POINT_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << QUOTATION_MARK_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << RADICAL_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << SOFT_DOTTED_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << TERMINAL_PUNCTUATION_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << UNIFIED_IDEOGRAPH_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << UPPERCASE_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << WHITE_SPACE_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << XID_CONTINUE_PROPERTY_) ),
        new BinaryProperties(  1, (  1 << XID_START_PROPERTY_) ),
        new BinaryProperties( -1, (  1 << CASE_SENSITIVE_SHIFT_) )
    };


 	/**
 	 * <p>Check a binary Unicode property for a code point.</p>
 	 * <p>Unicode, especially in version 3.2, defines many more properties
 	 * than the original set in UnicodeData.txt.</p>
 	 * <p>This API is intended to reflect Unicode properties as defined in
 	 * the Unicode Character Database (UCD) and Unicode Technical Reports
 	 * (UTR).</p>
 	 * <p>For details about the properties see
 	 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
 	 * <p>For names of Unicode properties see the UCD file
 	 * PropertyAliases.txt.</p>
 	 * <p>This API does not check the validity of the codepoint.</p>
 	 * <p>Important: If ICU is built with UCD files from Unicode versions
 	 * below 3.2, then properties marked with "new" are not or
 	 * not fully available.</p>
 	 * @param codepoint Code point to test.
 	 * @param property selector constant from com.ibm.icu.lang.UProperty,
 	 *        identifies which binary property to check.
 	 * @return true or false according to the binary Unicode property value
 	 *         for ch. Also false if property is out of bounds or if the
 	 *         Unicode version does not have data for the property at all, or
 	 *         not for this code point.
 	 * @see com.ibm.icu.lang.UProperty
 	 * @draft ICU 2.1
 	 */

 	public boolean hasBinaryProperty(int codepoint, int property)
 	{
 		 if(property <UProperty.BINARY_START || UProperty.BINARY_LIMIT<=property) {
 	        // not a known binary property
 	        return false;
 	    } else if(property == UProperty.FULL_COMPOSITION_EXCLUSION) {
 	        return NormalizerImpl.isFullCompositionExclusion(codepoint);
 	    } else {
 	        // systematic, directly stored properties
 	        return ((UNSIGNED_INT_MASK & getAdditional(codepoint, binProps[property].column)) & binProps[property].mask)!=0;
 	    }
 	}

 	/**
     * Forms a supplementary code point from the argument character<br>
     * Note this is for internal use hence no checks for the validity of the
     * surrogate characters are done
     * @param lead lead surrogate character
     * @param trail trailing surrogate character
     * @return code point of the supplementary character
     */
     public static int getRawSupplementary(char lead, char trail)
     {
         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
     }

     /**
     * Loads the property data and initialize the UCharacterProperty instance.
     * @Exception thrown when data is missing or data has been corrupted.
     */
     public static UCharacterProperty getInstance() throws RuntimeException
     {
     	if (INSTANCE_ == null) {
     		try {
     			INSTANCE_ = new UCharacterProperty();
     		}
         	catch (Exception e) {
             	throw new RuntimeException(e.getMessage());
         	}
     	}
     	return INSTANCE_;
     }

     /**
     * Special casing lowercase management
     * @param locale current locale
     * @param ch code point to convert
     * @param index of exception containing special case information
     * @param uchariter text iterator with index at position of ch
     * @param buffer to add lowercase
     * @return size of the lower case character in UTF16 format
     */
     public int getSpecialLowerCase(Locale locale, int index, int ch,
                                    UCharacterIterator uchariter,
                                    StringBuffer buffer)
     {
     	int exception = getException(index,
                                      UCharacterProperty.EXC_SPECIAL_CASING_);
         if (exception < 0) {
         	int offset = uchariter.getIndex();
             // fill u and i with the case mapping result string
             // use hardcoded conditions and mappings
             // Test for conditional mappings first
             // (otherwise the unconditional default mappings are always taken),
             // then test for characters that have unconditional mappings in
             // SpecialCasing.txt, then get the UnicodeData.txt mappings.
             if (locale.getLanguage().equals(LITHUANIAN_) &&
                 // base characters, find accents above
                 (((ch == LATIN_CAPITAL_LETTER_I_ ||
                    ch == LATIN_CAPITAL_LETTER_J_ ||
                    ch == LATIN_CAPITAL_I_WITH_OGONEK_) &&
                   isFollowedByMOREABOVE(uchariter, offset)) ||
                   // precomposed with accent above, no need to find one
                   (ch == LATIN_CAPITAL_I_WITH_GRAVE_ ||
                    ch == LATIN_CAPITAL_I_WITH_ACUTE_ ||
                    ch == LATIN_CAPITAL_I_WITH_TILDE_))) {
                    // lithuanian: add a dot above if there are more accents
                    // above (to always have the dot)
                    // # Lithuanian
                    // # Lithuanian retains the dot in a lowercase i when
                    //   followed by accents.
                    // # Introduce an explicit dot above when lowercasing
                    // capital I's and J's
                    // whenever there are more accents above.
                    // (of the accents used in Lithuanian: grave, acute, tilde
                    // above, and ogonek)
                    // 0049; 0069 0307; 0049; 0049; lt More_Above;
                    // # LATIN CAPITAL LETTER I
                    // 004A; 006A 0307; 004A; 004A; lt More_Above;
                    // # LATIN CAPITAL LETTER J
                    // 012E; 012F 0307; 012E; 012E; lt More_Above;
                    // # LATIN CAPITAL LETTER I WITH OGONEK
                    // 00CC; 0069 0307 0300; 00CC; 00CC; lt;
                    // # LATIN CAPITAL LETTER I WITH GRAVE
                    // 00CD; 0069 0307 0301; 00CD; 00CD; lt;
                    // # LATIN CAPITAL LETTER I WITH ACUTE
                    // 0128; 0069 0307 0303; 0128; 0128; lt;
                    // # LATIN CAPITAL LETTER I WITH TILDE
                    switch(ch) {
                    case LATIN_CAPITAL_LETTER_I_:
                         buffer.append((char)LATIN_SMALL_LETTER_I_);
                         buffer.append((char)COMBINING_DOT_ABOVE_);
                         return 2;
                    case LATIN_CAPITAL_LETTER_J_:
                         buffer.append((char)LATIN_SMALL_LETTER_J_);
                         buffer.append((char)COMBINING_DOT_ABOVE_);
                         return 2;
                    case LATIN_CAPITAL_I_WITH_OGONEK_:
                         buffer.append((char)LATIN_SMALL_LETTER_I_WITH_OGONEK_);
                         buffer.append((char)COMBINING_DOT_ABOVE_);
                         return 2;
                    case LATIN_CAPITAL_I_WITH_GRAVE_:
                         buffer.append((char)LATIN_SMALL_LETTER_I_);
                         buffer.append((char)COMBINING_DOT_ABOVE_);
                         buffer.append((char)COMBINING_GRAVE_ACCENT_);
                         return 3;
                    case LATIN_CAPITAL_I_WITH_ACUTE_:
                         buffer.append((char)LATIN_SMALL_LETTER_I_);
                         buffer.append((char)COMBINING_DOT_ABOVE_);
                         buffer.append((char)COMBINING_ACUTE_ACCENT_);
                         return 3;
                    case LATIN_CAPITAL_I_WITH_TILDE_:
                         buffer.append((char)LATIN_SMALL_LETTER_I_);
                         buffer.append((char)COMBINING_DOT_ABOVE_);
                         buffer.append((char)COMBINING_TILDE_);
                         return 3;
                    }
             }

             String language = locale.getLanguage();
             if (language.equals(TURKISH_) || language.equals(AZERBAIJANI_)) {
                 if (ch == 0x130) {
                     // # I and i-dotless; I-dot and i are case pairs in Turkish
                     // and Azeri
                     // # The following rules handle those cases.
                     // 0130; 0069; 0130; 0130; tr
                     // # LATIN CAPITAL LETTER I WITH DOT ABOVE
                     // 0130; 0069; 0130; 0130; az
                     // # LATIN CAPITAL LETTER I WITH DOT ABOVE
                     buffer.append(LATIN_SMALL_LETTER_I_);
                     return 1;
                 }
                 if (ch == 0x307 && isPrecededByI(uchariter, offset)) {
                     // ### TODO see comment above about isAfter_I()
                     // # When lowercasing, remove dot_above in the sequence
                     // I + dot_above, which will turn into i.
                     // # This matches the behavior of the canonically
                     // equivalent I-dot_above
                     // 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
                     // 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
                     return 0; // remove the dot (continue without output)

                 }
                 if (ch == LATIN_CAPITAL_LETTER_I_ &&
                     !isFollowedByDotAbove(uchariter, offset)) {
                     // turkish: I maps to dotless i
                     // other languages or turkish with decomposed I+dot above:
                     // I maps to i
                     // # When lowercasing, unless an I is before a dot_above,
                     // it turns into a dotless i.
                     //  0049; 0131; 0049; 0049; tr Not_Before_Dot;
                     // # LATIN CAPITAL LETTER I
                     // 0049; 0131; 0049; 0049; az Not_Before_Dot;
                     // # LATIN CAPITAL LETTER I
                     buffer.append(LATIN_SMALL_LETTER_DOTLESS_I_);
                     return 1;
                 }
             }

 			if (ch == 0x130) {
                 // decomposed I+dot above becomes i (see handling of
                 // U+0049 for turkish) and removes the dot above
                 // # Preserve canonical equivalence for I with dot. Turkic is
                 // handled below.
                 // 0130; 0069 0307; 0130; 0130;
                 // # LATIN CAPITAL LETTER I WITH DOT ABOVE
                 buffer.append(LATIN_SMALL_LETTER_I_);
                 buffer.append(COMBINING_DOT_ABOVE_);
                 return 2; // remove the dot (continue without output)
             }

             if (ch == GREEK_CAPITAL_LETTER_SIGMA_ &&
                 isCFINAL(uchariter, offset) &&
                 isNotCINITIAL(uchariter, offset)) {
                 // greek capital sigma maps depending on surrounding cased
                 // letters
                 // greek capital sigma maps depending on surrounding cased
                 // letters (see SpecialCasing.txt) */
                 // # Special case for final form of sigma
                 // 03A3; 03C2; 03A3; 03A3; Final_Sigma;
                 // # GREEK CAPITAL LETTER SIGMA
                 buffer.append(GREEK_SMALL_LETTER_RHO_);
                 return 1;
             }

 			// no known conditional special case mapping, use a normal mapping
             if (hasExceptionValue(index, UCharacterProperty.EXC_LOWERCASE_)) {
                 int oldlength = buffer.length();
                 UTF16.append(buffer, getException(index,
                                             UCharacterProperty.EXC_LOWERCASE_));
                 return buffer.length() - oldlength;
             }

 			UTF16.append(buffer, ch);
 			return UTF16.getCharCount(ch);
         }
         else {
             // get the special case mapping string from the data file
             index = exception & LAST_CHAR_MASK_;
             int oldlength = buffer.length();
             getLowerCase(index, buffer);
             return buffer.length() - oldlength;
         }
     }

     /**
      * Gets the lower case map of the argument codepoint
      * @param locale locale which the lowercase is looked for
      * @param ch codepoint whose lower case is to be matched
      * @param uchariter text iterator positioned at the codepoint ch
      * @param buffer buffer to store result string
      * @return size of the lowercased codepoint in UTF16 format
      */
     public int toLowerCase(Locale locale, int ch,
                                    UCharacterIterator uchariter,
                                    StringBuffer buffer)
     {
     	int props = getProperty(ch);
         if ((props & EXCEPTION_MASK) == 0) {
             int type = props & TYPE_MASK;
             if (type == UCharacterCategory.UPPERCASE_LETTER ||
                 type == UCharacterCategory.TITLECASE_LETTER) {
                 ch += UCharacterProperty.getSignedValue(props);
             }
         } else {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (hasExceptionValue(index,
                                 UCharacterProperty.EXC_SPECIAL_CASING_)) {
                 return getSpecialLowerCase(locale, index, ch, uchariter,
                                            buffer);
             }
             if (hasExceptionValue(index,
                                        UCharacterProperty.EXC_LOWERCASE_)) {
                 ch = getException(index, UCharacterProperty.EXC_LOWERCASE_);
             }
         }
         UTF16.append(buffer, ch);
         return UTF16.getCharCount(ch);
 	}

 	/**
      * Gets the lower case map of the argument codepoint
      * @param locale locale which the lowercase is looked for
      * @param ch codepoint whose lower case is to be matched
      * @param uchariter text iterator positioned at the codepoint ch
      * @param result array of char to store the result
      * @return size oflowercased codepoint in UTF16 format
      */
     public int toLowerCase(Locale locale, int ch,
                            UCharacterIterator uchariter, char buffer[])
     {
         int props = getProperty(ch);
         if ((props & EXCEPTION_MASK) == 0) {
             int type = props & TYPE_MASK;
             if (type == UCharacterCategory.UPPERCASE_LETTER ||
                 type == UCharacterCategory.TITLECASE_LETTER) {
                 ch += UCharacterProperty.getSignedValue(props);
             }
         } else {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (hasExceptionValue(index,
                                   UCharacterProperty.EXC_SPECIAL_CASING_)) {
                 StringBuffer strbuffer = new StringBuffer(1);
                 int result = getSpecialLowerCase(locale, index, ch, uchariter,
                                                  strbuffer);
                 strbuffer.getChars(0, result, buffer, 0);
                 return result;
             }
             if (hasExceptionValue(index, UCharacterProperty.EXC_LOWERCASE_)) {
                 ch = getException(index, UCharacterProperty.EXC_LOWERCASE_);
             }
         }
         if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
         	buffer[0] = (char)ch;
         	return 1;
         }
         buffer[0] = UTF16.getLeadSurrogate(ch);
         buffer[1] = UTF16.getTrailSurrogate(ch);
         return 2;
 	}

     /**
      * Gets the lower case mappings of the substring from index start to the
      * character before end.
      * @param locale locale which the mappings will be searched
      * @param str string to map
      * @param start start index of the substring to map
      * @param limit one index pass the last character to map
      * @param result string buffer to store lower case string
      */
     public void toLowerCase(Locale locale, String str, int start, int limit,
                             StringBuffer result)
     {
         UCharacterIterator ucharIter = UCharacterIterator.getInstance(str);
         int                strIndex  = start;

         while (strIndex < limit) {
         	ucharIter.setIndex(strIndex);
 	        int ch = ucharIter.currentCodePoint();

 	        toLowerCase(locale, ch, ucharIter, result);
 	        strIndex ++;
 	        if (ch >= UTF16.SUPPLEMENTARY_MIN_VALUE) {
 	        	strIndex ++;
 	        }
         }
     }

     /**
     * Special casing uppercase management
     * @param locale locale which the mappings will be based on
     * @param index of exception containing special case information
     * @param ch code point to convert
     * @param uchariter text iterator which ch belongs to
     * @param upperflag true if uppercase mapping is desired, false for title
     *        casing
     * @param buffer to add uppercase
     * @return size of uppercased codepoint in UTF16 format
     */
     public int getSpecialUpperOrTitleCase(Locale locale, int index, int ch,
                                           UCharacterIterator uchariter,
                                           boolean upperflag,
                                           StringBuffer buffer)
     {
         int exception = getException(index,
                                      UCharacterProperty.EXC_SPECIAL_CASING_);
         if (exception < 0) {
             String language = locale.getLanguage();
             // use hardcoded conditions and mappings
             if ((language.equals(TURKISH_) || language.equals(AZERBAIJANI_))
                 && ch == LATIN_SMALL_LETTER_I_) {
                 // turkish: i maps to dotted I
                 // # Turkish and Azeri
                 // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                 // # The following rules handle those cases.
                 // # When uppercasing, i turns into a dotted capital I
                 // 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
                 // 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
                 buffer.append(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_);
                 return 1;
             }

             if (language.equals(LITHUANIAN_) && ch == COMBINING_DOT_ABOVE_
                 && isPrecededBySoftDotted(uchariter, uchariter.getIndex())) {
                 // # Lithuanian
                 // # Lithuanian retains the dot in a lowercase i when followed
                 // by accents.
                 // # Remove DOT ABOVE after "i" with upper or titlecase
                 // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
                                 // lithuanian: remove DOT ABOVE after U+0069 "i" with
                 // upper or titlecase
                 return 0; // remove the dot (continue without output)
             }

             // no known conditional special case mapping, use a normal mapping
            if (!upperflag && hasExceptionValue(index,
                                           UCharacterProperty.EXC_TITLECASE_)) {
                ch = getException(index, UCharacterProperty.EXC_TITLECASE_);
            }
            else {
                if (hasExceptionValue(index,
                                      UCharacterProperty.EXC_UPPERCASE_)) {
 	               ch = getException(index, UCharacterProperty.EXC_UPPERCASE_);
                }
            }

            UTF16.append(buffer, ch);
            return UTF16.getCharCount(ch);
         }

 		// get the special case mapping string from the data file
         index = exception & LAST_CHAR_MASK_;
         int oldlength = buffer.length();
         if (upperflag) {
 	        getUpperCase(index, buffer);
         }
         else {
           	getTitleCase(index, buffer);
         }
         return buffer.length() - oldlength;
     }

     /**
      * Gets the upper or title case map of the codepoint
      * @param locale locale which the mappings will be searched
      * @param ch codepoint whose upper or title case will be mapped
      * @param uchariter text iterator positioned at the codepoint
      * @param upperflag flag true if uppercase is desired, false for title case
      * @param buffer buffer to store result map
      * @return size of uppercased codepoint in UTF16 format
      */
 	public int toUpperOrTitleCase(Locale locale, int ch,
 	                              UCharacterIterator uchariter,
 	                              boolean upperflag, StringBuffer buffer)
     {
         int props = getProperty(ch);
         if ((props & EXCEPTION_MASK) == 0) {
             int type = props & TYPE_MASK;
             if (type == UCharacterCategory.LOWERCASE_LETTER) {
             	ch -= UCharacterProperty.getSignedValue(props);
             }
         } else {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (hasExceptionValue(index,
                                   UCharacterProperty.EXC_SPECIAL_CASING_)) {
                 return getSpecialUpperOrTitleCase(locale, index, ch, uchariter,
                                                   upperflag, buffer);
             }
             if (!upperflag && hasExceptionValue(index,
                                          UCharacterProperty.EXC_TITLECASE_)) {
                 ch = getException(index, UCharacterProperty.EXC_TITLECASE_);
             }
             else {
              	if (hasExceptionValue(index,
                                       UCharacterProperty.EXC_UPPERCASE_)) {
                     ch = getException(index,
                                       UCharacterProperty.EXC_UPPERCASE_);
                 }
             }
         }
         UTF16.append(buffer, ch);
         return UTF16.getCharCount(ch);
     }

     /**
      * Gets the upper or title case map of the codepoint
      * @param locale locale which the mappings will be searched
      * @param ch codepoint whose upper or title case will be mapped
      * @param uchariter text iterator positioned at the codepoint
      * @param upperflag flag true if uppercase is desired, false for title case
      * @param buffer buffer to store result map
      * @return size of uppercased codepoint in UTF16 format
      */
 	public int toUpperOrTitleCase(Locale locale, int ch,
 	                              UCharacterIterator uchariter,
 	                              boolean upperflag, char buffer[])
     {
         int props = getProperty(ch);
         if ((props & EXCEPTION_MASK) == 0) {
             int type = props & TYPE_MASK;
             if (type == UCharacterCategory.LOWERCASE_LETTER) {
             	ch -= UCharacterProperty.getSignedValue(props);
             }
         } else {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (hasExceptionValue(index,
                                   UCharacterProperty.EXC_SPECIAL_CASING_)) {
                	StringBuffer strbuffer = new StringBuffer(1);
                 int result = getSpecialUpperOrTitleCase(locale, index, ch,
                                                         uchariter, upperflag,
                                                         strbuffer);
                 strbuffer.getChars(0, result, buffer, 0);
                 return result;
             }
             if (!upperflag && hasExceptionValue(index,
                                          UCharacterProperty.EXC_TITLECASE_)) {
                 ch = getException(index, UCharacterProperty.EXC_TITLECASE_);
             }
             else {
              	if (hasExceptionValue(index,
                                       UCharacterProperty.EXC_UPPERCASE_)) {
                     ch = getException(index,
                                       UCharacterProperty.EXC_UPPERCASE_);
                 }
             }
         }
         if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
         	buffer[0] = (char)ch;
         	return 1;
         }
         buffer[0] = UTF16.getLeadSurrogate(ch);
         buffer[1] = UTF16.getTrailSurrogate(ch);
         return 2;
     }

     /**
      * Gets the uppercasing of the argument string.
      * @param locale locale which the mappings will be searched
      * @param str string to map
      * @param start start index of the substring to map
      * @param limit one index pass the last character to map
      */
     public String toUpperCase(Locale locale, String str, int start, int limit)
     {
         UCharacterIterator ucharIter = UCharacterIterator.getInstance(str);
         int                strIndex  = start;
         StringBuffer       result    = new StringBuffer(limit - start);

         while (strIndex < limit) {
         	ucharIter.setIndex(strIndex);
 	        int ch = ucharIter.currentCodePoint();

 	        toUpperOrTitleCase(locale, ch, ucharIter, true, result);
 	        strIndex ++;
 	        if (ch >= UTF16.SUPPLEMENTARY_MIN_VALUE) {
 	        	strIndex ++;
 	        }
         }
         return result.toString();
     }

     /**
     * <p>Gets the titlecase version of the argument string.</p>
     * <p>Position for titlecasing is determined by the argument break
     * iterator, hence the user can customized his break iterator for
     * a specialized titlecasing. In this case only the forward iteration
     * needs to be implemented.
     * If the break iterator passed in is null, the default Unicode algorithm
     * will be used to determine the titlecase positions.
     * </p>
     * <p>Only positions returned by the break iterator will be title cased,
     * character in between the positions will all be in lower case.</p>
     * <p>Casing is dependent on the default locale and context-sensitive</p>
     * @param str source string to be performed on
     * @param breakiter break iterator to determine the positions in which
     *        the character should be title cased.
     * @return lowercase version of the argument string
     */
  	public String toTitleCase(Locale locale, String str,
  	                          BreakIterator breakiter)
  	{
  		UCharacterIterator ucharIter = UCharacterIterator.getInstance(str);
 		int                length    = str.length();
         StringBuffer       result    = new StringBuffer();

         breakiter.setText(str);

         int                index     = breakiter.first();
        	// titlecasing loop
 	    while (index != BreakIterator.DONE && index < length) {
 	    	// titlecase the character at the found index
 	        int ch = UTF16.charAt(str, index);
 	        ucharIter.setIndex(index);
 	        index += UTF16.getCharCount(ch);
         	toUpperOrTitleCase(locale, ch, ucharIter, false, result);
         	int next = breakiter.next();
         	if (index != BreakIterator.DONE && index < next) {
 	        	// lowercase [prev..index]
         		toLowerCase(locale, str, index, next, result);
             }
             index = next;
         }
         return result.toString();
  	}

     /**
      * <p>
      * Unicode property names and property value names are compared
      * "loosely". Property[Value]Aliases.txt say:
      * <quote>
      *   "With loose matching of property names, the case distinctions,
      *    whitespace, and '_' are ignored."
      * </quote>
      * </p>
      * <p>
      * This function does just that, for ASCII (char *) name strings.
      * It is almost identical to ucnv_compareNames() but also ignores
      * ASCII White_Space characters (U+0009..U+000d).
      * </p>
      * @param name1 name to compare
      * @param name2 name to compare
      * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
      *         if name1 is greater than name2.
      */
     /* to be implemented in 2.4
      * public static int comparePropertyNames(String name1, String name2)
     {
         int result = 0;
         int i1 = 0;
         int i2 = 0;
         while (true) {
             char ch1 = 0;
             char ch2 = 0;
             // Ignore delimiters '-', '_', and ASCII White_Space
             if (i1 < name1.length()) {
                 ch1 = name1.charAt(i1 ++);
             }
             while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
                    || ch1 == '\n' // synwee what is || ch1 == '\v'
                    || ch1 == '\f' || ch1=='\r') {
                 if (i1 < name1.length()) {
                     ch1 = name1.charAt(i1 ++);
                 }
                 else {
                     ch1 = 0;
                 }
             }
             if (i2 < name2.length()) {
                 ch2 = name2.charAt(i2 ++);
             }
             while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
                    || ch2 == '\n' // synwee what is || ch1 == '\v'
                    || ch2 == '\f' || ch2=='\r') {
                 if (i2 < name2.length()) {
                     ch2 = name2.charAt(i2 ++);
                 }
                 else {
                     ch2 = 0;
                 }
             }

             // If we reach the ends of both strings then they match
             if (ch1 == 0 && ch2 == 0) {
                 return 0;
             }

             // Case-insensitive comparison
             if (ch1 != ch2) {
                 result = Character.toLowerCase(ch1)
                                                 - Character.toLowerCase(ch2);
                 if (result != 0) {
                     return result;
                 }
             }
         }
     }
     */

     /**
      * Checks if the argument c is to be treated as a white space in ICU
      * rules. Usually ICU rule white spaces are ignored unless quoted.
      * @param c codepoint to check
      * @return true if c is a ICU white space
      */
     public static boolean isRuleWhiteSpace(int c)
     {
         /* "white space" in the sense of ICU rule parsers
            This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
            See UTR #31: http://www.unicode.org/reports/tr31/.
            U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
         */
         return (c >= 0x0009 && c <= 0x2029 &&
                 (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
                  c == 0x200E || c == 0x200F || c >= 0x2028));
     }

     /**
      * Get the the maximum values for some enum/int properties.
      * @return maximum values for the integer properties.
      */
     public int getMaxValues(int column)
     {
        // return m_maxBlockScriptValue_;

         switch(column) {
         case 0:
             return m_maxBlockScriptValue_;
         case 2:
             return m_maxJTGValue_;
         default:
             return 0;
         }
     }

     /**
      * Gets the type mask
      * @param type character type
      * @return mask
      */
     public static int getMask(int type)
     {
         return 1 << type;
     }

     // protected variables -----------------------------------------------

     /**
     * Case table
     */
     char m_case_[];

     /**
     * Exception property table
     */
     int m_exception_[];
     /**
      * Extra property trie
      */
     CharTrie m_additionalTrie_;
     /**
      * Extra property vectors, 1st column for age and second for binary
      * properties.
      */
     int m_additionalVectors_[];
     /**
      * Number of additional columns
      */
     int m_additionalColumnsCount_;
     /**
      * Maximum values for block, bits used as in vector word
      * 0
      */
     int m_maxBlockScriptValue_;
     /**
      * Maximum values for script, bits used as in vector word
      * 0
      */
  	int m_maxJTGValue_;
     // private variables -------------------------------------------------

   	/**
      * UnicodeData.txt property object
      */
     private static UCharacterProperty INSTANCE_ = null;

     /**
     * Default name of the datafile
     */
     private static final String DATA_FILE_NAME_ = "data/uprops.icu";

     /**
     * Default buffer size of datafile
     */
     private static final int DATA_BUFFER_SIZE_ = 25000;

     /**
     * This, from what i infer is the max size of the indicators used for the
     * exception values.
     * Number of bits in an 8-bit integer value
     */
     private static final int EXC_GROUP_ = 8;

     /**
     * Mask to get the group
     */
     private static final int EXC_GROUP_MASK_ = 255;

     /**
     * Mask to get the digit value in the exception result
     */
     private static final int EXC_DIGIT_MASK_ = 0xFFFF;

     /**
     * Offset table for data in exception block.<br>
     * Table formed by the number of bits used for the index, e.g. 0 = 0 bits,
     * 1 = 1 bits.
     */
     private static final byte FLAGS_OFFSET_[] =
     {
         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
     };

     /**
     * Numeric value shift
     */
     private static final int VALUE_SHIFT_ = 20;

     /**
     * Mask to be applied after shifting to obtain an unsigned numeric value
     */
     private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0x7FF;

     /**
 	 * Shift to get reserved value
 	 */
 	private static final int RESERVED_SHIFT_ = 15;

 	/**
 	 *
 	 */
 	private static final int BIDI_SHIFT_ = 6;
 	/**
 	 *
 	 */
 	private static final int MIRROR_SHIFT_ = BIDI_SHIFT_ + 5;

 	/**
 	 *
 	 */
 	private static final int NUMERIC_TYPE_SHIFT = 12;
 	/**
 	 *
 	 */
 	private static final int CASE_SENSITIVE_SHIFT_= NUMERIC_TYPE_SHIFT+3;
 	/**
 	 * Bit indicating exception
 	 */
   	private static final int EXCEPTION_BIT = 1 << 5;

   	/**
   	 * Bit to get the actual property value
   	 */
     private static final int VALUE_BITS_ = 0x10000 - VALUE_SHIFT_;

 	/**
 	 * Minimum value of a property
 	 */
     private static final int MIN_VALUE_ = -(1 << (VALUE_BITS_ - 1));

     /**
      * Maximum value of a property
      */
     private static final int MAX_VALUE_ = (1 << (VALUE_BITS_ - 1)) - 1;
     /**
      * Maximum number of exceptions
      */
     private static int MAX_EXCEPTIONS_COUNT_ = 1 << VALUE_BITS_;

     /**
     * To get the last 5 bits out from a data type
     */
     private static final int LAST_5_BIT_MASK_ = 0x1F;

     /**
     * Shift 5 bits
     */
     private static final int SHIFT_5_ = 5;
     /**
     * Shift 10 bits
     */
     private static final int SHIFT_10_ = 10;

     /**
     * Folding indicator mask
     */
     private static final int SUPPLEMENTARY_FOLD_INDICATOR_MASK_ = 0x8000;
     /**
     * Folding offset mask
     */
     private static final int SUPPLEMENTARY_FOLD_OFFSET_MASK_ = 0x7FFF;
     /**
     * Shift value for lead surrogate to form a supplementary character.
     */
 	private static final int LEAD_SURROGATE_SHIFT_ = 10;
 	/**
     * Offset to add to combined surrogate pair to avoid msking.
     */
     private static final int SURROGATE_OFFSET_ =
                            UTF16.SUPPLEMENTARY_MIN_VALUE -
                            (UTF16.SURROGATE_MIN_VALUE <<
                            LEAD_SURROGATE_SHIFT_) -
                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
     /**
     * Latin uppercase I
     */
     private static final char LATIN_CAPITAL_LETTER_I_ = 0x49;
     /**
     * Combining dot above
     */
     private static final char COMBINING_DOT_ABOVE_ = 0x307;
     /**
     * LATIN SMALL LETTER J
     */
     private static final int LATIN_SMALL_LETTER_J_ = 0x6a;
     /**
     * LATIN SMALL LETTER I WITH OGONEK
     */
     private static final int LATIN_SMALL_LETTER_I_WITH_OGONEK_ = 0x12f;
     /**
     * LATIN SMALL LETTER I WITH TILDE BELOW
     */
     private static final int LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ = 0x1e2d;
     /**
     * LATIN SMALL LETTER I WITH DOT BELOW
     */
     private static final int LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_ = 0x1ecb;
     /**
     * Combining class for combining mark above
     */
     private static final int COMBINING_MARK_ABOVE_CLASS_ = 230;

     /**
     * LATIN CAPITAL LETTER J
     */
     private static final int LATIN_CAPITAL_LETTER_J_ = 0x4a;

     /**
     * LATIN CAPITAL LETTER I WITH OGONEK
     */
     private static final int LATIN_CAPITAL_I_WITH_OGONEK_ = 0x12e;
     /**
     * LATIN CAPITAL LETTER I WITH TILDE
     */
     private static final int LATIN_CAPITAL_I_WITH_TILDE_ = 0x128;
     /**
     * LATIN CAPITAL LETTER I WITH GRAVE
     */
     private static final int LATIN_CAPITAL_I_WITH_GRAVE_ = 0xcc;
     /**
     * LATIN CAPITAL LETTER I WITH ACUTE
     */
     private static final int LATIN_CAPITAL_I_WITH_ACUTE_ = 0xcd;
     /**
     * COMBINING GRAVE ACCENT
     */
     private static final int COMBINING_GRAVE_ACCENT_ = 0x300;
     /**
     * COMBINING ACUTE ACCENT
     */
     private static final int COMBINING_ACUTE_ACCENT_ = 0x301;
     /**
     * COMBINING TILDE
     */
     private static final int COMBINING_TILDE_ = 0x303;
     /**
     * Greek capital letter sigma
     */
     private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3;
     /**
     * Greek small letter sigma
     */
     private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3;
     /**
     * Greek small letter rho
     */
     private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2;
     /**
     * Hyphens
     */
     private static final int HYPHEN_      = 0x2010;
     private static final int SOFT_HYPHEN_ = 0xAD;
     /**
     * To get the last character out from a data type
     */
     private static final int LAST_CHAR_MASK_ = 0xFFFF;
     /**
     * To get the last byte out from a data type
     */
     private static final int LAST_BYTE_MASK_ = 0xFF;
     /**
     * Shift 16 bits
     */
     private static final int SHIFT_16_ = 16;

    	// additional properties ----------------------------------------------

    	/**
    	 * Additional properties used in internal trie data
    	 */
    	/*
 	 * Properties in vector word 1
 	 * Each bit encodes one binary property.
 	 * The following constants represent the bit number, use 1<<UPROPS_XYZ.
 	 * UPROPS_BINARY_1_TOP<=32!
 	 *
 	 * Keep this list of property enums in sync with
 	 * propListNames[] in icu/source/tools/genprops/props2.c!
 	 *
 	 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
 	 */
     private static final int WHITE_SPACE_PROPERTY_ = 0;
     private static final int BIDI_CONTROL_PROPERTY_ = 1;
     private static final int JOIN_CONTROL_PROPERTY_ = 2;
     private static final int DASH_PROPERTY_ = 3;
     private static final int HYPHEN_PROPERTY_ = 4;
     private static final int QUOTATION_MARK_PROPERTY_ = 5;
     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 6;
     private static final int MATH_PROPERTY_ = 7;
     private static final int HEX_DIGIT_PROPERTY_ = 8;
     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 9;
     private static final int ALPHABETIC_PROPERTY_ = 10;
     private static final int IDEOGRAPHIC_PROPERTY_ = 11;
     private static final int DIACRITIC_PROPERTY_ = 12;
     private static final int EXTENDER_PROPERTY_ = 13;
     private static final int LOWERCASE_PROPERTY_ = 14;
     private static final int UPPERCASE_PROPERTY_ = 15;
     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 16;
     private static final int GRAPHEME_EXTEND_PROPERTY_ = 17;
     private static final int GRAPHEME_LINK_PROPERTY_ = 18;
     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 19;
     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 20;
     private static final int RADICAL_PROPERTY_ = 21;
     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 22;
     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 23;
     private static final int DEPRECATED_PROPERTY_ = 24;
     private static final int SOFT_DOTTED_PROPERTY_ = 25;
     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 26;
     private static final int XID_START_PROPERTY_ = 27;
     private static final int XID_CONTINUE_PROPERTY_ = 28;
     private static final int ID_START_PROPERTY_	= 29;
     private static final int ID_CONTINUE_PROPERTY_ = 30;
     private static final int GRAPHEME_BASE_PROPERTY_ = 31;
     private static final int BINARY_1_TOP_PROPERTY_ = 32;

     /**
      * First nibble shift
      */
     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
     /**
      * Second nibble mask
      */
     private static final int LAST_NIBBLE_MASK_ = 0xF;
     /**
      * Age value shift
      */
     private static final int AGE_SHIFT_ = 24;

     // private constructors --------------------------------------------------

     /**
     * Constructor
     * @exception thrown when data reading fails or data corrupted
     */
     private UCharacterProperty() throws IOException
     {
         // jar access
         InputStream i = getClass().getResourceAsStream(DATA_FILE_NAME_);
         if(i==null){
             throw new IOException("Could not load the file: "+DATA_FILE_NAME_);
         }
         BufferedInputStream b = new BufferedInputStream(i,
                                                         DATA_BUFFER_SIZE_);
         UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
         reader.read(this);
         b.close();
         i.close();
         m_trie_.putIndexData(this);
     }

 	// private methods -------------------------------------------------------

     /*
      * This section contains helper functions that check for conditions
      * in the input text surrounding the current code point
      * according to SpecialCasing.txt.
      *
      * Starting with ICU 2.1, the "surrounding text" is passed in as an
      * instance of UCharacterIterator to allow the core case mapping functions
      * to be used inside transliterators (using Replaceable instead of String)
      * etc.
      *
      * Each helper function gets the index
      * - after the current code point if it looks at following text
      * - before the current code point if it looks at preceding text
      *
      * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
      *
      * Final_Sigma
      *   C is preceded by a sequence consisting of a cased letter and a
      *   case-ignorable sequence, and C is not followed by a sequence
      *   consisting of an ignorable sequence and then a cased letter.
      *
      * More_Above
      *   C is followed by one or more characters of combining class 230 (ABOVE)
      *   in the combining character sequence.
      *
      * After_Soft_Dotted
      *   The last preceding character with combining class of zero before C
      *   was Soft_Dotted,
      *   and there is no intervening combining character class 230 (ABOVE).
      *
      * Before_Dot
      *   C is followed by combining dot above (U+0307).
      *   Any sequence of characters with a combining class that is neither 0
      *   nor 230 may intervene between the current character and the combining
      *   dot above.
      *
      * The erratum from 2002-10-31 adds the condition
      *
      * After_I
      *   The last preceding base character was an uppercase I, and there is no
      *   intervening combining character class 230 (ABOVE).
      *
      *   (See Jitterbug 2344 and the comments on After_I below.)
      *
      * Helper definitions in Unicode 3.2 UAX 21:
      *
      * D1. A character C is defined to be cased
      *     if it meets any of the following criteria:
      *
      *   - The general category of C is Titlecase Letter (Lt)
      *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
      *   - Given D = NFD(C), then it is not the case that:
      *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
      *     (This third criterium does not add any characters to the list
      *      for Unicode 3.2. Ignored.)
      *
      * D2. A character C is defined to be case-ignorable
      *     if it meets either of the following criteria:
      *
      *   - The general category of C is
      *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf),
      *     or Letter Modifier (Lm), or Symbol Modifier (Sk)
      *   - C is one of the following characters
      *     U+0027 APOSTROPHE
      *     U+00AD SOFT HYPHEN (SHY)
      *     U+2019 RIGHT SINGLE QUOTATION MARK
      *            (the preferred character for apostrophe)
      *
      * D3. A case-ignorable sequence is a sequence of
      *     zero or more case-ignorable characters.
      */

     /**
     * Determines if a string at offset is preceded by any soft dotted character
     * with no intervening character with combining class = 230
     * @param uchariter text iterator to be determined
     * @param offset offset in string to check
     * @return true if some characters preceding the offset index belongs to
     *         the set of soft dotted characters with no intervening character
     * @see SpecialCasing.txt
     */
     private boolean isPrecededBySoftDotted(
                                 UCharacterIterator uchariter, int offset)
     {
     	uchariter.setIndex(offset);

     	int ch = uchariter.previousCodePoint();

         while (ch != UCharacterIterator.DONE) {
             if (isSoftDotted(ch)) {
                 return true; // preceded by TYPE_i
             }

             int cc = NormalizerImpl.getCombiningClass(ch);
             if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) {
                 // preceded by different base character not TYPE_i), or
                 // intervening cc == 230
                 return false;
             }
             ch = uchariter.previousCodePoint();
         }

         return false; // not preceded by TYPE_i
     }

     /**
     * Determines if codepoint at offset is not followed by a sequence
     * consisting of an ignorable sequence and then a cased letter
     * {Ll, Lu, Lt}.
     * @param uchariter String iterator to determine
     * @param offset codepoint offset in string to check
     * @return false if any character after offset in src is a cased letter
     * @see SpecialCasing.txt
     */
     private boolean isCFINAL(UCharacterIterator uchariter, int offset)
     {
     	// iterator should have been determined to be not null by caller
         uchariter.setIndex(offset);
     	uchariter.nextCodePoint(); // rid of current codepoint
         int ch = uchariter.nextCodePoint(); // start checking

     	while (ch != UCharacterIterator.DONE) {
             int cat = getProperty(ch) & TYPE_MASK;
             if (isCased(ch, cat)) {
                 return false; // followed by cased letter
             }
             if (!isCaseIgnorable(ch, cat)) {
                 return true; // not ignorable
             }
             ch = uchariter.nextCodePoint();
         }

         return true;
     }

     /**
     * Determines if codepoint at offset is not preceded by a sequence
     * consisting of a cased letter {Ll, Lu, Lt} and an ignorable sequence.
     * @param uchariter string iterator to determine
     * @param offset codepoint offset in string to check
     * @return true if any character before index in src is a cased letter
     * @see SpecialCasing.txt
     */
     private boolean isNotCINITIAL(UCharacterIterator uchariter,
                                          int offset)
     {
     	uchariter.setIndex(offset);
     	int ch = uchariter.previousCodePoint();

         while (ch != UCharacterIterator.DONE) {
             int cat = getProperty(ch) & TYPE_MASK;
             if (isCased(ch, cat)) {
                 return true; // preceded by cased letter
             }
             if (!isCaseIgnorable(ch, cat)) {
                 return false; // not ignorable
             }
 			ch = uchariter.previousCodePoint();
         }

         return false;
     }

     /**
      * <p>
      * See Jitterbug 2344:
      * The condition After_I for Turkic-lowercasing of U+0307 combining dot
      * above is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
      * we made those releases compatible with Unicode 3.2 which had not fixed
      * a related but in SpecialCasing.txt.
      * </p>
      * <p>
      * From the Jitterbug 2344 text:
      * ... this bug is listed as a Unicode erratum
      * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
      * </p>
      * <quote>
      * There are two errors in SpecialCasing.txt.
      * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
      * 2. An incorrect context definition. Correct as follows:
      * &lt; 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
      * &lt; 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
      * ---
      * &gtr; 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
      * &gtr; 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
      * where the context After_I is defined as:
      * The last preceding base character was an uppercase I, and there is no
      * intervening combining character class 230 (ABOVE).
      * </quote>
      * <p>
      * Note that SpecialCasing.txt even in Unicode 3.2 described the condition
      * as:
      * </p>
      * <p>
      * <ul>
      * <li> When lowercasing, remove dot_above in the sequence I + dot_above,
      *      which will turn into i.
      * <li> This matches the behavior of the canonically equivalent I-dot_above
      * </ul>
      * See also the description in this place in older versions of uchar.c
      * (revision 1.100).
      * </p>
      * Markus W. Scherer 2003-feb-15
      */

     /**
      * Is preceded by base character 'I' with no intervening cc=230 ?
      * @param uchariter string iterator to determine
      * @param offset codepoint offset in string to check
      */
     private boolean isPrecededByI(UCharacterIterator uchariter, int offset)
     {
         uchariter.setIndex(offset);
         for(;;) {
             int c = uchariter.previousCodePoint();
             if (c < 0) {
                 break;
             }
             if (c == LATIN_CAPITAL_LETTER_I_) {
                 return true; // preceded by I
             }

             int cc = NormalizerImpl.getCombiningClass(c);
             if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) {
                 // preceded by different base character (not I),
                 // or intervening cc==230
                 return false;
             }
         }

         return false; // not preceded by I
     }

     /**
     * Determines if a codepoint at offset in string is followed by one or
     * more characters of combining class = 230.
     * @param uchariter text iterator to be determined
     * @param offset codepoint offset in string to check
     * @return true if a string at offset is followed by one or more characters
     *         of combining class = 230.
     * @see SpecialCasing.txt
     */
     private static boolean isFollowedByMOREABOVE(UCharacterIterator uchariter,
                                                  int offset)
     {
         uchariter.setIndex(offset);
         uchariter.nextCodePoint(); // rid of current codepoint
         int ch = uchariter.nextCodePoint(); // start checking

         while (ch != UCharacterIterator.DONE) {
             int cc = NormalizerImpl.getCombiningClass(ch);
             if (cc == COMBINING_MARK_ABOVE_CLASS_) {
                 return true; // at least one cc==230 following
             }
             if (cc == 0) {
                 return false; // next base character, no more cc==230 following
             }
             ch = uchariter.nextCodePoint();
         }

         return false; // no more cc == 230 following
     }

     /**
     * Determines if a codepoint at offset in string is followed by a dot
     * above with no characters of combining class == 230 in between
     * @param uchariter text iterator to be determined
     * @param offset codepoint offset of the character in string to check
     * @return true if a string at offset is followed by oa dot above
     *         with no characters of combining class == 230 in between
     * @see SpecialCasing.txt
     */
     private static boolean isFollowedByDotAbove(UCharacterIterator uchariter,
                                                 int offset)
     {
         uchariter.setIndex(offset);
         uchariter.nextCodePoint(); // rid off current character
         int ch = uchariter.nextCodePoint(); // start checking

         while (ch != UCharacterIterator.DONE) {
             if (ch == COMBINING_DOT_ABOVE_) {
                 return true;
             }
             int cc = NormalizerImpl.getCombiningClass(ch);
             if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) {
                 return false; // next base character or cc==230 in between
             }
             ch = uchariter.nextCodePoint();
         }

         return false; // no dot above following
     }

     /**
     * Checks if the case ignorable
     * @param ch codepoint
     * @param cat category of the argument codepoint
     * @return true if ch is case ignorable.
     */
     private static boolean isCaseIgnorable(int ch, int cat)
     {
         return cat == UCharacterCategory.NON_SPACING_MARK
                || cat == UCharacterCategory.ENCLOSING_MARK
                || cat == UCharacterCategory.FORMAT
                || cat == UCharacterCategory.MODIFIER_LETTER
                || cat == UCharacterCategory.MODIFIER_SYMBOL
                || ch == 0x27 || ch == 0xad || ch == 0x2019;
     }

     /**
      * Is this a "cased" character?
      * @param ch codepoint
      * @param cat category of the argument
      * @return true if ch is a cased character
      */
     private boolean isCased(int ch, int cat)
     {
         // Lt + Uppercase + Lowercase = Lt + Lu + Ll
         // + Other_Uppercase+Other_Lowercase

         boolean result = (cat == UCharacterCategory.TITLECASE_LETTER
                || cat == UCharacterCategory.UPPERCASE_LETTER
                || cat == UCharacterCategory.LOWERCASE_LETTER);
         if (result) {
             return result;
         }
         int prop = getAdditional(ch, 1);
         return compareAdditionalType(prop, UPPERCASE_PROPERTY_)
                || compareAdditionalType(prop, LOWERCASE_PROPERTY_);
     }

     /**
      * Is Soft_Dotted?
      * @param ch codepoint
      * @return true if ch is soft dotted
      */
     private boolean isSoftDotted(int ch) {
         return compareAdditionalType(getAdditional(ch, 1),
                                      SOFT_DOTTED_PROPERTY_);
     }

     /* Is followed by {case-ignorable}* cased  ? */
     /**
     * Getting the correct address for data in the exception value
     * @param evalue exception value
     * @param indicator type of data to retrieve
     * @param address current address to move from
     * @return the correct address
     */
     private int addExceptionOffset(int evalue, int indicator, int address)
     {
         int result = address;
         if (indicator >= EXC_GROUP_) {
         result += FLAGS_OFFSET_[evalue & EXC_GROUP_MASK_];
         evalue >>= EXC_GROUP_;
         indicator -= EXC_GROUP_;
         }
         int mask = (1 << indicator) - 1;
         result += FLAGS_OFFSET_[evalue & mask];
         return result;
     }

     /**
      * Compare additional properties to see if it has argument type
      * @param property 32 bit properties
      * @param type character type
      * @return true if property has type
      */
     private boolean compareAdditionalType(int property, int type)
     {
     	return (property & (1 << type)) != 0;
     }


 	private static final int TAB     = 0x0009;
 	private static final int LF      = 0x000a;
 	private static final int FF      = 0x000c;
 	private static final int CR      = 0x000d;
 	private static final int U_A     = 0x0041;
 	private static final int U_Z     = 0x005a;
 	private static final int U_a     = 0x0061;
 	private static final int U_z     = 0x007a;
 	private static final int DEL     = 0x007f;
 	private static final int NL      = 0x0085;
 	private static final int NBSP    = 0x00a0;
 	private static final int CGJ     = 0x034f;
 	private static final int FIGURESP= 0x2007;
 	private static final int HAIRSP  = 0x200a;
 	private static final int ZWNJ    = 0x200c;
 	private static final int ZWJ     = 0x200d;
 	private static final int RLM     = 0x200f;
 	private static final int NNBSP   = 0x202f;
 	private static final int WJ      = 0x2060;
 	private static final int INHSWAP = 0x206a;
 	private static final int NOMDIG  = 0x206f;
 	private static final int ZWNBSP  = 0xfeff;

     public UnicodeSet addPropertyStarts(UnicodeSet set) {
         int c;

         /* add the start code point of each same-value range of each trie */
         //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
         TrieIterator propsIter = new TrieIterator(m_trie_);
         RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
       	while(propsIter.next(propsResult)){
             set.add(propsResult.start);
         }
         //utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, set);
         TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
         RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
         while(propsVectorsIter.next(propsVectorsResult)){
             set.add(propsVectorsResult.start);
         }


         /* add code points with hardcoded properties, plus the ones following them */

     	/* add for IS_THAT_CONTROL_SPACE() */
 	    set.add(TAB); /* range TAB..CR */
 	    set.add(CR+1);
 	    set.add(0x1c);
 	    set.add(0x1f+1);
 	    set.add(NL);
 	    set.add(NL+1);

 	    /* add for u_isIDIgnorable() what was not added above */
 	    set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
 	    set.add(HAIRSP);
 	    set.add(RLM+1);
 	    set.add(INHSWAP);
 	    set.add(NOMDIG+1);
 	    set.add(ZWNBSP);
 	    set.add(ZWNBSP+1);

 	    /* add no-break spaces for u_isWhitespace() what was not added above */
 	    set.add(NBSP);
 	    set.add(NBSP+1);
 	    set.add(FIGURESP);
 	    set.add(FIGURESP+1);
 	    set.add(NNBSP);
 	    set.add(NNBSP+1);

 	    /* add for u_charDigitValue() */
 	    set.add(0x3007);
 	    set.add(0x3008);
 	    set.add(0x4e00);
 	    set.add(0x4e01);
 	    set.add(0x4e8c);
 	    set.add(0x4e8d);
 	    set.add(0x4e09);
 	    set.add(0x4e0a);
 	    set.add(0x56db);
 	    set.add(0x56dc);
 	    set.add(0x4e94);
 	    set.add(0x4e95);
 	    set.add(0x516d);
 	    set.add(0x516e);
 	    set.add(0x4e03);
 	    set.add(0x4e04);
 	    set.add(0x516b);
 	    set.add(0x516c);
 	    set.add(0x4e5d);
 	    set.add(0x4e5e);

 	    /* add for u_digit() */
 	    set.add(U_a);
 	    set.add(U_z+1);
 	    set.add(U_A);
 	    set.add(U_Z+1);

 	    /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
 	    set.add(WJ); /* range WJ..NOMDIG */
 	    set.add(0xfff0);
 	    set.add(0xfffb+1);
 	    set.add(0xe0000);
 	    set.add(0xe0fff+1);

 	    /* add for UCHAR_GRAPHEME_BASE and others */
 	    set.add(CGJ);
 	    set.add(CGJ+1);

 	    /* add for UCHAR_JOINING_TYPE */
 	    set.add(ZWNJ); /* range ZWNJ..ZWJ */
 	    set.add(ZWJ+1);

 	    /* add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE */
 	    set.add(0x1100);
 	    int value= UCharacter.HangulSyllableType.LEADING_JAMO;
 	    int value2;
 	    for(c=0x115a; c<=0x115f; ++c) {
 	        value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
 	        if(value!=value2) {
 	            value=value2;
 	            set.add(c);
 	        }
 	    }

 	    set.add(0x1160);
 	    value=UCharacter.HangulSyllableType.VOWEL_JAMO;
 	    for(c=0x11a3; c<=0x11a7; ++c) {
 	        value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
 	        if(value!=value2) {
 	            value=value2;
 	            set.add(c);
 	        }
 	    }

 	    set.add(0x11a8);
 	    value=UCharacter.HangulSyllableType.TRAILING_JAMO;
 	    for(c=0x11fa; c<=0x11ff; ++c) {
 	        value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
 	        if(value!=value2) {
 	            value=value2;
 	            set.add(c);
 	        }
 	    }


 	    /*
 	     * Omit code points for u_charCellWidth() because
 	     * - it is deprecated and not a real Unicode property
 	     * - they are probably already set from the trie enumeration
 	     */

 	    /*
 	     * Omit code points with hardcoded specialcasing properties
 	     * because we do not build property UnicodeSets for them right now.
 	     */
         return set; // for chaining
     }
 /*----------------------------------------------------------------
  * Inclusions list
  *----------------------------------------------------------------*/

     /*
      * Return a set of characters for property enumeration.
      * The set implicitly contains 0x110000 as well, which is one more than the highest
      * Unicode code point.
      *
      * This set is used as an ordered list - its code points are ordered, and
      * consecutive code points (in Unicode code point order) in the set define a range.
      * For each two consecutive characters (start, limit) in the set,
      * all of the UCD/normalization and related properties for
      * all code points start..limit-1 are all the same,
      * except for character names and ISO comments.
      *
      * All Unicode code points U+0000..U+10ffff are covered by these ranges.
      * The ranges define a partition of the Unicode code space.
      * ICU uses the inclusions set to enumerate properties for generating
      * UnicodeSets containing all code points that have a certain property value.
      *
      * The Inclusion List is generated from the UCD. It is generated
      * by enumerating the data tries, and code points for hardcoded properties
      * are added as well.
      *
      * --------------------------------------------------------------------------
      *
      * The following are ideas for getting properties-unique code point ranges,
      * with possible optimizations beyond the current implementation.
      * These optimizations would require more code and be more fragile.
      * The current implementation generates one single list (set) for all properties.
      *
      * To enumerate properties efficiently, one needs to know ranges of
      * repetitive values, so that the value of only each start code point
      * can be applied to the whole range.
      * This information is in principle available in the uprops.icu/unorm.icu data.
      *
      * There are two obstacles:
      *
      * 1. Some properties are computed from multiple data structures,
      *    making it necessary to get repetitive ranges by intersecting
      *    ranges from multiple tries.
      *
      * 2. It is not economical to write code for getting repetitive ranges
      *    that are precise for each of some 50 properties.
      *
      * Compromise ideas:
      *
      * - Get ranges per trie, not per individual property.
      *   Each range contains the same values for a whole group of properties.
      *   This would generate currently five range sets, two for uprops.icu tries
      *   and three for unorm.icu tries.
      *
      * - Combine sets of ranges for multiple tries to get sufficient sets
      *   for properties, e.g., the uprops.icu main and auxiliary tries
      *   for all non-normalization properties.
      *
      * Ideas for representing ranges and combining them:
      *
      * - A UnicodeSet could hold just the start code points of ranges.
      *   Multiple sets are easily combined by or-ing them together.
      *
      * - Alternatively, a UnicodeSet could hold each even-numbered range.
      *   All ranges could be enumerated by using each start code point
      *   (for the even-numbered ranges) as well as each limit (end+1) code point
      *   (for the odd-numbered ranges).
      *   It should be possible to combine two such sets by xor-ing them,
      *   but no more than two.
      *
      * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
      * but the first one is certainly simpler and applicable for combining more than
      * two range sets.
      *
      * It is possible to combine all range sets for all uprops/unorm tries into one
      * set that can be used for all properties.
      * As an optimization, there could be less-combined range sets for certain
      * groups of properties.
      * The relationship of which less-combined range set to use for which property
      * depends on the implementation of the properties and must be hardcoded
      * - somewhat error-prone and higher maintenance but can be tested easily
      * by building property sets "the simple way" in test code.
      *
      * ---
      *
      * Do not use a UnicodeSet pattern because that causes infinite recursion;
      * UnicodeSet depends on the inclusions set.
      */
     public UnicodeSet getInclusions() {
         UnicodeSet set = new UnicodeSet();
         NormalizerImpl.addPropertyStarts(set);
         addPropertyStarts(set);
         return set;
     }

 }