/**
*******************************************************************************
* Copyright (C) 1996-2005, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/

package com.ibm.icu.impl;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.Locale;
import java.util.MissingResourceException;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.VersionInfo;

/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
* @draft 2.1
*/

public final class UCharacterProperty
{
    // public data members -----------------------------------------------

    /**
    * Trie data
    */
    public CharTrie m_trie_;
    /**
     * Optimization
     * CharTrie index array
     */
    public char[] m_trieIndex_;
    /**
     * Optimization
     * CharTrie data array
     */
    public char[] m_trieData_;
    /**
     * Optimization
     * CharTrie data offset
     */
    public int m_trieInitialValue_;
    /**
    * Unicode version
    */
    public VersionInfo m_unicodeVersion_;
    /**
    * Latin capital letter i with dot above
    */
    public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
    /**
    * Latin small letter i with dot above
    */
    public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
    /**
    * Latin lowercase i
    */
    public static final char LATIN_SMALL_LETTER_I_ = 0x69;
    /**
    * Character type mask
    */
    public static final int TYPE_MASK = 0x1F;

    // uprops.h enum UPropertySource --------------------------------------- ***

    /** No source, not a supported property. */
    public static final int SRC_NONE=0;
    /** From uchar.c/uprops.icu main trie */
    public static final int SRC_CHAR=1;
    /** From uchar.c/uprops.icu properties vectors trie */
    public static final int SRC_PROPSVEC=2;
    /** Hangul_Syllable_Type, from uchar.c/uprops.icu */
    public static final int SRC_HST=3;
    /** From unames.c/unames.icu */
    public static final int SRC_NAMES=4;
    /** From unorm.cpp/unorm.icu */
    public static final int SRC_NORM=5;
    /** From ucase.c/ucase.icu */
    public static final int SRC_CASE=6;
    /** From ubidi_props.c/ubidi.icu */
    public static final int SRC_BIDI=7;
    /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
    public static final int SRC_CHAR_AND_PROPSVEC=8;
    /** One more than the highest UPropertySource (SRC_) constant. */
    public static final int SRC_COUNT=9;

    // public methods ----------------------------------------------------

    /**
     * Java friends implementation
     */
    public void setIndexData(CharTrie.FriendAgent friendagent)
    {
        m_trieIndex_ = friendagent.getPrivateIndex();
        m_trieData_ = friendagent.getPrivateData();
        m_trieInitialValue_ = friendagent.getPrivateInitialValue();
    }

    /**
    * Gets the property value at the index.
    * This is optimized.
    * Note this is alittle different from CharTrie the index m_trieData_
    * is never negative.
    * @param ch code point whose property value is to be retrieved
    * @return property value of code point
    */
    public final int getProperty(int ch)
    {
        if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
            || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
                && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
            // BMP codepoint 0000..D7FF or DC00..FFFF
            // optimized
            try { // using try for ch < 0 is faster than using an if statement
                return m_trieData_[
                    (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
                          << Trie.INDEX_STAGE_2_SHIFT_)
                    + (ch & Trie.INDEX_STAGE_3_MASK_)];
            } catch (ArrayIndexOutOfBoundsException e) {
                return m_trieInitialValue_;
            }
        }
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            // lead surrogate D800..DBFF
            return m_trieData_[
                    (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
                                  + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
                          << Trie.INDEX_STAGE_2_SHIFT_)
                    + (ch & Trie.INDEX_STAGE_3_MASK_)];
        }
        if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
            // supplementary code point 10000..10FFFF
            // look at the construction of supplementary characters
            // trail forms the ends of it.
            return m_trie_.getSurrogateValue(
                                          UTF16.getLeadSurrogate(ch),
                                          (char)(ch & Trie.SURROGATE_MASK_));
        }
        // ch is out of bounds
        // return m_dataOffset_ if there is an error, in this case we return
        // the default value: m_initialValue_
        // we cannot assume that m_initialValue_ is at offset 0
        // this is for optimization.
        return m_trieInitialValue_;

        // this all is an inlined form of return m_trie_.getCodePointValue(ch);
    }

    /**
    * Getting the signed numeric value of a character embedded in the property
    * argument
    * @param prop the character
    * @return signed numberic value
    */
    public static int getSignedValue(int prop)
    {
        return ((short)prop >> VALUE_SHIFT_);
    }

    /**
    * Getting the unsigned numeric value of a character embedded in the property
    * argument
    * @param prop the character
    * @return unsigned numberic value
    */
    ///CLOVER:OFF
    public static int getUnsignedValue(int prop)
    {
        return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
    }
    ///CLOVER:ON

    /* internal numeric pseudo-types for special encodings of numeric values */
    public static final int NT_FRACTION=4; /* ==UCharacter.NumericType.COUNT, must not change unless binary format version changes */
    public static final int NT_LARGE=5;
    public static final int NT_COUNT=6;

    /**
     * Gets the unicode additional properties.
     * C version getUnicodeProperties.
     * @param codepoint codepoint whose additional properties is to be
     *                  retrieved
     * @param column
     * @return unicode properties
     */
       public int getAdditional(int codepoint, int column) {
        if (column == -1) {
            return getProperty(codepoint);
        }
           if (column < 0 || column >= m_additionalColumnsCount_) {
           return 0;
       }
       return m_additionalVectors_[
                     m_additionalTrie_.getCodePointValue(codepoint) + column];
       }

    static final int MY_MASK = UCharacterProperty.TYPE_MASK
        & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
            (1<<UCharacterCategory.LOWERCASE_LETTER) |
            (1<<UCharacterCategory.TITLECASE_LETTER) |
            (1<<UCharacterCategory.MODIFIER_LETTER) |
            (1<<UCharacterCategory.OTHER_LETTER));


       /**
     * <p>Get the "age" of the code point.</p>
     * <p>The "age" is the Unicode version when the code point was first
     * designated (as a non-character or for Private Use) or assigned a
     * character.</p>
     * <p>This can be useful to avoid emitting code points to receiving
     * processes that do not accept newer characters.</p>
     * <p>The data is from the UCD file DerivedAge.txt.</p>
     * <p>This API does not check the validity of the codepoint.</p>
     * @param codepoint The code point.
     * @return the Unicode version number
     * @draft ICU 2.1
     */
    public VersionInfo getAge(int codepoint)
    {
        int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
        return VersionInfo.getInstance(
                           (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
                           version & LAST_NIBBLE_MASK_, 0, 0);
    }

    private static final long UNSIGNED_INT_MASK = 0xffffffffL;

    private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
    private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
    private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
    private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
    private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
    private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
    /** Mask constant for multiple UCharCategory bits (Z Separators). */
    private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;

    /**
     * Checks if c is in
     * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
     * with space=\p{Whitespace} and Control=Cc.
     * Implements UCHAR_POSIX_GRAPH.
     * @internal
     */
    private static final boolean isgraphPOSIX(int c) {
        /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
        /* comparing ==0 returns FALSE for the categories mentioned */
        return (getMask(UCharacter.getType(c))&
                (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
               ==0;
    }

    private static final class BinaryProperties{
       int column;
       long mask;
       public BinaryProperties(int column,long mask){
               this.column = column;
               this.mask  = mask;
       }
   }
   BinaryProperties[] binProps={
       /*
        * column and mask values for binary properties from u_getUnicodeProperties().
        * Must be in order of corresponding UProperty,
        * and there must be exacly one entry per binary UProperty.
        */
       new BinaryProperties(  1,                (  1 << ALPHABETIC_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << ASCII_HEX_DIGIT_PROPERTY_) ),
       new BinaryProperties( SRC_BIDI,   0 ),                                       /* UCHAR_BIDI_CONTROL */
       new BinaryProperties( SRC_BIDI,   0 ),                                       /* UCHAR_BIDI_MIRRORED */
       new BinaryProperties(  1,                (  1 << DASH_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << DEPRECATED_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << DIACRITIC_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << EXTENDER_PROPERTY_) ),
       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_FULL_COMPOSITION_EXCLUSION */
       new BinaryProperties(  1,                (  1 << GRAPHEME_BASE_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << GRAPHEME_EXTEND_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << GRAPHEME_LINK_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << HEX_DIGIT_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << HYPHEN_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << ID_CONTINUE_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << ID_START_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << IDEOGRAPHIC_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << IDS_BINARY_OPERATOR_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << IDS_TRINARY_OPERATOR_PROPERTY_) ),
       new BinaryProperties( SRC_BIDI,   0 ),                                       /* UCHAR_JOIN_CONTROL */
       new BinaryProperties(  1,                (  1 << LOGICAL_ORDER_EXCEPTION_PROPERTY_) ),
       new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_LOWERCASE */
       new BinaryProperties(  1,                (  1 << MATH_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << NONCHARACTER_CODE_POINT_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << QUOTATION_MARK_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << RADICAL_PROPERTY_) ),
       new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_SOFT_DOTTED */
       new BinaryProperties(  1,                (  1 << TERMINAL_PUNCTUATION_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << UNIFIED_IDEOGRAPH_PROPERTY_) ),
       new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_UPPERCASE */
       new BinaryProperties(  1,                (  1 << WHITE_SPACE_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << XID_CONTINUE_PROPERTY_) ),
       new BinaryProperties(  1,                (  1 << XID_START_PROPERTY_) ),
       new BinaryProperties( SRC_CASE,   0 ),                                       /* UCHAR_CASE_SENSITIVE */
       new BinaryProperties(  2,                (  1 << V2_S_TERM_PROPERTY_) ),
       new BinaryProperties(  2,                (  1 << V2_VARIATION_SELECTOR_PROPERTY_) ),
       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFD_INERT */
       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFKD_INERT */
       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFC_INERT */
       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_NFKC_INERT */
       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_SEGMENT_STARTER */
       new BinaryProperties(  2,                (  1 << V2_PATTERN_SYNTAX) ),
       new BinaryProperties(  2,                (  1 << V2_PATTERN_WHITE_SPACE) ),
       new BinaryProperties( SRC_CHAR_AND_PROPSVEC,  0 ),                           /* UCHAR_POSIX_ALNUM */
       new BinaryProperties( SRC_CHAR,  0 ),                                        /* UCHAR_POSIX_BLANK */
       new BinaryProperties( SRC_CHAR,  0 ),                                        /* UCHAR_POSIX_GRAPH */
       new BinaryProperties( SRC_CHAR,  0 ),                                        /* UCHAR_POSIX_PRINT */
       new BinaryProperties( SRC_CHAR,  0 )                                         /* UCHAR_POSIX_XDIGIT */
   };


    /**
     * <p>Check a binary Unicode property for a code point.</p>
     * <p>Unicode, especially in version 3.2, defines many more properties
     * than the original set in UnicodeData.txt.</p>
     * <p>This API is intended to reflect Unicode properties as defined in
     * the Unicode Character Database (UCD) and Unicode Technical Reports
     * (UTR).</p>
     * <p>For details about the properties see
     * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
     * <p>For names of Unicode properties see the UCD file
     * PropertyAliases.txt.</p>
     * <p>This API does not check the validity of the codepoint.</p>
     * <p>Important: If ICU is built with UCD files from Unicode versions
     * below 3.2, then properties marked with "new" are not or
     * not fully available.</p>
     * @param codepoint Code point to test.
     * @param property selector constant from com.ibm.icu.lang.UProperty,
     *        identifies which binary property to check.
     * @return true or false according to the binary Unicode property value
     *         for ch. Also false if property is out of bounds or if the
     *         Unicode version does not have data for the property at all, or
     *         not for this code point.
     * @see com.ibm.icu.lang.UProperty
     * @draft ICU 2.1
     */

    public boolean hasBinaryProperty(int codepoint, int property)
    {
         if(property <UProperty.BINARY_START || UProperty.BINARY_LIMIT<=property) {
            // not a known binary property
            return false;
        } else {
            long mask=binProps[property].mask;
            int column=binProps[property].column;
            if(mask!=0) {
                // systematic, directly stored properties
                return ((UNSIGNED_INT_MASK & getAdditional(codepoint, column)) & mask)!=0;
            } else {
                if(column==SRC_CASE) {
                    /* case mapping properties */
                    UCaseProps csp;
                    try {
                        csp = UCaseProps.getSingleton();
                    } catch (IOException e) {
                        return false;
                    }
                    switch(property) {
                    case UProperty.LOWERCASE:
                        return UCaseProps.LOWER==csp.getType(codepoint);
                    case UProperty.UPPERCASE:
                        return UCaseProps.UPPER==csp.getType(codepoint);
                    case UProperty.SOFT_DOTTED:
                        return csp.isSoftDotted(codepoint);
                    case UProperty.CASE_SENSITIVE:
                        return csp.isCaseSensitive(codepoint);
                    default:
                        break;
                    }
                } else if(column==SRC_NORM) {
                    /* normalization properties from unorm.icu */
                    switch(property) {
                    case UProperty.FULL_COMPOSITION_EXCLUSION:
                        return NormalizerImpl.isFullCompositionExclusion(codepoint);
                    case UProperty.NFD_INERT:
                        return Normalizer.isNFSkippable(codepoint, Normalizer.NFD);
                    case UProperty.NFKD_INERT:
                        return Normalizer.isNFSkippable(codepoint, Normalizer.NFKD);
                    case UProperty.NFC_INERT:
                        return Normalizer.isNFSkippable(codepoint, Normalizer.NFC);
                    case UProperty.NFKC_INERT:
                        return Normalizer.isNFSkippable(codepoint, Normalizer.NFKC);
                    case UProperty.SEGMENT_STARTER:
                        return NormalizerImpl.isCanonSafeStart(codepoint);
                    default:
                        break;
                    }
                } else if(column==SRC_BIDI) {
                    /* bidi/shaping properties */
					UBiDiProps bdp;
					try {
						bdp = UBiDiProps.getSingleton();
					} catch (IOException e) {
						return false;
					}
                    switch(property) {
                    case UProperty.BIDI_MIRRORED:
                        return bdp.isMirrored(codepoint);
                    case UProperty.BIDI_CONTROL:
                        return bdp.isBidiControl(codepoint);
                    case UProperty.JOIN_CONTROL:
                        return bdp.isJoinControl(codepoint);
                    default:
                        break;
                    }
                } else if(column==SRC_CHAR) {
                    switch(property) {
                    case UProperty.POSIX_BLANK:
                        // "horizontal space"
                        if(codepoint<=0x9f) {
                            return codepoint==9 || codepoint==0x20; /* TAB or SPACE */
                        } else {
                            /* Zs */
                            return UCharacter.getType(codepoint)==UCharacter.SPACE_SEPARATOR;
                        }
                    case UProperty.POSIX_GRAPH:
                        return isgraphPOSIX(codepoint);
                    case UProperty.POSIX_PRINT:
                        /*
                         * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
                         *
                         * The only cntrl character in graph+blank is TAB (in blank).
                         * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
                         */
                        return (UCharacter.getType(codepoint)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(codepoint);
                    case UProperty.POSIX_XDIGIT:
                        /* check ASCII and Fullwidth ASCII a-fA-F */
                        if(
                            (codepoint<=0x66 && codepoint>=0x41 && (codepoint<=0x46 || codepoint>=0x61)) ||
                            (codepoint>=0xff21 && codepoint<=0xff46 && (codepoint<=0xff26 || codepoint>=0xff41))
                        ) {
                            return true;
                        }
    
                        return UCharacter.getType(codepoint)==UCharacter.DECIMAL_DIGIT_NUMBER;
                    default:
                        break;
                    }
                } else if(column==SRC_CHAR_AND_PROPSVEC) {
                    switch(property) {
                    case UProperty.POSIX_ALNUM:
                        return UCharacter.isUAlphabetic(codepoint) || UCharacter.isDigit(codepoint);
                    default:
                        break;
                    }
                }
            }
        }
        return false;
    }

    public final int getSource(int which) {
        if(which<UProperty.BINARY_START) {
            return SRC_NONE; /* undefined */
        } else if(which<UProperty.BINARY_LIMIT) {
            if(binProps[which].mask!=0) {
                return SRC_PROPSVEC;
            } else {
                return binProps[which].column;
            }
        } else if(which<UProperty.INT_START) {
            return SRC_NONE; /* undefined */
        } else if(which<UProperty.INT_LIMIT) {
            switch(which) {
            case UProperty.GENERAL_CATEGORY:
            case UProperty.NUMERIC_TYPE:
                return SRC_CHAR;

            case UProperty.HANGUL_SYLLABLE_TYPE:
                return SRC_HST;

            case UProperty.CANONICAL_COMBINING_CLASS:
            case UProperty.NFD_QUICK_CHECK:
            case UProperty.NFKD_QUICK_CHECK:
            case UProperty.NFC_QUICK_CHECK:
            case UProperty.NFKC_QUICK_CHECK:
            case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
            case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
                return SRC_NORM;

            case UProperty.BIDI_CLASS:
            case UProperty.JOINING_GROUP:
            case UProperty.JOINING_TYPE:
                return SRC_BIDI;

            default:
                return SRC_PROPSVEC;
            }
        } else if(which<UProperty.STRING_START) {
            switch(which) {
            case UProperty.GENERAL_CATEGORY_MASK:
            case UProperty.NUMERIC_VALUE:
                return SRC_CHAR;

            default:
                return SRC_NONE;
            }
        } else if(which<UProperty.STRING_LIMIT) {
            switch(which) {
            case UProperty.AGE:
                return SRC_PROPSVEC;

            case UProperty.BIDI_MIRRORING_GLYPH:
                return SRC_BIDI;

            case UProperty.CASE_FOLDING:
            case UProperty.LOWERCASE_MAPPING:
            case UProperty.SIMPLE_CASE_FOLDING:
            case UProperty.SIMPLE_LOWERCASE_MAPPING:
            case UProperty.SIMPLE_TITLECASE_MAPPING:
            case UProperty.SIMPLE_UPPERCASE_MAPPING:
            case UProperty.TITLECASE_MAPPING:
            case UProperty.UPPERCASE_MAPPING:
                return SRC_CASE;

            case UProperty.ISO_COMMENT:
            case UProperty.NAME:
            case UProperty.UNICODE_1_NAME:
                return SRC_NAMES;

            default:
                return SRC_NONE;
            }
        } else {
            return SRC_NONE; /* undefined */
        }
    }

    /**
    * Forms a supplementary code point from the argument character<br>
    * Note this is for internal use hence no checks for the validity of the
    * surrogate characters are done
    * @param lead lead surrogate character
    * @param trail trailing surrogate character
    * @return code point of the supplementary character
    */
    public static int getRawSupplementary(char lead, char trail)
    {
        return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
    }

    /**
    * Loads the property data and initialize the UCharacterProperty instance.
    * @throws MissingResourceException when data is missing or data has been corrupted
    */
    public static UCharacterProperty getInstance()
    {
        if(INSTANCE_ == null) {
            try {
                INSTANCE_ = new UCharacterProperty();
            }
            catch (Exception e) {
                throw new MissingResourceException(e.getMessage(),"","");
            }
        }
        return INSTANCE_;
    }

    /**
     * <p>
     * Unicode property names and property value names are compared
     * "loosely". Property[Value]Aliases.txt say:
     * <quote>
     *   "With loose matching of property names, the case distinctions,
     *    whitespace, and '_' are ignored."
     * </quote>
     * </p>
     * <p>
     * This function does just that, for ASCII (char *) name strings.
     * It is almost identical to ucnv_compareNames() but also ignores
     * ASCII White_Space characters (U+0009..U+000d).
     * </p>
     * @param name1 name to compare
     * @param name2 name to compare
     * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
     *         if name1 is greater than name2.
     */
    /* to be implemented in 2.4
     * public static int comparePropertyNames(String name1, String name2)
    {
        int result = 0;
        int i1 = 0;
        int i2 = 0;
        while (true) {
            char ch1 = 0;
            char ch2 = 0;
            // Ignore delimiters '-', '_', and ASCII White_Space
            if (i1 < name1.length()) {
                ch1 = name1.charAt(i1 ++);
            }
            while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
                   || ch1 == '\n' // synwee what is || ch1 == '\v'
                   || ch1 == '\f' || ch1=='\r') {
                if (i1 < name1.length()) {
                    ch1 = name1.charAt(i1 ++);
                }
                else {
                    ch1 = 0;
                }
            }
            if (i2 < name2.length()) {
                ch2 = name2.charAt(i2 ++);
            }
            while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
                   || ch2 == '\n' // synwee what is || ch1 == '\v'
                   || ch2 == '\f' || ch2=='\r') {
                if (i2 < name2.length()) {
                    ch2 = name2.charAt(i2 ++);
                }
                else {
                    ch2 = 0;
                }
            }

            // If we reach the ends of both strings then they match
            if (ch1 == 0 && ch2 == 0) {
                return 0;
            }

            // Case-insensitive comparison
            if (ch1 != ch2) {
                result = Character.toLowerCase(ch1)
                                                - Character.toLowerCase(ch2);
                if (result != 0) {
                    return result;
                }
            }
        }
    }
    */

    /**
     * Checks if the argument c is to be treated as a white space in ICU
     * rules. Usually ICU rule white spaces are ignored unless quoted.
     * Equivalent to test for Pattern_White_Space Unicode property.
     * Stable set of characters, won't change.
     * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
     * @param c codepoint to check
     * @return true if c is a ICU white space
     */
    public static boolean isRuleWhiteSpace(int c)
    {
        /* "white space" in the sense of ICU rule parsers
           This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
           See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
           U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
           Equivalent to test for Pattern_White_Space Unicode property.
        */
        return (c >= 0x0009 && c <= 0x2029 &&
                (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
                 c == 0x200E || c == 0x200F || c >= 0x2028));
    }

    /**
     * Get the the maximum values for some enum/int properties.
     * @return maximum values for the integer properties.
     */
    public int getMaxValues(int column)
    {
       // return m_maxBlockScriptValue_;

        switch(column) {
        case 0:
            return m_maxBlockScriptValue_;
        case 2:
            return m_maxJTGValue_;
        default:
            return 0;
        }
    }

    /**
     * Gets the type mask
     * @param type character type
     * @return mask
     */
    public static final int getMask(int type)
    {
        return 1 << type;
    }

    // protected variables -----------------------------------------------

    /**
     * Extra property trie
     */
    CharTrie m_additionalTrie_;
    /**
     * Extra property vectors, 1st column for age and second for binary
     * properties.
     */
    int m_additionalVectors_[];
    /**
     * Number of additional columns
     */
    int m_additionalColumnsCount_;
    /**
     * Maximum values for block, bits used as in vector word
     * 0
     */
    int m_maxBlockScriptValue_;
    /**
     * Maximum values for script, bits used as in vector word
     * 0
     */
     int m_maxJTGValue_;
    // private variables -------------------------------------------------

      /**
     * UnicodeData.txt property object
     */
    private static UCharacterProperty INSTANCE_ = null;

    /**
    * Default name of the datafile
    */
    private static final String DATA_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/uprops.icu";

    /**
    * Default buffer size of datafile
    */
    private static final int DATA_BUFFER_SIZE_ = 25000;

    /**
    * Numeric value shift
    */
    private static final int VALUE_SHIFT_ = 8;

    /**
    * Mask to be applied after shifting to obtain an unsigned numeric value
    */
    private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;

    /**
     *
     */
    private static final int NUMERIC_TYPE_SHIFT = 5;

    /**
    * To get the last 5 bits out from a data type
    */
    private static final int LAST_5_BIT_MASK_ = 0x1F;

    /**
    * Shift 5 bits
    */
    private static final int SHIFT_5_ = 5;
    /**
    * Shift 10 bits
    */
    private static final int SHIFT_10_ = 10;

    /**
    * Shift value for lead surrogate to form a supplementary character.
    */
    private static final int LEAD_SURROGATE_SHIFT_ = 10;
    /**
    * Offset to add to combined surrogate pair to avoid msking.
    */
    private static final int SURROGATE_OFFSET_ =
                           UTF16.SUPPLEMENTARY_MIN_VALUE -
                           (UTF16.SURROGATE_MIN_VALUE <<
                           LEAD_SURROGATE_SHIFT_) -
                           UTF16.TRAIL_SURROGATE_MIN_VALUE;
    /**
    * Latin uppercase I
    */
    private static final char LATIN_CAPITAL_LETTER_I_ = 0x49;
    /**
    * Combining dot above
    */
    private static final char COMBINING_DOT_ABOVE_ = 0x307;
    /**
    * LATIN SMALL LETTER J
    */
    private static final int LATIN_SMALL_LETTER_J_ = 0x6a;
    /**
    * LATIN SMALL LETTER I WITH OGONEK
    */
    private static final int LATIN_SMALL_LETTER_I_WITH_OGONEK_ = 0x12f;
    /**
    * LATIN SMALL LETTER I WITH TILDE BELOW
    */
    private static final int LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ = 0x1e2d;
    /**
    * LATIN SMALL LETTER I WITH DOT BELOW
    */
    private static final int LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_ = 0x1ecb;
    /**
    * Combining class for combining mark above
    */
    private static final int COMBINING_MARK_ABOVE_CLASS_ = 230;

    /**
    * LATIN CAPITAL LETTER J
    */
    private static final int LATIN_CAPITAL_LETTER_J_ = 0x4a;

    /**
    * LATIN CAPITAL LETTER I WITH OGONEK
    */
    private static final int LATIN_CAPITAL_I_WITH_OGONEK_ = 0x12e;
    /**
    * LATIN CAPITAL LETTER I WITH TILDE
    */
    private static final int LATIN_CAPITAL_I_WITH_TILDE_ = 0x128;
    /**
    * LATIN CAPITAL LETTER I WITH GRAVE
    */
    private static final int LATIN_CAPITAL_I_WITH_GRAVE_ = 0xcc;
    /**
    * LATIN CAPITAL LETTER I WITH ACUTE
    */
    private static final int LATIN_CAPITAL_I_WITH_ACUTE_ = 0xcd;
    /**
    * COMBINING GRAVE ACCENT
    */
    private static final int COMBINING_GRAVE_ACCENT_ = 0x300;
    /**
    * COMBINING ACUTE ACCENT
    */
    private static final int COMBINING_ACUTE_ACCENT_ = 0x301;
    /**
    * COMBINING TILDE
    */
    private static final int COMBINING_TILDE_ = 0x303;
    /**
    * Greek capital letter sigma
    */
    private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3;
    /**
    * Greek small letter sigma
    */
    private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3;
    /**
    * Greek small letter rho
    */
    private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2;
    /**
    * Hyphens
    */
    private static final int HYPHEN_      = 0x2010;
    private static final int SOFT_HYPHEN_ = 0xAD;
    /**
    * To get the last character out from a data type
    */
    private static final int LAST_CHAR_MASK_ = 0xFFFF;
    /**
    * To get the last byte out from a data type
    */
    private static final int LAST_BYTE_MASK_ = 0xFF;
    /**
    * Shift 16 bits
    */
    private static final int SHIFT_16_ = 16;

    // additional properties ----------------------------------------------

    /**
     * Additional properties used in internal trie data
     */
    /*
     * Properties in vector word 1
     * Each bit encodes one binary property.
     * The following constants represent the bit number, use 1<<UPROPS_XYZ.
     * UPROPS_BINARY_1_TOP<=32!
     *
     * Keep this list of property enums in sync with
     * propListNames[] in icu/source/tools/genprops/props2.c!
     *
     * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
     */
    private static final int WHITE_SPACE_PROPERTY_ = 0;
    private static final int BIDI_CONTROL_PROPERTY_ = 1;
    private static final int JOIN_CONTROL_PROPERTY_ = 2;
    private static final int DASH_PROPERTY_ = 3;
    private static final int HYPHEN_PROPERTY_ = 4;
    private static final int QUOTATION_MARK_PROPERTY_ = 5;
    private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 6;
    private static final int MATH_PROPERTY_ = 7;
    private static final int HEX_DIGIT_PROPERTY_ = 8;
    private static final int ASCII_HEX_DIGIT_PROPERTY_ = 9;
    private static final int ALPHABETIC_PROPERTY_ = 10;
    private static final int IDEOGRAPHIC_PROPERTY_ = 11;
    private static final int DIACRITIC_PROPERTY_ = 12;
    private static final int EXTENDER_PROPERTY_ = 13;
    private static final int LOWERCASE_PROPERTY_ = 14;
    private static final int UPPERCASE_PROPERTY_ = 15;
    private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 16;
    private static final int GRAPHEME_EXTEND_PROPERTY_ = 17;
    private static final int GRAPHEME_LINK_PROPERTY_ = 18;
    private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 19;
    private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 20;
    private static final int RADICAL_PROPERTY_ = 21;
    private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 22;
    private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 23;
    private static final int DEPRECATED_PROPERTY_ = 24;
    private static final int SOFT_DOTTED_PROPERTY_ = 25;
    private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 26;
    private static final int XID_START_PROPERTY_ = 27;
    private static final int XID_CONTINUE_PROPERTY_ = 28;
    private static final int ID_START_PROPERTY_    = 29;
    private static final int ID_CONTINUE_PROPERTY_ = 30;
    private static final int GRAPHEME_BASE_PROPERTY_ = 31;
    private static final int BINARY_1_TOP_PROPERTY_ = 32;

    /**
     * First nibble shift
     */
    private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
    /**
     * Second nibble mask
     */
    private static final int LAST_NIBBLE_MASK_ = 0xF;
    /**
     * Age value shift
     */
    private static final int AGE_SHIFT_ = 24;

    // boolean properties in vector word 2
    private static final int V2_S_TERM_PROPERTY_ = 24;
    private static final int V2_VARIATION_SELECTOR_PROPERTY_ = 25;
    private static final int V2_PATTERN_SYNTAX = 26;                   /* new in ICU 3.4 and Unicode 4.1 */
    private static final int V2_PATTERN_WHITE_SPACE = 27;

    // private constructors --------------------------------------------------

    /**
    * Constructor
    * @exception thrown when data reading fails or data corrupted
    */
    private UCharacterProperty() throws IOException
    {
        // jar access
        InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
        BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
        UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
        reader.read(this);
        b.close();

        m_trie_.putIndexData(this);
    }

    // private methods -------------------------------------------------------

    /**
     * Compare additional properties to see if it has argument type
     * @param property 32 bit properties
     * @param type character type
     * @return true if property has type
     */
    private boolean compareAdditionalType(int property, int type)
    {
        return (property & (1 << type)) != 0;
    }

    // property starts for UnicodeSet -------------------------------------- ***

    private static final int TAB     = 0x0009;
    private static final int LF      = 0x000a;
    private static final int FF      = 0x000c;
    private static final int CR      = 0x000d;
    private static final int U_A     = 0x0041;
    private static final int U_F     = 0x0046;
    private static final int U_Z     = 0x005a;
    private static final int U_a     = 0x0061;
    private static final int U_f     = 0x0066;
    private static final int U_z     = 0x007a;
    private static final int DEL     = 0x007f;
    private static final int NL      = 0x0085;
    private static final int NBSP    = 0x00a0;
    private static final int CGJ     = 0x034f;
    private static final int FIGURESP= 0x2007;
    private static final int HAIRSP  = 0x200a;
    private static final int ZWNJ    = 0x200c;
    private static final int ZWJ     = 0x200d;
    private static final int RLM     = 0x200f;
    private static final int NNBSP   = 0x202f;
    private static final int WJ      = 0x2060;
    private static final int INHSWAP = 0x206a;
    private static final int NOMDIG  = 0x206f;
    private static final int U_FW_A  = 0xff21;
    private static final int U_FW_F  = 0xff26;
    private static final int U_FW_Z  = 0xff3a;
    private static final int U_FW_a  = 0xff41;
    private static final int U_FW_f  = 0xff46;
    private static final int U_FW_z  = 0xff5a;
    private static final int ZWNBSP  = 0xfeff;

    /* for Hangul_Syllable_Type */
    public void uhst_addPropertyStarts(UnicodeSet set) {
        /* add code points with hardcoded properties, plus the ones following them */

        /*
         * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
         * First, we add fixed boundaries for the blocks of Jamos.
         * Then we check in loops to see where the current Unicode version
         * actually stops assigning such Jamos. We start each loop
         * at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
         * (These have not changed since Unicode 2.)
         */
        int c, value, value2;

        set.add(0x1100);
        value=UCharacter.HangulSyllableType.LEADING_JAMO;
        for(c=0x115a; c<=0x115f; ++c) {
            value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
            if(value!=value2) {
                value=value2;
                set.add(c);
            }
        }

        set.add(0x1160);
        value=UCharacter.HangulSyllableType.VOWEL_JAMO;
        for(c=0x11a3; c<=0x11a7; ++c) {
            value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
            if(value!=value2) {
                value=value2;
                set.add(c);
            }
        }

        set.add(0x11a8);
        value=UCharacter.HangulSyllableType.TRAILING_JAMO;
        for(c=0x11fa; c<=0x11ff; ++c) {
            value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
            if(value!=value2) {
                value=value2;
                set.add(c);
            }
        }
    }

    public UnicodeSet addPropertyStarts(UnicodeSet set) {
        int c;

        /* add the start code point of each same-value range of the main trie */
        TrieIterator propsIter = new TrieIterator(m_trie_);
        RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
          while(propsIter.next(propsResult)){
            set.add(propsResult.start);
        }

        /* add code points with hardcoded properties, plus the ones following them */

        /* add for u_isblank() */
        set.add(TAB);
        set.add(TAB+1);

        /* add for IS_THAT_CONTROL_SPACE() */
        set.add(CR+1); /* range TAB..CR */
        set.add(0x1c);
        set.add(0x1f+1);
        set.add(NL);
        set.add(NL+1);

        /* add for u_isIDIgnorable() what was not added above */
        set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
        set.add(HAIRSP);
        set.add(RLM+1);
        set.add(INHSWAP);
        set.add(NOMDIG+1);
        set.add(ZWNBSP);
        set.add(ZWNBSP+1);

        /* add no-break spaces for u_isWhitespace() what was not added above */
        set.add(NBSP);
        set.add(NBSP+1);
        set.add(FIGURESP);
        set.add(FIGURESP+1);
        set.add(NNBSP);
        set.add(NNBSP+1);

        /* add for u_charDigitValue() */
        // TODO remove when UCharacter.getHanNumericValue() is changed to just return
        // Unicode numeric values 
        set.add(0x3007);
        set.add(0x3008);
        set.add(0x4e00);
        set.add(0x4e01);
        set.add(0x4e8c);
        set.add(0x4e8d);
        set.add(0x4e09);
        set.add(0x4e0a);
        set.add(0x56db);
        set.add(0x56dc);
        set.add(0x4e94);
        set.add(0x4e95);
        set.add(0x516d);
        set.add(0x516e);
        set.add(0x4e03);
        set.add(0x4e04);
        set.add(0x516b);
        set.add(0x516c);
        set.add(0x4e5d);
        set.add(0x4e5e);

        /* add for u_digit() */
        set.add(U_a);
        set.add(U_z+1);
        set.add(U_A);
        set.add(U_Z+1);
        set.add(U_FW_a);
        set.add(U_FW_z+1);
        set.add(U_FW_A);
        set.add(U_FW_Z+1);

        /* add for u_isxdigit() */
        set.add(U_f+1);
        set.add(U_F+1);
        set.add(U_FW_f+1);
        set.add(U_FW_F+1);

        /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
        set.add(WJ); /* range WJ..NOMDIG */
        set.add(0xfff0);
        set.add(0xfffb+1);
        set.add(0xe0000);
        set.add(0xe0fff+1);

        /* add for UCHAR_GRAPHEME_BASE and others */
        set.add(CGJ);
        set.add(CGJ+1);

        return set; // for chaining
    }

    public void upropsvec_addPropertyStarts(UnicodeSet set) {
        /* add the start code point of each same-value range of the properties vectors trie */
        if(m_additionalColumnsCount_>0) {
            /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
            TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
            RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
            while(propsVectorsIter.next(propsVectorsResult)){
                set.add(propsVectorsResult.start);
            }
        }
    }

/*----------------------------------------------------------------
 * Inclusions list
 *----------------------------------------------------------------*/

    /*
     * Return a set of characters for property enumeration.
     * The set implicitly contains 0x110000 as well, which is one more than the highest
     * Unicode code point.
     *
     * This set is used as an ordered list - its code points are ordered, and
     * consecutive code points (in Unicode code point order) in the set define a range.
     * For each two consecutive characters (start, limit) in the set,
     * all of the UCD/normalization and related properties for
     * all code points start..limit-1 are all the same,
     * except for character names and ISO comments.
     *
     * All Unicode code points U+0000..U+10ffff are covered by these ranges.
     * The ranges define a partition of the Unicode code space.
     * ICU uses the inclusions set to enumerate properties for generating
     * UnicodeSets containing all code points that have a certain property value.
     *
     * The Inclusion List is generated from the UCD. It is generated
     * by enumerating the data tries, and code points for hardcoded properties
     * are added as well.
     *
     * --------------------------------------------------------------------------
     *
     * The following are ideas for getting properties-unique code point ranges,
     * with possible optimizations beyond the current implementation.
     * These optimizations would require more code and be more fragile.
     * The current implementation generates one single list (set) for all properties.
     *
     * To enumerate properties efficiently, one needs to know ranges of
     * repetitive values, so that the value of only each start code point
     * can be applied to the whole range.
     * This information is in principle available in the uprops.icu/unorm.icu data.
     *
     * There are two obstacles:
     *
     * 1. Some properties are computed from multiple data structures,
     *    making it necessary to get repetitive ranges by intersecting
     *    ranges from multiple tries.
     *
     * 2. It is not economical to write code for getting repetitive ranges
     *    that are precise for each of some 50 properties.
     *
     * Compromise ideas:
     *
     * - Get ranges per trie, not per individual property.
     *   Each range contains the same values for a whole group of properties.
     *   This would generate currently five range sets, two for uprops.icu tries
     *   and three for unorm.icu tries.
     *
     * - Combine sets of ranges for multiple tries to get sufficient sets
     *   for properties, e.g., the uprops.icu main and auxiliary tries
     *   for all non-normalization properties.
     *
     * Ideas for representing ranges and combining them:
     *
     * - A UnicodeSet could hold just the start code points of ranges.
     *   Multiple sets are easily combined by or-ing them together.
     *
     * - Alternatively, a UnicodeSet could hold each even-numbered range.
     *   All ranges could be enumerated by using each start code point
     *   (for the even-numbered ranges) as well as each limit (end+1) code point
     *   (for the odd-numbered ranges).
     *   It should be possible to combine two such sets by xor-ing them,
     *   but no more than two.
     *
     * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
     * but the first one is certainly simpler and applicable for combining more than
     * two range sets.
     *
     * It is possible to combine all range sets for all uprops/unorm tries into one
     * set that can be used for all properties.
     * As an optimization, there could be less-combined range sets for certain
     * groups of properties.
     * The relationship of which less-combined range set to use for which property
     * depends on the implementation of the properties and must be hardcoded
     * - somewhat error-prone and higher maintenance but can be tested easily
     * by building property sets "the simple way" in test code.
     *
     * ---
     *
     * Do not use a UnicodeSet pattern because that causes infinite recursion;
     * UnicodeSet depends on the inclusions set.
     *
     * ---
     *
     * getInclusions() is commented out starting 2005-feb-12 because
     * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
     * and only for the relevant property source.
     */
    /*
    public UnicodeSet getInclusions() {
        UnicodeSet set = new UnicodeSet();
        NormalizerImpl.addPropertyStarts(set);
        addPropertyStarts(set);
        return set;
    }
    */
}
