| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: |
| * /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $ |
| * $Date: 2002/03/15 22:48:07 $ |
| * $Revision: 1.15 $ |
| * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.lang; |
| |
| import java.util.Locale; |
| import java.io.InputStream; |
| import java.io.DataInputStream; |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.text.UTF16; |
| |
| /** |
| * Internal class to manage character names. |
| * Since data in <a href=UCharacterNameDB.html>UCharacterNameDB</a> is stored |
| * in an array of char, by default indexes used in this class is refering to |
| * a 2 byte count, unless otherwise stated. Cases where the index is refering |
| * to a byte count, the index is halved and depending on whether the index is |
| * even or odd, the MSB or LSB of the result char at the halved index is |
| * returned. For indexes to an array of int, the index is multiplied by 2, |
| * result char at the multiplied index and its following char is returned as an |
| * int. |
| * <a href=UCharacter.html>UCharacter</a> acts as a public facade for this class |
| * Note : 0 - 0x1F are control characters without names in Unicode 3.0 |
| * Information on parsing of the binary data is located at |
| * <a href=oss.software.ibm.com/icu4j/icu4jhtml/com/ibm/icu/text/readme.html> |
| * ReadMe</a> |
| * @author Syn Wee Quek |
| * @since nov0700 |
| */ |
| |
| final class UCharacterName |
| { |
| // public methods ---------------------------------------------------- |
| |
| /** |
| * toString method for printing |
| */ |
| public String toString() |
| { |
| StringBuffer result = new StringBuffer("names content \n"); |
| /*result.append(super.toString()); |
| result.append('\n'); |
| result.append("token string offset "); |
| result.append(m_tokenstringoffset_); |
| result.append("\n"); |
| result.append("group offset "); |
| result.append(m_groupsoffset_); |
| result.append("\n"); |
| result.append("group string offset "); |
| result.append(m_groupstringoffset_); |
| result.append("\n"); |
| result.append("alg names offset "); |
| result.append(m_algnamesoffset_); |
| result.append("\n"); |
| */ |
| return result.toString(); |
| } |
| |
| // package protected inner class ------------------------------------- |
| |
| /** |
| * Algorithmic name class |
| */ |
| static final class AlgorithmName |
| { |
| // protected data members ---------------------------------------- |
| |
| /** |
| * Constant type value of the different AlgorithmName |
| */ |
| protected static final int TYPE_0_ = 0; |
| protected static final int TYPE_1_ = 1; |
| |
| // protected constructors ---------------------------------------- |
| |
| /** |
| * Constructor |
| */ |
| protected AlgorithmName() |
| { |
| } |
| |
| // protected methods --------------------------------------------- |
| |
| /** |
| * Sets the information for accessing the algorithmic names |
| * @param rangestart starting code point that lies within this name group |
| * @param rangeend end code point that lies within this name group |
| * @param type algorithm type. There's 2 kinds of algorithmic type. First |
| * which uses code point as part of its name and the other uses |
| * variant postfix strings |
| * @param variant algorithmic variant |
| * @return true if values are valid |
| */ |
| protected boolean setInfo(int rangestart, int rangeend, byte type, |
| byte variant) |
| { |
| if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend |
| && rangeend <= UCharacter.MAX_VALUE && |
| (type == TYPE_0_ || type == TYPE_1_)) { |
| m_rangestart_ = rangestart; |
| m_rangeend_ = rangeend; |
| m_type_ = type; |
| m_variant_ = variant; |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Sets the factor data |
| * @param array of factor |
| * @return true if factors are valid |
| */ |
| protected boolean setFactor(char factor[]) |
| { |
| if (factor.length == m_variant_) { |
| m_factor_ = factor; |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Sets the name prefix |
| * @param prefix |
| * @return true if prefix is set |
| */ |
| protected boolean setPrefix(String prefix) |
| { |
| if (prefix != null && prefix.length() > 0) { |
| m_prefix_ = prefix; |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Sets the variant factorized name data |
| * @param string variant factorized name data |
| * @return true if values are set |
| */ |
| protected boolean setFactorString(byte string[]) |
| { |
| // factor and variant string can be empty for things like |
| // hanggul code points |
| m_factorstring_ = string; |
| return true; |
| } |
| |
| /** |
| * Checks if code point lies in Algorithm object at index |
| * @param ch code point |
| */ |
| protected boolean contains(int ch) |
| { |
| return m_rangestart_ <= ch && ch <= m_rangeend_; |
| } |
| |
| /** |
| * Appends algorithm name of code point into StringBuffer. |
| * Note this method does not check for validity of code point in Algorithm, |
| * result is undefined if code point does not belong in Algorithm. |
| * @param ch code point |
| * @param str StringBuffer to append to |
| */ |
| protected void appendName(int ch, StringBuffer str) |
| { |
| str.append(m_prefix_); |
| switch (m_type_) |
| { |
| case TYPE_0_: |
| // prefix followed by hex digits indicating variants |
| Utility.hex(ch, m_variant_, str); |
| break; |
| case TYPE_1_: |
| // prefix followed by factorized-elements |
| int offset = ch - m_rangestart_; |
| int indexes[] = new int[m_variant_]; |
| int factor; |
| |
| // write elements according to the factors |
| // the factorized elements are determined by modulo |
| // arithmetic |
| for (int i = m_variant_ - 1; i > 0; i --) |
| { |
| factor = m_factor_[i] & 0x00FF; |
| indexes[i] = offset % factor; |
| offset /= factor; |
| } |
| |
| // we don't need to calculate the last modulus because |
| // start <= code <= end guarantees here that |
| // code <= factors[0] |
| indexes[0] = offset; |
| |
| // joining up the factorized strings |
| String s[] = getFactorString(indexes); |
| if (s != null && s.length > 0) |
| { |
| int size = s.length; |
| for (int i = 0; i < size; i ++) |
| str.append(s[i]); |
| } |
| break; |
| } |
| } |
| |
| /** |
| * Gets the character for the argument algorithmic name |
| * @return the algorithmic char or -1 otherwise. |
| */ |
| protected int getAlgorithmChar(String name) |
| { |
| int prefixlen = m_prefix_.length(); |
| if (name.length() < prefixlen || |
| !m_prefix_.equals(name.substring(0, prefixlen))) { |
| return -1; |
| } |
| |
| switch (m_type_) |
| { |
| case TYPE_0_ : |
| try |
| { |
| int result = Integer.parseInt(name.substring(prefixlen), |
| 16); |
| // does it fit into the range? |
| if (m_rangestart_ <= result && result <= m_rangeend_) { |
| return result; |
| } |
| } |
| catch (NumberFormatException e) |
| { |
| return -1; |
| } |
| break; |
| case TYPE_1_ : |
| // repetitative suffix name comparison done here |
| // offset is the character code - start |
| for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++) |
| { |
| int offset = ch - m_rangestart_; |
| int indexes[] = new int[m_variant_]; |
| int factor; |
| |
| // write elements according to the factors |
| // the factorized elements are determined by modulo |
| // arithmetic |
| for (int i = m_variant_ - 1; i > 0; i --) |
| { |
| factor = m_factor_[i] & 0x00FF; |
| indexes[i] = offset % factor; |
| offset /= factor; |
| } |
| |
| // we don't need to calculate the last modulus |
| // because start <= code <= end guarantees here that |
| // code <= factors[0] |
| indexes[0] = offset; |
| |
| // joining up the factorized strings |
| if (compareFactorString(indexes, name, prefixlen)) { |
| return ch; |
| } |
| } |
| } |
| |
| return -1; |
| } |
| |
| // private data members ------------------------------------------ |
| |
| /** |
| * Algorithmic data information |
| */ |
| private int m_rangestart_; |
| private int m_rangeend_; |
| private byte m_type_; |
| private byte m_variant_; |
| private char m_factor_[]; |
| private String m_prefix_; |
| private byte m_factorstring_[]; |
| |
| // private methods ----------------------------------------------- |
| |
| /** |
| * Gets the indexth string in each of the argument factor block |
| * @param index array with each index corresponding to each factor block |
| * @return array of indexth factor string in factor block |
| */ |
| private String[] getFactorString(int index[]) |
| { |
| int size = m_factor_.length; |
| if (index == null || index.length != size) { |
| return null; |
| } |
| |
| String result[] = new String[size]; |
| StringBuffer str = new StringBuffer(); |
| int count = 0; |
| int factor; |
| size --; |
| for (int i = 0; i <= size; i ++) { |
| factor = m_factor_[i]; |
| count = UCharacterUtil.skipNullTermByteSubString( |
| m_factorstring_, count, index[i]); |
| count = UCharacterUtil.getNullTermByteSubString( |
| str, m_factorstring_, count); |
| if (i != size) { |
| count = UCharacterUtil.skipNullTermByteSubString( |
| m_factorstring_, count, |
| factor - index[i] - 1); |
| } |
| result[i] = str.toString(); |
| str.delete(0, str.length()); |
| } |
| return result; |
| } |
| |
| /** |
| * Compares the indexth string in each of the argument factor block with |
| * the argument string |
| * @param index array with each index corresponding to each factor block |
| * @param str string to compare with |
| * @param offset of str to start comparison |
| * @return true if string matches |
| */ |
| private boolean compareFactorString(int index[], String str, |
| int offset) |
| { |
| int size = m_factor_.length; |
| if (index == null || index.length != size) |
| return false; |
| |
| int count = 0; |
| int strcount = offset; |
| int factor; |
| size --; |
| for (int i = 0; i <= size; i ++) |
| { |
| factor = m_factor_[i]; |
| count = UCharacterUtil.skipNullTermByteSubString( |
| m_factorstring_, count, index[i]); |
| strcount = UCharacterUtil.compareNullTermByteSubString(str, |
| m_factorstring_, strcount, count); |
| if (strcount < 0) { |
| return false; |
| } |
| |
| if (i != size) { |
| count = UCharacterUtil.skipNullTermByteSubString( |
| m_factorstring_, count, factor - index[i]); |
| } |
| } |
| if (strcount != str.length()) { |
| return false; |
| } |
| return true; |
| } |
| } |
| |
| // protected data members -------------------------------------------- |
| |
| /** |
| * Maximum number of groups |
| */ |
| protected int m_groupcount_ = 0; |
| /** |
| * Size of each groups |
| */ |
| protected int m_groupsize_ = 0; |
| /** |
| * Number of lines per group |
| * 1 << GROUP_SHIFT_ |
| */ |
| protected static final int LINES_PER_GROUP_ = 1 << 5; |
| |
| // protected constructor --------------------------------------------- |
| |
| /** |
| * <p>Protected constructor for use in UCharacter.</p> |
| * @exception IOException thrown when data reading fails |
| */ |
| protected UCharacterName() throws IOException |
| { |
| InputStream i = getClass().getResourceAsStream(NAME_FILE_NAME_); |
| BufferedInputStream b = new BufferedInputStream(i, |
| NAME_BUFFER_SIZE_); |
| UCharacterNameReader reader = new UCharacterNameReader(b); |
| reader.read(this); |
| i.close(); |
| } |
| |
| // protected methods ------------------------------------------------- |
| |
| /** |
| * Retrieve the name of a Unicode code point. |
| * Depending on <code>choice</code>, the character name written into the |
| * buffer is the "modern" name or the name that was defined in Unicode |
| * version 1.0. |
| * The name contains only "invariant" characters |
| * like A-Z, 0-9, space, and '-'. |
| * |
| * @param ch the code point for which to get the name. |
| * @param choice Selector for which name to get. |
| * @return if code point is above 0x1fff, null is returned |
| */ |
| protected String getName(int ch, int choice) |
| { |
| if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || |
| choice >= UCharacterNameChoice.U_CHAR_NAME_CHOICE_COUNT) { |
| return null; |
| } |
| |
| String result = null; |
| |
| result = getAlgName(ch, choice); |
| |
| // getting normal character name |
| if (result == null || result.length() == 0) { |
| if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { |
| result = getExtendedName(ch); |
| } else { |
| result = getGroupName(ch, choice); |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Find a character by its name and return its code point value |
| * @param character name |
| * @param choice selector to indicate if argument name is a Unicode 1.0 |
| * or the most current version |
| * @return code point |
| */ |
| protected int getCharFromName(int choice, String name) |
| { |
| // checks for illegal arguments |
| if (choice >= UCharacterNameChoice.U_CHAR_NAME_CHOICE_COUNT || |
| name == null || name.length() == 0) { |
| return -1; |
| } |
| |
| // try extended names first |
| int result = getExtendedChar(name.toLowerCase(), choice); |
| if (result >= -1) { |
| return result; |
| } |
| |
| String upperCaseName = name.toUpperCase(); |
| // try algorithmic names first, if fails then try group names |
| // int result = getAlgorithmChar(choice, uppercasename); |
| |
| if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { |
| int count = 0; |
| if (m_algorithm_ != null) { |
| count = m_algorithm_.length; |
| } |
| for (count --; count >= 0; count --) { |
| result = m_algorithm_[count].getAlgorithmChar(upperCaseName); |
| if (result >= 0) { |
| return result; |
| } |
| } |
| } |
| |
| if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { |
| result = getGroupChar(upperCaseName, |
| UCharacterNameChoice.U_UNICODE_CHAR_NAME); |
| if (result == -1) { |
| result = getGroupChar(upperCaseName, |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); |
| } |
| } |
| else { |
| result = getGroupChar(upperCaseName, choice); |
| } |
| return result; |
| } |
| |
| /** |
| * Sets the token data |
| * @param token array of tokens |
| * @param tokenstring array of string values of the tokens |
| * @return false if there is a data error |
| */ |
| protected boolean setToken(char token[], byte tokenstring[]) |
| { |
| if (token != null && tokenstring != null && token.length > 0 && |
| tokenstring.length > 0) { |
| m_tokentable_ = token; |
| m_tokenstring_ = tokenstring; |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Set the algorithm name information array |
| * @param algorithm information array |
| * @return true if the group string offset has been set correctly |
| */ |
| protected boolean setAlgorithm(AlgorithmName alg[]) |
| { |
| if (alg != null && alg.length != 0) { |
| m_algorithm_ = alg; |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Sets the number of group and size of each group in number of char |
| * @param count number of groups |
| * @param size size of group in char |
| * @return true if group size is set correctly |
| */ |
| protected boolean setGroupCountSize(int count, int size) |
| { |
| if (count <= 0 || size <= 0) { |
| return false; |
| } |
| m_groupcount_ = count; |
| m_groupsize_ = size; |
| return true; |
| } |
| |
| /** |
| * Sets the group name data |
| * @param group index information array |
| * @param groupstring name information array |
| * @return false if there is a data error |
| */ |
| protected boolean setGroup(char group[], byte groupstring[]) |
| { |
| if (group != null && groupstring != null && group.length > 0 && |
| groupstring.length > 0) { |
| m_groupinfo_ = group; |
| m_groupstring_ = groupstring; |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Reads a block of compressed lengths of 32 strings and expands them into |
| * offsets and lengths for each string. Lengths are stored with a |
| * variable-width encoding in consecutive nibbles: |
| * If a nibble<0xc, then it is the length itself (0 = empty string). |
| * If a nibble>=0xc, then it forms a length value with the following |
| * nibble. |
| * The offsets and lengths arrays must be at least 33 (one more) long |
| * because there is no check here at the end if the last nibble is still |
| * used. |
| * @param index of group string object in array |
| * @param offsets array to store the value of the string offsets |
| * @param lengths array to store the value of the string length |
| * @return next index of the data string immediately after the lengths |
| * in terms of byte address |
| */ |
| protected int getGroupLengths(int index, char offsets[], char lengths[]) |
| { |
| char length = 0xffff; |
| byte b = 0, |
| n = 0; |
| int shift; |
| index = index * m_groupsize_; // byte count offsets of group strings |
| int stringoffset = UCharacterUtil.toInt( |
| m_groupinfo_[index + OFFSET_HIGH_OFFSET_], |
| m_groupinfo_[index + OFFSET_LOW_OFFSET_]); |
| |
| offsets[0] = 0; |
| |
| // all 32 lengths must be read to get the offset of the first group |
| // string |
| for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) { |
| b = m_groupstring_[stringoffset]; |
| shift = 4; |
| |
| while (shift >= 0) { |
| // getting nibble |
| n = (byte)((b >> shift) & 0x0F); |
| if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) { |
| length = (char)((n - 12) << 4); |
| } |
| else { |
| if (length != 0xffff) { |
| lengths[i] = (char)((length | n) + 12); |
| } |
| else { |
| lengths[i] = (char)n; |
| } |
| |
| if (i < LINES_PER_GROUP_) { |
| offsets[i + 1] = (char)(offsets[i] + lengths[i]); |
| } |
| |
| length = 0xffff; |
| i ++; |
| } |
| |
| shift -= 4; |
| } |
| } |
| return stringoffset; |
| } |
| |
| /** |
| * Gets the name of the argument group index |
| * @param index of the group name string in byte count |
| * @param length of the group name string |
| * @param choice of Unicode 1.0 name or the most current name |
| * @return name of the group |
| */ |
| protected String getGroupName(int index, int length, int choice) |
| { |
| if (choice == UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { |
| int oldindex = index; |
| index += UCharacterUtil.skipByteSubString(m_groupstring_, |
| index, length, (byte)';'); |
| length -= (index - oldindex); |
| } |
| |
| StringBuffer s = new StringBuffer(); |
| byte b; |
| char token; |
| for (int i = 0; i < length;) { |
| b = m_groupstring_[index + i]; |
| i ++; |
| |
| if (b >= m_tokentable_.length) { |
| if (b == ';') { |
| break; |
| } |
| s.append(b); // implicit letter |
| } |
| else { |
| token = m_tokentable_[b & 0x00ff]; |
| if (token == 0xFFFE) { |
| // this is a lead byte for a double-byte token |
| token = m_tokentable_[b << 8 | |
| (m_groupstring_[index + i] & 0x00ff)]; |
| i ++; |
| } |
| if (token == 0xFFFF) { |
| if (b == ';') { |
| // skip the semicolon if we are seeking extended |
| // names and there was no 2.0 name but there |
| // is a 1.0 name. |
| if (s.length() == 0 && choice == |
| UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { |
| continue; |
| } |
| break; |
| } |
| s.append((char)(b & 0x00ff)); // explicit letter |
| } |
| else { // write token word |
| UCharacterUtil.getNullTermByteSubString(s, |
| m_tokenstring_, token); |
| } |
| } |
| } |
| |
| if (s.length() == 0) { |
| return null; |
| } |
| return s.toString(); |
| } |
| |
| /** |
| * Retrieves the extended name |
| */ |
| protected String getExtendedName(int ch) |
| { |
| String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME); |
| if (result == null) { |
| if (getType(ch) == UCharacterCategory.CONTROL) { |
| result = getName(ch, |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); |
| } |
| if (result == null) { |
| result = getExtendedOr10Name(ch); |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Gets the group index for the codepoint, or the group before it. |
| * @param codepoint |
| * @return group index containing codepoint or the group before it. |
| */ |
| protected int getGroup(int codepoint) |
| { |
| int endGroup = m_groupcount_; |
| int msb = getCodepointMSB(codepoint); |
| int result = 0; |
| // binary search for the group of names that contains the one for |
| // code |
| // find the group that contains codepoint, or the highest before it |
| while (result < endGroup - 1) { |
| int gindex = (result + endGroup) >> 1; |
| if (msb < getGroupMSB(gindex)) { |
| endGroup = gindex; |
| } |
| else { |
| result = gindex; |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Gets the extended and 1.0 name when the most current unicode names |
| * fail |
| * @param ch codepoint |
| * @return name of codepoint extended or 1.0 |
| */ |
| protected String getExtendedOr10Name(int ch) |
| { |
| String result = null; |
| if (getType(ch) == UCharacterCategory.CONTROL) { |
| result = getName(ch, |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); |
| } |
| if (result == null) { |
| int type = getType(ch); |
| // Return unknown if the table of names above is not up to |
| // date. |
| if (type >= UCharacterCategory.TYPE_NAMES_.length) { |
| result = UCharacterCategory.UNKNOWN_TYPE_NAME_; |
| } |
| else { |
| result = UCharacterCategory.TYPE_NAMES_[type]; |
| } |
| StringBuffer tempResult = new StringBuffer(result); |
| tempResult.insert(0, '<'); |
| tempResult.append('-'); |
| String chStr = Integer.toHexString(ch).toUpperCase(); |
| int zeros = 4 - chStr.length(); |
| while (zeros > 0) { |
| tempResult.append('0'); |
| zeros --; |
| } |
| tempResult.append(chStr); |
| tempResult.append('>'); |
| result = tempResult.toString(); |
| } |
| return result; |
| } |
| |
| // these are all UCharacterNameIterator use methods ------------------- |
| |
| /** |
| * Gets the MSB from the group index |
| * @param gindex group index |
| * @return the MSB of the group if gindex is valid, -1 otherwise |
| */ |
| protected int getGroupMSB(int gindex) |
| { |
| if (gindex >= m_groupcount_) { |
| return -1; |
| } |
| return m_groupinfo_[gindex * m_groupsize_]; |
| } |
| |
| /** |
| * Gets the MSB of the codepoint |
| * @param codepoint |
| * @return the MSB of the codepoint |
| */ |
| protected int getCodepointMSB(int codepoint) |
| { |
| return codepoint >> GROUP_SHIFT_; |
| } |
| |
| /** |
| * Gets the maximum codepoint + 1 of the group |
| * @param msb most significant byte of the group |
| * @return limit codepoint of the group |
| */ |
| protected int getGroupLimit(int msb) |
| { |
| return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_; |
| } |
| |
| /** |
| * Gets the minimum codepoint of the group |
| * @param msb most significant byte of the group |
| * @return minimum codepoint of the group |
| */ |
| protected int getGroupMin(int msb) |
| { |
| return msb << GROUP_SHIFT_; |
| } |
| |
| /** |
| * Gets the offset to a group |
| * @param codepoint |
| * @return offset to a group |
| */ |
| protected int getGroupOffset(int codepoint) |
| { |
| return codepoint & GROUP_MASK_; |
| } |
| |
| /** |
| * Gets the minimum codepoint of a group |
| * @param codepoint |
| * @return minimum codepoint in the group which codepoint belongs to |
| */ |
| protected int getGroupMinFromCodepoint(int codepoint) |
| { |
| return codepoint & ~GROUP_MASK_; |
| } |
| |
| /** |
| * Get the Algorithm range length |
| * @return Algorithm range length |
| */ |
| protected int getAlgorithmLength() |
| { |
| return m_algorithm_.length; |
| } |
| |
| /** |
| * Gets the start of the range |
| * @param index algorithm index |
| * @return algorithm range start |
| */ |
| protected int getAlgorithmStart(int index) |
| { |
| return m_algorithm_[index].m_rangestart_; |
| } |
| |
| /** |
| * Gets the end of the range |
| * @param index algorithm index |
| * @return algorithm range end |
| */ |
| protected int getAlgorithmEnd(int index) |
| { |
| return m_algorithm_[index].m_rangeend_; |
| } |
| |
| /** |
| * Gets the Algorithmic name of the codepoint |
| * @param index algorithmic range index |
| * @param codepoint |
| * @return algorithmic name of codepoint |
| */ |
| protected String getAlgorithmName(int index, int codepoint) |
| { |
| StringBuffer result = new StringBuffer(); |
| m_algorithm_[index].appendName(codepoint, result); |
| return result.toString(); |
| } |
| |
| |
| // private data members ---------------------------------------------- |
| |
| /** |
| * Data used in unames.dat |
| */ |
| private char m_tokentable_[]; |
| private byte m_tokenstring_[]; |
| private char m_groupinfo_[]; |
| private byte m_groupstring_[]; |
| private AlgorithmName m_algorithm_[]; |
| |
| /** |
| * Group use |
| */ |
| private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1]; |
| private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1]; |
| |
| /** |
| * Default name of the name datafile |
| */ |
| private static final String NAME_FILE_NAME_ = |
| "/com/ibm/icu/impl/data/unames.dat"; |
| /** |
| * Shift count to retrieve group information |
| */ |
| private static final int GROUP_SHIFT_ = 5; |
| /** |
| * Mask to retrieve the offset for a particular character within a group |
| */ |
| private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1; |
| /** |
| * Default buffer size of datafile |
| */ |
| private static final int NAME_BUFFER_SIZE_ = 100000; |
| |
| /** |
| * Position of offsethigh in group information array |
| */ |
| private static final int OFFSET_HIGH_OFFSET_ = 1; |
| |
| /** |
| * Position of offsetlow in group information array |
| */ |
| private static final int OFFSET_LOW_OFFSET_ = 2; |
| /** |
| * Double nibble indicator, any nibble > this number has to be combined |
| * with its following nibble |
| */ |
| private static final int SINGLE_NIBBLE_MAX_ = 11; |
| |
| |
| // private methods --------------------------------------------------- |
| |
| /** |
| * Gets the algorithmic name for the argument character |
| * @param ch character to determine name for |
| * @param choice name choice |
| * @return the algorithmic name or null if not found |
| */ |
| private String getAlgName(int ch, int choice) |
| { |
| // Do not write algorithmic Unicode 1.0 names because Unihan names are |
| // the same as the modern ones, extension A was only introduced with |
| // Unicode 3.0, and the Hangul syllable block was moved and changed |
| // around Unicode 1.1.5. |
| if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { |
| // index in terms integer index |
| StringBuffer s = new StringBuffer(); |
| |
| for (int index = m_algorithm_.length - 1; index >= 0; index --) { |
| if (m_algorithm_[index].contains(ch)) { |
| m_algorithm_[index].appendName(ch, s); |
| return s.toString(); |
| } |
| } |
| } |
| return null; |
| } |
| |
| /** |
| * Getting the character with the tokenized argument name |
| * @param name of the character |
| * @return character with the tokenized argument name or -1 if character |
| * is not found |
| */ |
| private synchronized int getGroupChar(String name, int choice) |
| { |
| for (int i = 0; i < m_groupcount_; i ++) { |
| // populating the data set of grouptable |
| |
| int startgpstrindex = getGroupLengths(i, m_groupoffsets_, |
| m_grouplengths_); |
| |
| // shift out to function |
| int result = getGroupChar(startgpstrindex, m_grouplengths_, name, |
| choice); |
| if (result != -1) { |
| return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_) |
| | result; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Compares and retrieve character if name is found within the argument |
| * group |
| * @param index index where the set of names reside in the group block |
| * @param length list of lengths of the strings |
| * @param name character name to search for |
| * @param choice of either 1.0 or the most current unicode name |
| * @return relative character in the group which matches name, otherwise if |
| * not found, -1 will be returned |
| */ |
| private int getGroupChar(int index, char length[], String name, |
| int choice) |
| { |
| byte b = 0; |
| char token; |
| int len; |
| int namelen = name.length(); |
| int nindex; |
| int count; |
| |
| for (int result = 0; result <= LINES_PER_GROUP_; result ++) { |
| nindex = 0; |
| len = length[result]; |
| |
| if (choice == UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { |
| int oldindex = index; |
| index += UCharacterUtil.skipByteSubString(m_groupstring_, |
| index, len, (byte)';'); |
| len -= (index - oldindex); |
| } |
| |
| // number of tokens is > the length of the name |
| // write each letter directly, and write a token word per token |
| for (count = 0; count < len && nindex != -1 && nindex < namelen; |
| ) { |
| b = m_groupstring_[index + count]; |
| count ++; |
| |
| if (b >= m_tokentable_.length) { |
| if (name.charAt(nindex ++) != (b & 0xFF)) { |
| nindex = -1; |
| } |
| } |
| else { |
| token = m_tokentable_[b & 0xFF]; |
| if (token == 0xFFFE) { |
| // this is a lead byte for a double-byte token |
| token = m_tokentable_[b << 8 | |
| (m_groupstring_[index + count] & 0x00ff)]; |
| count ++; |
| } |
| if (token == 0xFFFF) { |
| if (name.charAt(nindex ++) != (b & 0xFF)) { |
| nindex = -1; |
| } |
| } |
| else { |
| // compare token with name |
| nindex = UCharacterUtil.compareNullTermByteSubString( |
| name, m_tokenstring_, nindex, token); |
| } |
| } |
| } |
| |
| if (namelen == nindex && |
| (count == len || m_groupstring_[index + count] == ';')) { |
| return result; |
| } |
| |
| index += len; |
| } |
| return -1; |
| } |
| |
| /** |
| * Binary search for the group strings set that contains the argument Unicode |
| * code point's most significant bits. |
| * The return value is always a valid group string set that contain msb. |
| * If group string set is not found, -1 is returned |
| * @param ch the code point to look for |
| * @return group string set index in datatable otherwise -1 is returned if |
| * group string set is not found |
| */ |
| private int getGroupStringIndex(int ch) |
| { |
| // gets the msb |
| int msb = ch >> GROUP_SHIFT_, |
| end = m_groupcount_, |
| start, |
| gindex = 0; |
| |
| // binary search for the group of names that contains the one for code |
| for (start = 0; start < end - 1;) { |
| gindex = (start + end) >> 1; |
| if (msb < m_groupinfo_[gindex * m_groupsize_]) { |
| end = gindex; |
| } |
| else { |
| start = gindex; |
| } |
| } |
| |
| // return this if it is an exact match |
| if (msb == m_groupinfo_[start * m_groupsize_]) { |
| start = start * m_groupsize_; |
| return UCharacterUtil.toInt( |
| m_groupinfo_[start + OFFSET_HIGH_OFFSET_], |
| m_groupinfo_[start + OFFSET_LOW_OFFSET_]); |
| } |
| return -1; |
| } |
| |
| /** |
| * Gets the group name of the character |
| * @param ch character to get the group name |
| * @param choice name choice selector to choose a unicode 1.0 or newer name |
| */ |
| private synchronized String getGroupName(int ch, int choice) |
| { |
| // gets the msb |
| int msb = getCodepointMSB(ch); |
| int group = getGroup(ch); |
| |
| // return this if it is an exact match |
| if (msb == m_groupinfo_[group * m_groupsize_]) { |
| int index = getGroupLengths(group, m_groupoffsets_, |
| m_grouplengths_); |
| int offset = ch & GROUP_MASK_; |
| return getGroupName(index + m_groupoffsets_[offset], |
| m_grouplengths_[offset], choice); |
| } |
| |
| return null; |
| } |
| |
| /** |
| * Gets the character extended type |
| * @param ch character to be tested |
| * @return extended type it is associated with |
| */ |
| private int getType(int ch) |
| { |
| if (UCharacter.isNonCharacter(ch)) { |
| // not a character we return a invalid category count |
| return UCharacterCategory.NON_CHARACTER_; |
| } |
| int result = UCharacter.getType(ch); |
| if (result == UCharacterCategory.SURROGATE) { |
| if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { |
| result = UCharacterCategory.LEAD_SURROGATE_; |
| } |
| else { |
| result = UCharacterCategory.TRAIL_SURROGATE_; |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Getting the character with extended name of the form <....>. |
| * @param name of the character to be found |
| * @param choice name choice |
| * @return character associated with the name, -1 if such character is not |
| * found and -2 if we should continue with the search. |
| */ |
| private int getExtendedChar(String name, int choice) |
| { |
| if (name.charAt(0) == '<') { |
| if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { |
| int endIndex = name.length() - 1; |
| if (name.charAt(endIndex) == '>') { |
| int startIndex = name.lastIndexOf('-'); |
| if (startIndex >= 0) { // We've got a category. |
| startIndex ++; |
| int result = -1; |
| try { |
| result = Integer.parseInt( |
| name.substring(startIndex, endIndex), |
| 16); |
| } |
| catch (NumberFormatException e) { |
| return -1; |
| } |
| // Now validate the category name. We could use a |
| // binary search, or a trie, if we really wanted to. |
| String type = name.substring(1, startIndex - 1); |
| int length = UCharacterCategory.TYPE_NAMES_.length; |
| for (int i = 0; i < length; ++ i) { |
| if (type.compareTo( |
| UCharacterCategory.TYPE_NAMES_[i]) == 0) { |
| if (getType(result) == i) { |
| return result; |
| } |
| break; |
| } |
| } |
| } |
| } |
| } |
| return -1; |
| } |
| return -2; |
| } |
| } |