| /* |
| ****************************************************************************** |
| * Copyright (C) 1996-2003, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ****************************************************************************** |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Trie.java,v $ |
| * $Date: 2003/09/19 00:14:35 $ |
| * $Revision: 1.13 $ |
| * |
| ****************************************************************************** |
| */ |
| |
| package com.ibm.icu.impl; |
| |
| import java.io.InputStream; |
| import java.io.DataInputStream; |
| import java.io.IOException; |
| import java.util.Arrays; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.lang.UCharacter; |
| |
| /** |
| * <p>A trie is a kind of compressed, serializable table of values |
| * associated with Unicode code points (0..0x10ffff).</p> |
| * <p>This class defines the basic structure of a trie and provides methods |
| * to <b>retrieve the offsets to the actual data</b>.</p> |
| * <p>Data will be the form of an array of basic types, char or int.</p> |
| * <p>The actual data format will have to be specified by the user in the |
| * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> |
| * <p>This trie implementation is optimized for getting offset while walking |
| * forward through a UTF-16 string. |
| * Therefore, the simplest and fastest access macros are the |
| * fromLead() and fromOffsetTrail() methods. |
| * The fromBMP() method are a little more complicated; they get offsets even |
| * for lead surrogate codepoints, while the fromLead() method get special |
| * "folded" offsets for lead surrogate code units if there is relevant data |
| * associated with them. |
| * From such a folded offsets, an offset needs to be extracted to supply |
| * to the fromOffsetTrail() methods. |
| * To handle such supplementary codepoints, some offset information are kept |
| * in the data.</p> |
| * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve |
| * that offset from the folded value for the lead surrogate unit.</p> |
| * <p>For examples of use, see com.ibm.icu.impl.CharTrie or |
| * com.ibm.icu.impl.IntTrie.</p> |
| * @author synwee |
| * @see com.ibm.icu.impl.CharTrie |
| * @see com.ibm.icu.impl.IntTrie |
| * @since release 2.1, Jan 01 2002 |
| */ |
| public abstract class Trie |
| { |
| // public class declaration ---------------------------------------- |
| |
| /** |
| * Character data in com.ibm.impl.Trie have different user-specified format |
| * for different purposes. |
| * This interface specifies methods to be implemented in order for |
| * com.ibm.impl.Trie, to surrogate offset information encapsulated within |
| * the data. |
| * @draft 2.1 |
| */ |
| public static interface DataManipulate |
| { |
| /** |
| * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's |
| * data |
| * the index array offset of the indexes for that lead surrogate. |
| * @param value data value for a surrogate from the trie, including the |
| * folding offset |
| * @return data offset or 0 if there is no data for the lead surrogate |
| * @draft 2.1 |
| */ |
| public int getFoldingOffset(int value); |
| } |
| |
| // public methods -------------------------------------------------- |
| |
| /** |
| * Determines if this trie has a linear latin 1 array |
| * @return true if this trie has a linear latin 1 array, false otherwise |
| */ |
| public final boolean isLatin1Linear() |
| { |
| return m_isLatin1Linear_; |
| } |
| |
| /** |
| * Checks if the argument Trie has the same data as this Trie. |
| * Attributes are checked but not the index data. |
| * @param other Trie to check |
| * @return true if the argument Trie has the same data as this Trie, false |
| * otherwise |
| */ |
| ///CLOVER:OFF |
| public boolean equals(Object other) |
| { |
| if (other == this) { |
| return true; |
| } |
| if (!(other instanceof Trie)) { |
| return false; |
| } |
| Trie othertrie = (Trie)other; |
| return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_ |
| && m_options_ == othertrie.m_options_ |
| && m_dataLength_ == othertrie.m_dataLength_ |
| && Arrays.equals(m_index_, othertrie.m_index_); |
| } |
| ///CLOVER:ON |
| |
| /** |
| * Gets the serialized data file size of the Trie. This is used during |
| * trie data reading for size checking purposes. |
| * @return size size of serialized trie data file in terms of the number |
| * of bytes |
| */ |
| public int getSerializedDataSize() |
| { |
| // includes signature, option, dataoffset and datalength output |
| int result = (4 << 2); |
| result += (m_dataOffset_ << 1); |
| if (isCharTrie()) { |
| result += (m_dataLength_ << 1); |
| } |
| else if (isIntTrie()) { |
| result += (m_dataLength_ << 2); |
| } |
| return result; |
| } |
| |
| // protected constructor ------------------------------------------- |
| |
| /** |
| * Trie constructor for CharTrie use. |
| * @param inputStream ICU data file input stream which contains the |
| * trie |
| * @param datamanipulate object containing the information to parse the |
| * trie data |
| * @exception IOException thrown when input stream does not have the |
| * right header. |
| * @draft 2.1 |
| */ |
| protected Trie(InputStream inputStream, |
| DataManipulate dataManipulate) throws IOException |
| { |
| DataInputStream input = new DataInputStream(inputStream); |
| // Magic number to authenticate the data. |
| int signature = input.readInt(); |
| m_options_ = input.readInt(); |
| |
| if (!checkHeader(signature)) { |
| throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); |
| } |
| |
| m_dataManipulate_ = dataManipulate; |
| m_isLatin1Linear_ = (m_options_ & |
| HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; |
| m_dataOffset_ = input.readInt(); |
| m_dataLength_ = input.readInt(); |
| unserialize(inputStream); |
| } |
| |
| /** |
| * Trie constructor |
| * @param index array to be used for index |
| * @param options used by the trie |
| * @param datamanipulate object containing the information to parse the |
| * trie data |
| * @draft 2.2 |
| */ |
| protected Trie(char index[], int options, DataManipulate dataManipulate) |
| { |
| m_options_ = options; |
| m_dataManipulate_ = dataManipulate; |
| m_isLatin1Linear_ = (m_options_ & |
| HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; |
| m_index_ = index; |
| m_dataOffset_ = m_index_.length; |
| } |
| |
| |
| // protected data members ------------------------------------------ |
| |
| /** |
| * Lead surrogate code points' index displacement in the index array. |
| * 0x10000-0xd800=0x2800 |
| * 0x2800 >> INDEX_STAGE_1_SHIFT_ |
| */ |
| protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; |
| /** |
| * Shift size for shifting right the input index. 1..9 |
| * @draft 2.1 |
| */ |
| protected static final int INDEX_STAGE_1_SHIFT_ = 5; |
| /** |
| * Shift size for shifting left the index array values. |
| * Increases possible data size with 16-bit index values at the cost |
| * of compactability. |
| * This requires blocks of stage 2 data to be aligned by |
| * DATA_GRANULARITY. |
| * 0..INDEX_STAGE_1_SHIFT |
| * @draft 2.1 |
| */ |
| protected static final int INDEX_STAGE_2_SHIFT_ = 2; |
| /** |
| * Mask for getting the lower bits from the input index. |
| * DATA_BLOCK_LENGTH_ - 1. |
| * @draft 2.1 |
| */ |
| protected static final int INDEX_STAGE_3_MASK_ = |
| (1 << INDEX_STAGE_1_SHIFT_) - 1; |
| /** |
| * Surrogate mask to use when shifting offset to retrieve supplementary |
| * values |
| * @draft 2.1 |
| */ |
| protected static final int SURROGATE_MASK_ = 0x3FF; |
| /** |
| * Index or UTF16 characters |
| * @draft 2.1 |
| */ |
| protected char m_index_[]; |
| /** |
| * Internal TrieValue which handles the parsing of the data value. |
| * This class is to be implemented by the user |
| * @draft 2.1 |
| */ |
| protected DataManipulate m_dataManipulate_; |
| /** |
| * Start index of the data portion of the trie. CharTrie combines |
| * index and data into a char array, so this is used to indicate the |
| * initial offset to the data portion. |
| * Note this index always points to the initial value. |
| * @draft 2.1 |
| */ |
| protected int m_dataOffset_; |
| /** |
| * Length of the data array |
| */ |
| protected int m_dataLength_; |
| |
| // protected methods ----------------------------------------------- |
| |
| /** |
| * Gets the offset to the data which the surrogate pair points to. |
| * @param lead lead surrogate |
| * @param trail trailing surrogate |
| * @return offset to data |
| * @draft 2.1 |
| */ |
| protected abstract int getSurrogateOffset(char lead, char trail); |
| |
| /** |
| * Gets the value at the argument index |
| * @param index value at index will be retrieved |
| * @return 32 bit value |
| * @draft 2.1 |
| */ |
| protected abstract int getValue(int index); |
| |
| /** |
| * Gets the default initial value |
| * @return 32 bit value |
| * @draft 2.1 |
| */ |
| protected abstract int getInitialValue(); |
| |
| /** |
| * Gets the offset to the data which the index ch after variable offset |
| * points to. |
| * Note for locating a non-supplementary character data offset, calling |
| * <p> |
| * getRawOffset(0, ch); |
| * </p> |
| * will do. Otherwise if it is a supplementary character formed by |
| * surrogates lead and trail. Then we would have to call getRawOffset() |
| * with getFoldingIndexOffset(). See getSurrogateOffset(). |
| * @param offset index offset which ch is to start from |
| * @param ch index to be used after offset |
| * @return offset to the data |
| * @draft 2.1 |
| */ |
| protected final int getRawOffset(int offset, char ch) |
| { |
| return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] |
| << INDEX_STAGE_2_SHIFT_) |
| + (ch & INDEX_STAGE_3_MASK_); |
| } |
| |
| /** |
| * Gets the offset to data which the BMP character points to |
| * Treats a lead surrogate as a normal code point. |
| * @param ch BMP character |
| * @return offset to data |
| * @draft 2.1 |
| */ |
| protected final int getBMPOffset(char ch) |
| { |
| return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE |
| && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) |
| ? getRawOffset(LEAD_INDEX_OFFSET_, ch) |
| : getRawOffset(0, ch); |
| // using a getRawOffset(ch) makes no diff |
| } |
| |
| /** |
| * Gets the offset to the data which this lead surrogate character points |
| * to. |
| * Data at the returned offset may contain folding offset information for |
| * the next trailing surrogate character. |
| * @param ch lead surrogate character |
| * @return offset to data |
| * @draft 2.1 |
| */ |
| protected final int getLeadOffset(char ch) |
| { |
| return getRawOffset(0, ch); |
| } |
| |
| /** |
| * Internal trie getter from a code point. |
| * Could be faster(?) but longer with |
| * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } |
| * Gets the offset to data which the codepoint points to |
| * @param ch codepoint |
| * @return offset to data |
| * @draft 2.1 |
| */ |
| protected final int getCodePointOffset(int ch) |
| { |
| // if ((ch >> 16) == 0) slower |
| if (ch >= UTF16.CODEPOINT_MIN_VALUE |
| && ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { |
| // BMP codepoint |
| return getBMPOffset((char)ch); |
| } |
| // for optimization |
| if (ch >= UTF16.CODEPOINT_MIN_VALUE |
| && ch <= UCharacter.MAX_VALUE) { |
| // look at the construction of supplementary characters |
| // trail forms the ends of it. |
| return getSurrogateOffset(UTF16.getLeadSurrogate(ch), |
| (char)(ch & SURROGATE_MASK_)); |
| } |
| // return -1 if there is an error, in this case we return |
| return -1; |
| } |
| |
| /** |
| * <p>Parses the inputstream and creates the trie index with it.</p> |
| * <p>This is overwritten by the child classes. |
| * @param inputStream input stream containing the trie information |
| * @exception IOException thrown when data reading fails. |
| * @draft 2.1 |
| */ |
| protected void unserialize(InputStream inputStream) throws IOException |
| { |
| //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ |
| m_index_ = new char[m_dataOffset_]; |
| DataInputStream input = new DataInputStream(inputStream); |
| for (int i = 0; i < m_dataOffset_; i ++) { |
| m_index_[i] = input.readChar(); |
| } |
| } |
| |
| /** |
| * Determines if this is a 32 bit trie |
| * @return true if options specifies this is a 32 bit trie |
| * @draft 2.1 |
| */ |
| protected final boolean isIntTrie() |
| { |
| return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; |
| } |
| |
| /** |
| * Determines if this is a 16 bit trie |
| * @return true if this is a 16 bit trie |
| * @draft 2.1 |
| */ |
| protected final boolean isCharTrie() |
| { |
| return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; |
| } |
| |
| // private data members -------------------------------------------- |
| |
| /** |
| * Signature index |
| */ |
| private static final int HEADER_SIGNATURE_INDEX_ = 0; |
| /** |
| * Options index |
| */ |
| private static final int HEADER_OPTIONS_INDEX_ = 1 << 1; |
| /** |
| * Index length index |
| */ |
| private static final int HEADER_INDEX_LENGTH_INDEX_ = 2 << 1; |
| /** |
| * Data length index |
| */ |
| private static final int HEADER_DATA_LENGTH_INDEX_ = 3 << 1; |
| /** |
| * Size of header |
| */ |
| private static final int HEADER_LENGTH_ = 4 << 1; |
| /** |
| * Latin 1 option mask |
| */ |
| private static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; |
| /** |
| * Constant number to authenticate the byte block |
| */ |
| private static final int HEADER_SIGNATURE_ = 0x54726965; |
| /** |
| * Header option formatting |
| */ |
| private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; |
| private static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; |
| private static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; |
| |
| /** |
| * Flag indicator for Latin quick access data block |
| */ |
| private boolean m_isLatin1Linear_; |
| |
| /** |
| * <p>Trie options field.</p> |
| * <p>options bit field:<br> |
| * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> |
| * 8 0 = 16-bit data, 1=32-bit data<br> |
| * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> |
| * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> |
| */ |
| private int m_options_; |
| |
| // private methods --------------------------------------------------- |
| |
| /** |
| * Authenticates raw data header. |
| * Checking the header information, signature and options. |
| * @param rawdata array of char data to be checked |
| * @return true if the header is authenticated valid |
| * @draft 2.1 |
| */ |
| private final boolean checkHeader(int signature) |
| { |
| // check the signature |
| // Trie in big-endian US-ASCII (0x54726965). |
| // Magic number to authenticate the data. |
| if (signature != HEADER_SIGNATURE_) { |
| return false; |
| } |
| |
| if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != |
| INDEX_STAGE_1_SHIFT_ || |
| ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & |
| HEADER_OPTIONS_SHIFT_MASK_) |
| != INDEX_STAGE_2_SHIFT_) { |
| return false; |
| } |
| return true; |
| } |
| } |