| /* |
| ****************************************************************************** |
| * Copyright (C) 1996-2010, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ****************************************************************************** |
| */ |
| |
| package com.ibm.icu.impl; |
| |
| import java.util.Arrays; |
| |
| import com.ibm.icu.lang.UCharacter; |
| |
| /** |
| * Builder class to manipulate and generate a trie. |
| * This is useful for ICU data in primitive types. |
| * Provides a compact way to store information that is indexed by Unicode |
| * values, such as character properties, types, keyboard values, etc. This is |
| * very useful when you have a block of Unicode data that contains significant |
| * values while the rest of the Unicode data is unused in the application or |
| * when you have a lot of redundance, such as where all 21,000 Han ideographs |
| * have the same value. However, lookup is much faster than a hash table. |
| * A trie of any primitive data type serves two purposes: |
| * <UL type = round> |
| * <LI>Fast access of the indexed values. |
| * <LI>Smaller memory footprint. |
| * </UL> |
| * This is a direct port from the ICU4C version |
| * @author Syn Wee Quek |
| */ |
| public class TrieBuilder |
| { |
| // public data member ---------------------------------------------- |
| |
| /** |
| * Number of data values in a stage 2 (data array) block. 2, 4, 8, .., |
| * 0x200 |
| */ |
| public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_; |
| |
| // public class declaration ---------------------------------------- |
| |
| /** |
| * Character data in com.ibm.impl.Trie have different user-specified format |
| * for different purposes. |
| * This interface specifies methods to be implemented in order for |
| * com.ibm.impl.Trie, to surrogate offset information encapsulated within |
| * the data. |
| */ |
| public static interface DataManipulate |
| { |
| /** |
| * Build-time trie callback function, used with serialize(). |
| * This function calculates a lead surrogate's value including a |
| * folding offset from the 1024 supplementary code points |
| * [start..start+1024[ . |
| * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0. |
| * The folding offset is provided by the caller. |
| * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT |
| * with n=0..1023. |
| * Instead of the offset itself, n can be stored in 10 bits - or fewer |
| * if it can be assumed that few lead surrogates have associated data. |
| * The returned value must be |
| * - not zero if and only if there is relevant data for the |
| * corresponding 1024 supplementary code points |
| * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., |
| * offset))==offset |
| * @return a folded value, or 0 if there is no relevant data for the |
| * lead surrogate. |
| */ |
| public int getFoldedValue(int start, int offset); |
| } |
| |
| // public methods ---------------------------------------------------- |
| |
| /** |
| * Checks if the character belongs to a zero block in the trie |
| * @param ch codepoint which data is to be retrieved |
| * @return true if ch is in the zero block |
| */ |
| public boolean isInZeroBlock(int ch) |
| { |
| // valid, uncompacted trie and valid c? |
| if (m_isCompacted_ || ch > UCharacter.MAX_VALUE |
| || ch < UCharacter.MIN_VALUE) { |
| return true; |
| } |
| |
| return m_index_[ch >> SHIFT_] == 0; |
| } |
| |
| // package private method ----------------------------------------------- |
| |
| // protected data member ----------------------------------------------- |
| |
| /** |
| * Index values at build-time are 32 bits wide for easier processing. |
| * Bit 31 is set if the data block is used by multiple index values |
| * (from setRange()). |
| */ |
| protected int m_index_[]; |
| protected int m_indexLength_; |
| protected int m_dataCapacity_; |
| protected int m_dataLength_; |
| protected boolean m_isLatin1Linear_; |
| protected boolean m_isCompacted_; |
| /** |
| * Map of adjusted indexes, used in utrie_compact(). |
| * Maps from original indexes to new ones. |
| */ |
| protected int m_map_[]; |
| |
| /** |
| * Shift size for shifting right the input index. 1..9 |
| */ |
| protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_; |
| /** |
| * Length of the index (stage 1) array before folding. |
| * Maximum number of Unicode code points (0x110000) shifted right by |
| * SHIFT. |
| */ |
| protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_); |
| /** |
| * Length of the BMP portion of the index (stage 1) array. |
| */ |
| protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_; |
| /** |
| * Number of index (stage 1) entries per lead surrogate. |
| * Same as number of indexe entries for 1024 trail surrogates, |
| * ==0x400>>UTRIE_SHIFT |
| * 10 - SHIFT == Number of bits of a trail surrogate that are used in |
| * index table lookups. |
| */ |
| protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_); |
| /** |
| * Mask for getting the lower bits from the input index. |
| * DATA_BLOCK_LENGTH - 1. |
| */ |
| protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_; |
| /** |
| * Shift size for shifting left the index array values. |
| * Increases possible data size with 16-bit index values at the cost |
| * of compactability. |
| * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY. |
| * 0..UTRIE_SHIFT |
| */ |
| protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_; |
| /** |
| * Maximum length of the runtime data (stage 2) array. |
| * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_. |
| */ |
| protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_); |
| /** |
| * Shifting to position the index value in options |
| */ |
| protected static final int OPTIONS_INDEX_SHIFT_ = 4; |
| /** |
| * If set, then the data (stage 2) array is 32 bits wide. |
| */ |
| protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100; |
| /** |
| * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data |
| * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH. |
| */ |
| protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200; |
| /** |
| * The alignment size of a stage 2 data block. Also the granularity for |
| * compaction. |
| */ |
| protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_; |
| |
| // protected constructor ---------------------------------------------- |
| |
| protected TrieBuilder() |
| { |
| m_index_ = new int[MAX_INDEX_LENGTH_]; |
| m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_]; |
| m_isLatin1Linear_ = false; |
| m_isCompacted_ = false; |
| m_indexLength_ = MAX_INDEX_LENGTH_; |
| } |
| |
| protected TrieBuilder(TrieBuilder table) |
| { |
| m_index_ = new int[MAX_INDEX_LENGTH_]; |
| m_indexLength_ = table.m_indexLength_; |
| System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_); |
| m_dataCapacity_ = table.m_dataCapacity_; |
| m_dataLength_ = table.m_dataLength_; |
| m_map_ = new int[table.m_map_.length]; |
| System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length); |
| m_isLatin1Linear_ = table.m_isLatin1Linear_; |
| m_isCompacted_ = table.m_isCompacted_; |
| } |
| |
| // protected functions ------------------------------------------------ |
| |
| /** |
| * Compare two sections of an array for equality. |
| */ |
| protected static final boolean equal_int(int[] array, int start1, int start2, int length) { |
| while(length>0 && array[start1]==array[start2]) { |
| ++start1; |
| ++start2; |
| --length; |
| } |
| return length==0; |
| } |
| |
| /** |
| * Set a value in the trie index map to indicate which data block |
| * is referenced and which one is not. |
| * utrie_compact() will remove data blocks that are not used at all. |
| * Set |
| * - 0 if it is used |
| * - -1 if it is not used |
| */ |
| protected void findUnusedBlocks() |
| { |
| // fill the entire map with "not used" |
| Arrays.fill(m_map_, 0xff); |
| |
| // mark each block that _is_ used with 0 |
| for (int i = 0; i < m_indexLength_; ++ i) { |
| m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0; |
| } |
| |
| // never move the all-initial-value block 0 |
| m_map_[0] = 0; |
| } |
| |
| /** |
| * Finds the same index block as the otherBlock |
| * @param index array |
| * @param indexLength size of index |
| * @param otherBlock |
| * @return same index block |
| */ |
| protected static final int findSameIndexBlock(int index[], int indexLength, |
| int otherBlock) |
| { |
| for (int block = BMP_INDEX_LENGTH_; block < indexLength; |
| block += SURROGATE_BLOCK_COUNT_) { |
| if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) { |
| return block; |
| } |
| } |
| return indexLength; |
| } |
| |
| // private data member ------------------------------------------------ |
| |
| /** |
| * Maximum length of the build-time data (stage 2) array. |
| * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400. |
| * (Number of Unicode code points + one all-initial-value block + |
| * possible duplicate entries for 1024 lead surrogates.) |
| */ |
| private static final int MAX_BUILD_TIME_DATA_LENGTH_ = |
| 0x110000 + DATA_BLOCK_LENGTH + 0x400; |
| } |