| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2010-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * created on: 2010nov23 |
| * created by: Markus W. Scherer |
| * ported from ICU4C bytestrie.h/.cpp |
| */ |
| package com.ibm.icu.util; |
| |
| import java.io.IOException; |
| import java.nio.ByteBuffer; |
| import java.util.ArrayList; |
| import java.util.NoSuchElementException; |
| |
| /** |
| * Light-weight, non-const reader class for a BytesTrie. |
| * Traverses a byte-serialized data structure with minimal state, |
| * for mapping byte sequences to non-negative integer values. |
| * |
| * <p>This class is not intended for public subclassing. |
| * |
| * @stable ICU 4.8 |
| * @author Markus W. Scherer |
| */ |
| public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { |
| /** |
| * Constructs a BytesTrie reader instance. |
| * |
| * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, |
| * with the offset indicating the first byte of that sequence. |
| * The BytesTrie object will not read more bytes than |
| * the BytesTrieBuilder generated in the corresponding build() call. |
| * |
| * <p>The array is not copied/cloned and must not be modified while |
| * the BytesTrie object is in use. |
| * |
| * @param trieBytes Bytes array that contains the serialized trie. |
| * @param offset Root offset of the trie in the array. |
| * @stable ICU 4.8 |
| */ |
| public BytesTrie(byte[] trieBytes, int offset) { |
| bytes_=trieBytes; |
| pos_=root_=offset; |
| remainingMatchLength_=-1; |
| } |
| |
| /** |
| * Clones this trie reader object and its state, |
| * but not the byte array which will be shared. |
| * @return A shallow clone of this trie. |
| * @stable ICU 4.8 |
| */ |
| @Override |
| public Object clone() throws CloneNotSupportedException { |
| return super.clone(); // A shallow copy is just what we need. |
| } |
| |
| /** |
| * Resets this trie to its initial state. |
| * @return this |
| * @stable ICU 4.8 |
| */ |
| public BytesTrie reset() { |
| pos_=root_; |
| remainingMatchLength_=-1; |
| return this; |
| } |
| |
| /** |
| * BytesTrie state object, for saving a trie's current state |
| * and resetting the trie back to this state later. |
| * @stable ICU 4.8 |
| */ |
| public static final class State { |
| /** |
| * Constructs an empty State. |
| * @stable ICU 4.8 |
| */ |
| public State() {} |
| private byte[] bytes; |
| private int root; |
| private int pos; |
| private int remainingMatchLength; |
| } |
| |
| /** |
| * Saves the state of this trie. |
| * @param state The State object to hold the trie's state. |
| * @return this |
| * @see #resetToState |
| * @stable ICU 4.8 |
| */ |
| public BytesTrie saveState(State state) /*const*/ { |
| state.bytes=bytes_; |
| state.root=root_; |
| state.pos=pos_; |
| state.remainingMatchLength=remainingMatchLength_; |
| return this; |
| } |
| |
| /** |
| * Resets this trie to the saved state. |
| * @param state The State object which holds a saved trie state. |
| * @return this |
| * @throws IllegalArgumentException if the state object contains no state, |
| * or the state of a different trie |
| * @see #saveState |
| * @see #reset |
| * @stable ICU 4.8 |
| */ |
| public BytesTrie resetToState(State state) { |
| if(bytes_==state.bytes && bytes_!=null && root_==state.root) { |
| pos_=state.pos; |
| remainingMatchLength_=state.remainingMatchLength; |
| } else { |
| throw new IllegalArgumentException("incompatible trie state"); |
| } |
| return this; |
| } |
| |
| /** |
| * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. |
| * @stable ICU 4.8 |
| */ |
| public enum Result { |
| /** |
| * The input unit(s) did not continue a matching string. |
| * Once current()/next() return NO_MATCH, |
| * all further calls to current()/next() will also return NO_MATCH, |
| * until the trie is reset to its original state or to a saved state. |
| * @stable ICU 4.8 |
| */ |
| NO_MATCH, |
| /** |
| * The input unit(s) continued a matching string |
| * but there is no value for the string so far. |
| * (It is a prefix of a longer string.) |
| * @stable ICU 4.8 |
| */ |
| NO_VALUE, |
| /** |
| * The input unit(s) continued a matching string |
| * and there is a value for the string so far. |
| * This value will be returned by getValue(). |
| * No further input byte/unit can continue a matching string. |
| * @stable ICU 4.8 |
| */ |
| FINAL_VALUE, |
| /** |
| * The input unit(s) continued a matching string |
| * and there is a value for the string so far. |
| * This value will be returned by getValue(). |
| * Another input byte/unit can continue a matching string. |
| * @stable ICU 4.8 |
| */ |
| INTERMEDIATE_VALUE; |
| |
| // Note: The following methods assume the particular order |
| // of enum constants, treating the ordinal() values like bit sets. |
| // Do not reorder the enum constants! |
| |
| /** |
| * Same as (result!=NO_MATCH). |
| * @return true if the input bytes/units so far are part of a matching string/byte sequence. |
| * @stable ICU 4.8 |
| */ |
| public boolean matches() { return this!=NO_MATCH; } |
| |
| /** |
| * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). |
| * @return true if there is a value for the input bytes/units so far. |
| * @see #getValue |
| * @stable ICU 4.8 |
| */ |
| public boolean hasValue() { return ordinal()>=2; } |
| |
| /** |
| * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). |
| * @return true if another input byte/unit can continue a matching string. |
| * @stable ICU 4.8 |
| */ |
| public boolean hasNext() { return (ordinal()&1)!=0; } |
| } |
| |
| /** |
| * Determines whether the byte sequence so far matches, whether it has a value, |
| * and whether another input byte can continue a matching byte sequence. |
| * @return The match/value Result. |
| * @stable ICU 4.8 |
| */ |
| public Result current() /*const*/ { |
| int pos=pos_; |
| if(pos<0) { |
| return Result.NO_MATCH; |
| } else { |
| int node; |
| return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? |
| valueResults_[node&kValueIsFinal] : Result.NO_VALUE; |
| } |
| } |
| |
| /** |
| * Traverses the trie from the initial state for this input byte. |
| * Equivalent to reset().next(inByte). |
| * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. |
| * Values below -0x100 and above 0xff will never match. |
| * @return The match/value Result. |
| * @stable ICU 4.8 |
| */ |
| public Result first(int inByte) { |
| remainingMatchLength_=-1; |
| if(inByte<0) { |
| inByte+=0x100; |
| } |
| return nextImpl(root_, inByte); |
| } |
| |
| /** |
| * Traverses the trie from the current state for this input byte. |
| * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. |
| * Values below -0x100 and above 0xff will never match. |
| * @return The match/value Result. |
| * @stable ICU 4.8 |
| */ |
| public Result next(int inByte) { |
| int pos=pos_; |
| if(pos<0) { |
| return Result.NO_MATCH; |
| } |
| if(inByte<0) { |
| inByte+=0x100; |
| } |
| int length=remainingMatchLength_; // Actual remaining match length minus 1. |
| if(length>=0) { |
| // Remaining part of a linear-match node. |
| if(inByte==(bytes_[pos++]&0xff)) { |
| remainingMatchLength_=--length; |
| pos_=pos; |
| int node; |
| return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? |
| valueResults_[node&kValueIsFinal] : Result.NO_VALUE; |
| } else { |
| stop(); |
| return Result.NO_MATCH; |
| } |
| } |
| return nextImpl(pos, inByte); |
| } |
| |
| /** |
| * Traverses the trie from the current state for this byte sequence. |
| * Equivalent to |
| * <pre> |
| * Result result=current(); |
| * for(each c in s) |
| * if(!result.hasNext()) return Result.NO_MATCH; |
| * result=next(c); |
| * return result; |
| * </pre> |
| * @param s Contains a string or byte sequence. |
| * @param sIndex The start index of the byte sequence in s. |
| * @param sLimit The (exclusive) end index of the byte sequence in s. |
| * @return The match/value Result. |
| * @stable ICU 4.8 |
| */ |
| public Result next(byte[] s, int sIndex, int sLimit) { |
| if(sIndex>=sLimit) { |
| // Empty input. |
| return current(); |
| } |
| int pos=pos_; |
| if(pos<0) { |
| return Result.NO_MATCH; |
| } |
| int length=remainingMatchLength_; // Actual remaining match length minus 1. |
| for(;;) { |
| // Fetch the next input byte, if there is one. |
| // Continue a linear-match node. |
| byte inByte; |
| for(;;) { |
| if(sIndex==sLimit) { |
| remainingMatchLength_=length; |
| pos_=pos; |
| int node; |
| return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? |
| valueResults_[node&kValueIsFinal] : Result.NO_VALUE; |
| } |
| inByte=s[sIndex++]; |
| if(length<0) { |
| remainingMatchLength_=length; |
| break; |
| } |
| if(inByte!=bytes_[pos]) { |
| stop(); |
| return Result.NO_MATCH; |
| } |
| ++pos; |
| --length; |
| } |
| for(;;) { |
| int node=bytes_[pos++]&0xff; |
| if(node<kMinLinearMatch) { |
| Result result=branchNext(pos, node, inByte&0xff); |
| if(result==Result.NO_MATCH) { |
| return Result.NO_MATCH; |
| } |
| // Fetch the next input byte, if there is one. |
| if(sIndex==sLimit) { |
| return result; |
| } |
| if(result==Result.FINAL_VALUE) { |
| // No further matching bytes. |
| stop(); |
| return Result.NO_MATCH; |
| } |
| inByte=s[sIndex++]; |
| pos=pos_; // branchNext() advanced pos and wrote it to pos_ . |
| } else if(node<kMinValueLead) { |
| // Match length+1 bytes. |
| length=node-kMinLinearMatch; // Actual match length minus 1. |
| if(inByte!=bytes_[pos]) { |
| stop(); |
| return Result.NO_MATCH; |
| } |
| ++pos; |
| --length; |
| break; |
| } else if((node&kValueIsFinal)!=0) { |
| // No further matching bytes. |
| stop(); |
| return Result.NO_MATCH; |
| } else { |
| // Skip intermediate value. |
| pos=skipValue(pos, node); |
| // The next node must not also be a value node. |
| assert((bytes_[pos]&0xff)<kMinValueLead); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Returns a matching byte sequence's value if called immediately after |
| * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. |
| * getValue() can be called multiple times. |
| * |
| * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! |
| * @return The value for the byte sequence so far. |
| * @stable ICU 4.8 |
| */ |
| public int getValue() /*const*/ { |
| int pos=pos_; |
| int leadByte=bytes_[pos++]&0xff; |
| assert(leadByte>=kMinValueLead); |
| return readValue(bytes_, pos, leadByte>>1); |
| } |
| |
| /** |
| * Determines whether all byte sequences reachable from the current state |
| * map to the same value, and if so, returns that value. |
| * @return The unique value in bits 32..1 with bit 0 set, |
| * if all byte sequences reachable from the current state |
| * map to the same value; otherwise returns 0. |
| * @stable ICU 4.8 |
| */ |
| public long getUniqueValue() /*const*/ { |
| int pos=pos_; |
| if(pos<0) { |
| return 0; |
| } |
| // Skip the rest of a pending linear-match node. |
| long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); |
| // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. |
| return (uniqueValue<<31)>>31; |
| } |
| |
| /** |
| * Finds each byte which continues the byte sequence from the current state. |
| * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. |
| * @param out Each next byte is 0-extended to a char and appended to this object. |
| * (Only uses the out.append(c) method.) |
| * @return The number of bytes which continue the byte sequence from here. |
| * @stable ICU 4.8 |
| */ |
| public int getNextBytes(Appendable out) /*const*/ { |
| int pos=pos_; |
| if(pos<0) { |
| return 0; |
| } |
| if(remainingMatchLength_>=0) { |
| append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. |
| return 1; |
| } |
| int node=bytes_[pos++]&0xff; |
| if(node>=kMinValueLead) { |
| if((node&kValueIsFinal)!=0) { |
| return 0; |
| } else { |
| pos=skipValue(pos, node); |
| node=bytes_[pos++]&0xff; |
| assert(node<kMinValueLead); |
| } |
| } |
| if(node<kMinLinearMatch) { |
| if(node==0) { |
| node=bytes_[pos++]&0xff; |
| } |
| getNextBranchBytes(bytes_, pos, ++node, out); |
| return node; |
| } else { |
| // First byte of the linear-match node. |
| append(out, bytes_[pos]&0xff); |
| return 1; |
| } |
| } |
| |
| /** |
| * Iterates from the current state of this trie. |
| * @return A new BytesTrie.Iterator. |
| * @stable ICU 4.8 |
| */ |
| @Override |
| public Iterator iterator() { |
| return new Iterator(bytes_, pos_, remainingMatchLength_, 0); |
| } |
| |
| /** |
| * Iterates from the current state of this trie. |
| * @param maxStringLength If 0, the iterator returns full strings/byte sequences. |
| * Otherwise, the iterator returns strings with this maximum length. |
| * @return A new BytesTrie.Iterator. |
| * @stable ICU 4.8 |
| */ |
| public Iterator iterator(int maxStringLength) { |
| return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); |
| } |
| |
| /** |
| * Iterates from the root of a byte-serialized BytesTrie. |
| * @param trieBytes Bytes array that contains the serialized trie. |
| * @param offset Root offset of the trie in the array. |
| * @param maxStringLength If 0, the iterator returns full strings/byte sequences. |
| * Otherwise, the iterator returns strings with this maximum length. |
| * @return A new BytesTrie.Iterator. |
| * @stable ICU 4.8 |
| */ |
| public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { |
| return new Iterator(trieBytes, offset, -1, maxStringLength); |
| } |
| |
| /** |
| * Return value type for the Iterator. |
| * @stable ICU 4.8 |
| */ |
| public static final class Entry { |
| private Entry(int capacity) { |
| bytes=new byte[capacity]; |
| } |
| |
| /** |
| * @return The length of the byte sequence. |
| * @stable ICU 4.8 |
| */ |
| public int bytesLength() { return length; } |
| /** |
| * Returns a byte of the byte sequence. |
| * @param index An index into the byte sequence. |
| * @return The index-th byte sequence byte. |
| * @stable ICU 4.8 |
| */ |
| public byte byteAt(int index) { return bytes[index]; } |
| /** |
| * Copies the byte sequence into a byte array. |
| * @param dest Destination byte array. |
| * @param destOffset Starting offset to where in dest the byte sequence is copied. |
| * @stable ICU 4.8 |
| */ |
| public void copyBytesTo(byte[] dest, int destOffset) { |
| System.arraycopy(bytes, 0, dest, destOffset, length); |
| } |
| /** |
| * @return The byte sequence as a read-only ByteBuffer. |
| * @stable ICU 4.8 |
| */ |
| public ByteBuffer bytesAsByteBuffer() { |
| return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); |
| } |
| |
| /** |
| * The value associated with the byte sequence. |
| * @stable ICU 4.8 |
| */ |
| public int value; |
| |
| private void ensureCapacity(int len) { |
| if(bytes.length<len) { |
| byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; |
| System.arraycopy(bytes, 0, newBytes, 0, length); |
| bytes=newBytes; |
| } |
| } |
| private void append(byte b) { |
| ensureCapacity(length+1); |
| bytes[length++]=b; |
| } |
| private void append(byte[] b, int off, int len) { |
| ensureCapacity(length+len); |
| System.arraycopy(b, off, bytes, length, len); |
| length+=len; |
| } |
| private void truncateString(int newLength) { length=newLength; } |
| |
| private byte[] bytes; |
| private int length; |
| } |
| |
| /** |
| * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. |
| * @stable ICU 4.8 |
| */ |
| public static final class Iterator implements java.util.Iterator<Entry> { |
| private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { |
| bytes_=trieBytes; |
| pos_=initialPos_=offset; |
| remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; |
| maxLength_=maxStringLength; |
| entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); |
| int length=remainingMatchLength_; // Actual remaining match length minus 1. |
| if(length>=0) { |
| // Pending linear-match node, append remaining bytes to entry_. |
| ++length; |
| if(maxLength_>0 && length>maxLength_) { |
| length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. |
| } |
| entry_.append(bytes_, pos_, length); |
| pos_+=length; |
| remainingMatchLength_-=length; |
| } |
| } |
| |
| /** |
| * Resets this iterator to its initial state. |
| * @return this |
| * @stable ICU 4.8 |
| */ |
| public Iterator reset() { |
| pos_=initialPos_; |
| remainingMatchLength_=initialRemainingMatchLength_; |
| int length=remainingMatchLength_+1; // Remaining match length. |
| if(maxLength_>0 && length>maxLength_) { |
| length=maxLength_; |
| } |
| entry_.truncateString(length); |
| pos_+=length; |
| remainingMatchLength_-=length; |
| stack_.clear(); |
| return this; |
| } |
| |
| /** |
| * @return true if there are more elements. |
| * @stable ICU 4.8 |
| */ |
| @Override |
| public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } |
| |
| /** |
| * Finds the next (byte sequence, value) pair if there is one. |
| * |
| * If the byte sequence is truncated to the maximum length and does not |
| * have a real value, then the value is set to -1. |
| * In this case, this "not a real value" is indistinguishable from |
| * a real value of -1. |
| * @return An Entry with the string and value of the next element. |
| * @throws NoSuchElementException - iteration has no more elements. |
| * @stable ICU 4.8 |
| */ |
| @Override |
| public Entry next() { |
| int pos=pos_; |
| if(pos<0) { |
| if(stack_.isEmpty()) { |
| throw new NoSuchElementException(); |
| } |
| // Pop the state off the stack and continue with the next outbound edge of |
| // the branch node. |
| long top=stack_.remove(stack_.size()-1); |
| int length=(int)top; |
| pos=(int)(top>>32); |
| entry_.truncateString(length&0xffff); |
| length>>>=16; |
| if(length>1) { |
| pos=branchNext(pos, length); |
| if(pos<0) { |
| return entry_; // Reached a final value. |
| } |
| } else { |
| entry_.append(bytes_[pos++]); |
| } |
| } |
| if(remainingMatchLength_>=0) { |
| // We only get here if we started in a pending linear-match node |
| // with more than maxLength remaining bytes. |
| return truncateAndStop(); |
| } |
| for(;;) { |
| int node=bytes_[pos++]&0xff; |
| if(node>=kMinValueLead) { |
| // Deliver value for the byte sequence so far. |
| boolean isFinal=(node&kValueIsFinal)!=0; |
| entry_.value=readValue(bytes_, pos, node>>1); |
| if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { |
| pos_=-1; |
| } else { |
| pos_=skipValue(pos, node); |
| } |
| return entry_; |
| } |
| if(maxLength_>0 && entry_.length==maxLength_) { |
| return truncateAndStop(); |
| } |
| if(node<kMinLinearMatch) { |
| if(node==0) { |
| node=bytes_[pos++]&0xff; |
| } |
| pos=branchNext(pos, node+1); |
| if(pos<0) { |
| return entry_; // Reached a final value. |
| } |
| } else { |
| // Linear-match node, append length bytes to entry_. |
| int length=node-kMinLinearMatch+1; |
| if(maxLength_>0 && entry_.length+length>maxLength_) { |
| entry_.append(bytes_, pos, maxLength_-entry_.length); |
| return truncateAndStop(); |
| } |
| entry_.append(bytes_, pos, length); |
| pos+=length; |
| } |
| } |
| } |
| |
| /** |
| * Iterator.remove() is not supported. |
| * @throws UnsupportedOperationException (always) |
| * @stable ICU 4.8 |
| */ |
| @Override |
| public void remove() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| private Entry truncateAndStop() { |
| pos_=-1; |
| entry_.value=-1; // no real value for str |
| return entry_; |
| } |
| |
| private int branchNext(int pos, int length) { |
| while(length>kMaxBranchLinearSubNodeLength) { |
| ++pos; // ignore the comparison byte |
| // Push state for the greater-or-equal edge. |
| stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); |
| // Follow the less-than edge. |
| length>>=1; |
| pos=jumpByDelta(bytes_, pos); |
| } |
| // List of key-value pairs where values are either final values or jump deltas. |
| // Read the first (key, value) pair. |
| byte trieByte=bytes_[pos++]; |
| int node=bytes_[pos++]&0xff; |
| boolean isFinal=(node&kValueIsFinal)!=0; |
| int value=readValue(bytes_, pos, node>>1); |
| pos=skipValue(pos, node); |
| stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); |
| entry_.append(trieByte); |
| if(isFinal) { |
| pos_=-1; |
| entry_.value=value; |
| return -1; |
| } else { |
| return pos+value; |
| } |
| } |
| |
| private byte[] bytes_; |
| private int pos_; |
| private int initialPos_; |
| private int remainingMatchLength_; |
| private int initialRemainingMatchLength_; |
| |
| private int maxLength_; |
| private Entry entry_; |
| |
| // The stack stores longs for backtracking to another |
| // outbound edge of a branch node. |
| // Each long has the offset from bytes_ in bits 62..32, |
| // the entry_.stringLength() from before the node in bits 15..0, |
| // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) |
| // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, |
| // but the code looks more confusing that way.) |
| private ArrayList<Long> stack_=new ArrayList<Long>(); |
| } |
| |
| private void stop() { |
| pos_=-1; |
| } |
| |
| // Reads a compact 32-bit integer. |
| // pos is already after the leadByte, and the lead byte is already shifted right by 1. |
| private static int readValue(byte[] bytes, int pos, int leadByte) { |
| int value; |
| if(leadByte<kMinTwoByteValueLead) { |
| value=leadByte-kMinOneByteValueLead; |
| } else if(leadByte<kMinThreeByteValueLead) { |
| value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); |
| } else if(leadByte<kFourByteValueLead) { |
| value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); |
| } else if(leadByte==kFourByteValueLead) { |
| value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); |
| } else { |
| value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); |
| } |
| return value; |
| } |
| private static int skipValue(int pos, int leadByte) { |
| assert(leadByte>=kMinValueLead); |
| if(leadByte>=(kMinTwoByteValueLead<<1)) { |
| if(leadByte<(kMinThreeByteValueLead<<1)) { |
| ++pos; |
| } else if(leadByte<(kFourByteValueLead<<1)) { |
| pos+=2; |
| } else { |
| pos+=3+((leadByte>>1)&1); |
| } |
| } |
| return pos; |
| } |
| private static int skipValue(byte[] bytes, int pos) { |
| int leadByte=bytes[pos++]&0xff; |
| return skipValue(pos, leadByte); |
| } |
| |
| // Reads a jump delta and jumps. |
| private static int jumpByDelta(byte[] bytes, int pos) { |
| int delta=bytes[pos++]&0xff; |
| if(delta<kMinTwoByteDeltaLead) { |
| // nothing to do |
| } else if(delta<kMinThreeByteDeltaLead) { |
| delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); |
| } else if(delta<kFourByteDeltaLead) { |
| delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); |
| pos+=2; |
| } else if(delta==kFourByteDeltaLead) { |
| delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); |
| pos+=3; |
| } else { |
| delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); |
| pos+=4; |
| } |
| return pos+delta; |
| } |
| |
| private static int skipDelta(byte[] bytes, int pos) { |
| int delta=bytes[pos++]&0xff; |
| if(delta>=kMinTwoByteDeltaLead) { |
| if(delta<kMinThreeByteDeltaLead) { |
| ++pos; |
| } else if(delta<kFourByteDeltaLead) { |
| pos+=2; |
| } else { |
| pos+=3+(delta&1); |
| } |
| } |
| return pos; |
| } |
| |
| private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; |
| |
| // Handles a branch node for both next(byte) and next(string). |
| private Result branchNext(int pos, int length, int inByte) { |
| // Branch according to the current byte. |
| if(length==0) { |
| length=bytes_[pos++]&0xff; |
| } |
| ++length; |
| // The length of the branch is the number of bytes to select from. |
| // The data structure encodes a binary search. |
| while(length>kMaxBranchLinearSubNodeLength) { |
| if(inByte<(bytes_[pos++]&0xff)) { |
| length>>=1; |
| pos=jumpByDelta(bytes_, pos); |
| } else { |
| length=length-(length>>1); |
| pos=skipDelta(bytes_, pos); |
| } |
| } |
| // Drop down to linear search for the last few bytes. |
| // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 |
| // and divides length by 2. |
| do { |
| if(inByte==(bytes_[pos++]&0xff)) { |
| Result result; |
| int node=bytes_[pos]&0xff; |
| assert(node>=kMinValueLead); |
| if((node&kValueIsFinal)!=0) { |
| // Leave the final value for getValue() to read. |
| result=Result.FINAL_VALUE; |
| } else { |
| // Use the non-final value as the jump delta. |
| ++pos; |
| // int delta=readValue(pos, node>>1); |
| node>>=1; |
| int delta; |
| if(node<kMinTwoByteValueLead) { |
| delta=node-kMinOneByteValueLead; |
| } else if(node<kMinThreeByteValueLead) { |
| delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); |
| } else if(node<kFourByteValueLead) { |
| delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); |
| pos+=2; |
| } else if(node==kFourByteValueLead) { |
| delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); |
| pos+=3; |
| } else { |
| delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); |
| pos+=4; |
| } |
| // end readValue() |
| pos+=delta; |
| node=bytes_[pos]&0xff; |
| result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; |
| } |
| pos_=pos; |
| return result; |
| } |
| --length; |
| pos=skipValue(bytes_, pos); |
| } while(length>1); |
| if(inByte==(bytes_[pos++]&0xff)) { |
| pos_=pos; |
| int node=bytes_[pos]&0xff; |
| return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; |
| } else { |
| stop(); |
| return Result.NO_MATCH; |
| } |
| } |
| |
| // Requires remainingLength_<0. |
| private Result nextImpl(int pos, int inByte) { |
| for(;;) { |
| int node=bytes_[pos++]&0xff; |
| if(node<kMinLinearMatch) { |
| return branchNext(pos, node, inByte); |
| } else if(node<kMinValueLead) { |
| // Match the first of length+1 bytes. |
| int length=node-kMinLinearMatch; // Actual match length minus 1. |
| if(inByte==(bytes_[pos++]&0xff)) { |
| remainingMatchLength_=--length; |
| pos_=pos; |
| return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? |
| valueResults_[node&kValueIsFinal] : Result.NO_VALUE; |
| } else { |
| // No match. |
| break; |
| } |
| } else if((node&kValueIsFinal)!=0) { |
| // No further matching bytes. |
| break; |
| } else { |
| // Skip intermediate value. |
| pos=skipValue(pos, node); |
| // The next node must not also be a value node. |
| assert((bytes_[pos]&0xff)<kMinValueLead); |
| } |
| } |
| stop(); |
| return Result.NO_MATCH; |
| } |
| |
| // Helper functions for getUniqueValue(). |
| // Recursively finds a unique value (or whether there is not a unique one) |
| // from a branch. |
| // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). |
| // On return, if not 0, then bits 63..33 contain the updated non-negative pos. |
| private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, |
| long uniqueValue) { |
| while(length>kMaxBranchLinearSubNodeLength) { |
| ++pos; // ignore the comparison byte |
| uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); |
| if(uniqueValue==0) { |
| return 0; |
| } |
| length=length-(length>>1); |
| pos=skipDelta(bytes, pos); |
| } |
| do { |
| ++pos; // ignore a comparison byte |
| // handle its value |
| int node=bytes[pos++]&0xff; |
| boolean isFinal=(node&kValueIsFinal)!=0; |
| int value=readValue(bytes, pos, node>>1); |
| pos=skipValue(pos, node); |
| if(isFinal) { |
| if(uniqueValue!=0) { |
| if(value!=(int)(uniqueValue>>1)) { |
| return 0; |
| } |
| } else { |
| uniqueValue=((long)value<<1)|1; |
| } |
| } else { |
| uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); |
| if(uniqueValue==0) { |
| return 0; |
| } |
| } |
| } while(--length>1); |
| // ignore the last comparison byte |
| return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); |
| } |
| // Recursively finds a unique value (or whether there is not a unique one) |
| // starting from a position on a node lead byte. |
| // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. |
| // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. |
| private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { |
| for(;;) { |
| int node=bytes[pos++]&0xff; |
| if(node<kMinLinearMatch) { |
| if(node==0) { |
| node=bytes[pos++]&0xff; |
| } |
| uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); |
| if(uniqueValue==0) { |
| return 0; |
| } |
| pos=(int)(uniqueValue>>>33); |
| } else if(node<kMinValueLead) { |
| // linear-match node |
| pos+=node-kMinLinearMatch+1; // Ignore the match bytes. |
| } else { |
| boolean isFinal=(node&kValueIsFinal)!=0; |
| int value=readValue(bytes, pos, node>>1); |
| if(uniqueValue!=0) { |
| if(value!=(int)(uniqueValue>>1)) { |
| return 0; |
| } |
| } else { |
| uniqueValue=((long)value<<1)|1; |
| } |
| if(isFinal) { |
| return uniqueValue; |
| } |
| pos=skipValue(pos, node); |
| } |
| } |
| } |
| |
| // Helper functions for getNextBytes(). |
| // getNextBytes() when pos is on a branch node. |
| private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { |
| while(length>kMaxBranchLinearSubNodeLength) { |
| ++pos; // ignore the comparison byte |
| getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); |
| length=length-(length>>1); |
| pos=skipDelta(bytes, pos); |
| } |
| do { |
| append(out, bytes[pos++]&0xff); |
| pos=skipValue(bytes, pos); |
| } while(--length>1); |
| append(out, bytes[pos]&0xff); |
| } |
| private static void append(Appendable out, int c) { |
| try { |
| out.append((char)c); |
| } catch(IOException e) { |
| throw new ICUUncheckedIOException(e); |
| } |
| } |
| |
| // BytesTrie data structure |
| // |
| // The trie consists of a series of byte-serialized nodes for incremental |
| // string/byte sequence matching. The root node is at the beginning of the trie data. |
| // |
| // Types of nodes are distinguished by their node lead byte ranges. |
| // After each node, except a final-value node, another node follows to |
| // encode match values or continue matching further bytes. |
| // |
| // Node types: |
| // - Value node: Stores a 32-bit integer in a compact, variable-length format. |
| // The value is for the string/byte sequence so far. |
| // One node bit indicates whether the value is final or whether |
| // matching continues with the next node. |
| // - Linear-match node: Matches a number of bytes. |
| // - Branch node: Branches to other nodes according to the current input byte. |
| // The node byte is the length of the branch (number of bytes to select from) |
| // minus 1. It is followed by a sub-node: |
| // - If the length is at most kMaxBranchLinearSubNodeLength, then |
| // there are length-1 (key, value) pairs and then one more comparison byte. |
| // If one of the key bytes matches, then the value is either a final value for |
| // the string/byte sequence so far, or a "jump" delta to the next node. |
| // If the last byte matches, then matching continues with the next node. |
| // (Values have the same encoding as value nodes.) |
| // - If the length is greater than kMaxBranchLinearSubNodeLength, then |
| // there is one byte and one "jump" delta. |
| // If the input byte is less than the sub-node byte, then "jump" by delta to |
| // the next sub-node which will have a length of length/2. |
| // (The delta has its own compact encoding.) |
| // Otherwise, skip the "jump" delta to the next sub-node |
| // which will have a length of length-length/2. |
| |
| // Node lead byte values. |
| |
| // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise |
| // the length is one more than the next byte. |
| |
| // For a branch sub-node with at most this many entries, we drop down |
| // to a linear search. |
| /*package*/ static final int kMaxBranchLinearSubNodeLength=5; |
| |
| // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. |
| /*package*/ static final int kMinLinearMatch=0x10; |
| /*package*/ static final int kMaxLinearMatchLength=0x10; |
| |
| // 20..ff: Variable-length value node. |
| // If odd, the value is final. (Otherwise, intermediate value or jump delta.) |
| // Then shift-right by 1 bit. |
| // The remaining lead byte value indicates the number of following bytes (0..4) |
| // and contains the value's top bits. |
| /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 |
| // It is a final value if bit 0 is set. |
| private static final int kValueIsFinal=1; |
| |
| // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. |
| /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 |
| /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. |
| |
| /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 |
| /*package*/ static final int kMaxTwoByteValue=0x1aff; |
| |
| /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c |
| /*package*/ static final int kFourByteValueLead=0x7e; |
| |
| // A little more than Unicode code points. (0x11ffff) |
| /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; |
| |
| /*package*/ static final int kFiveByteValueLead=0x7f; |
| |
| // Compact delta integers. |
| /*package*/ static final int kMaxOneByteDelta=0xbf; |
| /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 |
| /*package*/ static final int kMinThreeByteDeltaLead=0xf0; |
| /*package*/ static final int kFourByteDeltaLead=0xfe; |
| /*package*/ static final int kFiveByteDeltaLead=0xff; |
| |
| /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff |
| /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff |
| |
| // Fixed value referencing the BytesTrie bytes. |
| private byte[] bytes_; |
| private int root_; |
| |
| // Iterator variables. |
| |
| // Index of next trie byte to read. Negative if no more matches. |
| private int pos_; |
| // Remaining length of a linear-match node, minus 1. Negative if not in such a node. |
| private int remainingMatchLength_; |
| }; |