blob: 2297728e2101e07f264a9fd333d96a7f244c829f [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
/**
* @author aheninger
*
* A read-only Trie2, holding 16 bit data values.
*
* A Trie2 is a highly optimized data structure for mapping from Unicode
* code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
*
* See class Trie2 for descriptions of the API for accessing the contents of a trie.
*
* The fundamental data access methods are declared final in this class, with
* the intent that applications might gain a little extra performance, when compared
* with calling the same methods via the abstract UTrie2 base class.
*/
public final class Trie2_16 extends Trie2 {
/**
* Internal constructor, not for general use.
*/
Trie2_16() {
}
/**
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
* The serialized format is identical between ICU4C and ICU4J, so this function
* will work with serialized Trie2s from either.
*
* The serialized Trie2 on the stream may be in either little or big endian byte order.
* This allows using serialized Tries from ICU4C without needing to consider the
* byte order of the system that created them.
*
* @param is an input stream to the serialized form of a UTrie2.
* @return An unserialized Trie_16, ready for use.
* @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
* @throws IOException if a read error occurs on the InputStream.
* @throws ClassCastException if the stream contains a serialized Trie2_32
*/
public static Trie2_16 createFromSerialized(InputStream is) throws IOException {
return (Trie2_16) Trie2.createFromSerialized(is);
}
/**
* Get the value for a code point as stored in the Trie2.
*
* @param codePoint the code point
* @return the value
*/
@Override
public final int get(int codePoint) {
int value;
int ix;
if (codePoint >= 0) {
if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 16 bit data is stored in the index array itself.
ix = index[codePoint >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint <= 0xffff) {
// Lead Surrogate Code Point. A Separate index section is stored for
// lead surrogate code units and code points.
// The main index has the code unit data.
// For this function, we need the code point data.
// Note: this expression could be refactored for slightly improved efficiency, but
// surrogate code points will be so rare in practice that it's not worth it.
ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint < highStart) {
// Supplemental code point, use two-level lookup.
ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
ix = index[ix];
ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
ix = index[ix];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint <= 0x10ffff) {
value = index[highValueIndex];
return value;
}
}
// Fall through. The code point is outside of the legal range of 0..0x10ffff.
return errorValue;
}
/**
* Get a Trie2 value for a UTF-16 code unit.
*
* This function returns the same value as get() if the input
* character is outside of the lead surrogate range
*
* There are two values stored in a Trie2 for inputs in the lead
* surrogate range. This function returns the alternate value,
* while Trie2.get() returns the main value.
*
* @param codeUnit a 16 bit code unit or lead surrogate value.
* @return the value
*/
@Override
public int getFromU16SingleLead(char codeUnit) {
int value;
int ix;
// Because the input is a 16 bit char, we can skip the tests for it being in
// the BMP range. It is.
ix = index[codeUnit >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
/**
* Serialize a Trie2_16 onto an OutputStream.
*
* A Trie2 can be serialized multiple times.
* The serialized data is compatible with ICU4C UTrie2 serialization.
* Trie2 serialization is unrelated to Java object serialization.
*
* @param os the stream to which the serialized Trie2 data will be written.
* @return the number of bytes written.
* @throw IOException on an error writing to the OutputStream.
*/
public int serialize(OutputStream os) throws IOException {
DataOutputStream dos = new DataOutputStream(os);
int bytesWritten = 0;
bytesWritten += serializeHeader(dos);
for (int i=0; i<dataLength; i++) {
dos.writeChar(index[data16+i]);
}
bytesWritten += dataLength*2;
return bytesWritten;
}
}