| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * CollationDataReader.java, ported from collationdatareader.h/.cpp |
| * |
| * C++ version created on: 2013feb07 |
| * created by: Markus W. Scherer |
| */ |
| |
| package com.ibm.icu.impl.coll; |
| |
| import java.io.BufferedInputStream; |
| import java.io.DataInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.Arrays; |
| |
| import com.ibm.icu.impl.ICUBinary; |
| import com.ibm.icu.impl.Trie2_32; |
| import com.ibm.icu.impl.USerializedSet; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.util.ICUException; |
| |
| /** |
| * Collation binary data reader. |
| */ |
| final class CollationDataReader /* all static */ { |
| // The following constants are also copied into source/common/ucol_swp.cpp. |
| // Keep them in sync! |
| /** |
| * Number of int indexes. |
| * |
| * Can be 2 if there are only options. |
| * Can be 7 or 8 if there are only options and a script reordering. |
| * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. |
| */ |
| static final int IX_INDEXES_LENGTH = 0; |
| /** |
| * Bits 31..24: numericPrimary, for numeric collation |
| * 23..16: fast Latin format version (0 = no fast Latin table) |
| * 15.. 0: options bit set |
| */ |
| static final int IX_OPTIONS = 1; |
| static final int IX_RESERVED2 = 2; |
| static final int IX_RESERVED3 = 3; |
| |
| /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ |
| static final int IX_JAMO_CE32S_START = 4; |
| |
| // Byte offsets from the start of the data, after the generic header. |
| // The indexes[] are at byte offset 0, other data follows. |
| // Each data item is aligned properly. |
| // The data items should be in descending order of unit size, |
| // to minimize the need for padding. |
| // Each item's byte length is given by the difference between its offset and |
| // the next index/offset value. |
| /** Byte offset to int reorderCodes[]. */ |
| static final int IX_REORDER_CODES_OFFSET = 5; |
| /** |
| * Byte offset to uint8_t reorderTable[]. |
| * Empty table if <256 bytes (padding only). |
| * Otherwise 256 bytes or more (with padding). |
| */ |
| static final int IX_REORDER_TABLE_OFFSET = 6; |
| /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ |
| static final int IX_TRIE_OFFSET = 7; |
| |
| static final int IX_RESERVED8_OFFSET = 8; |
| /** Byte offset to long ces[]. */ |
| static final int IX_CES_OFFSET = 9; |
| static final int IX_RESERVED10_OFFSET = 10; |
| /** Byte offset to int ce32s[]. */ |
| static final int IX_CE32S_OFFSET = 11; |
| |
| /** Byte offset to uint32_t rootElements[]. */ |
| static final int IX_ROOT_ELEMENTS_OFFSET = 12; |
| /** Byte offset to UChar *contexts[]. */ |
| static final int IX_CONTEXTS_OFFSET = 13; |
| /** Byte offset to char [] with serialized unsafeBackwardSet. */ |
| static final int IX_UNSAFE_BWD_OFFSET = 14; |
| /** Byte offset to char fastLatinTable[]. */ |
| static final int IX_FAST_LATIN_TABLE_OFFSET = 15; |
| |
| /** Byte offset to char scripts[]. */ |
| static final int IX_SCRIPTS_OFFSET = 16; |
| /** |
| * Byte offset to boolean compressibleBytes[]. |
| * Empty table if <256 bytes (padding only). |
| * Otherwise 256 bytes or more (with padding). |
| */ |
| static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17; |
| static final int IX_RESERVED18_OFFSET = 18; |
| static final int IX_TOTAL_SIZE = 19; |
| |
| static void read(CollationTailoring base, InputStream inBytes, |
| CollationTailoring tailoring) throws IOException { |
| BufferedInputStream bis = new BufferedInputStream(inBytes); |
| tailoring.version = ICUBinary.readHeaderAndDataVersion(bis, DATA_FORMAT, IS_ACCEPTABLE); |
| if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) { |
| throw new ICUException("Tailoring UCA version differs from base data UCA version"); |
| } |
| |
| DataInputStream ds = new DataInputStream(bis); |
| int indexesLength = ds.readInt(); // inIndexes[IX_INDEXES_LENGTH] |
| if(indexesLength < 2) { |
| throw new ICUException("not enough indexes"); |
| } |
| int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; |
| inIndexes[0] = indexesLength; |
| for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) { |
| inIndexes[i] = ds.readInt(); |
| } |
| for(int i = indexesLength; i < inIndexes.length; ++i) { |
| inIndexes[i] = -1; |
| } |
| if(indexesLength > inIndexes.length) { |
| ds.skipBytes((indexesLength - inIndexes.length) * 4); |
| } |
| |
| // Assume that the tailoring data is in initial state, |
| // with null pointers and 0 lengths. |
| |
| // Set pointers to non-empty data parts. |
| // Do this in order of their byte offsets. (Should help porting to Java.) |
| |
| int index; // one of the indexes[] slots |
| int offset; // byte offset for the index part |
| int length; // number of bytes in the index part |
| |
| CollationData baseData = base == null ? null : base.data; |
| int[] reorderCodes; |
| index = IX_REORDER_CODES_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 4) { |
| if(baseData == null) { |
| // We assume for collation settings that |
| // the base data does not have a reordering. |
| throw new ICUException("Collation base data must not reorder scripts"); |
| } |
| reorderCodes = new int[length / 4]; |
| for(int i = 0; i < length / 4; ++i) { |
| reorderCodes[i] = ds.readInt(); |
| } |
| length &= 3; |
| } else { |
| reorderCodes = new int[0]; |
| } |
| ds.skipBytes(length); |
| |
| // There should be a reorder table only if there are reorder codes. |
| // However, when there are reorder codes the reorder table may be omitted to reduce |
| // the data size. |
| byte[] reorderTable = null; |
| index = IX_REORDER_TABLE_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 256) { |
| if(reorderCodes.length == 0) { |
| throw new ICUException("Reordering table without reordering codes"); |
| } |
| reorderTable = new byte[256]; |
| ds.readFully(reorderTable); |
| length -= 256; |
| } else { |
| // If we have reorder codes, then build the reorderTable at the end, |
| // when the CollationData is otherwise complete. |
| } |
| ds.skipBytes(length); |
| |
| if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { |
| throw new ICUException("Tailoring numeric primary weight differs from base data"); |
| } |
| CollationData data = null; // Remains null if there are no mappings. |
| |
| index = IX_TRIE_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 8) { |
| tailoring.ensureOwnedData(); |
| data = tailoring.ownedData; |
| data.base = baseData; |
| data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; |
| data.trie = tailoring.trie = Trie2_32.createFromSerialized(ds); |
| int trieLength = data.trie.getSerializedLength(); |
| if(trieLength > length) { |
| throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. |
| } |
| length -= trieLength; |
| } else if(baseData != null) { |
| // Use the base data. Only the settings are tailored. |
| tailoring.data = baseData; |
| } else { |
| throw new ICUException("Missing collation data mappings"); // No mappings. |
| } |
| ds.skipBytes(length); |
| |
| index = IX_RESERVED8_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| ds.skipBytes(length); |
| |
| index = IX_CES_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 8) { |
| if(data == null) { |
| throw new ICUException("Tailored ces without tailored trie"); |
| } |
| data.ces = new long[length / 8]; |
| for(int i = 0; i < length / 8; ++i) { |
| data.ces[i] = ds.readLong(); |
| } |
| length &= 7; |
| } |
| ds.skipBytes(length); |
| |
| index = IX_RESERVED10_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| ds.skipBytes(length); |
| |
| index = IX_CE32S_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 4) { |
| if(data == null) { |
| throw new ICUException("Tailored ce32s without tailored trie"); |
| } |
| data.ce32s = new int[length / 4]; |
| for(int i = 0; i < length / 4; ++i) { |
| data.ce32s[i] = ds.readInt(); |
| } |
| length &= 3; |
| } |
| ds.skipBytes(length); |
| |
| int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; |
| if(jamoCE32sStart >= 0) { |
| if(data == null || data.ce32s == null) { |
| throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); |
| } |
| data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; |
| System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); |
| } else if(data == null) { |
| // Nothing to do. |
| } else if(baseData != null) { |
| data.jamoCE32s = baseData.jamoCE32s; |
| } else { |
| throw new ICUException("Missing Jamo CE32s for Hangul processing"); |
| } |
| |
| index = IX_ROOT_ELEMENTS_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 4) { |
| int rootElementsLength = length / 4; |
| if(data == null) { |
| throw new ICUException("Root elements but no mappings"); |
| } |
| if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { |
| throw new ICUException("Root elements array too short"); |
| } |
| data.rootElements = new long[rootElementsLength]; |
| for(int i = 0; i < rootElementsLength; ++i) { |
| data.rootElements[i] = ds.readInt() & 0xffffffffL; // unsigned int -> long |
| } |
| long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; |
| if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { |
| throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); |
| } |
| long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; |
| if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) { |
| // [fixed last secondary common byte] is too low, |
| // and secondary weights would collide with compressed common secondaries. |
| throw new ICUException("[fixed last secondary common byte] is too low"); |
| } |
| length &= 3; |
| } |
| ds.skipBytes(length); |
| |
| index = IX_CONTEXTS_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 2) { |
| if(data == null) { |
| throw new ICUException("Tailored contexts without tailored trie"); |
| } |
| StringBuilder sb = new StringBuilder(length / 2); |
| for(int i = 0; i < length / 2; ++i) { |
| sb.append(ds.readChar()); |
| } |
| data.contexts = sb.toString(); |
| length &= 1; |
| } |
| ds.skipBytes(length); |
| |
| index = IX_UNSAFE_BWD_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 2) { |
| if(data == null) { |
| throw new ICUException("Unsafe-backward-set but no mappings"); |
| } |
| if(baseData == null) { |
| // Create the unsafe-backward set for the root collator. |
| // Include all non-zero combining marks and trail surrogates. |
| // We do this at load time, rather than at build time, |
| // to simplify Unicode version bootstrapping: |
| // The root data builder only needs the new FractionalUCA.txt data, |
| // but it need not be built with a version of ICU already updated to |
| // the corresponding new Unicode Character Database. |
| // |
| // The following is an optimized version of |
| // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). |
| // It is faster and requires fewer code dependencies. |
| tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates |
| data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet); |
| } else { |
| // Clone the root collator's set contents. |
| tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed(); |
| } |
| // Add the ranges from the data file to the unsafe-backward set. |
| USerializedSet sset = new USerializedSet(); |
| char[] unsafeData = new char[length / 2]; |
| for(int i = 0; i < length / 2; ++i) { |
| unsafeData[i] = ds.readChar(); |
| } |
| length &= 1; |
| sset.getSet(unsafeData, 0); |
| int count = sset.countRanges(); |
| int[] range = new int[2]; |
| for(int i = 0; i < count; ++i) { |
| sset.getRange(i, range); |
| tailoring.unsafeBackwardSet.add(range[0], range[1]); |
| } |
| // Mark each lead surrogate as "unsafe" |
| // if any of its 1024 associated supplementary code points is "unsafe". |
| int c = 0x10000; |
| for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { |
| if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) { |
| tailoring.unsafeBackwardSet.add(lead); |
| } |
| } |
| tailoring.unsafeBackwardSet.freeze(); |
| data.unsafeBackwardSet = tailoring.unsafeBackwardSet; |
| } else if(data == null) { |
| // Nothing to do. |
| } else if(baseData != null) { |
| // No tailoring-specific data: Alias the root collator's set. |
| data.unsafeBackwardSet = baseData.unsafeBackwardSet; |
| } else { |
| throw new ICUException("Missing unsafe-backward-set"); |
| } |
| ds.skipBytes(length); |
| |
| // If the fast Latin format version is different, |
| // or the version is set to 0 for "no fast Latin table", |
| // then just always use the normal string comparison path. |
| index = IX_FAST_LATIN_TABLE_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(data != null) { |
| data.fastLatinTable = null; |
| data.fastLatinTableHeader = null; |
| if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { |
| if(length >= 2) { |
| char header0 = ds.readChar(); |
| int headerLength = header0 & 0xff; |
| data.fastLatinTableHeader = new char[headerLength]; |
| data.fastLatinTableHeader[0] = header0; |
| for(int i = 1; i < headerLength; ++i) { |
| data.fastLatinTableHeader[i] = ds.readChar(); |
| } |
| int tableLength = length / 2 - headerLength; |
| data.fastLatinTable = new char[tableLength]; |
| for(int i = 0; i < tableLength; ++i) { |
| data.fastLatinTable[i] = ds.readChar(); |
| } |
| length &= 1; |
| if((header0 >> 8) != CollationFastLatin.VERSION) { |
| throw new ICUException("Fast-Latin table version differs from version in data header"); |
| } |
| } else if(baseData != null) { |
| data.fastLatinTable = baseData.fastLatinTable; |
| data.fastLatinTableHeader = baseData.fastLatinTableHeader; |
| } |
| } |
| } |
| ds.skipBytes(length); |
| |
| index = IX_SCRIPTS_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 2) { |
| if(data == null) { |
| throw new ICUException("Script order data but no mappings"); |
| } |
| data.scripts = new char[length / 2]; |
| for(int i = 0; i < length / 2; ++i) { |
| data.scripts[i] = ds.readChar(); |
| } |
| length &= 1; |
| } else if(data == null) { |
| // Nothing to do. |
| } else if(baseData != null) { |
| data.scripts = baseData.scripts; |
| } |
| ds.skipBytes(length); |
| |
| index = IX_COMPRESSIBLE_BYTES_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| if(length >= 256) { |
| if(data == null) { |
| throw new ICUException("Data for compressible primary lead bytes but no mappings"); |
| } |
| data.compressibleBytes = new boolean[256]; |
| for(int i = 0; i < 256; ++i) { |
| data.compressibleBytes[i] = ds.readBoolean(); |
| } |
| length -= 256; |
| } else if(data == null) { |
| // Nothing to do. |
| } else if(baseData != null) { |
| data.compressibleBytes = baseData.compressibleBytes; |
| } else { |
| throw new ICUException("Missing data for compressible primary lead bytes"); |
| } |
| ds.skipBytes(length); |
| |
| index = IX_RESERVED18_OFFSET; |
| offset = inIndexes[index]; |
| length = inIndexes[index + 1] - offset; |
| ds.skipBytes(length); |
| |
| ds.close(); |
| |
| CollationSettings ts = tailoring.settings.readOnly(); |
| int options = inIndexes[IX_OPTIONS] & 0xffff; |
| char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; |
| int fastLatinOptions = CollationFastLatin.getOptions( |
| tailoring.data, ts, fastLatinPrimaries); |
| if(options == ts.options && ts.variableTop != 0 && |
| Arrays.equals(reorderCodes, ts.reorderCodes) && |
| fastLatinOptions == ts.fastLatinOptions && |
| (fastLatinOptions < 0 || |
| Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) { |
| return; |
| } |
| |
| CollationSettings settings = tailoring.settings.copyOnWrite(); |
| settings.options = options; |
| // Set variableTop from options and scripts data. |
| settings.variableTop = tailoring.data.getLastPrimaryForGroup( |
| Collator.ReorderCodes.FIRST + settings.getMaxVariable()); |
| if(settings.variableTop == 0) { |
| throw new ICUException("The maxVariable could not be mapped to a variableTop"); |
| } |
| |
| if(reorderCodes.length == 0 || reorderTable != null) { |
| settings.setReordering(reorderCodes, reorderTable); |
| } else { |
| byte[] table = new byte[256]; |
| baseData.makeReorderTable(reorderCodes, table); |
| settings.setReordering(reorderCodes, table); |
| } |
| |
| settings.fastLatinOptions = CollationFastLatin.getOptions( |
| tailoring.data, settings, |
| settings.fastLatinPrimaries); |
| } |
| |
| private static final class IsAcceptable implements ICUBinary.Authenticate { |
| // @Override when we switch to Java 6 |
| public boolean isDataVersionAcceptable(byte version[]) { |
| return version[0] == 4; |
| } |
| } |
| private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); |
| private static final byte DATA_FORMAT[] = { 0x55, 0x43, 0x6f, 0x6c }; // "UCol" |
| |
| private CollationDataReader() {} // no constructor |
| } |
| |
| /* |
| * Format of collation data (ucadata.icu, binary data in coll/ *.res files): |
| * See ICU4C source/common/collationdatareader.h. |
| */ |