| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * CollationSettings.java, ported from collationsettings.h/.cpp |
| * |
| * C++ version created on: 2013feb07 |
| * created by: Markus W. Scherer |
| */ |
| |
| package com.ibm.icu.impl.coll; |
| |
| import java.util.Arrays; |
| |
| import com.ibm.icu.text.Collator; |
| |
| /** |
| * Collation settings/options/attributes. |
| * These are the values that can be changed via API. |
| */ |
| public final class CollationSettings extends SharedObject { |
| /** |
| * Options bit 0: Perform the FCD check on the input text and deliver normalized text. |
| */ |
| public static final int CHECK_FCD = 1; |
| /** |
| * Options bit 1: Numeric collation. |
| * Also known as CODAN = COllate Digits As Numbers. |
| * |
| * Treat digit sequences as numbers with CE sequences in numeric order, |
| * rather than returning a normal CE for each digit. |
| */ |
| public static final int NUMERIC = 2; |
| /** |
| * "Shifted" alternate handling, see ALTERNATE_MASK. |
| */ |
| static final int SHIFTED = 4; |
| /** |
| * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. |
| * Reserve values 8 and 0xc for shift-trimmed and blanked. |
| */ |
| static final int ALTERNATE_MASK = 0xc; |
| /** |
| * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. |
| */ |
| static final int MAX_VARIABLE_SHIFT = 4; |
| /** maxVariable options bit mask before shifting. */ |
| static final int MAX_VARIABLE_MASK = 0x70; |
| /** Options bit 7: Reserved/unused/0. */ |
| /** |
| * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. |
| */ |
| static final int UPPER_FIRST = 0x100; |
| /** |
| * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) |
| * unless case level is on (when they are *moved* into the separate case level). |
| * By default, the case bits are removed from the tertiary weight (ignored). |
| * |
| * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to |
| * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. |
| */ |
| public static final int CASE_FIRST = 0x200; |
| /** |
| * Options bit mask for caseFirst and upperFirst, before shifting. |
| * Same value as caseFirst==upperFirst. |
| */ |
| public static final int CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; |
| /** |
| * Options bit 10: Insert the case level between the secondary and tertiary levels. |
| */ |
| public static final int CASE_LEVEL = 0x400; |
| /** |
| * Options bit 11: Compare secondary weights backwards. ("French secondary") |
| */ |
| public static final int BACKWARD_SECONDARY = 0x800; |
| /** |
| * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. |
| * It is the top used bit field in the options. (No need to mask after shifting.) |
| */ |
| static final int STRENGTH_SHIFT = 12; |
| /** Strength options bit mask before shifting. */ |
| static final int STRENGTH_MASK = 0xf000; |
| |
| /** maxVariable values */ |
| static final int MAX_VAR_SPACE = 0; |
| static final int MAX_VAR_PUNCT = 1; |
| static final int MAX_VAR_SYMBOL = 2; |
| static final int MAX_VAR_CURRENCY = 3; |
| |
| CollationSettings() {} |
| |
| @Override |
| public CollationSettings clone() { |
| CollationSettings newSettings = (CollationSettings)super.clone(); |
| // Note: The reorderTable, reorderRanges, and reorderCodes need not be cloned |
| // because, in Java, they only get replaced but not modified. |
| newSettings.fastLatinPrimaries = fastLatinPrimaries.clone(); |
| return newSettings; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| if(other == null) { return false; } |
| if(!this.getClass().equals(other.getClass())) { return false; } |
| CollationSettings o = (CollationSettings)other; |
| if(options != o.options) { return false; } |
| if((options & ALTERNATE_MASK) != 0 && variableTop != o.variableTop) { return false; } |
| if(!Arrays.equals(reorderCodes, o.reorderCodes)) { return false; } |
| return true; |
| } |
| |
| @Override |
| public int hashCode() { |
| int h = options << 8; |
| if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; } |
| h ^= reorderCodes.length; |
| for(int i = 0; i < reorderCodes.length; ++i) { |
| h ^= (reorderCodes[i] << i); |
| } |
| return h; |
| } |
| |
| public void resetReordering() { |
| // When we turn off reordering, we want to set a null permutation |
| // rather than a no-op permutation. |
| reorderTable = null; |
| minHighNoReorder = 0; |
| reorderRanges = null; |
| reorderCodes = EMPTY_INT_ARRAY; |
| } |
| |
| void aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) { |
| int[] codes; |
| if(codesLength == codesAndRanges.length) { |
| codes = codesAndRanges; |
| } else { |
| // TODO: Java 6: Arrays.copyOf(codes, codesLength); |
| codes = new int[codesLength]; |
| System.arraycopy(codesAndRanges, 0, codes, 0, codesLength); |
| } |
| int rangesStart = codesLength; |
| int rangesLimit = codesAndRanges.length; |
| int rangesLength = rangesLimit - rangesStart; |
| if(table != null && |
| (rangesLength == 0 ? |
| !reorderTableHasSplitBytes(table) : |
| rangesLength >= 2 && |
| // The first offset must be 0. The last offset must not be 0. |
| (codesAndRanges[rangesStart] & 0xffff) == 0 && |
| (codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) { |
| reorderTable = table; |
| reorderCodes = codes; |
| // Drop ranges before the first split byte. They are reordered by the table. |
| // This then speeds up reordering of the remaining ranges. |
| int firstSplitByteRangeIndex = rangesStart; |
| while(firstSplitByteRangeIndex < rangesLimit && |
| (codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) { |
| // The second byte of the primary limit is 0. |
| ++firstSplitByteRangeIndex; |
| } |
| if(firstSplitByteRangeIndex == rangesLimit) { |
| assert(!reorderTableHasSplitBytes(table)); |
| minHighNoReorder = 0; |
| reorderRanges = null; |
| } else { |
| assert(table[codesAndRanges[firstSplitByteRangeIndex] >>> 24] == 0); |
| minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L; |
| setReorderRanges(codesAndRanges, firstSplitByteRangeIndex, |
| rangesLimit - firstSplitByteRangeIndex); |
| } |
| return; |
| } |
| // Regenerate missing data. |
| setReordering(data, codes); |
| } |
| |
| public void setReordering(CollationData data, int[] codes) { |
| if(codes.length == 0 || (codes.length == 1 && codes[0] == Collator.ReorderCodes.NONE)) { |
| resetReordering(); |
| return; |
| } |
| UVector32 rangesList = new UVector32(); |
| data.makeReorderRanges(codes, rangesList); |
| int rangesLength = rangesList.size(); |
| if(rangesLength == 0) { |
| resetReordering(); |
| return; |
| } |
| int[] ranges = rangesList.getBuffer(); |
| // ranges[] contains at least two (limit, offset) pairs. |
| // The first offset must be 0. The last offset must not be 0. |
| // Separators (at the low end) and trailing weights (at the high end) |
| // are never reordered. |
| assert(rangesLength >= 2); |
| assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0); |
| minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L; |
| |
| // Write the lead byte permutation table. |
| // Set a 0 for each lead byte that has a range boundary in the middle. |
| byte[] table = new byte[256]; |
| int b = 0; |
| int firstSplitByteRangeIndex = -1; |
| for(int i = 0; i < rangesLength; ++i) { |
| int pair = ranges[i]; |
| int limit1 = pair >>> 24; |
| while(b < limit1) { |
| table[b] = (byte)(b + pair); |
| ++b; |
| } |
| // Check the second byte of the limit. |
| if((pair & 0xff0000) != 0) { |
| table[limit1] = 0; |
| b = limit1 + 1; |
| if(firstSplitByteRangeIndex < 0) { |
| firstSplitByteRangeIndex = i; |
| } |
| } |
| } |
| while(b <= 0xff) { |
| table[b] = (byte)b; |
| ++b; |
| } |
| int rangesStart; |
| if(firstSplitByteRangeIndex < 0) { |
| // The lead byte permutation table alone suffices for reordering. |
| rangesStart = rangesLength = 0; |
| } else { |
| // Remove the ranges below the first split byte. |
| rangesStart = firstSplitByteRangeIndex; |
| rangesLength -= firstSplitByteRangeIndex; |
| } |
| setReorderArrays(codes, ranges, rangesStart, rangesLength, table); |
| } |
| |
| private void setReorderArrays(int[] codes, |
| int[] ranges, int rangesStart, int rangesLength, byte[] table) { |
| // Very different from C++. See the comments after the reorderCodes declaration. |
| if(codes == null) { |
| codes = EMPTY_INT_ARRAY; |
| } |
| assert (codes.length == 0) == (table == null); |
| reorderTable = table; |
| reorderCodes = codes; |
| setReorderRanges(ranges, rangesStart, rangesLength); |
| } |
| |
| private void setReorderRanges(int[] ranges, int rangesStart, int rangesLength) { |
| if(rangesLength == 0) { |
| reorderRanges = null; |
| } else { |
| reorderRanges = new long[rangesLength]; |
| int i = 0; |
| do { |
| reorderRanges[i++] = ranges[rangesStart++] & 0xffffffffL; |
| } while(i < rangesLength); |
| } |
| } |
| |
| public void copyReorderingFrom(CollationSettings other) { |
| if(!other.hasReordering()) { |
| resetReordering(); |
| return; |
| } |
| minHighNoReorder = other.minHighNoReorder; |
| reorderTable = other.reorderTable; |
| reorderRanges = other.reorderRanges; |
| reorderCodes = other.reorderCodes; |
| } |
| |
| public boolean hasReordering() { return reorderTable != null; } |
| |
| private static boolean reorderTableHasSplitBytes(byte[] table) { |
| assert(table[0] == 0); |
| for(int i = 1; i < 256; ++i) { |
| if(table[i] == 0) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| public long reorder(long p) { |
| byte b = reorderTable[(int)p >>> 24]; |
| if(b != 0 || p <= Collation.NO_CE_PRIMARY) { |
| return ((b & 0xffL) << 24) | (p & 0xffffff); |
| } else { |
| return reorderEx(p); |
| } |
| } |
| |
| private long reorderEx(long p) { |
| assert minHighNoReorder > 0; |
| if(p >= minHighNoReorder) { return p; } |
| // Round up p so that its lower 16 bits are >= any offset bits. |
| // Then compare q directly with (limit, offset) pairs. |
| long q = p | 0xffff; |
| long r; |
| int i = 0; |
| while(q >= (r = reorderRanges[i])) { ++i; } |
| return p + ((long)(short)r << 24); |
| } |
| |
| // In C++, we use enums for attributes and their values, with a special value for the default. |
| // Combined getter/setter methods handle many attributes. |
| // In Java, we have specific methods for getting, setting, and set-to-default, |
| // except that this class uses bits in its own bit set for simple values. |
| |
| public void setStrength(int value) { |
| int noStrength = options & ~STRENGTH_MASK; |
| switch(value) { |
| case Collator.PRIMARY: |
| case Collator.SECONDARY: |
| case Collator.TERTIARY: |
| case Collator.QUATERNARY: |
| case Collator.IDENTICAL: |
| options = noStrength | (value << STRENGTH_SHIFT); |
| break; |
| default: |
| throw new IllegalArgumentException("illegal strength value " + value); |
| } |
| } |
| |
| public void setStrengthDefault(int defaultOptions) { |
| int noStrength = options & ~STRENGTH_MASK; |
| options = noStrength | (defaultOptions & STRENGTH_MASK); |
| } |
| |
| static int getStrength(int options) { |
| return options >> STRENGTH_SHIFT; |
| } |
| |
| public int getStrength() { |
| return getStrength(options); |
| } |
| |
| /** Sets the options bit for an on/off attribute. */ |
| public void setFlag(int bit, boolean value) { |
| if(value) { |
| options |= bit; |
| } else { |
| options &= ~bit; |
| } |
| } |
| |
| public void setFlagDefault(int bit, int defaultOptions) { |
| options = (options & ~bit) | (defaultOptions & bit); |
| } |
| |
| public boolean getFlag(int bit) { |
| return (options & bit) != 0; |
| } |
| |
| public void setCaseFirst(int value) { |
| assert value == 0 || value == CASE_FIRST || value == CASE_FIRST_AND_UPPER_MASK; |
| int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK; |
| options = noCaseFirst | value; |
| } |
| |
| public void setCaseFirstDefault(int defaultOptions) { |
| int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK; |
| options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK); |
| } |
| |
| public int getCaseFirst() { |
| return options & CASE_FIRST_AND_UPPER_MASK; |
| } |
| |
| public void setAlternateHandlingShifted(boolean value) { |
| int noAlternate = options & ~ALTERNATE_MASK; |
| if(value) { |
| options = noAlternate | SHIFTED; |
| } else { |
| options = noAlternate; |
| } |
| } |
| |
| public void setAlternateHandlingDefault(int defaultOptions) { |
| int noAlternate = options & ~ALTERNATE_MASK; |
| options = noAlternate | (defaultOptions & ALTERNATE_MASK); |
| } |
| |
| public boolean getAlternateHandling() { |
| return (options & ALTERNATE_MASK) != 0; |
| } |
| |
| public void setMaxVariable(int value, int defaultOptions) { |
| int noMax = options & ~MAX_VARIABLE_MASK; |
| switch(value) { |
| case MAX_VAR_SPACE: |
| case MAX_VAR_PUNCT: |
| case MAX_VAR_SYMBOL: |
| case MAX_VAR_CURRENCY: |
| options = noMax | (value << MAX_VARIABLE_SHIFT); |
| break; |
| case -1: |
| options = noMax | (defaultOptions & MAX_VARIABLE_MASK); |
| break; |
| default: |
| throw new IllegalArgumentException("illegal maxVariable value " + value); |
| } |
| } |
| |
| public int getMaxVariable() { |
| return (options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT; |
| } |
| |
| /** |
| * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. |
| */ |
| static boolean isTertiaryWithCaseBits(int options) { |
| return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; |
| } |
| static int getTertiaryMask(int options) { |
| // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. |
| return isTertiaryWithCaseBits(options) ? |
| Collation.CASE_AND_TERTIARY_MASK : Collation.ONLY_TERTIARY_MASK; |
| } |
| |
| static boolean sortsTertiaryUpperCaseFirst(int options) { |
| // On tertiary level, consider case bits and sort uppercase first |
| // if caseLevel is off and caseFirst==upperFirst. |
| return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; |
| } |
| |
| public boolean dontCheckFCD() { |
| return (options & CHECK_FCD) == 0; |
| } |
| |
| boolean hasBackwardSecondary() { |
| return (options & BACKWARD_SECONDARY) != 0; |
| } |
| |
| public boolean isNumeric() { |
| return (options & NUMERIC) != 0; |
| } |
| |
| /** CHECK_FCD etc. */ |
| public int options = (Collator.TERTIARY << STRENGTH_SHIFT) | // DEFAULT_STRENGTH |
| (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT); |
| /** Variable-top primary weight. */ |
| public long variableTop; |
| /** |
| * 256-byte table for reordering permutation of primary lead bytes; null if no reordering. |
| * A 0 entry at a non-zero index means that the primary lead byte is "split" |
| * (there are different offsets for primaries that share that lead byte) |
| * and the reordering offset must be determined via the reorderRanges. |
| */ |
| public byte[] reorderTable; |
| /** Limit of last reordered range. 0 if no reordering or no split bytes. */ |
| long minHighNoReorder; |
| /** |
| * Primary-weight ranges for script reordering, |
| * to be used by reorder(p) for split-reordered primary lead bytes. |
| * |
| * <p>Each entry is a (limit, offset) pair. |
| * The upper 16 bits of the entry are the upper 16 bits of the |
| * exclusive primary limit of a range. |
| * Primaries between the previous limit and this one have their lead bytes |
| * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. |
| * |
| * <p>CollationData.makeReorderRanges() writes a full list where the first range |
| * (at least for terminators and separators) has a 0 offset. |
| * The last range has a non-zero offset. |
| * minHighNoReorder is set to the limit of that last range. |
| * |
| * <p>In the settings object, the initial ranges before the first split lead byte |
| * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. |
| * If there are no split-reordered lead bytes, then no ranges are needed. |
| */ |
| long[] reorderRanges; |
| /** Array of reorder codes; ignored if length == 0. */ |
| public int[] reorderCodes = EMPTY_INT_ARRAY; |
| // Note: In C++, we keep a memory block around for the reorder codes, |
| // the ranges, and the permutation table, |
| // and modify them for new codes. |
| // In Java, we simply copy references and then never modify the array contents. |
| // The caller must abandon the arrays. |
| // Reorder codes from the public setter API must be cloned. |
| private static final int[] EMPTY_INT_ARRAY = new int[0]; |
| |
| /** Options for CollationFastLatin. Negative if disabled. */ |
| public int fastLatinOptions = -1; |
| // fastLatinPrimaries.length must be equal to CollationFastLatin.LATIN_LIMIT, |
| // but we do not import CollationFastLatin to reduce circular dependencies. |
| public char[] fastLatinPrimaries = new char[0x180]; // mutable contents |
| } |