| /** |
| ******************************************************************************* |
| * Copyright (C) 2001-2014, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| * CollationTest.java, ported from collationtest.cpp |
| * C++ version created on: 2012apr27 |
| * created by: Markus W. Scherer |
| */ |
| package com.ibm.icu.dev.test.collator; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.text.ParseException; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| import com.ibm.icu.dev.test.TestFmwk; |
| import com.ibm.icu.dev.test.TestUtil; |
| import com.ibm.icu.impl.Norm2AllModes; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.impl.coll.Collation; |
| import com.ibm.icu.impl.coll.CollationData; |
| import com.ibm.icu.impl.coll.CollationFCD; |
| import com.ibm.icu.impl.coll.CollationIterator; |
| import com.ibm.icu.impl.coll.CollationRoot; |
| import com.ibm.icu.impl.coll.CollationRootElements; |
| import com.ibm.icu.impl.coll.CollationRuleParser; |
| import com.ibm.icu.impl.coll.CollationWeights; |
| import com.ibm.icu.impl.coll.FCDIterCollationIterator; |
| import com.ibm.icu.impl.coll.FCDUTF16CollationIterator; |
| import com.ibm.icu.impl.coll.UTF16CollationIterator; |
| import com.ibm.icu.impl.coll.UVector32; |
| import com.ibm.icu.text.CollationElementIterator; |
| import com.ibm.icu.text.CollationKey; |
| import com.ibm.icu.text.Collator; |
| import com.ibm.icu.text.Collator.ReorderCodes; |
| import com.ibm.icu.text.Normalizer2; |
| import com.ibm.icu.text.RawCollationKey; |
| import com.ibm.icu.text.RuleBasedCollator; |
| import com.ibm.icu.text.UCharacterIterator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| import com.ibm.icu.util.IllformedLocaleException; |
| import com.ibm.icu.util.Output; |
| import com.ibm.icu.util.ULocale; |
| |
| public class CollationTest extends TestFmwk { |
| public static void main(String[] args) throws Exception{ |
| new CollationTest().run(args); |
| } |
| |
| public CollationTest() { |
| } |
| |
| // Fields |
| Normalizer2 fcd, nfd; |
| Collator coll; |
| String fileLine; |
| int fileLineNumber; |
| String fileTestName; |
| Throwable error; |
| |
| // package private methods ---------------------------------------------- |
| |
| static void doTest(TestFmwk test, RuleBasedCollator col, String source, |
| String target, int result) |
| { |
| doTestVariant(test, col, source, target, result); |
| if (result == -1) { |
| doTestVariant(test, col, target, source, 1); |
| } |
| else if (result == 1) { |
| doTestVariant(test, col, target, source, -1); |
| } |
| else { |
| doTestVariant(test, col, target, source, 0); |
| } |
| |
| CollationElementIterator iter = col.getCollationElementIterator(source); |
| backAndForth(test, iter); |
| iter.setText(target); |
| backAndForth(test, iter); |
| } |
| |
| /** |
| * Return an integer array containing all of the collation orders |
| * returned by calls to next on the specified iterator |
| */ |
| static int[] getOrders(CollationElementIterator iter) |
| { |
| int maxSize = 100; |
| int size = 0; |
| int[] orders = new int[maxSize]; |
| |
| int order; |
| while ((order = iter.next()) != CollationElementIterator.NULLORDER) { |
| if (size == maxSize) { |
| maxSize *= 2; |
| int[] temp = new int[maxSize]; |
| System.arraycopy(orders, 0, temp, 0, size); |
| orders = temp; |
| } |
| orders[size++] = order; |
| } |
| |
| if (maxSize > size) { |
| int[] temp = new int[size]; |
| System.arraycopy(orders, 0, temp, 0, size); |
| orders = temp; |
| } |
| return orders; |
| } |
| |
| static void backAndForth(TestFmwk test, CollationElementIterator iter) |
| { |
| // Run through the iterator forwards and stick it into an array |
| iter.reset(); |
| int[] orders = getOrders(iter); |
| |
| // Now go through it backwards and make sure we get the same values |
| int index = orders.length; |
| int o; |
| |
| // reset the iterator |
| iter.reset(); |
| |
| while ((o = iter.previous()) != CollationElementIterator.NULLORDER) { |
| if (o != orders[--index]) { |
| if (o == 0) { |
| index ++; |
| } else { |
| while (index > 0 && orders[index] == 0) { |
| index --; |
| } |
| if (o != orders[index]) { |
| test.errln("Mismatch at index " + index + ": 0x" |
| + Utility.hex(orders[index]) + " vs 0x" + Utility.hex(o)); |
| break; |
| } |
| } |
| } |
| } |
| |
| while (index != 0 && orders[index - 1] == 0) { |
| index --; |
| } |
| |
| if (index != 0) { |
| String msg = "Didn't get back to beginning - index is "; |
| test.errln(msg + index); |
| |
| iter.reset(); |
| test.err("next: "); |
| while ((o = iter.next()) != CollationElementIterator.NULLORDER) { |
| String hexString = "0x" + Utility.hex(o) + " "; |
| test.err(hexString); |
| } |
| test.errln(""); |
| test.err("prev: "); |
| while ((o = iter.previous()) != CollationElementIterator.NULLORDER) { |
| String hexString = "0x" + Utility.hex(o) + " "; |
| test.err(hexString); |
| } |
| test.errln(""); |
| } |
| } |
| |
| static final String appendCompareResult(int result, String target){ |
| if (result == -1) { |
| target += "LESS"; |
| } else if (result == 0) { |
| target += "EQUAL"; |
| } else if (result == 1) { |
| target += "GREATER"; |
| } else { |
| String huh = "?"; |
| target += huh + result; |
| } |
| return target; |
| } |
| |
| static final String prettify(CollationKey key) { |
| byte[] bytes = key.toByteArray(); |
| return prettify(bytes, bytes.length); |
| } |
| |
| static final String prettify(RawCollationKey key) { |
| return prettify(key.bytes, key.size); |
| } |
| |
| static final String prettify(byte[] skBytes, int length) { |
| StringBuilder target = new StringBuilder(length * 3 + 2).append('['); |
| |
| for (int i = 0; i < length; i++) { |
| String numStr = Integer.toHexString(skBytes[i] & 0xff); |
| if (numStr.length() < 2) { |
| target.append('0'); |
| } |
| target.append(numStr).append(' '); |
| } |
| target.append(']'); |
| return target.toString(); |
| } |
| |
| private static void doTestVariant(TestFmwk test, |
| RuleBasedCollator myCollation, |
| String source, String target, int result) |
| { |
| boolean printInfo = false; |
| int compareResult = myCollation.compare(source, target); |
| if (compareResult != result) { |
| |
| // !!! if not mod build, error, else nothing. |
| // warnln if not build, error, else always print warning. |
| // do we need a 'quiet warning?' (err or log). Hmmm, |
| // would it work to have the 'verbose' flag let you |
| // suppress warnings? Are there ever some warnings you |
| // want to suppress, and others you don't? |
| if(!test.isModularBuild()){ |
| test.errln("Comparing \"" + Utility.hex(source) + "\" with \"" |
| + Utility.hex(target) + "\" expected " + result |
| + " but got " + compareResult); |
| }else{ |
| printInfo = true; |
| } |
| } |
| CollationKey ssk = myCollation.getCollationKey(source); |
| CollationKey tsk = myCollation.getCollationKey(target); |
| compareResult = ssk.compareTo(tsk); |
| if (compareResult != result) { |
| |
| if(!test.isModularBuild()){ |
| test.errln("Comparing CollationKeys of \"" + Utility.hex(source) |
| + "\" with \"" + Utility.hex(target) |
| + "\" expected " + result + " but got " |
| + compareResult); |
| }else{ |
| printInfo = true; |
| } |
| } |
| RawCollationKey srsk = new RawCollationKey(); |
| myCollation.getRawCollationKey(source, srsk); |
| RawCollationKey trsk = new RawCollationKey(); |
| myCollation.getRawCollationKey(target, trsk); |
| compareResult = ssk.compareTo(tsk); |
| if (compareResult != result) { |
| |
| if(!test.isModularBuild()){ |
| test.errln("Comparing RawCollationKeys of \"" |
| + Utility.hex(source) |
| + "\" with \"" + Utility.hex(target) |
| + "\" expected " + result + " but got " |
| + compareResult); |
| }else{ |
| printInfo = true; |
| } |
| } |
| // hmmm, but here we issue a warning |
| // only difference is, one warning or two, and detailed info or not? |
| // hmmm, does seem preferable to omit detail if we know it is due to missing resource data. |
| // well, if we label the errors as warnings, we can let people know the details, but |
| // also know they may be due to missing resource data. basically this code is asserting |
| // that the errors are due to missing resource data, which may or may not be true. |
| if (printInfo) { |
| test.warnln("Could not load locale data skipping."); |
| } |
| } |
| |
| public void TestMinMax() { |
| setRootCollator(); |
| RuleBasedCollator rbc = (RuleBasedCollator)coll; |
| |
| final String s = "\uFFFE\uFFFF"; |
| long[] ces; |
| |
| ces = rbc.internalGetCEs(s); |
| if (ces.length != 2) { |
| errln("expected 2 CEs for <FFFE, FFFF>, got " + ces.length); |
| return; |
| } |
| |
| long ce = ces[0]; |
| long expected = Collation.makeCE(Collation.MERGE_SEPARATOR_PRIMARY); |
| if (ce != expected) { |
| errln("CE(U+fffe)=0x" + Utility.hex(ce) + " != 02.."); |
| } |
| |
| ce = ces[1]; |
| expected = Collation.makeCE(Collation.MAX_PRIMARY); |
| if (ce != expected) { |
| errln("CE(U+ffff)=0x" + Utility.hex(ce) + " != max.."); |
| } |
| } |
| |
| public void TestImplicits() { |
| CollationData cd = CollationRoot.getData(); |
| |
| // Implicit primary weights should be assigned for the following sets, |
| // and sort in ascending order by set and then code point. |
| // See http://www.unicode.org/reports/tr10/#Implicit_Weights |
| // core Han Unified Ideographs |
| UnicodeSet coreHan = new UnicodeSet("[\\p{unified_ideograph}&" |
| + "[\\p{Block=CJK_Unified_Ideographs}" |
| + "\\p{Block=CJK_Compatibility_Ideographs}]]"); |
| // all other Unified Han ideographs |
| UnicodeSet otherHan = new UnicodeSet("[\\p{unified ideograph}-" |
| + "[\\p{Block=CJK_Unified_Ideographs}" |
| + "\\p{Block=CJK_Compatibility_Ideographs}]]"); |
| |
| UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]"); |
| unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings. |
| |
| // Starting with CLDR 26/ICU 54, the root Han order may instead be |
| // the Unihan radical-stroke order. |
| // The tests should pass either way, so we only test the order of a small set of Han characters |
| // whose radical-stroke order is the same as their code point order. |
| UnicodeSet someHanInCPOrder = new UnicodeSet( |
| "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" + |
| "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]"); |
| UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder); |
| inOrder.addAll(unassigned).freeze(); |
| |
| UnicodeSet[] sets = { coreHan, otherHan, unassigned }; |
| int prev = 0; |
| long prevPrimary = 0; |
| UTF16CollationIterator ci = new UTF16CollationIterator(cd, false, "", 0); |
| for (int i = 0; i < sets.length; ++i) { |
| UnicodeSetIterator iter = new UnicodeSetIterator(sets[i]); |
| while (iter.next()) { |
| String s = iter.getString(); |
| int c = s.codePointAt(0); |
| ci.setText(false, s, 0); |
| long ce = ci.nextCE(); |
| long ce2 = ci.nextCE(); |
| if (ce == Collation.NO_CE || ce2 != Collation.NO_CE) { |
| errln("CollationIterator.nextCE(0x" + Utility.hex(c) |
| + ") did not yield exactly one CE"); |
| continue; |
| |
| } |
| if ((ce & 0xffffffffL) != Collation.COMMON_SEC_AND_TER_CE) { |
| errln("CollationIterator.nextCE(U+" + Utility.hex(c, 4) |
| + ") has non-common sec/ter weights: 0x" + Utility.hex(ce & 0xffffffffL, 8)); |
| continue; |
| } |
| long primary = ce >>> 32; |
| if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) { |
| errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary) |
| + ".. not greater than CE(U+" + Utility.hex(prev) |
| + ")=0x" + Utility.hex(prevPrimary) + ".."); |
| |
| } |
| prev = c; |
| prevPrimary = primary; |
| } |
| } |
| } |
| |
| // ICU4C: TestNulTerminated / renamed for ICU4J |
| public void TestSubSequence() { |
| CollationData data = CollationRoot.getData(); |
| final String s = "abab"; // { 0x61, 0x62, 0x61, 0x62 } |
| |
| UTF16CollationIterator ci1 = new UTF16CollationIterator(data, false, s, 0); |
| UTF16CollationIterator ci2 = new UTF16CollationIterator(data, false, s, 2); |
| |
| for (int i = 0; i < 2; ++i) { |
| long ce1 = ci1.nextCE(); |
| long ce2 = ci2.nextCE(); |
| |
| if (ce1 != ce2) { |
| errln("CollationIterator.nextCE(with start position at 0) != " |
| + "nextCE(with start position at 2) at CE " + i); |
| } |
| } |
| } |
| |
| |
| // ICU4C: TestIllegalUTF8 / not applicable to ICU4J |
| |
| |
| private static void addLeadSurrogatesForSupplementary(UnicodeSet src, UnicodeSet dest) { |
| for(int c = 0x10000; c < 0x110000;) { |
| int next = c + 0x400; |
| if(src.containsSome(c, next - 1)) { |
| dest.add(UTF16.getLeadSurrogate(c)); |
| } |
| c = next; |
| } |
| } |
| |
| public void TestShortFCDData() { |
| UnicodeSet expectedLccc = new UnicodeSet("[:^lccc=0:]"); |
| expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates |
| addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); |
| |
| UnicodeSet lccc = new UnicodeSet(); // actual |
| for (int c = 0; c <= 0xffff; ++c) { |
| if (CollationFCD.hasLccc(c)) { |
| lccc.add(c); |
| } |
| } |
| |
| UnicodeSet diff = new UnicodeSet(expectedLccc); |
| diff.removeAll(lccc); |
| diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP |
| |
| String empty = "[]"; |
| String diffString; |
| |
| diffString = diff.toPattern(true); |
| assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); |
| |
| diff = lccc; |
| diff.removeAll(expectedLccc); |
| diffString = diff.toPattern(true); |
| assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString); |
| |
| UnicodeSet expectedTccc = new UnicodeSet("[:^tccc=0:]"); |
| addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); |
| addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); |
| |
| UnicodeSet tccc = new UnicodeSet(); // actual |
| for(int c = 0; c <= 0xffff; ++c) { |
| if (CollationFCD.hasTccc(c)) { |
| tccc.add(c); |
| } |
| } |
| |
| diff = new UnicodeSet(expectedTccc); |
| diff.removeAll(tccc); |
| diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP |
| assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString); |
| |
| diff = tccc; |
| diff.removeAll(expectedTccc); |
| diffString = diff.toPattern(true); |
| assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString); |
| } |
| |
| private static class CodePointIterator { |
| int[] cp; |
| int length; |
| int pos; |
| |
| CodePointIterator(int[] cp) { |
| this.cp = cp; |
| this.length = cp.length; |
| this.pos = 0; |
| } |
| |
| void resetToStart() { |
| pos = 0; |
| } |
| |
| int next() { |
| return (pos < length) ? cp[pos++] : Collation.SENTINEL_CP; |
| } |
| |
| int previous() { |
| return (pos > 0) ? cp[--pos] : Collation.SENTINEL_CP; |
| } |
| |
| int getLength() { |
| return length; |
| } |
| |
| int getIndex() { |
| return pos; |
| } |
| } |
| |
| private void checkFCD(String name, CollationIterator ci, CodePointIterator cpi) { |
| // Iterate forward to the limit. |
| for (;;) { |
| int c1 = ci.nextCodePoint(); |
| int c2 = cpi.next(); |
| if (c1 != c2) { |
| errln(name + ".nextCodePoint(to limit, 1st pass) = U+" + Utility.hex(c1) |
| + " != U+" + Utility.hex(c1) + " at " + cpi.getIndex()); |
| return; |
| } |
| if (c1 < 0) { |
| break; |
| } |
| } |
| |
| // Iterate backward most of the way. |
| for (int n = (cpi.getLength() * 2) / 3; n > 0; --n) { |
| int c1 = ci.previousCodePoint(); |
| int c2 = cpi.previous(); |
| if (c1 != c2) { |
| errln(name + ".previousCodePoint() = U+" + Utility.hex(c1) + |
| " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); |
| return; |
| } |
| } |
| |
| // Forward again. |
| for (;;) { |
| int c1 = ci.nextCodePoint(); |
| int c2 = cpi.next(); |
| if (c1 != c2) { |
| errln(name + ".nextCodePoint(to limit again) = U+" + Utility.hex(c1) |
| + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); |
| return; |
| } |
| if (c1 < 0) { |
| break; |
| } |
| } |
| |
| // Iterate backward to the start. |
| for (;;) { |
| int c1 = ci.previousCodePoint(); |
| int c2 = cpi.previous(); |
| if (c1 != c2) { |
| errln(name + ".nextCodePoint(to start) = U+" + Utility.hex(c1) |
| + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); |
| return; |
| } |
| if (c1 < 0) { |
| break; |
| } |
| } |
| } |
| |
| public void TestFCD() { |
| CollationData data = CollationRoot.getData(); |
| |
| // Input string, not FCD. |
| StringBuilder buf = new StringBuilder(); |
| buf.append("\u0308\u00e1\u0062\u0301\u0327\u0430\u0062") |
| .appendCodePoint(0x1D15F) // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216 |
| .append("\u0327\u0308") // ccc=202, 230 |
| .appendCodePoint(0x1D16D) // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226 |
| .appendCodePoint(0x1D15F) |
| .appendCodePoint(0x1D16D) |
| .append("\uac01") |
| .append("\u00e7") // Character with tccc!=0 decomposed together with mis-ordered sequence. |
| .appendCodePoint(0x1D16D).appendCodePoint(0x1D165) |
| .append("\u00e1") // Character with tccc!=0 decomposed together with decomposed sequence. |
| .append("\u0f73\u0f75") // Tibetan composite vowels must be decomposed. |
| .append("\u4e00\u0f81"); |
| String s = buf.toString(); |
| |
| // Expected code points. |
| int[] cp = { |
| 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, |
| 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, |
| 0x1D15F, 0x1D16D, |
| 0xac01, |
| 0x63, 0x327, 0x1D165, 0x1D16D, |
| 0x61, |
| 0xf71, 0xf71, 0xf72, 0xf74, 0x301, |
| 0x4e00, 0xf71, 0xf80 |
| }; |
| |
| FCDUTF16CollationIterator u16ci = new FCDUTF16CollationIterator(data, false, s, 0); |
| CodePointIterator cpi = new CodePointIterator(cp); |
| checkFCD("FCDUTF16CollationIterator", u16ci, cpi); |
| |
| cpi.resetToStart(); |
| UCharacterIterator iter = UCharacterIterator.getInstance(s); |
| FCDIterCollationIterator uici = new FCDIterCollationIterator(data, false, iter, 0); |
| checkFCD("FCDIterCollationIterator", uici, cpi); |
| } |
| |
| private void checkAllocWeights(CollationWeights cw, long lowerLimit, long upperLimit, |
| int n, int someLength, int minCount) { |
| |
| if (!cw.allocWeights(lowerLimit, upperLimit, n)) { |
| errln("CollationWeights::allocWeights(0x" |
| + Utility.hex(lowerLimit) + ",0x" |
| + Utility.hex(upperLimit) + "," |
| + n + ") = false"); |
| return; |
| } |
| long previous = lowerLimit; |
| int count = 0; // number of weights that have someLength |
| for (int i = 0; i < n; ++i) { |
| long w = cw.nextWeight(); |
| if (w == 0xffffffffL) { |
| errln("CollationWeights::allocWeights(0x" |
| + Utility.hex(lowerLimit) + ",0x" |
| + Utility.hex(upperLimit) + ",0x" |
| + n + ").nextWeight() returns only " |
| + i + " weights"); |
| return; |
| } |
| if (!(previous < w && w < upperLimit)) { |
| errln("CollationWeights::allocWeights(0x" |
| + Utility.hex(lowerLimit) + ",0x" |
| + Utility.hex(upperLimit) + "," |
| + n + ").nextWeight() number " |
| + (i + 1) + " -> 0x" + Utility.hex(w) |
| + " not between " |
| + Utility.hex(previous) + " and " |
| + Utility.hex(upperLimit)); |
| return; |
| } |
| if (CollationWeights.lengthOfWeight(w) == someLength) { |
| ++count; |
| } |
| } |
| if (count < minCount) { |
| errln("CollationWeights::allocWeights(0x" |
| + Utility.hex(lowerLimit) + ",0x" |
| + Utility.hex(upperLimit) + "," |
| + n + ").nextWeight() returns only " |
| + count + " < " + minCount + " weights of length " |
| + someLength); |
| |
| } |
| } |
| |
| public void TestCollationWeights() { |
| CollationWeights cw = new CollationWeights(); |
| |
| // Non-compressible primaries use 254 second bytes 02..FF. |
| logln("CollationWeights.initForPrimary(non-compressible)"); |
| cw.initForPrimary(false); |
| // Expect 1 weight 11 and 254 weights 12xx. |
| checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 1, 1); |
| checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 2, 254); |
| // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. |
| checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 260, 2, 255); |
| // Expect 254 two-byte weights from the ranges 10ff and 11xx. |
| checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 600, 2, 254); |
| // Expect 254^2=64516 three-byte weights. |
| // During computation, there should be 3 three-byte ranges |
| // 10ffff, 11xxxx, 120202. |
| // The middle one should be split 64515:1, |
| // and the newly-split-off range and the last ranged lengthened. |
| checkAllocWeights(cw, 0x10fffe00L, 0x12020300L, 1 + 64516 + 254 + 1, 3, 64516); |
| // Expect weights 1102 & 1103. |
| checkAllocWeights(cw, 0x10ff0000L, 0x11040000L, 2, 2, 2); |
| // Expect weights 102102 & 102103. |
| checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2); |
| |
| // Compressible primaries use 251 second bytes 04..FE. |
| logln("CollationWeights.initForPrimary(compressible)"); |
| cw.initForPrimary(true); |
| // Expect 1 weight 11 and 251 weights 12xx. |
| checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 1, 1); |
| checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 2, 251); |
| // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. |
| checkAllocWeights(cw, 0x10fdfe40L, 0x12050300L, 260, 2, 252); |
| // Expect weights 1104 & 1105. |
| checkAllocWeights(cw, 0x10fe0000L, 0x11060000L, 2, 2, 2); |
| // Expect weights 102102 & 102103. |
| checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2); |
| |
| // Secondary and tertiary weights use only bytes 3 & 4. |
| logln("CollationWeights.initForSecondary()"); |
| cw.initForSecondary(); |
| // Expect weights fbxx and all four fc..ff. |
| checkAllocWeights(cw, 0xfb20L, 0x10000L, 20, 3, 4); |
| |
| logln("CollationWeights.initForTertiary()"); |
| cw.initForTertiary(); |
| // Expect weights 3dxx and both 3e & 3f. |
| checkAllocWeights(cw, 0x3d02L, 0x4000L, 10, 3, 2); |
| } |
| |
| private static boolean isValidCE(CollationRootElements re, CollationData data, long p, long s, long ctq) { |
| long p1 = p >>> 24; |
| long p2 = (p >>> 16) & 0xff; |
| long p3 = (p >>> 8) & 0xff; |
| long p4 = p & 0xff; |
| long s1 = s >>> 8; |
| long s2 = s & 0xff; |
| // ctq = Case, Tertiary, Quaternary |
| long c = (ctq & Collation.CASE_MASK) >>> 14; |
| long t = ctq & Collation.ONLY_TERTIARY_MASK; |
| long t1 = t >>> 8; |
| long t2 = t & 0xff; |
| long q = ctq & Collation.QUATERNARY_MASK; |
| // No leading zero bytes. |
| if ((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { |
| return false; |
| } |
| // No intermediate zero bytes. |
| if (p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { |
| return false; |
| } |
| if (p2 != 0 && p3 == 0 && p4 != 0) { |
| return false; |
| } |
| // Minimum & maximum lead bytes. |
| if ((p1 != 0 && p1 <= Collation.MERGE_SEPARATOR_BYTE) |
| || s1 == Collation.LEVEL_SEPARATOR_BYTE |
| || t1 == Collation.LEVEL_SEPARATOR_BYTE || t1 > 0x3f) { |
| return false; |
| } |
| if (c > 2) { |
| return false; |
| } |
| // The valid byte range for the second primary byte depends on compressibility. |
| if (p2 != 0) { |
| if (data.isCompressibleLeadByte((int)p1)) { |
| if (p2 <= Collation.PRIMARY_COMPRESSION_LOW_BYTE |
| || Collation.PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { |
| return false; |
| } |
| } else { |
| if (p2 <= Collation.LEVEL_SEPARATOR_BYTE) { |
| return false; |
| } |
| } |
| } |
| // Other bytes just need to avoid the level separator. |
| // Trailing zeros are ok. |
| // assert (Collation.LEVEL_SEPARATOR_BYTE == 1); |
| if (p3 == Collation.LEVEL_SEPARATOR_BYTE || p4 == Collation.LEVEL_SEPARATOR_BYTE |
| || s2 == Collation.LEVEL_SEPARATOR_BYTE || t2 == Collation.LEVEL_SEPARATOR_BYTE) { |
| return false; |
| } |
| // Well-formed CEs. |
| if (p == 0) { |
| if (s == 0) { |
| if (t == 0) { |
| // Completely ignorable CE. |
| // Quaternary CEs are not supported. |
| if (c != 0 || q != 0) { |
| return false; |
| } |
| } else { |
| // Tertiary CE. |
| if (t < re.getTertiaryBoundary() || c != 2) { |
| return false; |
| } |
| } |
| } else { |
| // Secondary CE. |
| if (s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) { |
| return false; |
| } |
| } |
| } else { |
| // Primary CE. |
| if (s == 0 || (Collation.COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) |
| || s >= re.getSecondaryBoundary()) { |
| return false; |
| } |
| if (t == 0 || t >= re.getTertiaryBoundary()) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private static boolean isValidCE(CollationRootElements re, CollationData data, long ce) { |
| long p = ce >>> 32; |
| long secTer = ce & 0xffffffffL; |
| return isValidCE(re, data, p, secTer >>> 16, secTer & 0xffff); |
| } |
| |
| private static class RootElementsIterator { |
| CollationData data; |
| long[] elements; |
| int length; |
| |
| long pri; |
| long secTer; |
| int index; |
| |
| RootElementsIterator(CollationData root) { |
| data = root; |
| elements = root.rootElements; |
| length = elements.length; |
| pri = 0; |
| secTer = 0; |
| index = (int)elements[CollationRootElements.IX_FIRST_TERTIARY_INDEX]; |
| } |
| |
| boolean next() { |
| if (index >= length) { |
| return false; |
| } |
| long p = elements[index]; |
| if (p == CollationRootElements.PRIMARY_SENTINEL) { |
| return false; |
| } |
| if ((p & CollationRootElements.SEC_TER_DELTA_FLAG) != 0) { |
| ++index; |
| secTer = p & ~CollationRootElements.SEC_TER_DELTA_FLAG; |
| return true; |
| } |
| if ((p & CollationRootElements.PRIMARY_STEP_MASK) != 0) { |
| // End of a range, enumerate the primaries in the range. |
| int step = (int)p & CollationRootElements.PRIMARY_STEP_MASK; |
| p &= 0xffffff00; |
| if (pri == p) { |
| // Finished the range, return the next CE after it. |
| ++index; |
| return next(); |
| } |
| assert (pri < p); |
| // Return the next primary in this range. |
| boolean isCompressible = data.isCompressiblePrimary(pri); |
| if ((pri & 0xffff) == 0) { |
| pri = Collation.incTwoBytePrimaryByOffset(pri, isCompressible, step); |
| } else { |
| pri = Collation.incThreeBytePrimaryByOffset(pri, isCompressible, step); |
| } |
| return true; |
| } |
| // Simple primary CE. |
| ++index; |
| pri = p; |
| secTer = Collation.COMMON_SEC_AND_TER_CE; |
| return true; |
| } |
| |
| long getPrimary() { |
| return pri; |
| } |
| |
| long getSecTer() { |
| return secTer; |
| } |
| } |
| |
| public void TestRootElements() { |
| CollationData root = CollationRoot.getData(); |
| |
| CollationRootElements rootElements = new CollationRootElements(root.rootElements); |
| RootElementsIterator iter = new RootElementsIterator(root); |
| |
| // We check each root CE for validity, |
| // and we also verify that there is a tailoring gap between each two CEs. |
| CollationWeights cw1c = new CollationWeights(); // compressible primary weights |
| CollationWeights cw1u = new CollationWeights(); // uncompressible primary weights |
| CollationWeights cw2 = new CollationWeights(); |
| CollationWeights cw3 = new CollationWeights(); |
| |
| cw1c.initForPrimary(true); |
| cw1u.initForPrimary(false); |
| cw2.initForSecondary(); |
| cw3.initForTertiary(); |
| |
| // Note: The root elements do not include Han-implicit or unassigned-implicit CEs, |
| // nor the special merge-separator CE for U+FFFE. |
| long prevPri = 0; |
| long prevSec = 0; |
| long prevTer = 0; |
| |
| while (iter.next()) { |
| long pri = iter.getPrimary(); |
| long secTer = iter.getSecTer(); |
| // CollationRootElements CEs must have 0 case and quaternary bits. |
| if ((secTer & Collation.CASE_AND_QUATERNARY_MASK) != 0) { |
| errln("CollationRootElements CE has non-zero case and/or quaternary bits: " |
| + "0x" + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); |
| } |
| long sec = secTer >>> 16; |
| long ter = secTer & Collation.ONLY_TERTIARY_MASK; |
| long ctq = ter; |
| if (pri == 0 && sec == 0 && ter != 0) { |
| // Tertiary CEs must have uppercase bits, |
| // but they are not stored in the CollationRootElements. |
| ctq |= 0x8000; |
| } |
| if (!isValidCE(rootElements, root, pri, sec, ctq)) { |
| errln("invalid root CE 0x" |
| + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); |
| } else { |
| if (pri != prevPri) { |
| long newWeight = 0; |
| if (prevPri == 0 || prevPri >= Collation.FFFD_PRIMARY) { |
| // There is currently no tailoring gap after primary ignorables, |
| // and we forbid tailoring after U+FFFD and U+FFFF. |
| } else if (root.isCompressiblePrimary(prevPri)) { |
| if (!cw1c.allocWeights(prevPri, pri, 1)) { |
| errln("no primary/compressible tailoring gap between " |
| + "0x" + Utility.hex(prevPri, 8) |
| + " and 0x" + Utility.hex(pri, 8)); |
| } else { |
| newWeight = cw1c.nextWeight(); |
| } |
| } else { |
| if (!cw1u.allocWeights(prevPri, pri, 1)) { |
| errln("no primary/uncompressible tailoring gap between " |
| + "0x" + Utility.hex(prevPri, 8) |
| + " and 0x" + Utility.hex(pri, 8)); |
| } else { |
| newWeight = cw1u.nextWeight(); |
| } |
| } |
| if (newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) { |
| errln("mis-allocated primary weight, should get " |
| + "0x" + Utility.hex(prevPri, 8) |
| + " < 0x" + Utility.hex(newWeight, 8) |
| + " < 0x" + Utility.hex(pri, 8)); |
| } |
| } else if (sec != prevSec) { |
| long lowerLimit = prevSec == 0 ? |
| rootElements.getSecondaryBoundary() - 0x100 : prevSec; |
| if (!cw2.allocWeights(lowerLimit, sec, 1)) { |
| errln("no secondary tailoring gap between " |
| + "0x" + Utility.hex(lowerLimit) |
| + " and 0x" + Utility.hex(sec)); |
| } else { |
| long newWeight = cw2.nextWeight(); |
| if (!(prevSec < newWeight && newWeight < sec)) { |
| errln("mis-allocated secondary weight, should get " |
| + "0x" + Utility.hex(lowerLimit) |
| + " < 0x" + Utility.hex(newWeight) |
| + " < 0x" + Utility.hex(sec)); |
| } |
| } |
| } else if (ter != prevTer) { |
| long lowerLimit = prevTer == 0 ? |
| rootElements.getTertiaryBoundary() - 0x100 : prevTer; |
| if (!cw3.allocWeights(lowerLimit, ter, 1)) { |
| errln("no tertiary tailoring gap between " |
| + "0x" + Utility.hex(lowerLimit) |
| + " and 0x" + Utility.hex(ter)); |
| } else { |
| long newWeight = cw3.nextWeight(); |
| if (!(prevTer < newWeight && newWeight < ter)) { |
| errln("mis-allocated tertiary weight, should get " |
| + "0x" + Utility.hex(lowerLimit) |
| + " < 0x" + Utility.hex(newWeight) |
| + " < 0x" + Utility.hex(ter)); |
| } |
| } |
| } else { |
| errln("duplicate root CE 0x" |
| + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); |
| } |
| } |
| prevPri = pri; |
| prevSec = sec; |
| prevTer = ter; |
| } |
| } |
| |
| public void TestTailoredElements() { |
| CollationData root = CollationRoot.getData(); |
| CollationRootElements rootElements = new CollationRootElements(root.rootElements); |
| |
| Set<String> prevLocales = new HashSet<String>(); |
| prevLocales.add(""); |
| prevLocales.add("root"); |
| prevLocales.add("root@collation=standard"); |
| |
| long[] ces; |
| ULocale[] locales = Collator.getAvailableULocales(); |
| String localeID = "root"; |
| int locIdx = 0; |
| |
| for (; locIdx < locales.length; localeID = locales[locIdx++].getName()) { |
| ULocale locale = new ULocale(localeID); |
| String[] types = Collator.getKeywordValuesForLocale("collation", locale, false); |
| for (int typeIdx = 0; typeIdx < types.length; ++typeIdx) { |
| String type = types[typeIdx]; // first: default type |
| if (type.startsWith("private-")) { |
| errln("Collator.getKeywordValuesForLocale(" + localeID + |
| ") returns private collation keyword: " + type); |
| } |
| ULocale localeWithType = locale.setKeywordValue("collation", type); |
| Collator coll = Collator.getInstance(localeWithType); |
| ULocale actual = coll.getLocale(ULocale.ACTUAL_LOCALE); |
| if (prevLocales.contains(actual.getName())) { |
| continue; |
| } |
| prevLocales.add(actual.getName()); |
| logln("TestTailoredElements(): requested " + localeWithType.getName() |
| + " -> actual " + actual.getName()); |
| if (!(coll instanceof RuleBasedCollator)) { |
| continue; |
| } |
| RuleBasedCollator rbc = (RuleBasedCollator) coll; |
| |
| // Note: It would be better to get tailored strings such that we can |
| // identify the prefix, and only get the CEs for the prefix+string, |
| // not also for the prefix. |
| // There is currently no API for that. |
| // It would help in an unusual case where a contraction starting in the prefix |
| // extends past its end, and we do not see the intended mapping. |
| // For example, for a mapping p|st, if there is also a contraction ps, |
| // then we get CEs(ps)+CEs(t), rather than CEs(p|st). |
| UnicodeSet tailored = coll.getTailoredSet(); |
| UnicodeSetIterator iter = new UnicodeSetIterator(tailored); |
| while (iter.next()) { |
| String s = iter.getString(); |
| ces = rbc.internalGetCEs(s); |
| for (int i = 0; i < ces.length; ++i) { |
| long ce = ces[i]; |
| if (!isValidCE(rootElements, root, ce)) { |
| logln(prettify(s)); |
| errln("invalid tailored CE 0x" + Utility.hex(ce, 16) |
| + " at CE index " + i + " from string:"); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| private static boolean isSpace(char c) { |
| return (c == 0x09 || c == 0x20 || c == 0x3000); |
| } |
| |
| private static boolean isSectionStarter(char c) { |
| return (c == '%' || c == '*' || c == '@'); |
| } |
| |
| private int skipSpaces(int i) { |
| while (isSpace(fileLine.charAt(i))) { |
| ++i; |
| } |
| return i; |
| } |
| |
| private String printSortKey(byte[] p) { |
| StringBuilder s = new StringBuilder(); |
| for (int i = 0; i < p.length; ++i) { |
| if (i > 0) { |
| s.append(' '); |
| } |
| byte b = p[i]; |
| if (b == 0) { |
| s.append('.'); |
| } else if (b == 1) { |
| s.append('|'); |
| } else { |
| s.append(String.format("%02x", b & 0xff)); |
| } |
| } |
| return s.toString(); |
| } |
| |
| private String printCollationKey(CollationKey key) { |
| byte[] p = key.toByteArray(); |
| return printSortKey(p); |
| } |
| |
| private boolean readLine(BufferedReader in) throws IOException { |
| String line = in.readLine(); |
| if (line == null) { |
| fileLine = null; |
| return false; |
| } |
| ++fileLineNumber; |
| // Strip trailing comments and spaces |
| int idx = line.indexOf('#'); |
| if (idx < 0) { |
| idx = line.length(); |
| } |
| for (; idx > 0; idx--) { |
| if (!isSpace(line.charAt(idx -1))) { |
| break; |
| } |
| } |
| |
| fileLine = idx < line.length() ? line.substring(0, idx) : line; |
| return true; |
| } |
| |
| private int parseString(int start, Output<String> prefix, Output<String> s) { |
| int length = fileLine.length(); |
| int i; |
| for (i = start; i < length && !isSpace(fileLine.charAt(i)); ++i) { |
| } |
| int pipeIndex = fileLine.indexOf('|', start); |
| if (pipeIndex >= 0 && pipeIndex < i) { |
| String tmpPrefix = Utility.unescape(fileLine.substring(start, pipeIndex)); |
| if (tmpPrefix.length() == 0) { |
| prefix.value = null; |
| logln(fileLine); |
| error = new ParseException("empty prefix on line " + fileLineNumber, fileLineNumber); |
| errln("empty prefix on line " + fileLineNumber); |
| return start; |
| } |
| prefix.value = tmpPrefix; |
| start = pipeIndex + 1; |
| } else { |
| prefix.value = null; |
| } |
| |
| String tmp = Utility.unescape(fileLine.substring(start, i)); |
| if (tmp.length() == 0) { |
| s.value = null; |
| logln(fileLine); |
| error = new ParseException("empty string on line " + fileLineNumber, fileLineNumber); |
| errln("empty string on line " + fileLineNumber); |
| return start; |
| } |
| s.value = tmp; |
| return i; |
| } |
| |
| private int parseRelationAndString(Output<String> s) { |
| int relation = Collation.NO_LEVEL; |
| int start; |
| if (fileLine.charAt(0) == '<') { |
| char second = fileLine.charAt(1); |
| start = 2; |
| switch(second) { |
| case 0x31: // <1 |
| relation = Collation.PRIMARY_LEVEL; |
| break; |
| case 0x32: // <2 |
| relation = Collation.SECONDARY_LEVEL; |
| break; |
| case 0x33: // <3 |
| relation = Collation.TERTIARY_LEVEL; |
| break; |
| case 0x34: // <4 |
| relation = Collation.QUATERNARY_LEVEL; |
| break; |
| case 0x63: // <c |
| relation = Collation.CASE_LEVEL; |
| break; |
| case 0x69: // <i |
| relation = Collation.IDENTICAL_LEVEL; |
| break; |
| default: // just < |
| relation = Collation.NO_LEVEL; |
| start = 1; |
| break; |
| } |
| } else if (fileLine.charAt(0) == '=') { |
| relation = Collation.ZERO_LEVEL; |
| start = 1; |
| } else { |
| start = 0; |
| } |
| |
| if (start == 0 || !isSpace(fileLine.charAt(start))) { |
| logln(fileLine); |
| error = new ParseException("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line " |
| + fileLineNumber, fileLineNumber); |
| errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line " + fileLineNumber); |
| return Collation.NO_LEVEL; |
| } |
| |
| start = skipSpaces(start); |
| Output<String> prefixOut = new Output<String>(); |
| start = parseString(start, prefixOut, s); |
| if (error == null && prefixOut.value != null) { |
| logln(fileLine); |
| error = new ParseException("prefix string not allowed for test string: on line " |
| + fileLineNumber, fileLineNumber); |
| errln("prefix string not allowed for test string: on line " + fileLineNumber); |
| return Collation.NO_LEVEL; |
| } |
| if (start < fileLine.length()) { |
| logln(fileLine); |
| error = new ParseException("unexpected line contents after test string on line " |
| + fileLineNumber, fileLineNumber); |
| errln("unexpected line contents after test string on line " + fileLineNumber); |
| return Collation.NO_LEVEL; |
| } |
| |
| return relation; |
| } |
| |
| private void parseAndSetAttribute() { |
| int start = skipSpaces(1); |
| int equalPos = fileLine.indexOf('='); |
| if (equalPos < 0) { |
| if (fileLine.regionMatches(start, "reorder", 0, 7)) { |
| parseAndSetReorderCodes(start + 7); |
| return; |
| } |
| logln(fileLine); |
| error = new ParseException("missing '=' on line " + fileLineNumber, fileLineNumber); |
| errln("missing '=' on line " + fileLineNumber); |
| return; |
| } |
| |
| String attrString = fileLine.substring(start, equalPos); |
| String valueString = fileLine.substring(equalPos + 1); |
| if (attrString.equals("maxVariable")) { |
| int max; |
| if (valueString.equals("space")) { |
| max = ReorderCodes.SPACE; |
| } else if(valueString.equals("punct")) { |
| max = ReorderCodes.PUNCTUATION; |
| } else if(valueString.equals("symbol")) { |
| max = ReorderCodes.SYMBOL; |
| } else if(valueString.equals("currency")) { |
| max = ReorderCodes.CURRENCY; |
| } else { |
| logln(fileLine); |
| error = new ParseException("invalid attribute value name on line " |
| + fileLineNumber, fileLineNumber); |
| errln("invalid attribute value name on line " + fileLineNumber); |
| return; |
| } |
| coll.setMaxVariable(max); |
| fileLine = null; |
| return; |
| } |
| |
| boolean parsed = true; |
| RuleBasedCollator rbc = (RuleBasedCollator)coll; |
| if (attrString.equals("backwards")) { |
| if (valueString.equals("on")) { |
| rbc.setFrenchCollation(true); |
| } else if (valueString.equals("off")) { |
| rbc.setFrenchCollation(false); |
| } else if (valueString.equals("default")) { |
| rbc.setFrenchCollationDefault(); |
| } else { |
| parsed = false; |
| } |
| } else if (attrString.equals("alternate")) { |
| if (valueString.equals("non-ignorable")) { |
| rbc.setAlternateHandlingShifted(false); |
| } else if (valueString.equals("shifted")) { |
| rbc.setAlternateHandlingShifted(true); |
| } else if (valueString.equals("default")) { |
| rbc.setAlternateHandlingDefault(); |
| } else { |
| parsed = false; |
| } |
| } else if (attrString.equals("caseFirst")) { |
| if (valueString.equals("upper")) { |
| rbc.setUpperCaseFirst(true); |
| } else if (valueString.equals("lower")) { |
| rbc.setLowerCaseFirst(true); |
| } else if (valueString.equals("default")) { |
| rbc.setCaseFirstDefault(); |
| } else { |
| parsed = false; |
| } |
| } else if (attrString.equals("caseLevel")) { |
| if (valueString.equals("on")) { |
| rbc.setCaseLevel(true); |
| } else if (valueString.equals("off")) { |
| rbc.setCaseLevel(false); |
| } else if (valueString.equals("default")) { |
| rbc.setCaseLevelDefault(); |
| } else { |
| parsed = false; |
| } |
| } else if (attrString.equals("strength")) { |
| if (valueString.equals("primary")) { |
| rbc.setStrength(Collator.PRIMARY); |
| } else if (valueString.equals("secondary")) { |
| rbc.setStrength(Collator.SECONDARY); |
| } else if (valueString.equals("tertiary")) { |
| rbc.setStrength(Collator.TERTIARY); |
| } else if (valueString.equals("quaternary")) { |
| rbc.setStrength(Collator.QUATERNARY); |
| } else if (valueString.equals("identical")) { |
| rbc.setStrength(Collator.IDENTICAL); |
| } else if (valueString.equals("default")) { |
| rbc.setStrengthDefault(); |
| } else { |
| parsed = false; |
| } |
| } else if (attrString.equals("numeric")) { |
| if (valueString.equals("on")) { |
| rbc.setNumericCollation(true); |
| } else if (valueString.equals("off")) { |
| rbc.setNumericCollation(false); |
| } else if (valueString.equals("default")) { |
| rbc.setNumericCollationDefault(); |
| } else { |
| parsed = false; |
| } |
| } else { |
| logln(fileLine); |
| error = new ParseException("invalid attribute value name on line " |
| + fileLineNumber, fileLineNumber); |
| errln("invalid attribute value name on line " + fileLineNumber); |
| return; |
| } |
| if (!parsed) { |
| logln(fileLine); |
| error = new ParseException("invalid attribute=value combination on line " |
| + fileLineNumber, fileLineNumber); |
| errln("invalid attribute=value combination on line " + fileLineNumber); |
| return; |
| } |
| |
| fileLine = null; |
| } |
| |
| private void parseAndSetReorderCodes(int start) { |
| UVector32 reorderCodes = new UVector32(); |
| while (start < fileLine.length()) { |
| start = skipSpaces(start); |
| int limit = start; |
| while (limit < fileLine.length() && !isSpace(fileLine.charAt(limit))) { |
| ++limit; |
| } |
| String name = fileLine.substring(start, limit); |
| int code = CollationRuleParser.getReorderCode(name); |
| if (code < -1) { |
| if (name.equalsIgnoreCase("default")) { |
| code = ReorderCodes.DEFAULT; // -1 |
| } else { |
| logln(fileLine); |
| error = new ParseException("invalid reorder code '" + name + "' on line " |
| + fileLineNumber, fileLineNumber); |
| return; |
| } |
| } |
| reorderCodes.addElement(code); |
| start = limit; |
| } |
| int[] reorderCodesArray = new int[reorderCodes.size()]; |
| System.arraycopy(reorderCodes.getBuffer(), 0, |
| reorderCodesArray, 0, reorderCodes.size()); |
| coll.setReorderCodes(reorderCodesArray); |
| |
| fileLine = null; |
| } |
| |
| private void buildTailoring(BufferedReader in) throws IOException { |
| StringBuilder rules = new StringBuilder(); |
| while (readLine(in)) { |
| if (fileLine.length() == 0) { |
| continue; |
| } |
| if (isSectionStarter(fileLine.charAt(0))) { |
| break; |
| } |
| rules.append(Utility.unescape(fileLine)); |
| } |
| |
| try { |
| coll = new RuleBasedCollator(rules.toString()); |
| } catch (Exception e) { |
| logln(rules.toString()); |
| errln("RuleBasedCollator(rules) failed - " + e.getMessage()); |
| error = e; |
| } |
| } |
| |
| private void setRootCollator() { |
| coll = Collator.getInstance(ULocale.ROOT); |
| } |
| |
| private void setLocaleCollator() { |
| ULocale locale = null; |
| if (fileLine.length() > 9) { |
| String localeID = fileLine.substring(9); // "@ locale <langTag>" |
| try { |
| locale = new ULocale(localeID); // either locale ID or language tag |
| } catch (IllformedLocaleException e) { |
| locale = null; |
| } |
| } |
| if (locale == null) { |
| logln(fileLine); |
| errln("invalid language tag on line " + fileLineNumber); |
| error = new ParseException("invalid langauge tag on line " + fileLineNumber, fileLineNumber); |
| return; |
| } |
| |
| logln("creating a collator for locale ID " + locale.getName()); |
| coll = Collator.getInstance(locale); |
| } |
| |
| private boolean needsNormalization(String s) { |
| if (!fcd.isNormalized(s)) { |
| return true; |
| } |
| // In some sequences with Tibetan composite vowel signs, |
| // even if the string passes the FCD check, |
| // those composites must be decomposed. |
| // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. |
| int index = 0; |
| while((index = s.indexOf(0xf71, index)) >= 0) { |
| if (++index < s.length()) { |
| char c = s.charAt(index); |
| if (c == 0xf73 || c == 0xf75 || c == 0xf81) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| private boolean getCollationKey(String norm, String line, String s, Output<CollationKey> keyOut) { |
| CollationKey key = coll.getCollationKey(s); |
| keyOut.value = key; |
| |
| byte[] keyBytes = key.toByteArray(); |
| if (keyBytes.length == 0 || keyBytes[keyBytes.length - 1] != 0) { |
| logln(fileTestName); |
| logln(line); |
| logln(printCollationKey(key)); |
| errln("Collator(" + norm + ").getCollationKey() wrote an empty or unterminated key"); |
| return false; |
| } |
| |
| int numLevels = coll.getStrength(); |
| if (numLevels < Collator.IDENTICAL) { |
| ++numLevels; |
| } else { |
| numLevels = 5; |
| } |
| if (((RuleBasedCollator)coll).isCaseLevel()) { |
| ++numLevels; |
| } |
| int numLevelSeparators = 0; |
| for (int i = 0; i < (keyBytes.length - 1); ++i) { |
| byte b = keyBytes[i]; |
| if (b == 0) { |
| logln(fileTestName); |
| logln(line); |
| logln(printCollationKey(key)); |
| errln("Collator(" + norm + ").getCollationKey() contains a 00 byte"); |
| return false; |
| } |
| if (b == 1) { |
| ++numLevelSeparators; |
| } |
| } |
| if (numLevelSeparators != (numLevels - 1)) { |
| logln(fileTestName); |
| logln(line); |
| logln(printCollationKey(key)); |
| errln("Collator(" + norm + ").getCollationKey() has " |
| + numLevelSeparators + " level separators for " |
| + numLevels + " levels"); |
| return false; |
| } |
| |
| // No nextSortKeyPart support in ICU4J |
| |
| return true; |
| } |
| |
| /** |
| * Changes the key to the merged segments of the U+FFFE-separated substrings of s. |
| * Leaves key unchanged if s does not contain U+FFFE. |
| * @return true if the key was successfully changed |
| */ |
| private boolean getMergedCollationKey(String s, Output<CollationKey> key) { |
| CollationKey mergedKey = null; |
| int sLength = s.length(); |
| int segmentStart = 0; |
| for (int i = 0;;) { |
| if (i == sLength) { |
| if (segmentStart == 0) { |
| // s does not contain any U+FFFE. |
| return false; |
| } |
| } else if (s.charAt(i) != '\uFFFE') { |
| ++i; |
| continue; |
| } |
| // Get the sort key for another segment and merge it into mergedKey. |
| CollationKey tmpKey = coll.getCollationKey(s.substring(segmentStart, i)); |
| if (mergedKey == null) { |
| mergedKey = tmpKey; |
| } else { |
| mergedKey = mergedKey.merge(tmpKey); |
| } |
| if (i == sLength) { |
| break; |
| } |
| segmentStart = ++i; |
| } |
| key.value = mergedKey; |
| return true; |
| } |
| |
| private static int getDifferenceLevel(CollationKey prevKey, CollationKey key, |
| int order, boolean collHasCaseLevel) { |
| if (order == Collation.EQUAL) { |
| return Collation.NO_LEVEL; |
| } |
| byte[] prevBytes = prevKey.toByteArray(); |
| byte[] bytes = key.toByteArray(); |
| int level = Collation.PRIMARY_LEVEL; |
| for (int i = 0;; ++i) { |
| byte b = prevBytes[i]; |
| if (b != bytes[i]) { |
| break; |
| } |
| if ((int)b == Collation.LEVEL_SEPARATOR_BYTE) { |
| ++level; |
| if (level == Collation.CASE_LEVEL && !collHasCaseLevel) { |
| ++level; |
| } |
| } |
| } |
| return level; |
| } |
| |
| private boolean checkCompareTwo(String norm, String prevFileLine, String prevString, String s, |
| int expectedOrder, int expectedLevel) { |
| // Get the sort keys first, for error debug output. |
| Output<CollationKey> prevKeyOut = new Output<CollationKey>(); |
| CollationKey prevKey; |
| if (!getCollationKey(norm, fileLine, prevString, prevKeyOut)) { |
| return false; |
| } |
| prevKey = prevKeyOut.value; |
| |
| Output<CollationKey> keyOut = new Output<CollationKey>(); |
| CollationKey key; |
| if (!getCollationKey(norm, fileLine, s, keyOut)) { |
| return false; |
| } |
| key = keyOut.value; |
| |
| int order = coll.compare(prevString, s); |
| if (order != expectedOrder) { |
| logln(fileTestName); |
| logln(prevFileLine); |
| logln(fileLine); |
| logln(printCollationKey(prevKey)); |
| logln(printCollationKey(key)); |
| errln("line " + fileLineNumber |
| + " Collator(" + norm + ").compare(previous, current) wrong order: " |
| + order + " != " + expectedOrder); |
| return false; |
| } |
| order = coll.compare(s, prevString); |
| if (order != -expectedOrder) { |
| logln(fileTestName); |
| logln(prevFileLine); |
| logln(fileLine); |
| logln(printCollationKey(prevKey)); |
| logln(printCollationKey(key)); |
| errln("line " + fileLineNumber |
| + " Collator(" + norm + ").compare(current, previous) wrong order: " |
| + order + " != " + -expectedOrder); |
| return false; |
| } |
| |
| order = prevKey.compareTo(key); |
| if (order != expectedOrder) { |
| logln(fileTestName); |
| logln(prevFileLine); |
| logln(fileLine); |
| logln(printCollationKey(prevKey)); |
| logln(printCollationKey(key)); |
| errln("line " + fileLineNumber |
| + " Collator(" + norm + ").getCollationKey(previous, current).compareTo() wrong order: " |
| + order + " != " + expectedOrder); |
| return false; |
| } |
| boolean collHasCaseLevel = ((RuleBasedCollator)coll).isCaseLevel(); |
| int level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); |
| if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) { |
| if (level != expectedLevel) { |
| logln(fileTestName); |
| logln(prevFileLine); |
| logln(fileLine); |
| logln(printCollationKey(prevKey)); |
| logln(printCollationKey(key)); |
| errln("line " + fileLineNumber |
| + " Collator(" + norm + ").getCollationKey(previous, current).compareTo()=" |
| + order + " wrong level: " + level + " != " + expectedLevel); |
| return false; |
| } |
| } |
| |
| // If either string contains U+FFFE, then their sort keys must compare the same as |
| // the merged sort keys of each string's between-FFFE segments. |
| // |
| // It is not required that |
| // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2)) |
| // only that those two methods yield the same order. |
| // |
| // Use bit-wise OR so that getMergedCollationKey() is always called for both strings. |
| Output<CollationKey> outPrevKey = new Output<CollationKey>(prevKey); |
| Output<CollationKey> outKey = new Output<CollationKey>(key); |
| if (getMergedCollationKey(prevString, outPrevKey) | getMergedCollationKey(s, outKey)) { |
| prevKey = outPrevKey.value; |
| key = outKey.value; |
| order = prevKey.compareTo(key); |
| if (order != expectedOrder) { |
| logln(fileTestName); |
| errln("line " + fileLineNumber |
| + " Collator(" + norm + ").getCollationKey" |
| + "(previous, current segments between U+FFFE)).merge().compareTo() wrong order: " |
| + order + " != " + expectedOrder); |
| logln(prevFileLine); |
| logln(fileLine); |
| logln(printCollationKey(prevKey)); |
| logln(printCollationKey(key)); |
| return false; |
| } |
| int mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); |
| if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) { |
| if(mergedLevel != level) { |
| logln(fileTestName); |
| errln("line " + fileLineNumber |
| + " Collator(" + norm + ").getCollationKey" |
| + "(previous, current segments between U+FFFE)).merge().compareTo()=" |
| + order + " wrong level: " + mergedLevel + " != " + level); |
| logln(prevFileLine); |
| logln(fileLine); |
| logln(printCollationKey(prevKey)); |
| logln(printCollationKey(key)); |
| return false; |
| } |
| } |
| } |
| return true; |
| } |
| |
| private void checkCompareStrings(BufferedReader in) throws IOException { |
| String prevFileLine = "(none)"; |
| String prevString = ""; |
| String s; |
| Output<String> sOut = new Output<String>(); |
| while (readLine(in)) { |
| if (fileLine.length() == 0) { |
| continue; |
| } |
| if (isSectionStarter(fileLine.charAt(0))) { |
| break; |
| } |
| int relation = parseRelationAndString(sOut); |
| s = sOut.value; |
| int expectedOrder = (relation == Collation.ZERO_LEVEL) ? Collation.EQUAL : Collation.LESS; |
| int expectedLevel = relation; |
| boolean isOk = true; |
| if (!needsNormalization(prevString) && !needsNormalization(s)) { |
| coll.setDecomposition(Collator.NO_DECOMPOSITION); |
| isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s, |
| expectedOrder, expectedLevel); |
| } |
| if (isOk) { |
| coll.setDecomposition(Collator.CANONICAL_DECOMPOSITION); |
| isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s, |
| expectedOrder, expectedLevel); |
| } |
| if (isOk && (!nfd.isNormalized(prevString) || !nfd.isNormalized(s))) { |
| String pn = nfd.normalize(prevString); |
| String n = nfd.normalize(s); |
| isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, |
| expectedOrder, expectedLevel); |
| } |
| prevFileLine = fileLine; |
| prevString = s; |
| } |
| } |
| |
| public void TestDataDriven() { |
| nfd = Normalizer2.getNFDInstance(); |
| fcd = Norm2AllModes.getFCDNormalizer2(); |
| |
| BufferedReader in = null; |
| |
| try { |
| in = TestUtil.getDataReader("collationtest.txt", "UTF-8"); |
| |
| // read first line and remove BOM if present |
| readLine(in); |
| if (fileLine != null && fileLine.charAt(0) == '\uFEFF') { |
| fileLine = fileLine.substring(1); |
| } |
| |
| while (error == null) { |
| if (fileLine == null || fileLine.length() == 0) { |
| if (!readLine(in)) { |
| break; |
| } |
| continue; |
| } |
| if (!isSectionStarter(fileLine.charAt(0))) { |
| logln(fileLine); |
| errln("syntax error on line " + fileLineNumber); |
| return; |
| } |
| if (fileLine.startsWith("** test: ")) { |
| fileTestName = fileLine; |
| logln(fileLine); |
| fileLine = null; |
| } else if (fileLine.equals("@ root")) { |
| setRootCollator(); |
| fileLine = null; |
| } else if (fileLine.startsWith("@ locale ")) { |
| setLocaleCollator(); |
| fileLine = null; |
| } else if (fileLine.equals("@ rules")) { |
| buildTailoring(in); |
| } else if (fileLine.charAt(0) == '%' |
| && fileLine.length() > 1 && isSpace(fileLine.charAt(1))) { |
| parseAndSetAttribute(); |
| } else if (fileLine.equals("* compare")) { |
| checkCompareStrings(in); |
| } else { |
| logln(fileLine); |
| errln("syntax error on line " + fileLineNumber); |
| return; |
| } |
| } |
| } catch (IOException e) { |
| errln(e.getMessage()); |
| } finally { |
| try { |
| if (in != null) { |
| in.close(); |
| } |
| } catch (IOException e) { |
| e.printStackTrace(); |
| } |
| } |
| } |
| } |