| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html#License |
| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2016, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.dev.test.normalizer; |
| |
| import java.text.StringCharacterIterator; |
| import java.util.Random; |
| |
| import org.junit.Test; |
| import org.junit.runner.RunWith; |
| import org.junit.runners.JUnit4; |
| |
| import com.ibm.icu.dev.test.TestFmwk; |
| import com.ibm.icu.impl.Norm2AllModes; |
| import com.ibm.icu.impl.Normalizer2Impl; |
| import com.ibm.icu.impl.USerializedSet; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UCharacterCategory; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.text.FilteredNormalizer2; |
| import com.ibm.icu.text.Normalizer; |
| import com.ibm.icu.text.Normalizer2; |
| import com.ibm.icu.text.UCharacterIterator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.text.UnicodeSetIterator; |
| |
| |
| @RunWith(JUnit4.class) |
| public class BasicTest extends TestFmwk { |
| String[][] canonTests = { |
| // Input Decomposed Composed |
| { "cat", "cat", "cat" }, |
| { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, |
| |
| { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above |
| { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above |
| |
| { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above |
| { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below |
| { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above |
| |
| { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above |
| { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below |
| |
| { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave |
| { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave |
| { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron |
| |
| { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign |
| { "\u00c5", "A\u030a", "\u00c5" }, // A-ring |
| |
| { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, |
| { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, |
| |
| { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0 |
| { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0 |
| |
| { "Henry IV", "Henry IV", "Henry IV" }, |
| { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, |
| |
| { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) |
| { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten |
| { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten |
| { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten |
| { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten |
| |
| { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, |
| {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"}, |
| }; |
| |
| String[][] compatTests = { |
| // Input Decomposed Composed |
| { "cat", "cat", "cat" }, |
| { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed |
| |
| { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, |
| { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i |
| |
| { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0 |
| { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i |
| |
| { "Henry IV", "Henry IV", "Henry IV" }, |
| { "Henry \u2163", "Henry IV", "Henry IV" }, |
| |
| { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) |
| { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten |
| |
| { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten |
| |
| /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ |
| { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten |
| { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten |
| |
| }; |
| |
| // With Canonical decomposition, Hangul syllables should get decomposed |
| // into Jamo, but Jamo characters should not be decomposed into |
| // conjoining Jamo |
| String[][] hangulCanon = { |
| // Input Decomposed Composed |
| { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, |
| { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, |
| }; |
| |
| // With compatibility decomposition turned on, |
| // it should go all the way down to conjoining Jamo characters. |
| // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE |
| String[][] hangulCompat = { |
| // Input Decomposed Composed |
| // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" }, |
| }; |
| |
| @Test |
| public void TestHangulCompose() |
| throws Exception{ |
| // Make sure that the static composition methods work |
| logln("Canonical composition..."); |
| staticTest(Normalizer.NFC, hangulCanon, 2); |
| logln("Compatibility composition..."); |
| staticTest(Normalizer.NFKC, hangulCompat, 2); |
| // Now try iterative composition.... |
| logln("Iterative composition..."); |
| Normalizer norm = new Normalizer("", Normalizer.NFC,0); |
| iterateTest(norm, hangulCanon, 2); |
| |
| norm.setMode(Normalizer.NFKD); |
| iterateTest(norm, hangulCompat, 2); |
| |
| // And finally, make sure you can do it in reverse too |
| logln("Reverse iteration..."); |
| norm.setMode(Normalizer.NFC); |
| backAndForth(norm, hangulCanon); |
| } |
| |
| @Test |
| public void TestHangulDecomp() throws Exception{ |
| // Make sure that the static decomposition methods work |
| logln("Canonical decomposition..."); |
| staticTest(Normalizer.NFD, hangulCanon, 1); |
| logln("Compatibility decomposition..."); |
| staticTest(Normalizer.NFKD, hangulCompat, 1); |
| |
| // Now the iterative decomposition methods... |
| logln("Iterative decomposition..."); |
| Normalizer norm = new Normalizer("", Normalizer.NFD,0); |
| iterateTest(norm, hangulCanon, 1); |
| |
| norm.setMode(Normalizer.NFKD); |
| iterateTest(norm, hangulCompat, 1); |
| |
| // And finally, make sure you can do it in reverse too |
| logln("Reverse iteration..."); |
| norm.setMode(Normalizer.NFD); |
| backAndForth(norm, hangulCanon); |
| } |
| @Test |
| public void TestNone() throws Exception{ |
| Normalizer norm = new Normalizer("", Normalizer.NONE,0); |
| iterateTest(norm, canonTests, 0); |
| staticTest(Normalizer.NONE, canonTests, 0); |
| } |
| @Test |
| public void TestDecomp() throws Exception{ |
| Normalizer norm = new Normalizer("", Normalizer.NFD,0); |
| iterateTest(norm, canonTests, 1); |
| staticTest(Normalizer.NFD, canonTests, 1); |
| decomposeTest(Normalizer.NFD, canonTests, 1); |
| } |
| |
| @Test |
| public void TestCompatDecomp() throws Exception{ |
| Normalizer norm = new Normalizer("", Normalizer.NFKD,0); |
| iterateTest(norm, compatTests, 1); |
| staticTest(Normalizer.NFKD,compatTests, 1); |
| decomposeTest(Normalizer.NFKD,compatTests, 1); |
| } |
| |
| @Test |
| public void TestCanonCompose() throws Exception{ |
| Normalizer norm = new Normalizer("", Normalizer.NFC,0); |
| staticTest(Normalizer.NFC, canonTests, 2); |
| iterateTest(norm, canonTests, 2); |
| composeTest(Normalizer.NFC, canonTests, 2); |
| } |
| |
| @Test |
| public void TestCompatCompose() throws Exception{ |
| Normalizer norm = new Normalizer("", Normalizer.NFKC,0); |
| iterateTest(norm, compatTests, 2); |
| staticTest(Normalizer.NFKC,compatTests, 2); |
| composeTest(Normalizer.NFKC,compatTests, 2); |
| } |
| |
| @Test |
| public void TestExplodingBase() throws Exception{ |
| // \u017f - Latin small letter long s |
| // \u0307 - combining dot above |
| // \u1e61 - Latin small letter s with dot above |
| // \u1e9b - Latin small letter long s with dot above |
| String[][] canon = { |
| // Input Decomposed Composed |
| { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, |
| { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, |
| }; |
| String[][] compat = { |
| // Input Decomposed Composed |
| { "\u017f", "s", "s" }, |
| { "\u1e9b", "s\u0307", "\u1e61" }, |
| }; |
| |
| staticTest(Normalizer.NFD, canon, 1); |
| staticTest(Normalizer.NFC, canon, 2); |
| |
| staticTest(Normalizer.NFKD, compat, 1); |
| staticTest(Normalizer.NFKC, compat, 2); |
| |
| } |
| |
| /** |
| * The Tibetan vowel sign AA, 0f71, was messed up prior to |
| * Unicode version 2.1.9. |
| * Once 2.1.9 or 3.0 is released, uncomment this test. |
| */ |
| @Test |
| public void TestTibetan() throws Exception{ |
| String[][] decomp = { |
| { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } |
| }; |
| String[][] compose = { |
| { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } |
| }; |
| |
| staticTest(Normalizer.NFD, decomp, 1); |
| staticTest(Normalizer.NFKD,decomp, 2); |
| staticTest(Normalizer.NFC, compose, 1); |
| staticTest(Normalizer.NFKC,compose, 2); |
| } |
| |
| /** |
| * Make sure characters in the CompositionExclusion.txt list do not get |
| * composed to. |
| */ |
| @Test |
| public void TestCompositionExclusion() |
| throws Exception{ |
| // This list is generated from CompositionExclusion.txt. |
| // Update whenever the normalizer tables are updated. Note |
| // that we test all characters listed, even those that can be |
| // derived from the Unicode DB and are therefore commented |
| // out. |
| String EXCLUDED = |
| "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" + |
| "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" + |
| "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" + |
| "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" + |
| "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" + |
| "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" + |
| "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" + |
| "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" + |
| "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" + |
| "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" + |
| "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" + |
| "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" + |
| "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" + |
| "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E"; |
| for (int i=0; i<EXCLUDED.length(); ++i) { |
| String a = String.valueOf(EXCLUDED.charAt(i)); |
| String b = Normalizer.normalize(a, Normalizer.NFKD); |
| String c = Normalizer.normalize(b, Normalizer.NFC); |
| if (c.equals(a)) { |
| errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + |
| hex(b) + " x COMPOSE => " + |
| hex(c)); |
| } else if (isVerbose()) { |
| logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + |
| hex(b) + " x COMPOSE => " + |
| hex(c)); |
| } |
| } |
| // The following method works too, but it is somewhat |
| // incestuous. It uses UInfo, which is the same database that |
| // NormalizerBuilder uses, so if something is wrong with |
| // UInfo, the following test won't show it. All it will show |
| // is that NormalizerBuilder has been run with whatever the |
| // current UInfo is. |
| // |
| // We comment this out in favor of the test above, which |
| // provides independent verification (but also requires |
| // independent updating). |
| // logln("---"); |
| // UInfo uinfo = new UInfo(); |
| // for (int i=0; i<=0xFFFF; ++i) { |
| // if (!uinfo.isExcludedComposition((char)i) || |
| // (!uinfo.hasCanonicalDecomposition((char)i) && |
| // !uinfo.hasCompatibilityDecomposition((char)i))) continue; |
| // String a = String.valueOf((char)i); |
| // String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0); |
| // String c = Normalizer.normalize(b,Normalizer.COMPOSE,0); |
| // if (c.equals(a)) { |
| // errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + |
| // hex(b) + " x COMPOSE => " + |
| // hex(c)); |
| // } else if (isVerbose()) { |
| // logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + |
| // hex(b) + " x COMPOSE => " + |
| // hex(c)); |
| // } |
| // } |
| } |
| |
| /** |
| * Test for a problem that showed up just before ICU 1.6 release |
| * having to do with combining characters with an index of zero. |
| * Such characters do not participate in any canonical |
| * decompositions. However, having an index of zero means that |
| * they all share one typeMask[] entry, that is, they all have to |
| * map to the same canonical class, which is not the case, in |
| * reality. |
| */ |
| @Test |
| public void TestZeroIndex() |
| throws Exception{ |
| String[] DATA = { |
| // Expect col1 x COMPOSE_COMPAT => col2 |
| // Expect col2 x DECOMP => col3 |
| "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", |
| "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", |
| "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", |
| "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", |
| "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", |
| }; |
| |
| for (int i=0; i<DATA.length; i+=3) { |
| String a = DATA[i]; |
| String b = Normalizer.normalize(a, Normalizer.NFKC); |
| String exp = DATA[i+1]; |
| if (b.equals(exp)) { |
| logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b)); |
| } else { |
| errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) + |
| ", expect " + hex(exp)); |
| } |
| a = Normalizer.normalize(b, Normalizer.NFD); |
| exp = DATA[i+2]; |
| if (a.equals(exp)) { |
| logln("Ok: " + hex(b) + " x DECOMP => " + hex(a)); |
| } else { |
| errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) + |
| ", expect " + hex(exp)); |
| } |
| } |
| } |
| |
| /** |
| * Test for a problem found by Verisign. Problem is that |
| * characters at the start of a string are not put in canonical |
| * order correctly by compose() if there is no starter. |
| */ |
| @Test |
| public void TestVerisign() |
| throws Exception{ |
| String[] inputs = { |
| "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", |
| "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" |
| }; |
| String[] outputs = { |
| "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", |
| "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" |
| }; |
| |
| for (int i = 0; i < inputs.length; ++i) { |
| String input = inputs[i]; |
| String output = outputs[i]; |
| String result = Normalizer.decompose(input, false); |
| if (!result.equals(output)) { |
| errln("FAIL input: " + hex(input)); |
| errln(" decompose: " + hex(result)); |
| errln(" expected: " + hex(output)); |
| } |
| result = Normalizer.compose(input, false); |
| if (!result.equals(output)) { |
| errln("FAIL input: " + hex(input)); |
| errln(" compose: " + hex(result)); |
| errln(" expected: " + hex(output)); |
| } |
| } |
| |
| } |
| @Test |
| public void TestQuickCheckResultNO() |
| throws Exception{ |
| final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C, |
| 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E}; |
| final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB, |
| 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E}; |
| final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE, |
| 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; |
| final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE, |
| 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; |
| |
| |
| final int SIZE = 10; |
| |
| int count = 0; |
| for (; count < SIZE; count ++) |
| { |
| if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), |
| Normalizer.NFD,0) != Normalizer.NO) |
| { |
| errln("ERROR in NFD quick check at U+" + |
| Integer.toHexString(CPNFD[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), |
| Normalizer.NFC,0) !=Normalizer.NO) |
| { |
| errln("ERROR in NFC quick check at U+"+ |
| Integer.toHexString(CPNFC[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), |
| Normalizer.NFKD,0) != Normalizer.NO) |
| { |
| errln("ERROR in NFKD quick check at U+"+ |
| Integer.toHexString(CPNFKD[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), |
| Normalizer.NFKC,0) !=Normalizer.NO) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| // for improving coverage |
| if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), |
| Normalizer.NFKC) !=Normalizer.NO) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| } |
| } |
| |
| |
| @Test |
| public void TestQuickCheckResultYES() |
| throws Exception{ |
| final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A, |
| 0x2261, 0x3075, 0x4000, 0x5000, 0xF000}; |
| final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500, |
| 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000}; |
| final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB, |
| 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27}; |
| final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000, |
| 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E}; |
| |
| final int SIZE = 10; |
| int count = 0; |
| |
| char cp = 0; |
| while (cp < 0xA0) |
| { |
| if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFD quick check at U+"+ |
| Integer.toHexString(cp)); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFC quick check at U+"+ |
| Integer.toHexString(cp)); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFKD quick check at U+" + |
| Integer.toHexString(cp)); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(cp)); |
| return; |
| } |
| // improve the coverage |
| if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(cp)); |
| return; |
| } |
| cp++; |
| } |
| |
| for (; count < SIZE; count ++) |
| { |
| if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), |
| Normalizer.NFD,0)!=Normalizer.YES) |
| { |
| errln("ERROR in NFD quick check at U+"+ |
| Integer.toHexString(CPNFD[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), |
| Normalizer.NFC,0)!=Normalizer.YES) |
| { |
| errln("ERROR in NFC quick check at U+"+ |
| Integer.toHexString(CPNFC[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), |
| Normalizer.NFKD,0)!=Normalizer.YES) |
| { |
| errln("ERROR in NFKD quick check at U+"+ |
| Integer.toHexString(CPNFKD[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), |
| Normalizer.NFKC,0)!=Normalizer.YES) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| // improve the coverage |
| if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), |
| Normalizer.NFKC)!=Normalizer.YES) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| } |
| } |
| @Test |
| public void TestBengali() throws Exception{ |
| String input = "\u09bc\u09be\u09cd\u09be"; |
| String output=Normalizer.normalize(input,Normalizer.NFC); |
| if(!input.equals(output)){ |
| errln("ERROR in NFC of string"); |
| } |
| } |
| @Test |
| public void TestQuickCheckResultMAYBE() |
| throws Exception{ |
| |
| final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161, |
| 0x116A, 0x1173, 0x1175, 0x3099, 0x309A}; |
| final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E, |
| 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099}; |
| |
| |
| final int SIZE = 10; |
| |
| int count = 0; |
| |
| /* NFD and NFKD does not have any MAYBE codepoints */ |
| for (; count < SIZE; count ++) |
| { |
| if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), |
| Normalizer.NFC,0)!=Normalizer.MAYBE) |
| { |
| errln("ERROR in NFC quick check at U+"+ |
| Integer.toHexString(CPNFC[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), |
| Normalizer.NFKC,0)!=Normalizer.MAYBE) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(new char[]{CPNFC[count]}, |
| Normalizer.NFC,0)!=Normalizer.MAYBE) |
| { |
| errln("ERROR in NFC quick check at U+"+ |
| Integer.toHexString(CPNFC[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, |
| Normalizer.NFKC,0)!=Normalizer.MAYBE) |
| { |
| errln("ERROR in NFKC quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, |
| Normalizer.NONE,0)!=Normalizer.YES) |
| { |
| errln("ERROR in NONE quick check at U+"+ |
| Integer.toHexString(CPNFKC[count])); |
| return; |
| } |
| } |
| } |
| |
| @Test |
| public void TestQuickCheckStringResult() |
| throws Exception{ |
| int count; |
| String d; |
| String c; |
| |
| for (count = 0; count < canonTests.length; count ++) |
| { |
| d = canonTests[count][1]; |
| c = canonTests[count][2]; |
| if (Normalizer.quickCheck(d,Normalizer.NFD,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFD quick check for string at count " + count); |
| return; |
| } |
| |
| if (Normalizer.quickCheck(c, Normalizer.NFC,0) |
| == Normalizer.NO) |
| { |
| errln("ERROR in NFC quick check for string at count " + count); |
| return; |
| } |
| } |
| |
| for (count = 0; count < compatTests.length; count ++) |
| { |
| d = compatTests[count][1]; |
| c = compatTests[count][2]; |
| if (Normalizer.quickCheck(d, Normalizer.NFKD,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFKD quick check for string at count " + count); |
| return; |
| } |
| |
| if (Normalizer.quickCheck(c, Normalizer.NFKC,0) |
| != Normalizer.YES) |
| { |
| errln("ERROR in NFKC quick check for string at count " + count); |
| return; |
| } |
| } |
| } |
| |
| static final int qcToInt(Normalizer.QuickCheckResult qc) { |
| if(qc==Normalizer.NO) { |
| return 0; |
| } else if(qc==Normalizer.YES) { |
| return 1; |
| } else /* Normalizer.MAYBE */ { |
| return 2; |
| } |
| } |
| |
| @Test |
| public void TestQuickCheckPerCP() { |
| int c, lead, trail; |
| String s, nfd; |
| int lccc1, lccc2, tccc1, tccc2; |
| int qc1, qc2; |
| |
| if( |
| UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES |
| UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 || |
| UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE |
| UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 || |
| UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) || |
| UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) |
| ) { |
| errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS"); |
| } |
| |
| /* |
| * compare the quick check property values for some code points |
| * to the quick check results for checking same-code point strings |
| */ |
| c=0; |
| while(c<0x110000) { |
| s=UTF16.valueOf(c); |
| |
| qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK); |
| qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC)); |
| if(qc1!=qc2) { |
| errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c)); |
| } |
| |
| qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK); |
| qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD)); |
| if(qc1!=qc2) { |
| errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c)); |
| } |
| |
| qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK); |
| qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC)); |
| if(qc1!=qc2) { |
| errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c)); |
| } |
| |
| qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK); |
| qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD)); |
| if(qc1!=qc2) { |
| errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c)); |
| } |
| |
| nfd=Normalizer.normalize(s, Normalizer.NFD); |
| lead=UTF16.charAt(nfd, 0); |
| trail=UTF16.charAt(nfd, nfd.length()-1); |
| |
| lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS); |
| lccc2=UCharacter.getCombiningClass(lead); |
| tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); |
| tccc2=UCharacter.getCombiningClass(trail); |
| |
| if(lccc1!=lccc2) { |
| errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c)); |
| } |
| if(tccc1!=tccc2) { |
| errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c)); |
| } |
| |
| /* skip some code points */ |
| c=(20*c)/19+1; |
| } |
| } |
| |
| //------------------------------------------------------------------------ |
| // Internal utilities |
| // |
| //------------------------------------------------------------------------ |
| // Internal utilities |
| // |
| |
| /* private void backAndForth(Normalizer iter, String input) |
| { |
| iter.setText(input); |
| |
| // Run through the iterator forwards and stick it into a StringBuffer |
| StringBuffer forward = new StringBuffer(); |
| for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { |
| forward.append(ch); |
| } |
| |
| // Now do it backwards |
| StringBuffer reverse = new StringBuffer(); |
| for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { |
| reverse.insert(0, ch); |
| } |
| |
| if (!forward.toString().equals(reverse.toString())) { |
| errln("FAIL: Forward/reverse mismatch for input " + hex(input) |
| + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); |
| } else if (isVerbose()) { |
| logln("Ok: Forward/reverse for input " + hex(input) |
| + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); |
| } |
| }*/ |
| |
| private void backAndForth(Normalizer iter, String[][] tests) |
| { |
| for (int i = 0; i < tests.length; i++) |
| { |
| iter.setText(tests[i][0]); |
| |
| // Run through the iterator forwards and stick it into a |
| // StringBuffer |
| StringBuffer forward = new StringBuffer(); |
| for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { |
| forward.append(ch); |
| } |
| |
| // Now do it backwards |
| StringBuffer reverse = new StringBuffer(); |
| for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { |
| reverse.insert(0, ch); |
| } |
| |
| if (!forward.toString().equals(reverse.toString())) { |
| errln("FAIL: Forward/reverse mismatch for input " |
| + hex(tests[i][0]) + ", forward: " + hex(forward) |
| + ", backward: " + hex(reverse)); |
| } else if (isVerbose()) { |
| logln("Ok: Forward/reverse for input " + hex(tests[i][0]) |
| + ", forward: " + hex(forward) + ", backward: " |
| + hex(reverse)); |
| } |
| } |
| } |
| |
| private void staticTest (Normalizer.Mode mode, |
| String[][] tests, int outCol) throws Exception{ |
| for (int i = 0; i < tests.length; i++) |
| { |
| String input = Utility.unescape(tests[i][0]); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + input + "' (" + hex(input) + ")" ); |
| |
| String output = Normalizer.normalize(input, mode); |
| |
| if (!output.equals(expect)) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + output + "' (" + hex(output) + ")" ); |
| } |
| } |
| char[] output = new char[1]; |
| for (int i = 0; i < tests.length; i++) |
| { |
| char[] input = Utility.unescape(tests[i][0]).toCharArray(); |
| String expect =Utility.unescape( tests[i][outCol]); |
| |
| logln("Normalizing '" + new String(input) + "' (" + |
| hex(new String(input)) + ")" ); |
| int reqLength=0; |
| while(true){ |
| try{ |
| reqLength=Normalizer.normalize(input,output, mode,0); |
| if(reqLength<=output.length ){ |
| break; |
| } |
| }catch(IndexOutOfBoundsException e){ |
| output= new char[Integer.parseInt(e.getMessage())]; |
| continue; |
| } |
| } |
| if (!expect.equals(new String(output,0,reqLength))) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + new String(output) |
| + "' (" + hex(new String(output)) + ")" ); |
| } |
| } |
| } |
| private void decomposeTest(Normalizer.Mode mode, |
| String[][] tests, int outCol) throws Exception{ |
| for (int i = 0; i < tests.length; i++) |
| { |
| String input = Utility.unescape(tests[i][0]); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + input + "' (" + hex(input) + ")" ); |
| |
| String output = Normalizer.decompose(input, mode==Normalizer.NFKD); |
| |
| if (!output.equals(expect)) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + output + "' (" + hex(output) + ")" ); |
| } |
| } |
| char[] output = new char[1]; |
| for (int i = 0; i < tests.length; i++) |
| { |
| char[] input = Utility.unescape(tests[i][0]).toCharArray(); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + new String(input) + "' (" + |
| hex(new String(input)) + ")" ); |
| int reqLength=0; |
| while(true){ |
| try{ |
| reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0); |
| if(reqLength<=output.length ){ |
| break; |
| } |
| }catch(IndexOutOfBoundsException e){ |
| output= new char[Integer.parseInt(e.getMessage())]; |
| continue; |
| } |
| } |
| if (!expect.equals(new String(output,0,reqLength))) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + new String(output) |
| + "' (" + hex(new String(output)) + ")" ); |
| } |
| } |
| output = new char[1]; |
| for (int i = 0; i < tests.length; i++) |
| { |
| char[] input = Utility.unescape(tests[i][0]).toCharArray(); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + new String(input) + "' (" + |
| hex(new String(input)) + ")" ); |
| int reqLength=0; |
| while(true){ |
| try{ |
| reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0); |
| if(reqLength<=output.length ){ |
| break; |
| } |
| }catch(IndexOutOfBoundsException e){ |
| output= new char[Integer.parseInt(e.getMessage())]; |
| continue; |
| } |
| } |
| if (!expect.equals(new String(output,0,reqLength))) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + new String(output) |
| + "' (" + hex(new String(output)) + ")" ); |
| } |
| char[] output2 = new char[reqLength * 2]; |
| System.arraycopy(output, 0, output2, 0, reqLength); |
| int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); |
| if(retLength != reqLength){ |
| logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); |
| } |
| } |
| } |
| |
| private void composeTest(Normalizer.Mode mode, |
| String[][] tests, int outCol) throws Exception{ |
| for (int i = 0; i < tests.length; i++) |
| { |
| String input = Utility.unescape(tests[i][0]); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + input + "' (" + hex(input) + ")" ); |
| |
| String output = Normalizer.compose(input, mode==Normalizer.NFKC); |
| |
| if (!output.equals(expect)) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + output + "' (" + hex(output) + ")" ); |
| } |
| } |
| char[] output = new char[1]; |
| for (int i = 0; i < tests.length; i++) |
| { |
| char[] input = Utility.unescape(tests[i][0]).toCharArray(); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + new String(input) + "' (" + |
| hex(new String(input)) + ")" ); |
| int reqLength=0; |
| while(true){ |
| try{ |
| reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0); |
| if(reqLength<=output.length ){ |
| break; |
| } |
| }catch(IndexOutOfBoundsException e){ |
| output= new char[Integer.parseInt(e.getMessage())]; |
| continue; |
| } |
| } |
| if (!expect.equals(new String(output,0,reqLength))) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + new String(output) |
| + "' (" + hex(new String(output)) + ")" ); |
| } |
| } |
| output = new char[1]; |
| for (int i = 0; i < tests.length; i++) |
| { |
| char[] input = Utility.unescape(tests[i][0]).toCharArray(); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + new String(input) + "' (" + |
| hex(new String(input)) + ")" ); |
| int reqLength=0; |
| while(true){ |
| try{ |
| reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0); |
| if(reqLength<=output.length ){ |
| break; |
| } |
| }catch(IndexOutOfBoundsException e){ |
| output= new char[Integer.parseInt(e.getMessage())]; |
| continue; |
| } |
| } |
| if (!expect.equals(new String(output,0,reqLength))) { |
| errln("FAIL: case " + i |
| + " expected '" + expect + "' (" + hex(expect) + ")" |
| + " but got '" + new String(output) |
| + "' (" + hex(new String(output)) + ")" ); |
| } |
| |
| char[] output2 = new char[reqLength * 2]; |
| System.arraycopy(output, 0, output2, 0, reqLength); |
| int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); |
| if(retLength != reqLength){ |
| logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); |
| } |
| } |
| } |
| private void iterateTest(Normalizer iter, String[][] tests, int outCol){ |
| for (int i = 0; i < tests.length; i++) |
| { |
| String input = Utility.unescape(tests[i][0]); |
| String expect = Utility.unescape(tests[i][outCol]); |
| |
| logln("Normalizing '" + input + "' (" + hex(input) + ")" ); |
| |
| iter.setText(input); |
| assertEqual(expect, iter, "case " + i + " "); |
| } |
| } |
| |
| private void assertEqual(String expected, Normalizer iter, String msg) |
| { |
| int index = 0; |
| int ch; |
| UCharacterIterator cIter = UCharacterIterator.getInstance(expected); |
| |
| while ((ch=iter.next())!= Normalizer.DONE){ |
| if (index >= expected.length()) { |
| errln("FAIL: " + msg + "Unexpected character '" + (char)ch |
| + "' (" + hex(ch) + ")" |
| + " at index " + index); |
| break; |
| } |
| int want = UTF16.charAt(expected,index); |
| if (ch != want) { |
| errln("FAIL: " + msg + "got '" + (char)ch |
| + "' (" + hex(ch) + ")" |
| + " but expected '" + want + "' (" + hex(want)+ ")" |
| + " at index " + index); |
| } |
| index+= UTF16.getCharCount(ch); |
| } |
| if (index < expected.length()) { |
| errln("FAIL: " + msg + "Only got " + index + " chars, expected " |
| + expected.length()); |
| } |
| |
| cIter.setToLimit(); |
| while((ch=iter.previous())!=Normalizer.DONE){ |
| int want = cIter.previousCodePoint(); |
| if (ch != want ) { |
| errln("FAIL: " + msg + "got '" + (char)ch |
| + "' (" + hex(ch) + ")" |
| + " but expected '" + want + "' (" + hex(want) + ")" |
| + " at index " + index); |
| } |
| } |
| } |
| //-------------------------------------------------------------------------- |
| |
| // NOTE: These tests are used for quick debugging so are not ported |
| // to ICU4C tsnorm.cpp in intltest |
| // |
| |
| @Test |
| public void TestDebugStatic(){ |
| String in = Utility.unescape("\\U0001D157\\U0001D165"); |
| if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){ |
| errln("isNormalized failed"); |
| } |
| |
| String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ |
| "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ |
| "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ |
| "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ |
| "\uAD8B\uAD8B\uAD8B\uAD8B"+ |
| "d\u031B\u0307\u0323"; |
| String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ |
| "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ |
| "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+ |
| "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ |
| "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ |
| "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+ |
| "cccccccccccccccccccccccccccccccccccccccccccccccc"+ |
| "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ |
| "dddddddddddddddddddddddd"+ |
| "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ |
| "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307"; |
| String output = Normalizer.normalize(Utility.unescape(input), |
| Normalizer.NFD); |
| if(!expect.equals(output)){ |
| errln("FAIL expected: "+hex(expect) + " got: "+hex(output)); |
| } |
| |
| |
| |
| } |
| @Test |
| public void TestDebugIter(){ |
| String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); |
| String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); |
| Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)), |
| Normalizer.NONE,0); |
| int index = 0; |
| int ch; |
| UCharacterIterator cIter = UCharacterIterator.getInstance(expected); |
| |
| while ((ch=iter.next())!= Normalizer.DONE){ |
| if (index >= expected.length()) { |
| errln("FAIL: " + "Unexpected character '" + (char)ch |
| + "' (" + hex(ch) + ")" |
| + " at index " + index); |
| break; |
| } |
| int want = UTF16.charAt(expected,index); |
| if (ch != want) { |
| errln("FAIL: " + "got '" + (char)ch |
| + "' (" + hex(ch) + ")" |
| + " but expected '" + want + "' (" + hex(want)+ ")" |
| + " at index " + index); |
| } |
| index+= UTF16.getCharCount(ch); |
| } |
| if (index < expected.length()) { |
| errln("FAIL: " + "Only got " + index + " chars, expected " |
| + expected.length()); |
| } |
| |
| cIter.setToLimit(); |
| while((ch=iter.previous())!=Normalizer.DONE){ |
| int want = cIter.previousCodePoint(); |
| if (ch != want ) { |
| errln("FAIL: " + "got '" + (char)ch |
| + "' (" + hex(ch) + ")" |
| + " but expected '" + want + "' (" + hex(want) + ")" |
| + " at index " + index); |
| } |
| } |
| } |
| @Test |
| public void TestDebugIterOld(){ |
| String input = "\\U0001D15E"; |
| String expected = "\uD834\uDD57\uD834\uDD65"; |
| String expectedReverse = "\uD834\uDD65\uD834\uDD57"; |
| int index = 0; |
| int ch; |
| Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)), |
| Normalizer.NFKC,0); |
| StringBuffer got = new StringBuffer(); |
| for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next()) |
| { |
| if (index >= expected.length()) { |
| errln("FAIL: " + "Unexpected character '" + (char)ch + |
| "' (" + hex(ch) + ")" + " at index " + index); |
| break; |
| } |
| got.append(UCharacter.toString(ch)); |
| index++; |
| } |
| if (!expected.equals(got.toString())) { |
| errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" |
| + " but expected '" + expected + "' (" |
| + hex(expected) + ")"); |
| } |
| if (got.length() < expected.length()) { |
| errln("FAIL: " + "Only got " + index + " chars, expected " |
| + expected.length()); |
| } |
| |
| logln("Reverse Iteration\n"); |
| iter.setIndexOnly(iter.endIndex()); |
| got.setLength(0); |
| for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){ |
| if (index >= expected.length()) { |
| errln("FAIL: " + "Unexpected character '" + (char)ch |
| + "' (" + hex(ch) + ")" + " at index " + index); |
| break; |
| } |
| got.append(UCharacter.toString(ch)); |
| } |
| if (!expectedReverse.equals(got.toString())) { |
| errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" |
| + " but expected '" + expected |
| + "' (" + hex(expected) + ")"); |
| } |
| if (got.length() < expected.length()) { |
| errln("FAIL: " + "Only got " + index + " chars, expected " |
| + expected.length()); |
| } |
| |
| } |
| //-------------------------------------------------------------------------- |
| // helper class for TestPreviousNext() |
| // simple UTF-32 character iterator |
| class UCharIterator { |
| |
| public UCharIterator(int[] src, int len, int index){ |
| |
| s=src; |
| length=len; |
| i=index; |
| } |
| |
| public int current() { |
| if(i<length) { |
| return s[i]; |
| } else { |
| return -1; |
| } |
| } |
| |
| public int next() { |
| if(i<length) { |
| return s[i++]; |
| } else { |
| return -1; |
| } |
| } |
| |
| public int previous() { |
| if(i>0) { |
| return s[--i]; |
| } else { |
| return -1; |
| } |
| } |
| |
| public int getIndex() { |
| return i; |
| } |
| |
| private int[] s; |
| private int length, i; |
| } |
| @Test |
| public void TestPreviousNext() { |
| // src and expect strings |
| char src[]={ |
| UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), |
| UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), |
| 0xc4, |
| 0x1ed0 |
| }; |
| int expect[]={ |
| 0x831d, |
| 0x1d158, 0x1d165, |
| 0x41, 0x308, |
| 0x4f, 0x302, 0x301 |
| }; |
| |
| // expected src indexes corresponding to expect indexes |
| int expectIndex[]={ |
| 0, |
| 2, 2, |
| 4, 4, |
| 5, 5, 5, |
| 6 // behind last character |
| }; |
| |
| // initial indexes into the src and expect strings |
| |
| final int SRC_MIDDLE=4; |
| final int EXPECT_MIDDLE=3; |
| |
| |
| // movement vector |
| // - for previous(), 0 for current(), + for next() |
| // not const so that we can terminate it below for the error message |
| String moves="0+0+0--0-0-+++0--+++++++0--------"; |
| |
| // iterators |
| Normalizer iter = new Normalizer(new String(src), |
| Normalizer.NFD,0); |
| UCharIterator iter32 = new UCharIterator(expect, expect.length, |
| EXPECT_MIDDLE); |
| |
| int c1, c2; |
| char m; |
| |
| // initially set the indexes into the middle of the strings |
| iter.setIndexOnly(SRC_MIDDLE); |
| |
| // move around and compare the iteration code points with |
| // the expected ones |
| int movesIndex =0; |
| while(movesIndex<moves.length()) { |
| m=moves.charAt(movesIndex++); |
| if(m=='-') { |
| c1=iter.previous(); |
| c2=iter32.previous(); |
| } else if(m=='0') { |
| c1=iter.current(); |
| c2=iter32.current(); |
| } else /* m=='+' */ { |
| c1=iter.next(); |
| c2=iter32.next(); |
| } |
| |
| // compare results |
| if(c1!=c2) { |
| // copy the moves until the current (m) move, and terminate |
| String history = moves.substring(0,movesIndex); |
| errln("error: mismatch in Normalizer iteration at "+history+": " |
| +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); |
| break; |
| } |
| |
| // compare indexes |
| if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { |
| // copy the moves until the current (m) move, and terminate |
| String history = moves.substring(0,movesIndex); |
| errln("error: index mismatch in Normalizer iteration at " |
| +history+ " : "+ "Normalizer index " +iter.getIndex() |
| +" expected "+ expectIndex[iter32.getIndex()]); |
| break; |
| } |
| } |
| } |
| // Only in ICU4j |
| @Test |
| public void TestPreviousNextJCI() { |
| // src and expect strings |
| char src[]={ |
| UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), |
| UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), |
| 0xc4, |
| 0x1ed0 |
| }; |
| int expect[]={ |
| 0x831d, |
| 0x1d158, 0x1d165, |
| 0x41, 0x308, |
| 0x4f, 0x302, 0x301 |
| }; |
| |
| // expected src indexes corresponding to expect indexes |
| int expectIndex[]={ |
| 0, |
| 2, 2, |
| 4, 4, |
| 5, 5, 5, |
| 6 // behind last character |
| }; |
| |
| // initial indexes into the src and expect strings |
| |
| final int SRC_MIDDLE=4; |
| final int EXPECT_MIDDLE=3; |
| |
| |
| // movement vector |
| // - for previous(), 0 for current(), + for next() |
| // not const so that we can terminate it below for the error message |
| String moves="0+0+0--0-0-+++0--+++++++0--------"; |
| |
| // iterators |
| StringCharacterIterator text = new StringCharacterIterator(new String(src)); |
| Normalizer iter = new Normalizer(text,Normalizer.NFD,0); |
| UCharIterator iter32 = new UCharIterator(expect, expect.length, |
| EXPECT_MIDDLE); |
| |
| int c1, c2; |
| char m; |
| |
| // initially set the indexes into the middle of the strings |
| iter.setIndexOnly(SRC_MIDDLE); |
| |
| // move around and compare the iteration code points with |
| // the expected ones |
| int movesIndex =0; |
| while(movesIndex<moves.length()) { |
| m=moves.charAt(movesIndex++); |
| if(m=='-') { |
| c1=iter.previous(); |
| c2=iter32.previous(); |
| } else if(m=='0') { |
| c1=iter.current(); |
| c2=iter32.current(); |
| } else /* m=='+' */ { |
| c1=iter.next(); |
| c2=iter32.next(); |
| } |
| |
| // compare results |
| if(c1!=c2) { |
| // copy the moves until the current (m) move, and terminate |
| String history = moves.substring(0,movesIndex); |
| errln("error: mismatch in Normalizer iteration at "+history+": " |
| +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); |
| break; |
| } |
| |
| // compare indexes |
| if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { |
| // copy the moves until the current (m) move, and terminate |
| String history = moves.substring(0,movesIndex); |
| errln("error: index mismatch in Normalizer iteration at " |
| +history+ " : "+ "Normalizer index " +iter.getIndex() |
| +" expected "+ expectIndex[iter32.getIndex()]); |
| break; |
| } |
| } |
| } |
| |
| // test APIs that are not otherwise used - improve test coverage |
| @Test |
| public void TestNormalizerAPI() throws Exception { |
| try{ |
| // instantiate a Normalizer from a CharacterIterator |
| String s=Utility.unescape("a\u0308\uac00\\U0002f800"); |
| // make s a bit longer and more interesting |
| UCharacterIterator iter = UCharacterIterator.getInstance(s+s); |
| Normalizer norm = new Normalizer(iter, Normalizer.NFC,0); |
| if(norm.next()!=0xe4) { |
| errln("error in Normalizer(CharacterIterator).next()"); |
| } |
| |
| // test clone(), ==, and hashCode() |
| Normalizer clone=(Normalizer)norm.clone(); |
| if(clone.equals(norm)) { |
| errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm"); |
| } |
| |
| if(clone.getLength()!= norm.getLength()){ |
| errln("error in Normalizer.getBeginIndex()"); |
| } |
| // clone must have the same hashCode() |
| //if(clone.hashCode()!=norm.hashCode()) { |
| // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()"); |
| //} |
| if(clone.next()!=0xac00) { |
| errln("error in Normalizer(Normalizer(CharacterIterator)).next()"); |
| } |
| int ch = clone.next(); |
| if(ch!=0x4e3d) { |
| errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()"); |
| } |
| // position changed, must change hashCode() |
| if(clone.hashCode()==norm.hashCode()) { |
| errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()"); |
| } |
| |
| // test compose() and decompose() |
| StringBuffer tel; |
| String nfkc, nfkd; |
| tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121"); |
| tel.insert(1,(char)0x0301); |
| |
| nfkc=Normalizer.compose(tel.toString(), true); |
| nfkd=Normalizer.decompose(tel.toString(), true); |
| if( |
| !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))|| |
| !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL")) |
| ) { |
| errln("error in Normalizer::(de)compose(): wrong result(s)"); |
| } |
| |
| // test setIndex() |
| ch=norm.setIndex(3); |
| if(ch!=0x4e3d) { |
| errln("error in Normalizer(CharacterIterator).setIndex(3)"); |
| } |
| |
| // test setText(CharacterIterator) and getText() |
| String out, out2; |
| clone.setText(iter); |
| |
| out = clone.getText(); |
| out2 = iter.getText(); |
| if( !out.equals(out2) || |
| clone.startIndex()!=0|| |
| clone.endIndex()!=iter.getLength() |
| ) { |
| errln("error in Normalizer::setText() or Normalizer::getText()"); |
| } |
| |
| char[] fillIn1 = new char[clone.getLength()]; |
| char[] fillIn2 = new char[iter.getLength()]; |
| int len = clone.getText(fillIn1); |
| iter.getText(fillIn2,0); |
| if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ |
| errln("error in Normalizer.getText(). Normalizer: "+ |
| Utility.hex(new String(fillIn1))+ |
| " Iter: " + Utility.hex(new String(fillIn2))); |
| } |
| |
| clone.setText(fillIn1); |
| len = clone.getText(fillIn2); |
| if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ |
| errln("error in Normalizer.setText() or Normalizer.getText()"+ |
| Utility.hex(new String(fillIn1))+ |
| " Iter: " + Utility.hex(new String(fillIn2))); |
| } |
| |
| // test setText(UChar *), getUMode() and setMode() |
| clone.setText(s); |
| clone.setIndexOnly(1); |
| clone.setMode(Normalizer.NFD); |
| if(clone.getMode()!=Normalizer.NFD) { |
| errln("error in Normalizer::setMode() or Normalizer::getMode()"); |
| } |
| if(clone.next()!=0x308 || clone.next()!=0x1100) { |
| errln("error in Normalizer::setText() or Normalizer::setMode()"); |
| } |
| |
| // test last()/previous() with an internal buffer overflow |
| StringBuffer buf = new StringBuffer("aaaaaaaaaa"); |
| buf.setCharAt(10-1,'\u0308'); |
| clone.setText(buf); |
| if(clone.last()!=0x308) { |
| errln("error in Normalizer(10*U+0308).last()"); |
| } |
| |
| // test UNORM_NONE |
| norm.setMode(Normalizer.NONE); |
| if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) { |
| errln("error in Normalizer(UNORM_NONE).first()/next()/last()"); |
| } |
| out=Normalizer.normalize(s, Normalizer.NONE); |
| if(!out.equals(s)) { |
| errln("error in Normalizer::normalize(UNORM_NONE)"); |
| } |
| ch = 0x1D15E; |
| String exp = "\\U0001D157\\U0001D165"; |
| String ns = Normalizer.normalize(ch,Normalizer.NFC); |
| if(!ns.equals(Utility.unescape(exp))){ |
| errln("error in Normalizer.normalize(int,Mode)"); |
| } |
| ns = Normalizer.normalize(ch,Normalizer.NFC,0); |
| if(!ns.equals(Utility.unescape(exp))){ |
| errln("error in Normalizer.normalize(int,Mode,int)"); |
| } |
| }catch(Exception e){ |
| throw e; |
| } |
| } |
| |
| @Test |
| public void TestConcatenate() { |
| |
| Object[][]cases=new Object[][]{ |
| /* mode, left, right, result */ |
| { |
| Normalizer.NFC, |
| "re", |
| "\u0301sum\u00e9", |
| "r\u00e9sum\u00e9" |
| }, |
| { |
| Normalizer.NFC, |
| "a\u1100", |
| "\u1161bcdefghijk", |
| "a\uac00bcdefghijk" |
| }, |
| /* ### TODO: add more interesting cases */ |
| { |
| Normalizer.NFD, |
| "\u03B1\u0345", |
| "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169 |
| "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345 |
| } |
| }; |
| |
| String left, right, expect, result; |
| Normalizer.Mode mode; |
| int i; |
| |
| /* test concatenation */ |
| for(i=0; i<cases.length; ++i) { |
| mode = (Normalizer.Mode)cases[i][0]; |
| |
| left=(String)cases[i][1]; |
| right=(String)cases[i][2]; |
| expect=(String)cases[i][3]; |
| { |
| result=Normalizer.concatenate(left, right, mode,0); |
| if(!result.equals(expect)) { |
| errln("error in Normalizer.concatenate(), cases[] failed" |
| +", result==expect: expected: " |
| + hex(expect)+" =========> got: " + hex(result)); |
| } |
| } |
| { |
| result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0); |
| if(!result.equals(expect)) { |
| errln("error in Normalizer.concatenate(), cases[] failed" |
| +", result==expect: expected: " |
| + hex(expect)+" =========> got: " + hex(result)); |
| } |
| } |
| } |
| |
| mode= Normalizer.NFC; // (Normalizer.Mode)cases2[0][0]; |
| char[] destination = "My resume is here".toCharArray(); |
| left = "resume"; |
| right = "re\u0301sum\u00e9 is HERE"; |
| expect = "My r\u00e9sum\u00e9 is HERE"; |
| |
| // Concatenates 're' with '\u0301sum\u00e9 is HERE' and places the result at |
| // position 3 of string 'My resume is here'. |
| Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, |
| destination, 3, 17, mode, 0); |
| if(!String.valueOf(destination).equals(expect)) { |
| errln("error in Normalizer.concatenate(), cases2[] failed" |
| +", result==expect: expected: " |
| + hex(expect) + " =========> got: " + hex(destination)); |
| } |
| |
| // Error case when result of concatenation won't fit into destination array. |
| try { |
| Normalizer.concatenate(left.toCharArray(), 0, 2, right.toCharArray(), 2, 15, |
| destination, 3, 16, mode, 0); |
| } catch (IndexOutOfBoundsException e) { |
| assertTrue("Normalizer.concatenate() failed", e.getMessage().equals("14")); |
| return; |
| } |
| fail("Normalizer.concatenate() tested for failure but passed"); |
| } |
| |
| private final int RAND_MAX = 0x7fff; |
| |
| @Test |
| public void TestCheckFCD() |
| { |
| char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, |
| 0x0008, 0x0009, 0x000A}; |
| |
| char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301, |
| 0x02B9, 0x0314, 0x0315, 0x0316}; |
| |
| char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7, |
| 0x0050, 0x0730, 0x09EE, 0x1E10}; |
| |
| char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0}, |
| {0x0061, 0x030A, 0x00E2, 0x0323, 0}, |
| {0x0061, 0x0323, 0x00E2, 0x0323, 0}, |
| {0x0061, 0x0323, 0x1E05, 0x0302, 0} |
| }; |
| Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES}; |
| |
| char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, |
| 0x6a, |
| 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, |
| 0xea, |
| 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, |
| 0x0307, 0x0308, 0x0309, 0x030a, |
| 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, |
| 0x0327, 0x0328, 0x0329, 0x032a, |
| 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06, |
| 0x1e07, 0x1e08, 0x1e09, 0x1e0a |
| }; |
| |
| int count = 0; |
| |
| if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES) |
| errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n"); |
| if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO) |
| errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n"); |
| if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES) |
| errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n"); |
| |
| |
| while (count < 4) |
| { |
| Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0); |
| if (result[count] != fcdresult) { |
| errln("Normalizer.quickCheck(FCD) failed: Data set "+ count |
| + " expected value "+ result[count]); |
| } |
| count ++; |
| } |
| |
| /* random checks of long strings */ |
| //srand((unsigned)time( NULL )); |
| Random rand = createRandom(); // use test framework's random |
| |
| for (count = 0; count < 50; count ++) |
| { |
| int size = 0; |
| Normalizer.QuickCheckResult testresult = Normalizer.YES; |
| char[] data= new char[20]; |
| char[] norm= new char[100]; |
| char[] nfd = new char[100]; |
| int normStart = 0; |
| int nfdsize = 0; |
| while (size != 19) { |
| data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX]; |
| logln("0x"+data[size]); |
| normStart += Normalizer.normalize(data,size,size+1, |
| norm,normStart,100, |
| Normalizer.NFD,0); |
| size ++; |
| } |
| logln("\n"); |
| |
| nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0); |
| // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL, |
| // nfd, 100, &status); |
| if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) { |
| testresult = Normalizer.NO; |
| } |
| if (testresult == Normalizer.YES) { |
| logln("result Normalizer.YES\n"); |
| } |
| else { |
| logln("result Normalizer.NO\n"); |
| } |
| |
| if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) { |
| errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) ); |
| } |
| } |
| } |
| |
| |
| // reference implementation of Normalizer::compare |
| private int ref_norm_compare(String s1, String s2, int options) { |
| String t1, t2,r1,r2; |
| |
| int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; |
| |
| if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) { |
| // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) |
| r1 = Normalizer.decompose(s1,false,normOptions); |
| r2 = Normalizer.decompose(s2,false,normOptions); |
| r1 = UCharacter.foldCase(r1,options); |
| r2 = UCharacter.foldCase(r2,options); |
| }else{ |
| r1 = s1; |
| r2 = s2; |
| } |
| |
| t1 = Normalizer.decompose(r1, false, normOptions); |
| t2 = Normalizer.decompose(r2, false, normOptions); |
| |
| if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { |
| UTF16.StringComparator comp |
| = new UTF16.StringComparator(true, false, |
| UTF16.StringComparator.FOLD_CASE_DEFAULT); |
| return comp.compare(t1,t2); |
| } else { |
| return t1.compareTo(t2); |
| } |
| |
| } |
| |
| // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately |
| private int norm_compare(String s1, String s2, int options) { |
| int normOptions=options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT; |
| |
| if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) && |
| Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) { |
| options|=Normalizer.INPUT_IS_FCD; |
| } |
| |
| int cmpStrings = Normalizer.compare(s1, s2, options); |
| int cmpArrays = Normalizer.compare( |
| s1.toCharArray(), 0, s1.length(), |
| s2.toCharArray(), 0, s2.length(), options); |
| assertEquals("compare strings == compare char arrays", cmpStrings, cmpArrays); |
| return cmpStrings; |
| } |
| |
| // reference implementation of UnicodeString::caseCompare |
| private int ref_case_compare(String s1, String s2, int options) { |
| String t1, t2; |
| |
| t1=s1; |
| t2=s2; |
| |
| t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); |
| t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); |
| |
| if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { |
| UTF16.StringComparator comp |
| = new UTF16.StringComparator(true, false, |
| UTF16.StringComparator.FOLD_CASE_DEFAULT); |
| return comp.compare(t1,t2); |
| } else { |
| return t1.compareTo(t2); |
| } |
| |
| } |
| |
| // reduce an integer to -1/0/1 |
| private static int sign(int value) { |
| if(value==0) { |
| return 0; |
| } else { |
| return (value>>31)|1; |
| } |
| } |
| private static String signString(int value) { |
| if(value<0) { |
| return "<0"; |
| } else if(value==0) { |
| return "=0"; |
| } else /* value>0 */ { |
| return ">0"; |
| } |
| } |
| // test Normalizer::compare and unorm_compare (thinly wrapped by the former) |
| // by comparing it with its semantic equivalent |
| // since we trust the pieces, this is sufficient |
| |
| // test each string with itself and each other |
| // each time with all options |
| private String strings[]=new String[]{ |
| // some cases from NormalizationTest.txt |
| // 0..3 |
| "D\u031B\u0307\u0323", |
| "\u1E0C\u031B\u0307", |
| "D\u031B\u0323\u0307", |
| "d\u031B\u0323\u0307", |
| |
| // 4..6 |
| "\u00E4", |
| "a\u0308", |
| "A\u0308", |
| |
| // Angstrom sign = A ring |
| // 7..10 |
| "\u212B", |
| "\u00C5", |
| "A\u030A", |
| "a\u030A", |
| |
| // 11.14 |
| "a\u059A\u0316\u302A\u032Fb", |
| "a\u302A\u0316\u032F\u059Ab", |
| "a\u302A\u0316\u032F\u059Ab", |
| "A\u059A\u0316\u302A\u032Fb", |
| |
| // from ICU case folding tests |
| // 15..20 |
| "A\u00df\u00b5\ufb03\\U0001040c\u0131", |
| "ass\u03bcffi\\U00010434i", |
| "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff", |
| "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff", |
| "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff", |
| "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd", |
| |
| // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold |
| // vs. U+10000 at bottom - code point order |
| // 21..22 |
| "\ud800\ud800\udc01", |
| "\ud800\udc00", |
| |
| // other code point order tests from ustrtest.cpp |
| // 23..31 |
| "\u20ac\ud801", |
| "\u20ac\ud800\udc00", |
| "\ud800", |
| "\ud800\uff61", |
| "\udfff", |
| "\uff61\udfff", |
| "\uff61\ud800\udc02", |
| "\ud800\udc02", |
| "\ud84d\udc56", |
| |
| // long strings, see cnormtst.c/TestNormCoverage() |
| // equivalent if case-insensitive |
| // 32..33 |
| "\uAD8B\uAD8B\uAD8B\uAD8B"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ |
| "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ |
| "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ |
| "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ |
| "\uAD8B\uAD8B\uAD8B\uAD8B"+ |
| "d\u031B\u0307\u0323", |
| |
| "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ |
| "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ |
| "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ |
| "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ |
| "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ |
| "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ |
| "\u1E0C\u031B\u0307", |
| |
| // some strings that may make a difference whether the compare function |
| // case-folds or decomposes first |
| // 34..41 |
| "\u0360\u0345\u0334", |
| "\u0360\u03b9\u0334", |
| |
| "\u0360\u1f80\u0334", |
| "\u0360\u03b1\u0313\u03b9\u0334", |
| |
| "\u0360\u1ffc\u0334", |
| "\u0360\u03c9\u03b9\u0334", |
| |
| "a\u0360\u0345\u0360\u0345b", |
| "a\u0345\u0360\u0345\u0360b", |
| |
| // interesting cases for canonical caseless match with turkic i handling |
| // 42..43 |
| "\u00cc", |
| "\u0069\u0300", |
| |
| // strings with post-Unicode 3.2 normalization or normalization corrections |
| // 44..45 |
| "\u00e4\u193b\\U0002f868", |
| "\u0061\u193b\u0308\u36fc", |
| |
| |
| }; |
| |
| // all combinations of options |
| // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions |
| final class Temp { |
| int options; |
| String name; |
| public Temp(int opt,String str){ |
| options =opt; |
| name = str; |
| } |
| |
| } |
| // set UNORM_UNICODE_3_2 in one additional combination |
| |
| private Temp[] opt = new Temp[]{ |
| new Temp(0,"default"), |
| new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ), |
| new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ), |
| new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ), |
| new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"), |
| new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"), |
| new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2") |
| }; |
| |
| |
| @Test |
| public void TestCompareDebug(){ |
| |
| String[] s = new String[100]; // at least as many items as in strings[] ! |
| |
| |
| int i, j, k, count=strings.length; |
| int result, refResult; |
| |
| // create the UnicodeStrings |
| for(i=0; i<count; ++i) { |
| s[i]=Utility.unescape(strings[i]); |
| } |
| UTF16.StringComparator comp = new UTF16.StringComparator(true, false, |
| UTF16.StringComparator.FOLD_CASE_DEFAULT); |
| // test them each with each other |
| |
| i = 42; |
| j = 43; |
| k = 2; |
| // test Normalizer::compare |
| result=norm_compare(s[i], s[j], opt[k].options); |
| refResult=ref_norm_compare(s[i], s[j], opt[k].options); |
| if(sign(result)!=sign(refResult)) { |
| errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); |
| } |
| |
| // test UnicodeString::caseCompare - same internal implementation function |
| if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { |
| // result=s[i]. (s[j], opt[k].options); |
| if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) |
| { |
| comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); |
| } |
| else { |
| comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); |
| } |
| |
| result=comp.compare(s[i],s[j]); |
| refResult=ref_case_compare(s[i], s[j], opt[k].options); |
| if(sign(result)!=sign(refResult)) { |
| errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); |
| } |
| } |
| String value1 = "\u00dater\u00fd"; |
| String value2 = "\u00fater\u00fd"; |
| if(Normalizer.compare(value1,value2,0)!=0){ |
| if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){ |
| |
| } |
| } |
| } |
| |
| @Test |
| public void TestCompare() { |
| |
| String[] s = new String[100]; // at least as many items as in strings[] ! |
| |
| int i, j, k, count=strings.length; |
| int result, refResult; |
| |
| // create the UnicodeStrings |
| for(i=0; i<count; ++i) { |
| s[i]=Utility.unescape(strings[i]); |
| } |
| UTF16.StringComparator comp = new UTF16.StringComparator(); |
| // test them each with each other |
| for(i=0; i<count; ++i) { |
| for(j=i; j<count; ++j) { |
| for(k=0; k<opt.length; ++k) { |
| // test Normalizer::compare |
| result=norm_compare(s[i], s[j], opt[k].options); |
| refResult=ref_norm_compare(s[i], s[j], opt[k].options); |
| if(sign(result)!=sign(refResult)) { |
| errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); |
| } |
| |
| // test UnicodeString::caseCompare - same internal implementation function |
| if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { |
| // result=s[i]. (s[j], opt[k].options); |
| if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) |
| { |
| comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); |
| } |
| else { |
| comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); |
| } |
| |
| comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); |
| // result=comp.caseCompare(s[i],s[j], opt[k].options); |
| result=comp.compare(s[i],s[j]); |
| refResult=ref_case_compare(s[i], s[j], opt[k].options); |
| if(sign(result)!=sign(refResult)) { |
| errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); |
| } |
| } |
| } |
| } |
| } |
| |
| // test cases with i and I to make sure Turkic works |
| char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 }; |
| UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet(); |
| Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; |
| nfcImpl.ensureCanonIterData(); |
| |
| String s1, s2; |
| |
| // collect all sets into one for contiguous output |
| for(i=0; i<iI.length; ++i) { |
| if(nfcImpl.getCanonStartSet(iI[i], iSet)) { |
| set.addAll(iSet); |
| } |
| } |
| |
| // test all of these precomposed characters |
| Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance(); |
| UnicodeSetIterator it = new UnicodeSetIterator(set); |
| int c; |
| while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) { |
| s1 = UTF16.valueOf(c); |
| s2 = nfcNorm2.getDecomposition(c); |
| for(k=0; k<opt.length; ++k) { |
| // test Normalizer::compare |
| |
| result= norm_compare(s1, s2, opt[k].options); |
| refResult=ref_norm_compare(s1, s2, opt[k].options); |
| if(sign(result)!=sign(refResult)) { |
| errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")" |
| + signString(result)+" should be "+signString(refResult)); |
| } |
| |
| // test UnicodeString::caseCompare - same internal implementation function |
| if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) { |
| if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) |
| { |
| comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); |
| } |
| else { |
| comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); |
| } |
| |
| comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); |
| |
| result=comp.compare(s1,s2); |
| refResult=ref_case_compare(s1, s2, opt[k].options); |
| if(sign(result)!=sign(refResult)) { |
| errln("UTF16.compare(U+"+hex(c)+" with its NFD, " |
| +opt[k].name+")"+signString(result) +" should be "+signString(refResult)); |
| } |
| } |
| } |
| } |
| |
| // test getDecomposition() for some characters that do not decompose |
| if( nfcNorm2.getDecomposition(0x20)!=null || |
| nfcNorm2.getDecomposition(0x4e00)!=null || |
| nfcNorm2.getDecomposition(0x20002)!=null |
| ) { |
| errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions"); |
| } |
| |
| // test getRawDecomposition() for some characters that do not decompose |
| if( nfcNorm2.getRawDecomposition(0x20)!=null || |
| nfcNorm2.getRawDecomposition(0x4e00)!=null || |
| nfcNorm2.getRawDecomposition(0x20002)!=null |
| ) { |
| errln("getRawDecomposition() returns TRUE for characters which do not have decompositions"); |
| } |
| |
| // test composePair() for some pairs of characters that do not compose |
| if( nfcNorm2.composePair(0x20, 0x301)>=0 || |
| nfcNorm2.composePair(0x61, 0x305)>=0 || |
| nfcNorm2.composePair(0x1100, 0x1160)>=0 || |
| nfcNorm2.composePair(0xac00, 0x11a7)>=0 |
| ) { |
| errln("NFC.composePair() incorrectly composes some pairs of characters"); |
| } |
| |
| // test FilteredNormalizer2.getDecomposition() |
| UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]"); |
| FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); |
| if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) { |
| errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed"); |
| } |
| |
| // test FilteredNormalizer2.getRawDecomposition() |
| if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) { |
| errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); |
| } |
| |
| // test FilteredNormalizer2::composePair() |
| if( 0x100!=fn2.composePair(0x41, 0x304) || |
| fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08 |
| ) { |
| errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed"); |
| } |
| } |
| |
| // verify that case-folding does not un-FCD strings |
| int countFoldFCDExceptions(int foldingOptions) { |
| String s, d; |
| int c; |
| int count; |
| int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC; |
| Normalizer.QuickCheckResult qcResult; |
| int category; |
| boolean isNFD; |
| |
| |
| logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions)); |
| |
| count=0; |
| for(c=0; c<=0x10ffff; ++c) { |
| category=UCharacter.getType(c); |
| if(category==UCharacterCategory.UNASSIGNED) { |
| continue; // skip unassigned code points |
| } |
| if(c==0xac00) { |
| c=0xd7a3; // skip Hangul - no case folding there |
| continue; |
| } |
| // skip Han blocks - no case folding there either |
| if(c==0x3400) { |
| c=0x4db5; |
| continue; |
| } |
| if(c==0x4e00) { |
| c=0x9fa5; |
| continue; |
| } |
| if(c==0x20000) { |
| c=0x2a6d6; |
| continue; |
| } |
| |
| s= UTF16.valueOf(c); |
| |
| // get leading and trailing cc for c |
| d= Normalizer.decompose(s,false); |
| isNFD= s==d; |
| cc=UCharacter.getCombiningClass(UTF16.charAt(d,0)); |
| trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); |
| |
| // get leading and trailing cc for the case-folding of c |
| UCharacter.foldCase(s,(foldingOptions==0)); |
| d = Normalizer.decompose(s, false); |
| foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0)); |
| foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); |
| |
| qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0); |
| |
| |
| // bad: |
| // - character maps to empty string: adjacent characters may then need reordering |
| // - folding has different leading/trailing cc's, and they don't become just 0 |
| // - folding itself is not FCD |
| if( qcResult!=Normalizer.YES || |
| s.length()==0 || |
| (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0) |
| ) { |
| ++count; |
| errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); |
| //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult); |
| continue; |
| } |
| |
| // also bad: |
| // if a code point is in NFD but its case folding is not, then |
| // unorm_compare will also fail |
| if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) { |
| ++count; |
| errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); |
| } |
| } |
| |
| logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" ); |
| return count; |
| } |
| |
| @Test |
| public void TestFindFoldFCDExceptions() { |
| int count; |
| |
| count=countFoldFCDExceptions(0); |
| count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I); |
| if(count>0) { |
| //* |
| //* If case-folding un-FCDs any strings, then unorm_compare() must be |
| //* re-implemented. |
| //* It currently assumes that one can check for FCD then case-fold |
| //* and then still have FCD strings for raw decomposition without reordering. |
| //* |
| errln("error: There are "+count+" code points for which case-folding"+ |
| " may un-FCD a string for all folding options.\n See comment"+ |
| " in BasicNormalizerTest::FindFoldFCDExceptions()!"); |
| } |
| } |
| |
| @Test |
| public void TestCombiningMarks(){ |
| String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; |
| String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; |
| String result = Normalizer.decompose(src,false); |
| if(!expected.equals(result)){ |
| errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result)); |
| } |
| } |
| |
| /* |
| * Re-enable this test when UTC fixes UAX 21 |
| @Test |
| public void TestUAX21Failure(){ |
| final String[][] cases = new String[][]{ |
| {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"}, |
| {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"}, |
| {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, |
| {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, |
| {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"}, |
| {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"}, |
| }; |
| for(int i = 0; i< cases.length; i++){ |
| String s1 =cases[0][0]; |
| String s2 = cases[0][1]; |
| if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare |
| && |
| (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){ |
| errln("Normalizer.compare() failed for s1: " |
| + Utility.hex(s1) +" s2: " + Utility.hex(s2)); |
| } |
| } |
| } |
| */ |
| @Test |
| public void TestFCNFKCClosure() { |
| final class TestStruct{ |
| int c; |
| String s; |
| TestStruct(int cp, String src){ |
| c=cp; |
| s=src; |
| } |
| } |
| |
| TestStruct[] tests= new TestStruct[]{ |
| new TestStruct( 0x00C4, "" ), |
| new TestStruct( 0x00E4, "" ), |
| new TestStruct( 0x037A, "\u0020\u03B9" ), |
| new TestStruct( 0x03D2, "\u03C5" ), |
| new TestStruct( 0x20A8, "\u0072\u0073" ) , |
| new TestStruct( 0x210B, "\u0068" ), |
| new TestStruct( 0x210C, "\u0068" ), |
| new TestStruct( 0x2121, "\u0074\u0065\u006C" ), |
| new TestStruct( 0x2122, "\u0074\u006D" ), |
| new TestStruct( 0x2128, "\u007A" ), |
| new TestStruct( 0x1D5DB,"\u0068" ), |
| new TestStruct( 0x1D5ED,"\u007A" ), |
| new TestStruct( 0x0061, "" ) |
| }; |
| |
| |
| for(int i = 0; i < tests.length; ++ i) { |
| String result=Normalizer.getFC_NFKC_Closure(tests[i].c); |
| if(!result.equals(new String(tests[i].s))) { |
| errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong"); |
| } |
| } |
| |
| /* error handling */ |
| |
| int length=Normalizer.getFC_NFKC_Closure(0x5c, null); |
| if(length!=0){ |
| errln("getFC_NFKC_Closure did not perform error handling correctly"); |
| } |
| } |
| @Test |
| public void TestBugJ2324(){ |
| /* String[] input = new String[]{ |
| //"\u30FD\u3099", |
| "\u30FA\u309A", |
| "\u30FB\u309A", |
| "\u30FC\u309A", |
| "\u30FE\u309A", |
| "\u30FD\u309A", |
| |
| };*/ |
| String troublesome = "\u309A"; |
| for(int i=0x3000; i<0x3100;i++){ |
| String input = ((char)i)+troublesome; |
| try{ |
| /* String result =*/ Normalizer.compose(input,false); |
| }catch(IndexOutOfBoundsException e){ |
| errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString()); |
| } |
| } |
| |
| } |
| |
| static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5; |
| |
| private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) { |
| skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false); |
| skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); |
| skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false); |
| skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); |
| |
| // Remove from the NFC and NFKC sets all those characters that change |
| // when a back-combining character is added. |
| // First, get all of the back-combining characters and their combining classes. |
| UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]"); |
| int numCombineBack=combineBack.size(); |
| int[] combineBackCharsAndCc=new int[numCombineBack*2]; |
| UnicodeSetIterator iter=new UnicodeSetIterator(combineBack); |
| for(int i=0; i<numCombineBack; ++i) { |
| iter.next(); |
| int c=iter.codepoint; |
| combineBackCharsAndCc[2*i]=c; |
| combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c); |
| } |
| |
| // We need not look at control codes, Han characters nor Hangul LVT syllables because they |
| // do not combine forward. LV syllables are already removed. |
| UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]"); |
| UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting); |
| // System.out.format("unsure.size()=%d\n", unsure.size()); |
| |
| // For each character about which we are unsure, see if it changes when we add |
| // one of the back-combining characters. |
| Normalizer2 norm2=Normalizer2.getNFCInstance(); |
| StringBuilder s=new StringBuilder(); |
| iter.reset(unsure); |
| while(iter.next()) { |
| int c=iter.codepoint; |
| s.delete(0, 0x7fffffff).appendCodePoint(c); |
| int cLength=s.length(); |
| int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); |
| for(int i=0; i<numCombineBack; ++i) { |
| // If c's decomposition ends with a character with non-zero combining class, then |
| // c can only change if it combines with a character with a non-zero combining class. |
| int cc2=combineBackCharsAndCc[2*i+1]; |
| if(tccc==0 || cc2!=0) { |
| int c2=combineBackCharsAndCc[2*i]; |
| s.appendCodePoint(c2); |
| if(!norm2.isNormalized(s)) { |
| // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2); |
| skipSets[C].remove(c); |
| skipSets[KC].remove(c); |
| break; |
| } |
| s.delete(cLength, 0x7fffffff); |
| } |
| } |
| } |
| return skipSets; |
| } |
| |
| private static String[] kModeStrings = { |
| "D", "C", "KD", "KC" |
| }; |
| |
| @Test |
| public void TestSkippable() { |
| UnicodeSet[] skipSets = new UnicodeSet[] { |
| new UnicodeSet(), //NFD |
| new UnicodeSet(), //NFC |
| new UnicodeSet(), //NFKD |
| new UnicodeSet() //NFKC |
| }; |
| UnicodeSet[] expectSets = new UnicodeSet[] { |
| new UnicodeSet(), |
| new UnicodeSet(), |
| new UnicodeSet(), |
| new UnicodeSet() |
| }; |
| StringBuilder s, pattern; |
| |
| // build NF*Skippable sets from runtime data |
| skipSets[D].applyPattern("[:NFD_Inert:]"); |
| skipSets[C].applyPattern("[:NFC_Inert:]"); |
| skipSets[KD].applyPattern("[:NFKD_Inert:]"); |
| skipSets[KC].applyPattern("[:NFKC_Inert:]"); |
| |
| expectSets = initSkippables(expectSets); |
| if(expectSets[D].contains(0x0350)){ |
| errln("expectSets[D] contains 0x0350"); |
| } |
| for(int i=0; i<expectSets.length; ++i) { |
| if(!skipSets[i].equals(expectSets[i])) { |
| String ms = kModeStrings[i]; |
| errln("error: TestSkippable skipSets["+ms+"]!=expectedSets["+ms+"]\n"); |
| // Note: This used to depend on hardcoded UnicodeSet patterns generated by |
| // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by |
| // running com.ibm.text.UCD.Main with the option NFSkippable. |
| // Since ICU 4.6/Unicode 6, we are generating the |
| // expectSets ourselves in initSkippables(). |
| |
| s=new StringBuilder(); |
| |
| s.append("\n\nskip= "); |
| s.append(skipSets[i].toPattern(true)); |
| s.append("\n\n"); |
| |
| s.append("skip-expect="); |
| pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true)); |
| s.append(pattern); |
| |
| pattern.delete(0,pattern.length()); |
| s.append("\n\nexpect-skip="); |
| pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true)); |
| s.append(pattern); |
| s.append("\n\n"); |
| |
| pattern.delete(0,pattern.length()); |
| s.append("\n\nintersection(expect,skip)="); |
| UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]); |
| pattern = new StringBuilder(intersection.toPattern(true)); |
| s.append(pattern); |
| // Special: test coverage for append(char). |
| s.append('\n'); |
| s.append('\n'); |
| |
| errln(s.toString()); |
| } |
| } |
| } |
| |
| @Test |
| public void TestBugJ2068(){ |
| String sample = "The quick brown fox jumped over the lazy dog"; |
| UCharacterIterator text = UCharacterIterator.getInstance(sample); |
| Normalizer norm = new Normalizer(text,Normalizer.NFC,0); |
| text.setIndex(4); |
| if(text.current() == norm.current()){ |
| errln("Normalizer is not cloning the UCharacterIterator"); |
| } |
| } |
| @Test |
| public void TestGetCombiningClass(){ |
| for(int i=0;i<0x10FFFF;i++){ |
| int cc = UCharacter.getCombiningClass(i); |
| if(0xD800<= i && i<=0xDFFF && cc >0 ){ |
| cc = UCharacter.getCombiningClass(i); |
| errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8)); |
| } |
| } |
| } |
| |
| @Test |
| public void TestSerializedSet(){ |
| USerializedSet sset=new USerializedSet(); |
| UnicodeSet set = new UnicodeSet(); |
| int start, end; |
| |
| char[] serialized = { |
| 0x8007, // length |
| 3, // bmpLength |
| 0xc0, 0xfe, 0xfffc, |
| 1, 9, 0x10, 0xfffc |
| }; |
| sset.getSet(serialized, 0); |
| |
| // collect all sets into one for contiguous output |
| int[] startEnd = new int[2]; |
| int count=sset.countRanges(); |
| for(int j=0; j<count; ++j) { |
| sset.getRange(j, startEnd); |
| set.add(startEnd[0], startEnd[1]); |
| } |
| |
| // test all of these characters |
| UnicodeSetIterator it = new UnicodeSetIterator(set); |
| while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) { |
| start=it.codepoint; |
| end=it.codepointEnd; |
| while(start<=end) { |
| if(!sset.contains(start)){ |
| errln("USerializedSet.contains failed for "+Utility.hex(start,8)); |
| } |
| ++start; |
| } |
| } |
| } |
| |
| @Test |
| public void TestReturnFailure(){ |
| char[] term = {'r','\u00e9','s','u','m','\u00e9' }; |
| char[] decomposed_term = new char[10 + term.length + 2]; |
| int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0); |
| int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); |
| if(rc!=rc1){ |
| errln("Normalizer decompose did not return correct length"); |
| } |
| } |
| |
| private final static class TestCompositionCase { |
| public Normalizer.Mode mode; |
| public int options; |
| public String input, expect; |
| TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) { |
| this.mode=mode; |
| this.options=options; |
| this.input=input; |
| this.expect=expect; |
| } |
| } |
| |
| @Test |
| public void TestComposition() { |
| final TestCompositionCase cases[]=new TestCompositionCase[]{ |
| /* |
| * special cases for UAX #15 bug |
| * see Unicode Corrigendum #5: Normalization Idempotency |
| * at http://unicode.org/versions/corrigendum5.html |
| * (was Public Review Issue #29) |
| */ |
| new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), |
| new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"), |
| new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), |
| new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), |
| |
| /* TODO: add test cases for UNORM_FCC here (j2151) */ |
| }; |
| |
| String output; |
| int i; |
| |
| for(i=0; i<cases.length; ++i) { |
| output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options); |
| if(!output.equals(cases[i].expect)) { |
| errln("unexpected result for case "+i); |
| } |
| } |
| } |
| |
| @Test |
| public void TestGetDecomposition() { |
| Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS); |
| String decomp=n2.getDecomposition(0x20); |
| assertEquals("fcc.getDecomposition(space) failed", null, decomp); |
| decomp=n2.getDecomposition(0xe4); |
| assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp); |
| decomp=n2.getDecomposition(0xac01); |
| assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp); |
| } |
| |
| @Test |
| public void TestGetRawDecomposition() { |
| Normalizer2 n2=Normalizer2.getNFKCInstance(); |
| /* |
| * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values, |
| * without recursive decomposition. |
| */ |
| |
| String decomp=n2.getRawDecomposition(0x20); |
| assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp); |
| decomp=n2.getRawDecomposition(0xe4); |
| assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp); |
| /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ |
| decomp=n2.getRawDecomposition(0x1e08); |
| assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp); |
| /* U+212B ANGSTROM SIGN */ |
| decomp=n2.getRawDecomposition(0x212b); |
| assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp); |
| decomp=n2.getRawDecomposition(0xac00); |
| assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp); |
| /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */ |
| decomp=n2.getRawDecomposition(0xac01); |
| assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp); |
| } |
| |
| @Test |
| public void TestCustomComp() { |
| String [][] pairs={ |
| // ICU 63 normalization with CodePointTrie requires inert surrogate code points. |
| // { "\\uD801\\uE000\\uDFFE", "" }, |
| // { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, |
| // { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, |
| { "\\uD801\\uE000\\uDFFE", "\\uD801\\uDFFE" }, |
| { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD800\\uD801\\uDFFE\\uDFFF" }, |
| { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD800\\U000107FE\\uDFFF" }, |
| |
| { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" }, |
| { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, |
| { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, |
| { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, |
| { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } |
| }; |
| Normalizer2 customNorm2; |
| customNorm2= |
| Normalizer2.getInstance( |
| BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"), |
| "testnorm", |
| Normalizer2.Mode.COMPOSE); |
| for(int i=0; i<pairs.length; ++i) { |
| String[] pair=pairs[i]; |
| String input=Utility.unescape(pair[0]); |
| String expected=Utility.unescape(pair[1]); |
| String result=customNorm2.normalize(input); |
| if(!result.equals(expected)) { |
| errln("custom compose Normalizer2 did not normalize input "+i+" as expected"); |
| } |
| } |
| } |
| |
| @Test |
| public void TestCustomFCC() { |
| String[][] pairs={ |
| // ICU 63 normalization with CodePointTrie requires inert surrogate code points. |
| // { "\\uD801\\uE000\\uDFFE", "" }, |
| // { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, |
| // { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, |
| { "\\uD801\\uE000\\uDFFE", "\\uD801\\uDFFE" }, |
| { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD800\\uD801\\uDFFE\\uDFFF" }, |
| { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD800\\U000107FE\\uDFFF" }, |
| |
| // The following expected result is different from CustomComp |
| // because of only-contiguous composition. |
| { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" }, |
| { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, |
| { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, |
| { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, |
| { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } |
| }; |
| Normalizer2 customNorm2; |
| customNorm2= |
| Normalizer2.getInstance( |
| BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"), |
| "testnorm", |
| Normalizer2.Mode.COMPOSE_CONTIGUOUS); |
| for(int i=0; i<pairs.length; ++i) { |
| String[] pair=pairs[i]; |
| String input=Utility.unescape(pair[0]); |
| String expected=Utility.unescape(pair[1]); |
| String result=customNorm2.normalize(input); |
| if(!result.equals(expected)) { |
| errln("custom FCC Normalizer2 did not normalize input "+i+" as expected"); |
| } |
| } |
| } |
| |
| @Test |
| public void TestCanonIterData() { |
| // For now, just a regression test. |
| Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData(); |
| // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character |
| // in some decomposition mappings where there is a composition exclusion. |
| // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0) |
| // but it is not a segment starter because it occurs in a decomposition mapping. |
| if(impl.isCanonSegmentStarter(0xfb5)) { |
| errln("isCanonSegmentStarter(U+0fb5)=true is wrong"); |
| } |
| // For [:Segment_Starter:] to work right, not just the property function has to work right, |
| // UnicodeSet also needs a correct range starts set. |
| UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze(); |
| if(segStarters.contains(0xfb5)) { |
| errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong"); |
| } |
| // Try characters up to Kana and miscellaneous CJK but below Han (for expediency). |
| for(int c=0; c<=0x33ff; ++c) { |
| boolean isStarter=impl.isCanonSegmentStarter(c); |
| boolean isContained=segStarters.contains(c); |
| if(isStarter!=isContained) { |
| errln(String.format( |
| "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " + |
| "[:Segment_Starter:].contains(same)", |
| c, isStarter)); |
| } |
| } |
| } |
| |
| @Test |
| public void TestFilteredNormalizer2() { |
| Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); |
| UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); |
| FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); |
| int c; |
| for(c=0; c<=0x3ff; ++c) { |
| int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0; |
| int cc=fn2.getCombiningClass(c); |
| assertEquals( |
| "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+ |
| ")==filtered NFC.getCC()", |
| expectedCC, cc); |
| } |
| |
| // More coverage. |
| StringBuilder sb=new StringBuilder(); |
| assertEquals("filtered normalize()", "ää\u0304", |
| fn2.normalize("a\u0308ä\u0304", (Appendable)sb).toString()); |
| assertTrue("filtered hasBoundaryAfter()", fn2.hasBoundaryAfter('ä')); |
| assertTrue("filtered isInert()", fn2.isInert(0x0313)); |
| } |
| |
| @Test |
| public void TestFilteredAppend() { |
| Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); |
| UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); |
| FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); |
| |
| // Append two strings that each contain a character outside the filter set. |
| StringBuilder sb = new StringBuilder("a\u0313a"); |
| String second = "\u0301\u0313"; |
| assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString()); |
| |
| // Same, and also normalize the second string. |
| sb.replace(0, 0x7fffffff, "a\u0313a"); |
| assertEquals( |
| "normalizeSecondAndAppend()", |
| "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString()); |
| |
| // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend(). |
| assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313")); |
| } |
| |
| @Test |
| public void TestGetEasyToUseInstance() { |
| // Test input string: |
| // U+00A0 -> <noBreak> 0020 |
| // U+00C7 0301 = 1E08 = 0043 0327 0301 |
| String in="\u00A0\u00C7\u0301"; |
| Normalizer2 n2=Normalizer2.getNFCInstance(); |
| String out=n2.normalize(in); |
| assertEquals( |
| "getNFCInstance() did not return an NFC instance " + |
| "(normalizes to " + prettify(out) + ')', |
| "\u00A0\u1E08", out); |
| |
| n2=Normalizer2.getNFDInstance(); |
| out=n2.normalize(in); |
| assertEquals( |
| "getNFDInstance() did not return an NFD instance " + |
| "(normalizes to " + prettify(out) + ')', |
| "\u00A0C\u0327\u0301", out); |
| |
| n2=Normalizer2.getNFKCInstance(); |
| out=n2.normalize(in); |
| assertEquals( |
| "getNFKCInstance() did not return an NFKC instance " + |
| "(normalizes to " + prettify(out) + ')', |
| " \u1E08", out); |
| |
| n2=Normalizer2.getNFKDInstance(); |
| out=n2.normalize(in); |
| assertEquals( |
| "getNFKDInstance() did not return an NFKD instance " + |
| "(normalizes to " + prettify(out) + ')', |
| " C\u0327\u0301", out); |
| |
| n2=Normalizer2.getNFKCCasefoldInstance(); |
| out=n2.normalize(in); |
| assertEquals( |
| "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " + |
| "(normalizes to " + prettify(out) + ')', |
| " \u1E09", out); |
| } |
| |
| @Test |
| public void TestLowMappingToEmpty_D() { |
| Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.DECOMPOSE); |
| checkLowMappingToEmpty(n2); |
| |
| String sh = "\u00AD"; |
| assertFalse("soft hyphen is not normalized", n2.isNormalized(sh)); |
| String result = n2.normalize(sh); |
| assertTrue("soft hyphen normalizes to empty", result.isEmpty()); |
| assertEquals("soft hyphen QC=No", Normalizer.NO, n2.quickCheck(sh)); |
| assertEquals("soft hyphen spanQuickCheckYes", 0, n2.spanQuickCheckYes(sh)); |
| |
| String s = "\u00ADÄ\u00AD\u0323"; |
| result = n2.normalize(s); |
| assertEquals("normalize string with soft hyphens", "a\u0323\u0308", result); |
| } |
| |
| @Test |
| public void TestLowMappingToEmpty_FCD() { |
| Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.FCD); |
| checkLowMappingToEmpty(n2); |
| |
| String sh = "\u00AD"; |
| assertTrue("soft hyphen is FCD", n2.isNormalized(sh)); |
| |
| String s = "\u00ADÄ\u00AD\u0323"; |
| String result = n2.normalize(s); |
| assertEquals("normalize string with soft hyphens", "\u00ADa\u0323\u0308", result); |
| } |
| |
| private void checkLowMappingToEmpty(Normalizer2 n2) { |
| String mapping = n2.getDecomposition(0xad); |
| assertNotNull("getDecomposition(soft hyphen)", mapping); |
| assertTrue("soft hyphen maps to empty", mapping.isEmpty()); |
| assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad)); |
| assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad)); |
| assertFalse("soft hyphen is not inert", n2.isInert(0xad)); |
| } |
| |
| @Test |
| public void TestNormalizeIllFormedText() { |
| Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance(); |
| // Normalization behavior for ill-formed text is not defined. |
| // ICU currently treats ill-formed sequences as normalization-inert |
| // and copies them unchanged. |
| String src = " A\uD800ÄA\u0308\uD900A\u0308\u00ad\u0323\uDBFFÄ\u0323," + |
| "\u00ad\uDC00\u1100\u1161가\u11A8가\u3133 \uDFFF"; |
| String expected = " a\uD800ää\uD900ạ\u0308\uDBFFạ\u0308,\uDC00가각갃 \uDFFF"; |
| String result = nfkc_cf.normalize(src); |
| assertEquals("normalize", expected, result); |
| } |
| |
| @Test |
| public void TestComposeJamoTBase() { |
| // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7 |
| // which is not a conjoining Jamo Trailing consonant. |
| Normalizer2 nfkc = Normalizer2.getNFKCInstance(); |
| String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"; |
| String expected = "가\u11A7가\u11A7가\u11A7"; |
| String result = nfkc.normalize(s); |
| assertEquals("normalize(LV+11A7)", expected, result); |
| assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s)); |
| assertTrue("isNormalized(normalized)", nfkc.isNormalized(result)); |
| } |
| |
| @Test |
| public void TestComposeBoundaryAfter() { |
| Normalizer2 nfkc = Normalizer2.getNFKCInstance(); |
| // U+02DA and U+FB2C do not have compose-boundaries-after. |
| String s = "\u02DA\u0339 \uFB2C\u05B6"; |
| String expected = " \u0339\u030A \u05E9\u05B6\u05BC\u05C1"; |
| String result = nfkc.normalize(s); |
| assertEquals("nfkc", expected, result); |
| assertFalse("U+02DA boundary-after", nfkc.hasBoundaryAfter(0x2DA)); |
| assertFalse("U+FB2C boundary-after", nfkc.hasBoundaryAfter(0xFB2C)); |
| } |
| |
| @Test |
| public void TestNFC() { |
| // Coverage tests. |
| Normalizer2 nfc = Normalizer2.getNFCInstance(); |
| assertTrue("nfc.hasBoundaryAfter(space)", nfc.hasBoundaryAfter(' ')); |
| assertFalse("nfc.hasBoundaryAfter(ä)", nfc.hasBoundaryAfter('ä')); |
| } |
| |
| @Test |
| public void TestNFD() { |
| // Coverage tests. |
| Normalizer2 nfd = Normalizer2.getNFDInstance(); |
| assertTrue("nfd.hasBoundaryAfter(space)", nfd.hasBoundaryAfter(' ')); |
| assertFalse("nfd.hasBoundaryAfter(ä)", nfd.hasBoundaryAfter('ä')); |
| } |
| |
| @Test |
| public void TestFCD() { |
| // Coverage tests. |
| Normalizer2 fcd = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.FCD); |
| assertTrue("fcd.hasBoundaryAfter(space)", fcd.hasBoundaryAfter(' ')); |
| assertFalse("fcd.hasBoundaryAfter(ä)", fcd.hasBoundaryAfter('ä')); |
| assertTrue("fcd.isInert(space)", fcd.isInert(' ')); |
| assertFalse("fcd.isInert(ä)", fcd.isInert('ä')); |
| |
| // This implementation method is unreachable via public API. |
| Norm2AllModes.FCDNormalizer2 impl = (Norm2AllModes.FCDNormalizer2)fcd; |
| assertEquals("fcd impl.getQuickCheck(space)", 1, impl.getQuickCheck(' ')); |
| assertEquals("fcd impl.getQuickCheck(ä)", 0, impl.getQuickCheck('ä')); |
| } |
| |
| @Test |
| public void TestNoneNormalizer() { |
| // Use the deprecated Mode Normalizer.NONE for coverage of the internal NoopNormalizer2 |
| // as far as its methods are reachable that way. |
| assertEquals("NONE.concatenate()", "ä\u0327", |
| Normalizer.concatenate("ä", "\u0327", Normalizer.NONE, 0)); |
| assertTrue("NONE.isNormalized()", Normalizer.isNormalized("ä\u0327", Normalizer.NONE, 0)); |
| } |
| |
| @Test |
| public void TestNoopNormalizer2() { |
| // Use the internal class directly for coverage of methods that are not publicly reachable. |
| Normalizer2 noop = Norm2AllModes.NOOP_NORMALIZER2; |
| assertEquals("noop.normalizeSecondAndAppend()", "ä\u0327", |
| noop.normalizeSecondAndAppend(new StringBuilder("ä"), "\u0327").toString()); |
| assertEquals("noop.getDecomposition()", null, noop.getDecomposition('ä')); |
| assertTrue("noop.hasBoundaryAfter()", noop.hasBoundaryAfter(0x0308)); |
| assertTrue("noop.isInert()", noop.isInert(0x0308)); |
| } |
| |
| /* |
| * Abstract class Normalizer2 has non-abstract methods which are overwritten by |
| * its derived classes. To test these methods a derived class is defined here. |
| */ |
| public class TestNormalizer2 extends Normalizer2 { |
| |
| public TestNormalizer2() {} |
| @Override |
| public StringBuilder normalize(CharSequence src, StringBuilder dest) { return null; } |
| @Override |
| public Appendable normalize(CharSequence src, Appendable dest) { return null; } |
| @Override |
| public StringBuilder normalizeSecondAndAppend( |
| StringBuilder first, CharSequence second) { return null; } |
| @Override |
| public StringBuilder append(StringBuilder first, CharSequence second) { return null; } |
| @Override |
| public String getDecomposition(int c) { return null; } |
| @Override |
| public boolean isNormalized(CharSequence s) { return false; } |
| @Override |
| public Normalizer.QuickCheckResult quickCheck(CharSequence s) { return null; } |
| @Override |
| public int spanQuickCheckYes(CharSequence s) { return 0; } |
| @Override |
| public boolean hasBoundaryBefore(int c) { return false; } |
| @Override |
| public boolean hasBoundaryAfter(int c) { return false; } |
| @Override |
| public boolean isInert(int c) { return false; } |
| } |
| |
| final TestNormalizer2 tnorm2 = new TestNormalizer2(); |
| @Test |
| public void TestGetRawDecompositionBase() { |
| int c = 'à'; |
| assertEquals("Unexpected value returned from Normalizer2.getRawDecomposition()", |
| null, tnorm2.getRawDecomposition(c)); |
| } |
| |
| @Test |
| public void TestComposePairBase() { |
| int a = 'a'; |
| int b = '\u0300'; |
| assertEquals("Unexpected value returned from Normalizer2.composePair()", |
| -1, tnorm2.composePair(a, b)); |
| } |
| |
| @Test |
| public void TestGetCombiningClassBase() { |
| int c = '\u00e0'; |
| assertEquals("Unexpected value returned from Normalizer2.getCombiningClass()", |
| 0, tnorm2.getCombiningClass(c)); |
| } |
| } |