blob: d220e7a06472db28517fb08387e016f0a5970e1c [file] [log] [blame]
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.dev.test.translit;
import java.util.Map.Entry;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
import junitparams.JUnitParamsRunner;
import junitparams.Parameters;
/**
* @test
* @summary Disordered marks test of Transliterator
*/
@RunWith(JUnitParamsRunner.class)
public class TransliteratorDisorderedMarksTest extends TestFmwk {
private static UnicodeSet disorderedMarks;
@AfterClass
public static void disorderedMarksNull() {
disorderedMarks = null;
}
@BeforeClass
public static void disorderedMarksAddAll() {
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfd = Normalizer2.getNFDInstance();
// Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
// UnicodeSet nfkdSource = new UnicodeSet();
// UnicodeSet nfkdTarget = new UnicodeSet();
// for (int i = 0; i <= 0x10FFFF; ++i) {
// if (nfkd.isInert(i)) {
// continue;
// }
// nfkdSource.add(i);
// String t = nfkd.getDecomposition(i);
// if (t != null) {
// nfkdTarget.addAll(t);
// } else {
// nfkdTarget.add(i);
// }
// }
// nfkdSource.freeze();
// nfkdTarget.freeze();
// logln("NFKD Source: " + nfkdSource.toPattern(false));
// logln("NFKD Target: " + nfkdTarget.toPattern(false));
UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
CanonicalIterator can = new CanonicalIterator("");
disorderedMarks = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String s = nfd.getDecomposition(i);
if (s == null) {
continue;
}
can.setSource(s);
for (String t = can.next(); t != null; t = can.next()) {
disorderedMarks.add(t);
}
// if s has two code points, (or more), add the lead/trail information
int first = s.codePointAt(0);
int firstCount = Character.charCount(first);
if (s.length() == firstCount) continue;
String trailString = s.substring(firstCount);
// add all the trail characters
if (!nonStarters.containsSome(trailString)) {
continue;
}
UnicodeSet trailSet = leadToTrail.get(first);
if (trailSet == null) {
leadToTrail.put(first, trailSet = new UnicodeSet());
}
trailSet.addAll(trailString); // add remaining trails
// add the sources
UnicodeSet sourcesSet = leadToSources.get(first);
if (sourcesSet == null) {
leadToSources.put(first, sourcesSet = new UnicodeSet());
}
sourcesSet.add(i);
}
for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
String lead = x.getKey();
UnicodeSet sources = x.getValue();
UnicodeSet trailSet = leadToTrail.get(lead);
for (String source : sources) {
for (String trail : trailSet) {
can.setSource(source + trail);
for (String t = can.next(); t != null; t = can.next()) {
if (t.endsWith(trail)) continue;
disorderedMarks.add(t);
}
}
}
}
for (String s : nonStarters) {
disorderedMarks.add("\u0345" + s);
disorderedMarks.add(s+"\u0323");
String xx = nfc.normalize("\u01EC" + s);
if (!xx.startsWith("\u01EC")) {
logln("??");
}
}
// for (int i = 0; i <= 0x10FFFF; ++i) {
// String s = nfkd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// disorderedMarks.add(nfc.normalize(s));
// addDerivedStrings(nfc, disorderedMarks, s);
// }
// s = nfd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// }
// if (!nfc.isInert(i)) {
// if (i == 0x00C0) {
// logln("\u00C0");
// }
// can.setSource(s+"\u0334");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0345");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0323");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// }
// }
logln("Test cases: " + disorderedMarks.size());
disorderedMarks.addAll(0,0x10FFFF).freeze();
logln("isInert \u0104 " + nfc.isInert('\u0104'));
}
@Test
@Parameters({
":: [:sc=COMMON:] any-name;",
":: [:Greek:] hex-any/C;",
":: [:Greek:] any-hex/C;",
":: [[:Mn:][:Me:]] remove;",
":: [[:Mn:][:Me:]] null;",
":: lower;",
":: upper;",
":: title;",
":: CaseFold;",
":: NFD;",
":: NFC;",
":: NFKD;",
":: NFKC;",
":: [[:Mn:][:Me:]] NFKD;",
":: Latin-Greek;",
":: [:Latin:] NFKD;",
":: NFKD;",
":: NFKD;\n" +
":: [[:Mn:][:Me:]] remove;\n" +
":: NFC;",
})
public void TestSourceTargetSet2(String rule) {
Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
UnicodeSet actualSource = trans.getSourceSet();
UnicodeSet actualTarget = trans.getTargetSet();
UnicodeSet empiricalSource = new UnicodeSet();
UnicodeSet empiricalTarget = new UnicodeSet();
String ruleDisplay = rule.replace("\n", "\t\t");
UnicodeSet toTest = disorderedMarks;
Normalizer2 nfd = Normalizer2.getNFDInstance();
String test = nfd.normalize("\u0104");
boolean DEBUG = true;
@SuppressWarnings("unused")
int count = 0; // for debugging
for (String s : toTest) {
if (s.equals(test)) {
logln(test);
}
String t = trans.transform(s);
if (!s.equals(t)) {
if (!TransliteratorTest.isAtomic(s, t, trans)) {
TransliteratorTest.isAtomic(s, t, trans);
continue;
}
// only keep the part that changed; so skip the front and end.
// int start = findSharedStartLength(s,t);
// int end = findSharedEndLength(s,t);
// if (start != 0 || end != 0) {
// s = s.substring(start, s.length() - end);
// t = t.substring(start, t.length() - end);
// }
if (DEBUG) {
if (!actualSource.containsAll(s)) {
count++;
}
if (!actualTarget.containsAll(t)) {
count++;
}
}
TransliteratorTest.addSourceTarget(s, empiricalSource, t, empiricalTarget);
}
}
if (rule.contains("title")) {
// See the comment in TestCasing() about the iota subscript.
empiricalSource.remove(0x345);
}
TransliteratorTest.assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, TransliteratorTest.SetAssert.MISSING_OK);
TransliteratorTest.assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, TransliteratorTest.SetAssert.MISSING_OK);
}
}