icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorDisorderedMarksTest.java - external/github.com/unicode-org/icu - Git at Google

 // © 2018 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html

 package com.ibm.icu.dev.test.translit;

 import java.util.Map.Entry;

 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.dev.util.UnicodeMap;
 import com.ibm.icu.text.CanonicalIterator;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;

 import junitparams.JUnitParamsRunner;
 import junitparams.Parameters;

 /**
  * @test
  * @summary Disordered marks test of Transliterator
  */
 @RunWith(JUnitParamsRunner.class)
 public class TransliteratorDisorderedMarksTest extends TestFmwk {
     private static UnicodeSet disorderedMarks;

     @AfterClass
     public static void disorderedMarksNull() {
         disorderedMarks = null;
     }

     @BeforeClass
     public static void disorderedMarksAddAll() {
         Normalizer2 nfc = Normalizer2.getNFCInstance();
         Normalizer2 nfd = Normalizer2.getNFDInstance();

         //        Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
         //        UnicodeSet nfkdSource = new UnicodeSet();
         //        UnicodeSet nfkdTarget = new UnicodeSet();
         //        for (int i = 0; i <= 0x10FFFF; ++i) {
         //            if (nfkd.isInert(i)) {
         //                continue;
         //            }
         //            nfkdSource.add(i);
         //            String t = nfkd.getDecomposition(i);
         //            if (t != null) {
         //                nfkdTarget.addAll(t);
         //            } else {
         //                nfkdTarget.add(i);
         //            }
         //        }
         //        nfkdSource.freeze();
         //        nfkdTarget.freeze();
         //        logln("NFKD Source: " + nfkdSource.toPattern(false));
         //        logln("NFKD Target: " + nfkdTarget.toPattern(false));

         UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
         UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
         UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
         CanonicalIterator can = new CanonicalIterator("");

         disorderedMarks = new UnicodeSet();

         for (int i = 0; i <= 0x10FFFF; ++i) {
             String s = nfd.getDecomposition(i);
             if (s == null) {
                 continue;
             }

             can.setSource(s);
             for (String t = can.next(); t != null; t = can.next()) {
                 disorderedMarks.add(t);
             }

             // if s has two code points, (or more), add the lead/trail information
             int first = s.codePointAt(0);
             int firstCount = Character.charCount(first);
             if (s.length() == firstCount) continue;
             String trailString = s.substring(firstCount);

             // add all the trail characters
             if (!nonStarters.containsSome(trailString)) {
                 continue;
             }
             UnicodeSet trailSet = leadToTrail.get(first);
             if (trailSet == null) {
                 leadToTrail.put(first, trailSet = new UnicodeSet());
             }
             trailSet.addAll(trailString); // add remaining trails

             // add the sources
             UnicodeSet sourcesSet = leadToSources.get(first);
             if (sourcesSet == null) {
                 leadToSources.put(first, sourcesSet = new UnicodeSet());
             }
             sourcesSet.add(i);
         }


         for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
             String lead = x.getKey();
             UnicodeSet sources = x.getValue();
             UnicodeSet trailSet = leadToTrail.get(lead);
             for (String source : sources) {
                 for (String trail : trailSet) {
                     can.setSource(source + trail);
                     for (String t = can.next(); t != null; t = can.next()) {
                         if (t.endsWith(trail)) continue;
                         disorderedMarks.add(t);
                     }
                 }
             }
         }


         for (String s : nonStarters) {
             disorderedMarks.add("\u0345" + s);
             disorderedMarks.add(s+"\u0323");
             String xx = nfc.normalize("\u01EC" + s);
             if (!xx.startsWith("\u01EC")) {
                 logln("??");
             }
         }

         //        for (int i = 0; i <= 0x10FFFF; ++i) {
         //            String s = nfkd.getDecomposition(i);
         //            if (s != null) {
         //                disorderedMarks.add(s);
         //                disorderedMarks.add(nfc.normalize(s));
         //                addDerivedStrings(nfc, disorderedMarks, s);
         //            }
         //            s = nfd.getDecomposition(i);
         //            if (s != null) {
         //                disorderedMarks.add(s);
         //            }
         //            if (!nfc.isInert(i)) {
         //                if (i == 0x00C0) {
         //                    logln("\u00C0");
         //                }
         //                can.setSource(s+"\u0334");
         //                for (String t = can.next(); t != null; t = can.next()) {
         //                    addDerivedStrings(nfc, disorderedMarks, t);
         //                }
         //                can.setSource(s+"\u0345");
         //                for (String t = can.next(); t != null; t = can.next()) {
         //                    addDerivedStrings(nfc, disorderedMarks, t);
         //                }
         //                can.setSource(s+"\u0323");
         //                for (String t = can.next(); t != null; t = can.next()) {
         //                    addDerivedStrings(nfc, disorderedMarks, t);
         //                }
         //            }
         //        }
         logln("Test cases: " + disorderedMarks.size());
         disorderedMarks.addAll(0,0x10FFFF).freeze();
         logln("isInert \u0104 " + nfc.isInert('\u0104'));
     }

     @Test
     @Parameters({
             ":: [:sc=COMMON:] any-name;",

             ":: [:Greek:] hex-any/C;",
             ":: [:Greek:] any-hex/C;",

             ":: [[:Mn:][:Me:]] remove;",
             ":: [[:Mn:][:Me:]] null;",


             ":: lower;",
             ":: upper;",
             ":: title;",
             ":: CaseFold;",

             ":: NFD;",
             ":: NFC;",
             ":: NFKD;",
             ":: NFKC;",

             ":: [[:Mn:][:Me:]] NFKD;",
             ":: Latin-Greek;",
             ":: [:Latin:] NFKD;",
             ":: NFKD;",
             ":: NFKD;\n" +
                 ":: [[:Mn:][:Me:]] remove;\n" +
                 ":: NFC;",
     })
     public void TestSourceTargetSet2(String rule) {
         Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
         UnicodeSet actualSource = trans.getSourceSet();
         UnicodeSet actualTarget = trans.getTargetSet();
         UnicodeSet empiricalSource = new UnicodeSet();
         UnicodeSet empiricalTarget = new UnicodeSet();
         String ruleDisplay = rule.replace("\n", "\t\t");
         UnicodeSet toTest = disorderedMarks;
         Normalizer2 nfd = Normalizer2.getNFDInstance();

         String test = nfd.normalize("\u0104");
         boolean DEBUG = true;
         @SuppressWarnings("unused")
         int count = 0; // for debugging
         for (String s : toTest) {
             if (s.equals(test)) {
                 logln(test);
             }
             String t = trans.transform(s);
             if (!s.equals(t)) {
                 if (!TransliteratorTest.isAtomic(s, t, trans)) {
                     TransliteratorTest.isAtomic(s, t, trans);
                     continue;
                 }

                 // only keep the part that changed; so skip the front and end.
                 //                    int start = findSharedStartLength(s,t);
                 //                    int end = findSharedEndLength(s,t);
                 //                    if (start != 0 || end != 0) {
                 //                        s = s.substring(start, s.length() - end);
                 //                        t = t.substring(start, t.length() - end);
                 //                    }
                 if (DEBUG) {
                     if (!actualSource.containsAll(s)) {
                         count++;
                     }
                     if (!actualTarget.containsAll(t)) {
                         count++;
                     }
                 }
                 TransliteratorTest.addSourceTarget(s, empiricalSource, t, empiricalTarget);
             }
         }
         if (rule.contains("title")) {
             // See the comment in TestCasing() about the iota subscript.
             empiricalSource.remove(0x345);
         }
         TransliteratorTest.assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, TransliteratorTest.SetAssert.MISSING_OK);
         TransliteratorTest.assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, TransliteratorTest.SetAssert.MISSING_OK);
     }
 }
	// © 2018 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html

	package com.ibm.icu.dev.test.translit;

	import java.util.Map.Entry;

	import org.junit.AfterClass;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.junit.runner.RunWith;

	import com.ibm.icu.dev.test.TestFmwk;
	import com.ibm.icu.dev.util.UnicodeMap;
	import com.ibm.icu.text.CanonicalIterator;
	import com.ibm.icu.text.Normalizer2;
	import com.ibm.icu.text.Transliterator;
	import com.ibm.icu.text.UnicodeSet;

	import junitparams.JUnitParamsRunner;
	import junitparams.Parameters;

	/**
	* @test
	* @summary Disordered marks test of Transliterator
	*/
	@RunWith(JUnitParamsRunner.class)
	public class TransliteratorDisorderedMarksTest extends TestFmwk {
	private static UnicodeSet disorderedMarks;

	@AfterClass
	public static void disorderedMarksNull() {
	disorderedMarks = null;
	}

	@BeforeClass
	public static void disorderedMarksAddAll() {
	Normalizer2 nfc = Normalizer2.getNFCInstance();
	Normalizer2 nfd = Normalizer2.getNFDInstance();

	// Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
	// UnicodeSet nfkdSource = new UnicodeSet();
	// UnicodeSet nfkdTarget = new UnicodeSet();
	// for (int i = 0; i <= 0x10FFFF; ++i) {
	// if (nfkd.isInert(i)) {
	// continue;
	// }
	// nfkdSource.add(i);
	// String t = nfkd.getDecomposition(i);
	// if (t != null) {
	// nfkdTarget.addAll(t);
	// } else {
	// nfkdTarget.add(i);
	// }
	// }
	// nfkdSource.freeze();
	// nfkdTarget.freeze();
	// logln("NFKD Source: " + nfkdSource.toPattern(false));
	// logln("NFKD Target: " + nfkdTarget.toPattern(false));

	UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
	UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
	UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
	CanonicalIterator can = new CanonicalIterator("");

	disorderedMarks = new UnicodeSet();

	for (int i = 0; i <= 0x10FFFF; ++i) {
	String s = nfd.getDecomposition(i);
	if (s == null) {
	continue;
	}

	can.setSource(s);
	for (String t = can.next(); t != null; t = can.next()) {
	disorderedMarks.add(t);
	}

	// if s has two code points, (or more), add the lead/trail information
	int first = s.codePointAt(0);
	int firstCount = Character.charCount(first);
	if (s.length() == firstCount) continue;
	String trailString = s.substring(firstCount);

	// add all the trail characters
	if (!nonStarters.containsSome(trailString)) {
	continue;
	}
	UnicodeSet trailSet = leadToTrail.get(first);
	if (trailSet == null) {
	leadToTrail.put(first, trailSet = new UnicodeSet());
	}
	trailSet.addAll(trailString); // add remaining trails

	// add the sources
	UnicodeSet sourcesSet = leadToSources.get(first);
	if (sourcesSet == null) {
	leadToSources.put(first, sourcesSet = new UnicodeSet());
	}
	sourcesSet.add(i);
	}


	for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
	String lead = x.getKey();
	UnicodeSet sources = x.getValue();
	UnicodeSet trailSet = leadToTrail.get(lead);
	for (String source : sources) {
	for (String trail : trailSet) {
	can.setSource(source + trail);
	for (String t = can.next(); t != null; t = can.next()) {
	if (t.endsWith(trail)) continue;
	disorderedMarks.add(t);
	}
	}
	}
	}


	for (String s : nonStarters) {
	disorderedMarks.add("\u0345" + s);
	disorderedMarks.add(s+"\u0323");
	String xx = nfc.normalize("\u01EC" + s);
	if (!xx.startsWith("\u01EC")) {
	logln("??");
	}
	}

	// for (int i = 0; i <= 0x10FFFF; ++i) {
	// String s = nfkd.getDecomposition(i);
	// if (s != null) {
	// disorderedMarks.add(s);
	// disorderedMarks.add(nfc.normalize(s));
	// addDerivedStrings(nfc, disorderedMarks, s);
	// }
	// s = nfd.getDecomposition(i);
	// if (s != null) {
	// disorderedMarks.add(s);
	// }
	// if (!nfc.isInert(i)) {
	// if (i == 0x00C0) {
	// logln("\u00C0");
	// }
	// can.setSource(s+"\u0334");
	// for (String t = can.next(); t != null; t = can.next()) {
	// addDerivedStrings(nfc, disorderedMarks, t);
	// }
	// can.setSource(s+"\u0345");
	// for (String t = can.next(); t != null; t = can.next()) {
	// addDerivedStrings(nfc, disorderedMarks, t);
	// }
	// can.setSource(s+"\u0323");
	// for (String t = can.next(); t != null; t = can.next()) {
	// addDerivedStrings(nfc, disorderedMarks, t);
	// }
	// }
	// }
	logln("Test cases: " + disorderedMarks.size());
	disorderedMarks.addAll(0,0x10FFFF).freeze();
	logln("isInert \u0104 " + nfc.isInert('\u0104'));
	}

	@Test
	@Parameters({
	":: [:sc=COMMON:] any-name;",

	":: [:Greek:] hex-any/C;",
	":: [:Greek:] any-hex/C;",

	":: [[:Mn:][:Me:]] remove;",
	":: [[:Mn:][:Me:]] null;",


	":: lower;",
	":: upper;",
	":: title;",
	":: CaseFold;",

	":: NFD;",
	":: NFC;",
	":: NFKD;",
	":: NFKC;",

	":: [[:Mn:][:Me:]] NFKD;",
	":: Latin-Greek;",
	":: [:Latin:] NFKD;",
	":: NFKD;",
	":: NFKD;\n" +
	":: [[:Mn:][:Me:]] remove;\n" +
	":: NFC;",
	})
	public void TestSourceTargetSet2(String rule) {
	Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
	UnicodeSet actualSource = trans.getSourceSet();
	UnicodeSet actualTarget = trans.getTargetSet();
	UnicodeSet empiricalSource = new UnicodeSet();
	UnicodeSet empiricalTarget = new UnicodeSet();
	String ruleDisplay = rule.replace("\n", "\t\t");
	UnicodeSet toTest = disorderedMarks;
	Normalizer2 nfd = Normalizer2.getNFDInstance();

	String test = nfd.normalize("\u0104");
	boolean DEBUG = true;
	@SuppressWarnings("unused")
	int count = 0; // for debugging
	for (String s : toTest) {
	if (s.equals(test)) {
	logln(test);
	}
	String t = trans.transform(s);
	if (!s.equals(t)) {
	if (!TransliteratorTest.isAtomic(s, t, trans)) {
	TransliteratorTest.isAtomic(s, t, trans);
	continue;
	}

	// only keep the part that changed; so skip the front and end.
	// int start = findSharedStartLength(s,t);
	// int end = findSharedEndLength(s,t);
	// if (start != 0 \|\| end != 0) {
	// s = s.substring(start, s.length() - end);
	// t = t.substring(start, t.length() - end);
	// }
	if (DEBUG) {
	if (!actualSource.containsAll(s)) {
	count++;
	}
	if (!actualTarget.containsAll(t)) {
	count++;
	}
	}
	TransliteratorTest.addSourceTarget(s, empiricalSource, t, empiricalTarget);
	}
	}
	if (rule.contains("title")) {
	// See the comment in TestCasing() about the iota subscript.
	empiricalSource.remove(0x345);
	}
	TransliteratorTest.assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, TransliteratorTest.SetAssert.MISSING_OK);
	TransliteratorTest.assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, TransliteratorTest.SetAssert.MISSING_OK);
	}
	}