blob: 1811028984ae161ff8a7026c58665ba40cfe7345 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2009-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.text;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.SpoofChecker;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
public class SpoofCheckerTest extends TestFmwk {
public static void main(String[] args) throws Exception {
new SpoofCheckerTest().run(args);
}
void TEST_ASSERT(boolean expr) {
if ((expr) == false) {
errln("Assertion Failure.\n");
}
}
void TEST_ASSERT_EQ(int a, int b) {
if (a != b) {
errln(String.format("Test Failure: %d != %d\n", a, b));
}
}
void TEST_ASSERT_NE(Object a, Object b) {
if (a == b) {
errln(String.format("Test Failure: (%s) == (%s) \n", a.toString(), b.toString()));
}
}
/*
* setup() and teardown() macros to handle the boilerplate around setting up test case. Put arbitrary test code
* between SETUP and TEARDOWN. "sc" is the ready-to-go SpoofChecker for use in the tests.
*/
SpoofChecker sc;
SpoofChecker.Builder builder;
void setup() {
builder = new SpoofChecker.Builder();
sc = builder.build();
}
void teardown() {
sc = null;
}
/*
* Identifiers for verifying that spoof checking is minimally alive and working.
*/
char[] goodLatinChars = { (char) 0x75, (char) 0x7a };
String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */
/* (not confusable) */
char[] scMixedChars = { (char) 0x73, (char) 0x0441 };
String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */
/* (mixed script, confusable */
char[] scLatinChars = { (char) 0x73, (char) 0x63 };
String scLatin = new String(scLatinChars); /* "sc", plain ascii. */
char[] goodCyrlChars = { (char) 0x438, (char) 0x43B };
String goodCyrl = new String(goodCyrlChars); /*
* Plain lower case Cyrillic letters, no latin confusables
*/
char[] goodGreekChars = { (char) 0x3c0, (char) 0x3c6 };
String goodGreek = new String(goodGreekChars); /* Plain lower case Greek letters */
char[] lll_Latin_aChars = { (char) 0x6c, (char) 0x49, (char) 0x31 };
String lll_Latin_a = new String(lll_Latin_aChars); /* lI1, all ASCII */
/* Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA */
char[] lll_Latin_bChars = { (char) 0xff29, (char) 0x217c, (char) 0x196 };
String lll_Latin_b = new String(lll_Latin_bChars);
char[] lll_CyrlChars = { (char) 0x0406, (char) 0x04C0, (char) 0x31 };
String lll_Cyrl = new String(lll_CyrlChars);
/* The skeleton transform for all of thes 'lll' lookalikes is all ascii lower case letter l. */
char[] lll_SkelChars = { (char) 0x6c, (char) 0x6c, (char) 0x6c };
String lll_Skel = new String(lll_SkelChars);
/*
* Test basic constructor.
*/
public void TestUSpoof() {
setup();
teardown();
}
/*
* Test build from source rules.
*/
public void TestOpenFromSourceRules() {
setup();
String fileName;
Reader confusables;
Reader confusablesWholeScript;
try {
fileName = "unicode/confusables.txt";
confusables = TestUtil.getDataReader(fileName, "UTF-8");
fileName = "unicode/confusablesWholeScript.txt";
confusablesWholeScript = TestUtil.getDataReader(fileName, "UTF-8");
SpoofChecker rsc = builder.setData(confusables, confusablesWholeScript).build();
if (rsc == null) {
errln("FAIL: null SpoofChecker");
}
} catch (java.io.IOException e) {
errln(e.toString());
} catch (ParseException e) {
errln(e.toString());
}
teardown();
}
/*
* Set & Get Check Flags
*/
public void TestGetSetChecks1() {
setup();
int t;
sc = builder.setChecks(SpoofChecker.ALL_CHECKS).build();
t = sc.getChecks();
TEST_ASSERT_EQ(t, SpoofChecker.ALL_CHECKS);
sc = builder.setChecks(0).build();
t = sc.getChecks();
TEST_ASSERT_EQ(0, t);
int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE
| SpoofChecker.ANY_CASE;
sc = builder.setChecks(checks).build();
t = sc.getChecks();
TEST_ASSERT_EQ(checks, t);
teardown();
}
/*
* get & setAllowedChars
*/
public void TestGetSetAllowedChars() {
setup();
UnicodeSet us;
UnicodeSet uset;
uset = sc.getAllowedChars();
TEST_ASSERT(uset.isFrozen());
us = new UnicodeSet((int) 0x41, (int) 0x5A); /* [A-Z] */
sc = builder.setAllowedChars(us).build();
TEST_ASSERT_NE(us, sc.getAllowedChars());
TEST_ASSERT(us.equals(sc.getAllowedChars()));
teardown();
}
/*
* get & set Checks
*/
public void TestGetSetChecks() {
setup();
int checks;
int checks2;
boolean checkResults;
checks = sc.getChecks();
TEST_ASSERT_EQ(SpoofChecker.ALL_CHECKS, checks);
checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE);
sc = builder.setChecks(checks).build();
checks2 = sc.getChecks();
TEST_ASSERT_EQ(checks, checks2);
/*
* The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests
* gone checking that Identifier should now succeed
*/
checkResults = sc.check(scMixed);
TEST_ASSERT(false == checkResults);
teardown();
}
/*
* AllowedLoacles
*/
public void TestAllowedLoacles() {
setup();
Set<ULocale> allowedLocales = new LinkedHashSet<ULocale>();
boolean checkResults;
/* Default allowed locales list should be empty */
allowedLocales = sc.getAllowedLocales();
TEST_ASSERT(allowedLocales.isEmpty());
/* Allow en and ru, which should enable Latin and Cyrillic only to pass */
ULocale enloc = new ULocale("en");
ULocale ruloc = new ULocale("ru_RU");
allowedLocales.add(enloc);
allowedLocales.add(ruloc);
sc = builder.setAllowedLocales(allowedLocales).build();
allowedLocales = sc.getAllowedLocales();
TEST_ASSERT(allowedLocales.contains(enloc));
TEST_ASSERT(allowedLocales.contains(ruloc));
/*
* Limit checks to SpoofChecker.CHAR_LIMIT. Some of the test data has whole script confusables also, which we
* don't want to see in this test.
*/
sc = builder.setChecks(SpoofChecker.CHAR_LIMIT).build();
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
checkResults = sc.check(goodLatin);
TEST_ASSERT(false == checkResults);
checkResults = sc.check(goodGreek, result);
TEST_ASSERT_EQ(SpoofChecker.CHAR_LIMIT, result.checks);
checkResults = sc.check(goodCyrl);
TEST_ASSERT(false == checkResults);
/* Reset with an empty locale list, which should allow all characters to pass */
allowedLocales = new LinkedHashSet<ULocale>();
sc = builder.setAllowedLocales(allowedLocales).build();
checkResults = sc.check(goodGreek);
TEST_ASSERT(false == checkResults);
teardown();
}
/*
* AllowedChars set/get the UnicodeSet of allowed characters.
*/
public void TestAllowedChars() {
setup();
UnicodeSet set;
UnicodeSet tmpSet;
boolean checkResults;
/* By default, we should see no restriction; the UnicodeSet should allow all characters. */
set = sc.getAllowedChars();
tmpSet = new UnicodeSet(0, 0x10ffff);
TEST_ASSERT(tmpSet.equals(set));
/* Setting the allowed chars should enable the check. */
sc = builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CHAR_LIMIT).build();
/* Remove a character that is in our good Latin test identifier from the allowed chars set. */
tmpSet.remove(goodLatin.charAt(1));
sc = builder.setAllowedChars(tmpSet).build();
/* Latin Identifier should now fail; other non-latin test cases should still be OK */
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
checkResults = sc.check(goodLatin, result);
TEST_ASSERT(checkResults);
TEST_ASSERT_EQ(SpoofChecker.CHAR_LIMIT, result.checks);
checkResults = sc.check(goodGreek, result);
if (false) { // Ticket 8054. Understand why this is different from ICU4C.
TEST_ASSERT(checkResults);
TEST_ASSERT_EQ(SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, result.checks);
}
teardown();
}
public void TestCheck() {
setup();
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
boolean checkResults;
result.position = 666;
checkResults = sc.check(goodLatin, result);
TEST_ASSERT(false == checkResults);
TEST_ASSERT_EQ(666, result.position);
checkResults = sc.check(goodCyrl, result);
TEST_ASSERT(false == checkResults);
result.position = 666;
checkResults = sc.check(scMixed, result);
TEST_ASSERT(true == checkResults);
TEST_ASSERT_EQ(SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
TEST_ASSERT_EQ(2, result.position);
teardown();
}
public void TestAreConfusable1() {
setup();
int checkResults;
checkResults = sc.areConfusable(scLatin, scMixed);
TEST_ASSERT_EQ(SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults);
checkResults = sc.areConfusable(goodGreek, scLatin);
TEST_ASSERT_EQ(0, checkResults);
checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b);
TEST_ASSERT_EQ(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults);
teardown();
}
public void TestGetSkeleton() {
setup();
String dest;
dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a);
TEST_ASSERT(lll_Skel.equals(dest));
TEST_ASSERT_EQ(lll_Skel.length(), dest.length());
TEST_ASSERT_EQ(3, dest.length());
teardown();
}
/**
* IntlTestSpoof is the top level test class for the Unicode Spoof detection tests
*/
// Test the USpoofDetector API functions that require C++
// The pure C part of the API, which is most of it, is tested in cintltst
/**
* IntlTestSpoof tests for USpoofDetector
*/
public void TestSpoofAPI() {
setup();
String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts.
// If this test starts failing, consult confusablesWholeScript.txt
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
result.position = 666;
boolean checkResults = sc.check(s, result);
TEST_ASSERT(false == checkResults);
TEST_ASSERT_EQ(666, result.position); // not changed
teardown();
setup();
String s1 = "cxs";
String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs"
int checkResult = sc.areConfusable(s1, s2);
TEST_ASSERT_EQ(SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult);
teardown();
setup();
s = "I1l0O";
String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
TEST_ASSERT(dest.equals("lllOO"));
teardown();
}
// testSkeleton. Spot check a number of confusable skeleton substitutions from the
// Unicode data file confusables.txt
// Test cases chosen for substitutions of various lengths, and
// membership in different mapping tables.
public void TestSkeleton() {
int ML = 0;
int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
int MA = SpoofChecker.ANY_CASE;
int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
setup();
// A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
checkSkeleton(
sc,
SL,
" A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.");
// FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
// ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
// This character NFKD normalizes to \u064d \u0651, so its confusable mapping
// is never used in creating a skeleton.
checkSkeleton(sc, SL, "\\uFC5F", " \\u064d\\u0651");
checkSkeleton(sc, SL, "nochange", "nochange");
checkSkeleton(sc, MA, "love", "love");
checkSkeleton(sc, MA, "1ove", "love"); // Digit 1 to letter l
checkSkeleton(sc, ML, "OOPS", "OOPS");
checkSkeleton(sc, ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode.
checkSkeleton(sc, MA, "OOPS", "OOPS");
checkSkeleton(sc, MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
checkSkeleton(sc, SL, "\\u059c", "\\u0301");
checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D");
checkSkeleton(sc, SL, "\\u247E", "\\u0028\\u006c\\u006c\\u0029"); // "(ll)"
checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
// This mapping exists in the ML and MA tables, does not exist in SL, SA
// 0C83 ; 0983 ; ML # KANNADA SIGN VISARGA to
checkSkeleton(sc, SL, "\\u0C83", "\\u0C83");
checkSkeleton(sc, SA, "\\u0C83", "\\u0C83");
checkSkeleton(sc, ML, "\\u0C83", "\\u0983");
checkSkeleton(sc, MA, "\\u0C83", "\\u0983");
// 0391 ; 0041 ; MA # GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A
// This mapping exists only in the MA table.
checkSkeleton(sc, MA, "\\u0391", "A");
checkSkeleton(sc, SA, "\\u0391", "\\u0391");
checkSkeleton(sc, ML, "\\u0391", "\\u0391");
checkSkeleton(sc, SL, "\\u0391", "\\u0391");
// 13CF ; 0062 ; MA # CHEROKEE LETTER SI to LATIN SMALL LETTER B
// This mapping exists in the ML and MA tables
checkSkeleton(sc, ML, "\\u13CF", "b");
checkSkeleton(sc, MA, "\\u13CF", "b");
checkSkeleton(sc, SL, "\\u13CF", "\\u13CF");
checkSkeleton(sc, SA, "\\u13CF", "\\u13CF");
// 0022 ; 0027 0027 ;
// all tables
checkSkeleton(sc, SL, "\"", "\\u0027\\u0027");
checkSkeleton(sc, SA, "\"", "\\u0027\\u0027");
checkSkeleton(sc, ML, "\"", "\\u0027\\u0027");
checkSkeleton(sc, MA, "\"", "\\u0027\\u0027");
teardown();
}
// Internal function to run a single skeleton test case.
//
// Run a single confusable skeleton transformation test case.
//
void checkSkeleton(SpoofChecker sc, int type, String input, String expected) {
String uInput = Utility.unescape(input);
String uExpected = Utility.unescape(expected);
String actual;
actual = sc.getSkeleton(type, uInput);
if (!uExpected.equals(actual)) {
errln("Actual and Expected skeletons differ.");
errln((" Actual Skeleton: \"") + actual + ("\"\n") + (" Expected Skeleton: \"") + uExpected + ("\""));
}
}
public void TestAreConfusable() {
setup();
String s1 = "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
+ "A long string that will overflow stack buffers. A long string that will overflow stack buffers. ";
String s2 = "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
+ "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ";
TEST_ASSERT_EQ(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2));
teardown();
}
public void TestInvisible() {
setup();
String s = Utility.unescape("abcd\\u0301ef");
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
result.position = -42;
TEST_ASSERT(false == sc.check(s, result));
TEST_ASSERT_EQ(0, result.checks);
TEST_ASSERT(result.position == -42); // unchanged
String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef");
TEST_ASSERT(true == sc.check(s2, result));
TEST_ASSERT_EQ(SpoofChecker.INVISIBLE, result.checks);
TEST_ASSERT_EQ(7, result.position);
// Two acute accents, one from the composed a with acute accent, \u00e1,
// and one separate.
result.position = -42;
String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz");
TEST_ASSERT(true == sc.check(s3, result));
TEST_ASSERT_EQ(SpoofChecker.INVISIBLE, result.checks);
TEST_ASSERT_EQ(7, result.position);
teardown();
}
private String parseHex(String in) {
StringBuilder sb = new StringBuilder();
for (String oneCharAsHexString : in.split("\\s+")) {
if (oneCharAsHexString.length() > 0) {
sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16));
}
}
return sb.toString();
}
private String escapeString(String in) {
StringBuilder out = new StringBuilder();
for (int i = 0; i < in.length(); i++) {
int c = in.codePointAt(i);
if (c <= 0x7f) {
out.append((char) c);
} else if (c <= 0xffff) {
out.append(String.format("\\u%04x", c));
} else {
out.append(String.format("\\U%06x", c));
i++;
}
}
return out.toString();
}
// Verify that each item from the Unicode confusables.txt file
// transforms into the expected skeleton.
public void testConfData() {
try {
// Read in the confusables.txt file. (Distributed by Unicode.org)
String fileName = "unicode/confusables.txt";
BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8");
// Create a default spoof checker to use in this test.
SpoofChecker sc = new SpoofChecker.Builder().build();
// Parse lines from the confusables.txt file. Example Line:
// FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
// Lines have three fields. The hex fields can contain more than one character,
// and each character may be more than 4 digits (for supplemntals)
// This regular expression matches lines and splits the fields into capture groups.
// Capture group 1: map from chars
// 2: map to chars
// 3: table type, SL, ML, SA or MA
// 4: Comment Lines Only
// 5: Error Lines Only
Matcher parseLine = Pattern.compile(
"\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
+ "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
matcher("");
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
int lineNum = 0;
String inputLine;
while ((inputLine = confusablesRdr.readLine()) != null) {
lineNum++;
parseLine.reset(inputLine);
if (!parseLine.matches()) {
errln("Syntax error in confusable data file at line " + lineNum);
errln(inputLine);
break;
}
if (parseLine.group(4) != null) {
continue; // comment line
}
String from = parseHex(parseLine.group(1));
if (!normalizer.isNormalized(from)) {
// The source character was not NFKD.
// Skip this case; the first step in obtaining a skeleton is to NFKD the input,
// so the mapping in this line of confusables.txt will never be applied.
continue;
}
String rawExpected = parseHex(parseLine.group(2));
String expected = normalizer.normalize(rawExpected);
int skeletonType = 0;
String tableType = parseLine.group(3);
if (tableType.equals("SL")) {
skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
} else if (tableType.indexOf("SA") >= 0) {
skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
} else if (tableType.indexOf("ML") >= 0) {
skeletonType = 0;
} else if (tableType.indexOf("MA") >= 0) {
skeletonType = SpoofChecker.ANY_CASE;
}
String actual;
actual = sc.getSkeleton(skeletonType, from);
if (!actual.equals(expected)) {
errln("confusables.txt: " + lineNum + ": " + parseLine.group(0));
errln("Actual: " + escapeString(actual));
}
}
confusablesRdr.close();
} catch (IOException e) {
errln(e.toString());
}
}
}