source/test/intltest/itspoof.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 * Copyright (C) 2009, International Business Machines Corporation
 * and others.  All Rights Reserved.
 **********************************************************************
 */
 /**
  * IntlTestSpoof tests for USpoofDetector
  */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_SPOOF_DETECT

 #include "itspoof.h"
 #include "unicode/uspoof.h"

 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     errln("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}

 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
     errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}

 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
     errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
              __FILE__, __LINE__, #a, (a), #b, (b)); }}

 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
     errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
              __FILE__, __LINE__, #a, (a), #b, (b)); }}

 /*
  *   TEST_SETUP and TEST_TEARDOWN
  *         macros to handle the boilerplate around setting up test case.
  *         Put arbitrary test code between SETUP and TEARDOWN.
  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
  */
 #define TEST_SETUP {  \
     UErrorCode status = U_ZERO_ERROR; \
     USpoofChecker *sc;     \
     sc = uspoof_open(&status);  \
     TEST_ASSERT_SUCCESS(status);   \
     if (U_SUCCESS(status)){

 #define TEST_TEARDOWN  \
     }  \
     TEST_ASSERT_SUCCESS(status);  \
     uspoof_close(sc);  \
 }


 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
 {
     if (exec) logln("TestSuite spoof: ");
     switch (index) {
         case 0:
             name = "TestSpoofAPI";
             if (exec) {
                 testSpoofAPI();
             }
             break;
          case 1:
             name = "TestSkeleton";
             if (exec) {
                 testSkeleton();
             }
             break;
          case 2:
             name = "TestAreConfusable";
             if (exec) {
                 testAreConfusable();
             }
             break;
           case 3:
             name = "TestInvisible";
             if (exec) {
                 testInvisible();
             }
             break;
         default: name=""; break;
     }
 }

 void IntlTestSpoof::testSpoofAPI() {

     TEST_SETUP
         UnicodeString s("uvw");
         int32_t position = 666;
         int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT_EQ(0, checkResults);
         TEST_ASSERT_EQ(666, position);
     TEST_TEARDOWN;

     TEST_SETUP
         UnicodeString s1("cxs");
         UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
         int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);

     TEST_TEARDOWN;

     TEST_SETUP
         UnicodeString s("I1l0O");
         UnicodeString dest;
         UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(UnicodeString("11100") == dest);
         TEST_ASSERT(&dest == &retStr);
     TEST_TEARDOWN;
 }


 #define CHECK_SKELETON(type, input, expected) { \
     checkSkeleton(sc, type, input, expected, __LINE__); \
     }


 // testSkeleton.   Spot check a number of confusable skeleton substitutions from the
 //                 Unicode data file confusables.txt
 //                 Test cases chosen for substitutions of various lengths, and
 //                 membership in different mapping tables.
 void IntlTestSpoof::testSkeleton() {
     const uint32_t ML = 0;
     const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
     const uint32_t MA = USPOOF_ANY_CASE;
     const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;

     TEST_SETUP
         // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
         CHECK_SKELETON(SL, " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                            " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                            " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                            " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",

                " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
                " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
                " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
                " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations.")

         // FC5F ;	FE74 0651 ;   ML  #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
         //                                ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
         //    This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping
         //    is never used in creating a skeleton.
         CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");

         CHECK_SKELETON(SL, "nochange", "nochange");
         CHECK_SKELETON(MA, "love", "1ove");   // lower case l to digit 1
         CHECK_SKELETON(ML, "OOPS", "OOPS");
         CHECK_SKELETON(MA, "OOPS", "00PS");   // Letter O to digit 0 in any case mode only
         CHECK_SKELETON(SL, "\\u059c", "\\u0301");
         CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
         CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u0031\\u0031\\u0029");
         CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");

         // This mapping exists in the ML and MA tables, does not exist in SL, SA
         //0C83 ;	0C03 ;	ML	# ( ಃ → ః ) KANNADA SIGN VISARGA → TELUGU SIGN VISARGA	# {source:513}
         CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
         CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
         CHECK_SKELETON(ML, "\\u0C83", "\\u0C03");
         CHECK_SKELETON(MA, "\\u0C83", "\\u0C03");

         // 0391 ; 0041 ; MA # ( Α → A ) GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A
         // This mapping exists only in the MA table.
         CHECK_SKELETON(MA, "\\u0391", "A");
         CHECK_SKELETON(SA, "\\u0391", "\\u0391");
         CHECK_SKELETON(ML, "\\u0391", "\\u0391");
         CHECK_SKELETON(SL, "\\u0391", "\\u0391");

         // 13CF ;  0062 ;  MA  #  CHEROKEE LETTER SI to LATIN SMALL LETTER B
         // This mapping exists in the ML and MA tables
         CHECK_SKELETON(ML, "\\u13CF", "b");
         CHECK_SKELETON(MA, "\\u13CF", "b");
         CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
         CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");

         // 0022 ;  02B9 02B9 ;  SA  #*  QUOTATION MARK to MODIFIER LETTER PRIME, MODIFIER LETTER PRIME
         // all tables.
         CHECK_SKELETON(SL, "\\u0022", "\\u02B9\\u02B9");
         CHECK_SKELETON(SA, "\\u0022", "\\u02B9\\u02B9");
         CHECK_SKELETON(ML, "\\u0022", "\\u02B9\\u02B9");
         CHECK_SKELETON(MA, "\\u0022", "\\u02B9\\u02B9");

     TEST_TEARDOWN;
 }


 //
 //  Run a single confusable skeleton transformation test case.
 //
 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
                                   const char *input, const char *expected, int32_t lineNum) {
     UnicodeString uInput = UnicodeString(input).unescape();
     UnicodeString uExpected = UnicodeString(expected).unescape();

     UErrorCode status = U_ZERO_ERROR;
     UnicodeString actual;
     uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
     if (U_FAILURE(status)) {
         errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
               u_errorName(status));
         return;
     }
     if (uExpected != actual) {
         errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
                __FILE__, __LINE__, lineNum);
         errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
               UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
     }
 }

 void IntlTestSpoof::testAreConfusable() {
     TEST_SETUP
         UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
                          "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
         UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
                          "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
         TEST_ASSERT_SUCCESS(status);

     TEST_TEARDOWN;
 }

 void IntlTestSpoof::testInvisible() {
     TEST_SETUP
         UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
         int32_t position = -42;
         TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT(position == -42);

         UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT_EQ(7, position);

         // Tow acute accents, one from the composed a with acute accent, \u00e1,
         // and one separate.
         position = -42;
         UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
         TEST_ASSERT_SUCCESS(status);
         TEST_ASSERT_EQ(7, position);
     TEST_TEARDOWN;
 }

 #endif /* #if !UCONFIG_NO_SPOOF_DETECT*/
	/*
	**********************************************************************
	* Copyright (C) 2009, International Business Machines Corporation
	* and others. All Rights Reserved.
	**********************************************************************
	*/
	/**
	* IntlTestSpoof tests for USpoofDetector
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_SPOOF_DETECT

	#include "itspoof.h"
	#include "unicode/uspoof.h"

	#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
	errln("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}

	#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
	errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}

	#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
	errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
	__FILE__, __LINE__, #a, (a), #b, (b)); }}

	#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
	errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
	__FILE__, __LINE__, #a, (a), #b, (b)); }}

	/*
	* TEST_SETUP and TEST_TEARDOWN
	* macros to handle the boilerplate around setting up test case.
	* Put arbitrary test code between SETUP and TEARDOWN.
	* "sc" is the ready-to-go SpoofChecker for use in the tests.
	*/
	#define TEST_SETUP { \
	UErrorCode status = U_ZERO_ERROR; \
	USpoofChecker *sc; \
	sc = uspoof_open(&status); \
	TEST_ASSERT_SUCCESS(status); \
	if (U_SUCCESS(status)){

	#define TEST_TEARDOWN \
	} \
	TEST_ASSERT_SUCCESS(status); \
	uspoof_close(sc); \
	}




	void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
	{
	if (exec) logln("TestSuite spoof: ");
	switch (index) {
	case 0:
	name = "TestSpoofAPI";
	if (exec) {
	testSpoofAPI();
	}
	break;
	case 1:
	name = "TestSkeleton";
	if (exec) {
	testSkeleton();
	}
	break;
	case 2:
	name = "TestAreConfusable";
	if (exec) {
	testAreConfusable();
	}
	break;
	case 3:
	name = "TestInvisible";
	if (exec) {
	testInvisible();
	}
	break;
	default: name=""; break;
	}
	}

	void IntlTestSpoof::testSpoofAPI() {

	TEST_SETUP
	UnicodeString s("uvw");
	int32_t position = 666;
	int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
	TEST_ASSERT_SUCCESS(status);
	TEST_ASSERT_EQ(0, checkResults);
	TEST_ASSERT_EQ(666, position);
	TEST_TEARDOWN;

	TEST_SETUP
	UnicodeString s1("cxs");
	UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs"
	int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
	TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE \| USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);

	TEST_TEARDOWN;

	TEST_SETUP
	UnicodeString s("I1l0O");
	UnicodeString dest;
	UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
	TEST_ASSERT_SUCCESS(status);
	TEST_ASSERT(UnicodeString("11100") == dest);
	TEST_ASSERT(&dest == &retStr);
	TEST_TEARDOWN;
	}


	#define CHECK_SKELETON(type, input, expected) { \
	checkSkeleton(sc, type, input, expected, __LINE__); \
	}


	// testSkeleton. Spot check a number of confusable skeleton substitutions from the
	// Unicode data file confusables.txt
	// Test cases chosen for substitutions of various lengths, and
	// membership in different mapping tables.
	void IntlTestSpoof::testSkeleton() {
	const uint32_t ML = 0;
	const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
	const uint32_t MA = USPOOF_ANY_CASE;
	const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE \| USPOOF_ANY_CASE;

	TEST_SETUP
	// A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
	CHECK_SKELETON(SL, " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
	" A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
	" A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
	" A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",

	" A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
	" A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
	" A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
	" A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations.")

	// FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
	// ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
	// This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping
	// is never used in creating a skeleton.
	CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");

	CHECK_SKELETON(SL, "nochange", "nochange");
	CHECK_SKELETON(MA, "love", "1ove"); // lower case l to digit 1
	CHECK_SKELETON(ML, "OOPS", "OOPS");
	CHECK_SKELETON(MA, "OOPS", "00PS"); // Letter O to digit 0 in any case mode only
	CHECK_SKELETON(SL, "\\u059c", "\\u0301");
	CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
	CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u0031\\u0031\\u0029");
	CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");

	// This mapping exists in the ML and MA tables, does not exist in SL, SA
	//0C83 ; 0C03 ; ML # ( ಃ → ః ) KANNADA SIGN VISARGA → TELUGU SIGN VISARGA # {source:513}
	CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
	CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
	CHECK_SKELETON(ML, "\\u0C83", "\\u0C03");
	CHECK_SKELETON(MA, "\\u0C83", "\\u0C03");

	// 0391 ; 0041 ; MA # ( Α → A ) GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A
	// This mapping exists only in the MA table.
	CHECK_SKELETON(MA, "\\u0391", "A");
	CHECK_SKELETON(SA, "\\u0391", "\\u0391");
	CHECK_SKELETON(ML, "\\u0391", "\\u0391");
	CHECK_SKELETON(SL, "\\u0391", "\\u0391");

	// 13CF ; 0062 ; MA # CHEROKEE LETTER SI to LATIN SMALL LETTER B
	// This mapping exists in the ML and MA tables
	CHECK_SKELETON(ML, "\\u13CF", "b");
	CHECK_SKELETON(MA, "\\u13CF", "b");
	CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
	CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");

	// 0022 ; 02B9 02B9 ; SA #* QUOTATION MARK to MODIFIER LETTER PRIME, MODIFIER LETTER PRIME
	// all tables.
	CHECK_SKELETON(SL, "\\u0022", "\\u02B9\\u02B9");
	CHECK_SKELETON(SA, "\\u0022", "\\u02B9\\u02B9");
	CHECK_SKELETON(ML, "\\u0022", "\\u02B9\\u02B9");
	CHECK_SKELETON(MA, "\\u0022", "\\u02B9\\u02B9");

	TEST_TEARDOWN;
	}


	//
	// Run a single confusable skeleton transformation test case.
	//
	void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
	const char input, const char expected, int32_t lineNum) {
	UnicodeString uInput = UnicodeString(input).unescape();
	UnicodeString uExpected = UnicodeString(expected).unescape();

	UErrorCode status = U_ZERO_ERROR;
	UnicodeString actual;
	uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
	if (U_FAILURE(status)) {
	errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
	u_errorName(status));
	return;
	}
	if (uExpected != actual) {
	errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
	__FILE__, __LINE__, lineNum);
	errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") +
	UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
	}
	}

	void IntlTestSpoof::testAreConfusable() {
	TEST_SETUP
	UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
	"A long string that will overflow stack buffers. A long string that will overflow stack buffers. ");
	UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
	"A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ");
	TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
	TEST_ASSERT_SUCCESS(status);

	TEST_TEARDOWN;
	}

	void IntlTestSpoof::testInvisible() {
	TEST_SETUP
	UnicodeString s = UnicodeString("abcd\\u0301ef").unescape();
	int32_t position = -42;
	TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
	TEST_ASSERT_SUCCESS(status);
	TEST_ASSERT(position == -42);

	UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
	TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
	TEST_ASSERT_SUCCESS(status);
	TEST_ASSERT_EQ(7, position);

	// Tow acute accents, one from the composed a with acute accent, \u00e1,
	// and one separate.
	position = -42;
	UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
	TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
	TEST_ASSERT_SUCCESS(status);
	TEST_ASSERT_EQ(7, position);
	TEST_TEARDOWN;
	}

	#endif /* #if !UCONFIG_NO_SPOOF_DETECT*/