blob: ca2c8a0d6f435208fa5023d2cae63ee7151897d2 [file] [log] [blame]
/*
**********************************************************************
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 05/23/00 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/translit.h"
#include "unicode/rbt.h"
#include "unicode/uniset.h"
#include "unicode/unicode.h"
#include "unicode/normlzr.h"
#include "unicode/uchar.h"
#include "transrt.h"
#include "testutil.h"
#define CASE(id,test) case id: \
name = #test; \
if (exec) { \
logln(#test "---"); \
logln((UnicodeString)""); \
test(); \
} \
break
// #define ENABLE_FAILING_TESTS
void
TransliteratorRoundTripTest::runIndexedTest(int32_t index, UBool exec,
const char* &name, char* /*par*/) {
switch (index) {
CASE(0,TestHiragana);
CASE(1,TestKatakana);
//CASE(2,TestArabic);
//CASE(3,TestHebrew);
CASE(2,TestGreek);
CASE(3,Testel);
CASE(4,TestCyrillic);
CASE(5,TestDevanagariLatin);
CASE(6,TestDevanagariBengali);
CASE(7,TestDevanagariGurmukhi);
CASE(8,TestDevanagariGujarati);
CASE(9,TestDevanagariOriya);
CASE(10,TestDevanagariTamil);
CASE(11,TestDevanagariTelugu);
CASE(12,TestDevanagariKannada);
CASE(13,TestDevanagariMalayalam);
#ifdef ENABLE_FAILING_TESTS
CASE(14,TestJamo);
CASE(15,TestJamoHangul);
#endif
default: name = ""; break;
}
}
//--------------------------------------------------------------------
// BitSet
//--------------------------------------------------------------------
/**
* Tiny and incomplete BitSet. Hardcoded to support 0..FFFF.
*/
class BitSet {
int32_t bits[65536/32];
public:
BitSet();
~BitSet();
void clear();
void set(int32_t x);
UBool get(int32_t x) const;
};
BitSet::BitSet() {
clear();
}
BitSet::~BitSet() {
}
void BitSet::clear() {
int32_t *limit = bits + 65536/32;
int32_t *p = bits;
while (p < limit) *p++ = 0;
}
void BitSet::set(int32_t x) {
x &= 0xFFFF;
int32_t i = x / 32;
int32_t bit = 1L << (x & 31);
bits[i] |= bit;
}
UBool BitSet::get(int32_t x) const {
x &= 0xFFFF;
int32_t i = x / 32;
int32_t bit = 1L << (x & 31);
return (bits[i] & bit) != 0L;
}
//--------------------------------------------------------------------
// Legal
//--------------------------------------------------------------------
class Legal {
public:
Legal() {}
virtual ~Legal() {}
virtual UBool is(const UnicodeString& sourceString) const {return TRUE;}
};
class LegalGreek : public Legal {
UBool full;
public:
LegalGreek(UBool _full) { full = _full; }
virtual ~LegalGreek() {}
virtual UBool is(const UnicodeString& sourceString) const;
static UBool isVowel(UChar c);
static UBool isRho(UChar c);
};
UBool LegalGreek::is(const UnicodeString& sourceString) const {
UnicodeString decomp;
UErrorCode ec = U_ZERO_ERROR;
Normalizer::decompose(sourceString, FALSE, 0, decomp, ec);
// modern is simpler: don't care about anything but a grave
if (!full) {
if (sourceString == CharsToUnicodeString("\\u039C\\u03C0"))
return FALSE;
for (int32_t i = 0; i < decomp.length(); ++i) {
UChar c = decomp.charAt(i);
// exclude all the accents
if (c == 0x0313 || c == 0x0314 || c == 0x0300 || c == 0x0302
|| c == 0x0342 || c == 0x0345
) return FALSE;
}
return TRUE;
}
// Legal greek has breathing marks IFF there is a vowel or RHO at the start
// IF it has them, it has exactly one.
// IF it starts with a RHO, then the breathing mark must come before the second letter.
// Since there are no surrogates in greek, don't worry about them
UBool firstIsVowel = FALSE;
UBool firstIsRho = FALSE;
UBool noLetterYet = TRUE;
int32_t breathingCount = 0;
int32_t letterCount = 0;
for (int32_t i = 0; i < decomp.length(); ++i) {
UChar c = decomp.charAt(i);
if (u_isalpha(c)) {
++letterCount;
if (noLetterYet) {
noLetterYet = FALSE;
firstIsVowel = isVowel(c);
firstIsRho = isRho(c);
}
if (firstIsRho && letterCount == 2 && breathingCount == 0) return FALSE;
}
if (c == 0x0313 || c == 0x0314) {
++breathingCount;
}
}
if (firstIsVowel || firstIsRho) return breathingCount == 1;
return breathingCount == 0;
}
UBool LegalGreek::isVowel(UChar c) {
switch (c) {
case 0x03B1:
case 0x03B5:
case 0x03B7:
case 0x03B9:
case 0x03BF:
case 0x03C5:
case 0x03C9:
case 0x0391:
case 0x0395:
case 0x0397:
case 0x0399:
case 0x039F:
case 0x03A5:
case 0x03A9:
return TRUE;
}
return FALSE;
}
UBool LegalGreek::isRho(UChar c) {
switch (c) {
case 0x03C1:
case 0x03A1:
return TRUE;
}
return FALSE;
}
class LegalDeleter {
Legal* obj;
Legal*& zeroMe;
public:
LegalDeleter(Legal* adopted, Legal*& ptrToClean) :
obj(adopted),
zeroMe(ptrToClean) {}
~LegalDeleter() { delete obj; zeroMe = NULL; }
};
//--------------------------------------------------------------------
// RTTest Interface
//--------------------------------------------------------------------
class RTTest {
// PrintWriter out;
UnicodeString transliteratorID;
int8_t sourceScript;
int8_t targetScript;
int32_t errorLimit;
int32_t errorCount;
int32_t pairLimit;
UnicodeSet sourceRange;
UnicodeSet targetRange;
UnicodeSet roundtripExclusions;
IntlTest* log;
Legal* legalSource; // NOT owned
UnicodeSet badCharacters;
public:
/*
* create a test for the given script transliterator.
*/
RTTest(const UnicodeString& transliteratorIDStr,
int8_t sourceScriptVal, int8_t targetScriptVal);
virtual ~RTTest();
void setErrorLimit(int32_t limit);
void setPairLimit(int32_t limit);
void test(const UnicodeString& sourceRange,
const UnicodeString& targetRange,
const char* roundtripExclusions,
IntlTest* log,
Legal* adoptedLegal);
private:
// Added to do better equality check.
static UBool isSame(const UnicodeString& a, const UnicodeString& b);
UBool includesSome(const UnicodeSet& set, const UnicodeString& a);
static UBool isCamel(const UnicodeString& a);
void test2();
void logWrongScript(const UnicodeString& label,
const UnicodeString& from,
const UnicodeString& to);
void logRoundTripFailure(const UnicodeString& from,
const UnicodeString& to,
const UnicodeString& back);
void logNotCanonical(const UnicodeString& label,
const UnicodeString& from,
const UnicodeString& to,
const UnicodeString& toCan);
protected:
/*
* Characters to filter for source-target mapping completeness
* Typically is base alphabet, minus extended characters
* Default is ASCII letters for Latin
*/
virtual UBool isSource(UChar c);
/*
* Characters to check for target back to source mapping.
* Typically the same as the target script, plus punctuation
*/
inline UBool isReceivingSource(UChar c);
/*
* Characters to filter for target-source mapping
* Typically is base alphabet, minus extended characters
*/
inline UBool isTarget(UChar c);
/*
* Characters to check for target-source mapping
* Typically the same as the source script, plus punctuation
*/
inline UBool isReceivingTarget(UChar c);
UBool isSource(const UnicodeString& s);
UBool isTarget(const UnicodeString& s);
UBool isReceivingSource(const UnicodeString& s);
UBool isReceivingTarget(const UnicodeString& s);
};
//--------------------------------------------------------------------
// RTTest Implementation
//--------------------------------------------------------------------
/*
* create a test for the given script transliterator.
*/
RTTest::RTTest(const UnicodeString& transliteratorIDStr,
int8_t sourceScriptVal, int8_t targetScriptVal) {
this->transliteratorID = transliteratorIDStr;
this->sourceScript = sourceScriptVal;
this->targetScript = targetScriptVal;
legalSource = NULL;
errorLimit = (int32_t)0x7FFFFFFFL;
errorCount = 0;
pairLimit = 0x10000;
}
RTTest::~RTTest() {
}
void RTTest::setErrorLimit(int32_t limit) {
errorLimit = limit;
}
void RTTest::setPairLimit(int32_t limit) {
pairLimit = limit;
}
UBool RTTest::isSame(const UnicodeString& a, const UnicodeString& b) {
if (a == b) return TRUE;
if (a.caseCompare(b, U_FOLD_CASE_DEFAULT)==0 && isCamel(a)) return TRUE;
UnicodeString aa, bb;
UErrorCode ec = U_ZERO_ERROR;
Normalizer::decompose(a, FALSE, 0, aa, ec);
Normalizer::decompose(b, FALSE, 0, bb, ec);
if (aa == bb) return TRUE;
if (aa.caseCompare(bb, U_FOLD_CASE_DEFAULT)==0 && isCamel(aa)) return TRUE;
return FALSE;
}
UBool RTTest::includesSome(const UnicodeSet& set, const UnicodeString& a) {
UChar32 cp;
for (int32_t i = 0; i < a.length(); i += UTF_CHAR_LENGTH(cp)) {
cp = a.char32At(i);
if (set.contains(cp)) return TRUE;
}
return FALSE;
}
UBool RTTest::isCamel(const UnicodeString& a) {
// see if string is of the form aB; e.g. lower, then upper or title
UChar32 cp;
UBool haveLower = FALSE;
for (int32_t i = 0; i < a.length(); i += UTF_CHAR_LENGTH(cp)) {
cp = a.char32At(i);
int8_t t = u_charType(cp);
switch (t) {
case U_UPPERCASE_LETTER:
if (haveLower) return TRUE;
break;
case U_TITLECASE_LETTER:
if (haveLower) return TRUE;
// drop through, since second letter is lower.
case U_LOWERCASE_LETTER:
haveLower = TRUE;
break;
}
}
return FALSE;
}
void RTTest::test(const UnicodeString& sourceRangeVal,
const UnicodeString& targetRangeVal,
const char* roundtripExclusions,
IntlTest* logVal,
Legal* adoptedLegal) {
UErrorCode status = U_ZERO_ERROR;
this->log = logVal;
this->legalSource = adoptedLegal;
LegalDeleter cleaner(adoptedLegal, this->legalSource);
if (sourceRangeVal.length() > 0) {
this->sourceRange.applyPattern(sourceRangeVal, status);
if (U_FAILURE(status)) {
log->errln("FAIL: UnicodeSet::applyPattern(" +
sourceRangeVal + ")");
return;
}
} else {
this->sourceRange.applyPattern("[a-zA-Z]", status);
if (U_FAILURE(status)) {
log->errln("FAIL: UnicodeSet::applyPattern([a-z])");
return;
}
}
this->targetRange.clear();
if (targetRangeVal.length() > 0) {
this->targetRange.applyPattern(targetRangeVal, status);
if (U_FAILURE(status)) {
log->errln("FAIL: UnicodeSet::applyPattern(" +
targetRangeVal + ")");
return;
}
}
this->roundtripExclusions.clear();
if (roundtripExclusions != NULL) {
UErrorCode ec = U_ZERO_ERROR;
this->roundtripExclusions.applyPattern(roundtripExclusions, ec);
}
if (badCharacters.isEmpty()) {
UErrorCode ec = U_ZERO_ERROR;
badCharacters.applyPattern("[:Other:]", ec);
}
test2();
if (errorCount > 0) {
log->errln(transliteratorID + " errors: " + errorCount); // + ", see " + logFileName);
} else {
log->logln(transliteratorID + " ok");
}
}
void RTTest::logWrongScript(const UnicodeString& label,
const UnicodeString& from,
const UnicodeString& to) {
log->errln((UnicodeString)"Fail " +
label + ": " +
from + "(" + TestUtility::hex(from) + ") => " +
to + "(" + TestUtility::hex(to) + ")");
++errorCount;
}
void RTTest::logNotCanonical(const UnicodeString& label,
const UnicodeString& from,
const UnicodeString& to,
const UnicodeString& toCan) {
log->errln((UnicodeString)"Fail (can.equiv)" +
label + ": " +
from + "(" + TestUtility::hex(from) + ") => " +
to + "(" + TestUtility::hex(to) + ")" +
toCan + " (" +
TestUtility::hex(to) + ")"
);
++errorCount;
}
void RTTest::logRoundTripFailure(const UnicodeString& from,
const UnicodeString& to,
const UnicodeString& back) {
if (!legalSource->is(from)) return; // skip illegals
log->errln((UnicodeString)"Fail Roundtrip: " +
from + "(" + TestUtility::hex(from) + ") => " +
to + "(" + TestUtility::hex(to) + ") => " +
back + "(" + TestUtility::hex(back) + ") => ");
++errorCount;
}
/*
* Characters to filter for source-target mapping completeness
* Typically is base alphabet, minus extended characters
* Default is ASCII letters for Latin
*/
UBool RTTest::isSource(UChar c) {
return (TestUtility::getScript(c) == sourceScript && u_isalpha(c)
&& sourceRange.contains(c));
}
/*
* Characters to check for target back to source mapping.
* Typically the same as the target script, plus punctuation
*/
inline UBool
RTTest::isReceivingSource(UChar c) {
int8_t script = TestUtility::getScript(c);
return (script == sourceScript || script == TestUtility::COMMON_SCRIPT);
}
/*
* Characters to filter for target-source mapping
* Typically is base alphabet, minus extended characters
*/
inline UBool
RTTest::isTarget(UChar c) {
return (TestUtility::getScript(c) == targetScript && u_isalpha(c)
&& (targetRange.isEmpty() || targetRange.contains(c)));
}
/*
* Characters to check for target-source mapping
* Typically the same as the source script, plus punctuation
*/
inline UBool
RTTest::isReceivingTarget(UChar c) {
int8_t script = TestUtility::getScript(c);
return (script == targetScript || script == TestUtility::COMMON_SCRIPT);
}
UBool RTTest::isSource(const UnicodeString& s) {
int32_t length = s.length();
for (int32_t i = 0; i < length; ++i) {
if (!isSource(s.charAt(i)))
return FALSE;
}
return TRUE;
}
UBool RTTest::isTarget(const UnicodeString& s) {
int32_t length = s.length();
for (int32_t i = 0; i < length; ++i) {
if (!isTarget(s.charAt(i)))
return FALSE;
}
return TRUE;
}
UBool RTTest::isReceivingSource(const UnicodeString& s) {
int32_t length = s.length();
for (int32_t i = 0; i < length; ++i) {
if (!isReceivingSource(s.charAt(i)))
return FALSE;
}
return TRUE;
}
UBool RTTest::isReceivingTarget(const UnicodeString& s) {
int32_t length = s.length();
for (int32_t i = 0; i < length; ++i) {
if (!isReceivingTarget(s.charAt(i)))
return FALSE;
}
return TRUE;
}
//--------------------------------------------------------------------
// Specific Tests
//--------------------------------------------------------------------
void TransliteratorRoundTripTest::TestHiragana() {
RTTest test("Latin-Hiragana",
TestUtility::LATIN_SCRIPT, TestUtility::HIRAGANA_SCRIPT);
test.test("[a-z]", UnicodeString("[\\u3040-\\u3094]", ""), NULL, this, new Legal());
}
void TransliteratorRoundTripTest::TestKatakana() {
RTTest test("Latin-Katakana",
TestUtility::LATIN_SCRIPT, TestUtility::KATAKANA_SCRIPT);
test.test("[a-z]", UnicodeString("[\\u30A1-\\u30FA\\u30FC]", ""), NULL, this, new Legal());
}
void TransliteratorRoundTripTest::TestArabic() {
// RTTest test("Latin-Arabic",
// TestUtility::LATIN_SCRIPT, TestUtility::ARABIC_SCRIPT);
// test.test("[a-z]", UnicodeString("[\\u0620-\\u065F-[\\u0640]]", ""), this, new Legal());
}
void TransliteratorRoundTripTest::TestHebrew() {
// RTTest test("Latin-Hebrew",
// TestUtility::LATIN_SCRIPT, TestUtility::HEBREW_SCRIPT);
// test.test("", UnicodeString("[\\u05D0-\\u05EF]", ""), this, new Legal());
}
void TransliteratorRoundTripTest::TestJamo() {
RTTest t("Latin-Jamo",
TestUtility::LATIN_SCRIPT, TestUtility::JAMO_SCRIPT);
t.setErrorLimit(200); // Don't run full test -- too long
t.test("", "", NULL, this, new Legal());
}
void TransliteratorRoundTripTest::TestJamoHangul() {
RTTest t("Latin-Hangul",
TestUtility::LATIN_SCRIPT, TestUtility::HANGUL_SCRIPT);
t.setErrorLimit(50); // Don't run full test -- too long
t.test("", "", NULL, this, new Legal());
}
void TransliteratorRoundTripTest::TestGreek() {
RTTest test("Latin-Greek",
TestUtility::LATIN_SCRIPT, TestUtility::GREEK_SCRIPT);
test.test("", UnicodeString("[\\u003B\\u00B7[:Greek:]-[\\u03D7-\\u03EF]]", ""),
"[\\u037A\\u03D0-\\u03F5]", /* exclusions */
this, new LegalGreek(TRUE));
}
void TransliteratorRoundTripTest::Testel() {
RTTest test("Latin-el",
TestUtility::LATIN_SCRIPT, TestUtility::GREEK_SCRIPT);
test.test("", "[\\u003B\\u00B7[:Greek:]-[\\u03D7-\\u03EF]]",
"[\\u037A\\u03D0-\\u03F5]", /* exclusions */
this, new LegalGreek(FALSE));
}
void TransliteratorRoundTripTest::TestCyrillic() {
RTTest test("Latin-Cyrillic",
TestUtility::LATIN_SCRIPT, TestUtility::CYRILLIC_SCRIPT);
test.test("", UnicodeString("[\\u0400-\\u045F]", ""), NULL, this, new Legal());
}
//----------------------------------
// Inter-Indic Tests
//----------------------------------
void TransliteratorRoundTripTest::TestDevanagariLatin() {
RTTest test("Latin-DEVANAGARI",
TestUtility::LATIN_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("", "[:Devanagari:]", NULL, this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariBengali() {
RTTest test("BENGALI-DEVANAGARI",
TestUtility::BENGALI_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:BENGALI:]", "[:Devanagari:]",
"[\\u0950\\u0935\\u0912\\u0933\\u090e\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]\\u09F0\\u09F1]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-BENGALI",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::BENGALI_SCRIPT );
test1.test( "[:Devanagari:]", "[:BENGALI:]",
"[\\u0950\\u0935\\u0912\\u0933\\u090e\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]\\u09F0\\u09F1]", /*roundtrip exclusions*/
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariGurmukhi() {
RTTest test("GURMUKHI-DEVANAGARI",
TestUtility::GURMUKHI_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:GURMUKHI:]", "[:Devanagari:]",
"[\\u0950\\u090D\\u090e\\u0912\\u0911\\u090b\\u090c\\u0934\\u0960\\u0961\\u0937\\u0a72\\u0a73\\u0a74\\u093d]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-GURMUKHI",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::GURMUKHI_SCRIPT );
test1.test( "[:Devanagari:]", "[:GURMUKHI:]",
"[\\u0950\\u090D\\u090e\\u0912\\u0911\\u090b\\u090c\\u0934\\u0960\\u0961\\u0937\\u0a72\\u0a73\\u0a74\\u093d]", /*roundtrip exclusions*/
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariGujarati() {
RTTest test("GUJARATI-DEVANAGARI",
TestUtility::GUJARATI_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:GUJARATI:]", "[:Devanagari:]",
"[\\u0961\\u090c\\u090e\\u0912]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-GUJARATI",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::GUJARATI_SCRIPT );
test1.test( "[:Devanagari:]", "[:GUJARATI:]",NULL,
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariOriya() {
RTTest test("ORIYA-DEVANAGARI",
TestUtility::ORIYA_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:ORIYA:]", "[:Devanagari:]",
"[\\u0950\\u090D\\u090e\\u0912\\u0911\\u0931\\u0935]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-ORIYA",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::ORIYA_SCRIPT );
test1.test( "[:Devanagari:]", "[:ORIYA:]",
"[\\u0950\\u090D\\u090e\\u0912\\u0911\\u0931\\u0935]", /*roundtrip exclusions*/
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariTamil() {
RTTest test("Tamil-DEVANAGARI",
TestUtility::TAMIL_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:tamil:]", "[:Devanagari:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]"
"\\u090B\\u090C\\u0916\\u0917\\u0918\\u091B\\u091D\\u0920\\u0921"
"\\u0922\\u0925\\u0926\\u0927\\u092B\\u092C\\u092D\\u0936\\u0960\\u0961]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-Tamil",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::TAMIL_SCRIPT );
test1.test( "[:Devanagari:]", "[:tamil:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariTelugu() {
RTTest test("Telugu-DEVANAGARI",
TestUtility::TELUGU_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:telugu:]", "[:Devanagari:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-TELUGU",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::TELUGU_SCRIPT );
test1.test( "[:Devanagari:]", "[:TELUGU:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariKannada() {
RTTest test("KANNADA-DEVANAGARI",
TestUtility::KANNADA_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:KANNADA:]", "[:Devanagari:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-KANNADA",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::KANNADA_SCRIPT );
test1.test( "[:Devanagari:]", "[:KANNADA:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
}
void TransliteratorRoundTripTest::TestDevanagariMalayalam() {
RTTest test("MALAYALAM-DEVANAGARI",
TestUtility::MALAYALAM_SCRIPT, TestUtility::DEVANAGARI_SCRIPT);
test.test("[:MALAYALAM:]", "[:Devanagari:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
RTTest test1("DEVANAGARI-MALAYALAM",
TestUtility::DEVANAGARI_SCRIPT, TestUtility::MALAYALAM_SCRIPT );
test1.test( "[:Devanagari:]", "[:MALAYALAM:]",
"[\\u0950\\u090D\\u0911\\u093d\\u0929\\u0934[\\u0958-\\u095f]]", /*roundtrip exclusions*/
this, new Legal());
}
//---------------
// End Indic
//---------------
void RTTest::test2() {
UChar c;
UnicodeString cs, targ, reverse;
int8_t *type = new int8_t[0xFFFF];
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
Transliterator* sourceToTarget = Transliterator::createInstance(transliteratorID, UTRANS_FORWARD, parseError, status);
if (sourceToTarget == NULL) {
log->errln("Fail: createInstance(" + transliteratorID +
") returned NULL");
return;
}
Transliterator* targetToSource = sourceToTarget->createInverse(status);
if (targetToSource == NULL) {
log->errln("Fail: " + transliteratorID +
".createInverse() returned NULL");
delete sourceToTarget;
return;
}
log->logln("Initializing type array");
for (c = 0; c < 0xFFFF; ++c) {
type[c] = u_charType(c);
}
BitSet failSourceTarg;
log->logln("Checking that all source characters convert to target - Singles");
for (c = 0; c < 0xFFFF; ++c) {
if (type[c] == U_UNASSIGNED || !isSource(c))
continue;
cs.remove();
cs.append(c);
targ = cs;
sourceToTarget->transliterate(targ);
if (!isReceivingTarget(targ) || includesSome(badCharacters, targ)) {
logWrongScript("Source-Target", cs, targ);
failSourceTarg.set(c);
if (errorCount >= errorLimit)
return;
} else {
UnicodeString cs2;
UErrorCode ec = U_ZERO_ERROR;
Normalizer::decompose(cs, FALSE, 0, cs2, ec);
UnicodeString targ2 = cs2;
sourceToTarget->transliterate(targ2);
if (targ != targ2) {
logNotCanonical("Source-Target", cs, targ, targ2);
if (errorCount >= errorLimit)
return;
}
}
}
log->logln("Checking that all source characters convert to target - Doubles");
for (c = 0; c < 0xFFFF; ++c) {
if (type[c] == U_UNASSIGNED ||
!isSource(c)) continue;
if (failSourceTarg.get(c)) continue;
for (UChar d = 0; d < 0xFFFF; ++d) {
if (type[d] == U_UNASSIGNED || !isSource(d))
continue;
if (failSourceTarg.get(d)) continue;
cs.remove();
cs.append(c).append(d);
targ = cs;
sourceToTarget->transliterate(targ);
if (!isReceivingTarget(targ) || includesSome(badCharacters, targ)) {
logWrongScript("Source-Target", cs, targ);
if (errorCount >= errorLimit)
return;
} else {
UnicodeString cs2;
UErrorCode ec = U_ZERO_ERROR;
Normalizer::decompose(cs, FALSE, 0, cs2, ec);
UnicodeString targ2 = cs2;
sourceToTarget->transliterate(targ2);
if (targ != targ2) {
logNotCanonical("Source-Target", cs, targ, targ2);
if (errorCount >= errorLimit)
return;
}
}
}
}
log->logln("Checking that target characters convert to source and back - Singles");
BitSet failTargSource;
BitSet failRound;
for (c = 0; c < 0xFFFF; ++c) {
if (type[c] == U_UNASSIGNED || !isTarget(c))
continue;
cs.remove();
cs.append(c);
targ = cs;
targetToSource->transliterate(targ);
reverse = targ;
sourceToTarget->transliterate(reverse);
if (!isReceivingSource(targ) || includesSome(badCharacters, targ)) {
logWrongScript("Target-Source", cs, targ);
failTargSource.set(c);
if (errorCount >= errorLimit)
return;
} else if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)) {
logRoundTripFailure(cs, targ, reverse);
failRound.set(c);
if (errorCount >= errorLimit)
return;
} else {
UnicodeString targ2;
UErrorCode ec = U_ZERO_ERROR;
Normalizer::decompose(targ, FALSE, 0, targ2, ec);
UnicodeString reverse2 = targ2;
sourceToTarget->transliterate(reverse2);
if (reverse != reverse2) {
logNotCanonical("Target-Source", cs, targ, targ2);
if (errorCount >= errorLimit)
return;
}
}
}
log->logln("Checking that target characters convert to source and back - Doubles");
int32_t count = 0;
cs = UNICODE_STRING("aa", 2);
for (c = 0; c < 0xFFFF; ++c) {
if (type[c] == U_UNASSIGNED || !isTarget(c))
continue;
if (++count > pairLimit) {
//throw new TestTruncated("Test truncated at " + pairLimit + " x 64k pairs");
log->logln("");
log->logln((UnicodeString)"Test truncated at " + pairLimit + " x 64k pairs");
return;
}
cs.setCharAt(0, c);
log->log(TestUtility::hex(c));
for (UChar d = 0; d < 0xFFFF; ++d) {
if (type[d] == U_UNASSIGNED || !isTarget(d))
continue;
cs.setCharAt(1, d);
targ = cs;
targetToSource->transliterate(targ);
reverse = targ;
sourceToTarget->transliterate(reverse);
if (!isReceivingSource(targ) && !failTargSource.get(c) && !failTargSource.get(d)
|| includesSome(badCharacters, targ)) {
logWrongScript("Target-Source", cs, targ);
if (errorCount >= errorLimit)
return;
} else if (!isSame(cs, reverse) && !failRound.get(c) && !failRound.get(d)
&& !roundtripExclusions.contains(c) && !roundtripExclusions.contains(d)) {
logRoundTripFailure(cs, targ, reverse);
if (errorCount >= errorLimit)
return;
} else {
UnicodeString targ2;
UErrorCode ec = U_ZERO_ERROR;
Normalizer::decompose(targ, FALSE, 0, targ2, ec);
UnicodeString reverse2 = targ2;
sourceToTarget->transliterate(reverse2);
if (reverse != reverse2) {
logNotCanonical("Target-Source", cs, targ, targ2);
if (errorCount >= errorLimit)
return;
}
}
}
}
log->logln("");
delete []type;
delete sourceToTarget;
delete targetToSource;
}