source/test/intltest/csdetest.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
  **********************************************************************
  *   Copyright (C) 2005-2006, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  */


 #include "unicode/utypes.h"
 #include "unicode/ucsdet.h"
 #include "unicode/ucnv.h"
 #include "unicode/unistr.h"
 #include "unicode/putil.h"

 #include "intltest.h"
 #include "csdetest.h"

 #include "xmlparser.h"

 #include <stdlib.h>
 #include <string.h>

 #ifdef DEBUG_DETECT
 #include <stdio.h>
 #endif

 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))

 #define CH_SPACE 0x0020
 #define CH_SLASH 0x002F

 //---------------------------------------------------------------------------
 //
 //  Test class boilerplate
 //
 //---------------------------------------------------------------------------
 CharsetDetectionTest::CharsetDetectionTest()
 {
 }


 CharsetDetectionTest::~CharsetDetectionTest()
 {
 }


 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
 {
     if (exec) logln("TestSuite CharsetDetectionTest: ");
     switch (index) {
        case 0: name = "ConstructionTest";
             if (exec) ConstructionTest();
             break;

        case 1: name = "UTF8Test";
             if (exec) UTF8Test();
             break;

        case 2: name = "UTF16Test";
             if (exec) UTF16Test();
             break;

        case 3: name = "C1BytesTest";
             if (exec) C1BytesTest();
             break;

        case 4: name = "InputFilterTest";
             if (exec) InputFilterTest();
             break;

        case 5: name = "DetectionTest";
             if (exec) DetectionTest();
             break;

         default: name = "";
             break; //needed to end loop
     }
 }

 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
 {
     int32_t offset = -1;

     splits = 1;
     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
         splits += 1;
     }

     UnicodeString *result = new UnicodeString[splits];

     int32_t start = 0;
     int32_t split = 0;
     int32_t end;

     while((end = src.indexOf(ch, start)) >= 0) {
         src.extractBetween(start, end, result[split++]);
         start = end + 1;
     }

     src.extractBetween(start, src.length(), result[split]);

     return result;
 }

 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
 {
     int32_t sLength = source.length();
     char *bytes = NULL;

     length = source.extract(0, sLength, NULL, codepage);

     if (length > 0) {
         bytes = NEW_ARRAY(char, length + 1);
         source.extract(0, sLength, bytes, codepage);
     }

     return bytes;
 }

 static void freeBytes(char *bytes)
 {
     DELETE_ARRAY(bytes);
 }

 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
 {
     int32_t splits = 0;
     int32_t testLength = testString.length();
     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
     UErrorCode status = U_ZERO_ERROR;
     int32_t cpLength = eSplit[0].length();
     char codepage[64];

     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
     codepage[cpLength] = '\0';

     UCharsetDetector *csd = ucsdet_open(&status);

     int32_t byteLength = 0;
     char *bytes = extractBytes(testString, codepage, byteLength);

     if (bytes == NULL) {
 #if !UCONFIG_NO_LEGACY_CONVERSION
         errln("Can't open a " + encoding + " converter for " + id);
 #endif
         return;
     }

     ucsdet_setText(csd, bytes, byteLength, &status);

     int32_t matchCount = 0;
     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);


     UnicodeString name(ucsdet_getName(matches[0], &status));
     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
     UChar *decoded = NULL;
     int32_t dLength = 0;

     if (matchCount == 0) {
         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
         goto bail;
     }

     if (name.compare(eSplit[0]) != 0) {
         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

 #ifdef DEBUG_DETECT
         for (int32_t m = 0; m < matchCount; m += 1) {
             const char *name = ucsdet_getName(matches[m], &status);
             const char *lang = ucsdet_getLanguage(matches[m], &status);
             int32_t confidence = ucsdet_getConfidence(matches[m], &status);

             printf("%s (%s) %d\n", name, lang, confidence);
         }
 #endif
         goto bail;
     }

     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
         goto bail;
     }

     decoded = NEW_ARRAY(UChar, testLength);
     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);

     if (testString.compare(decoded, dLength) != 0) {
         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");

 #ifdef DEBUG_DETECT
         for(int32_t i = 0; i < testLength; i += 1) {
             if(testString[i] != decoded[i]) {
                 printf("Strings differ at byte %d\n", i);
                 break;
             }
         }
 #endif

     }

     DELETE_ARRAY(decoded);

 bail:
     freeBytes(bytes);
     ucsdet_close(csd);
     delete[] eSplit;
 }

 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
     UErrorCode status = U_ZERO_ERROR;
     const char *testDataDirectory = IntlTest::getSourceTestData(status);

     if (U_FAILURE(status)) {
         errln("ERROR: getPath() failed - %s", u_errorName(status));
         return NULL;
     }

     strcpy(buffer, testDataDirectory);
     strcat(buffer, filename);
     return buffer;
 }

 void CharsetDetectionTest::ConstructionTest()
 {
     UErrorCode status = U_ZERO_ERROR;
     UCharsetDetector *csd = ucsdet_open(&status);
     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
     int32_t count = uenum_count(e, &status);

 #ifdef DEBUG_DETECT
     printf("There are %d recognizers.\n", count);
 #endif

     for(int32_t i = 0; i < count; i += 1) {
         int32_t length;
         const char *name = uenum_next(e, &length, &status);

         if(name == NULL || length <= 0) {
             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
         }

 #ifdef DEBUG_DETECT
         printf("%s\n", name);
 #endif
     }

     uenum_close(e);
     ucsdet_close(csd);
 }

 void CharsetDetectionTest::UTF8Test()
 {
     UErrorCode status = U_ZERO_ERROR;
     UnicodeString ss = "This is a string with some non-ascii characters that will "
                        "be converted to UTF-8, then shoved through the detection process.  "
                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
                        "Sure would be nice if our source could contain Unicode directly!";
     UnicodeString s = ss.unescape();
     int32_t byteLength = 0, sLength = s.length();
     char *bytes = extractBytes(s, "UTF-8", byteLength);
     UCharsetDetector *csd = ucsdet_open(&status);
     const UCharsetMatch *match;
     UChar *detected = NEW_ARRAY(UChar, sLength);

     ucsdet_setText(csd, bytes, byteLength, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("Detection failure for UTF-8: got no matches.");
         goto bail;
     }

     ucsdet_getUChars(match, detected, sLength, &status);

     if (s.compare(detected, sLength) != 0) {
         errln("Round-trip test failed!");
     }

     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */

 bail:
     DELETE_ARRAY(detected);
     freeBytes(bytes);
     ucsdet_close(csd);
 }

 void CharsetDetectionTest::UTF16Test()
 {
     UErrorCode status = U_ZERO_ERROR;
     /* Notice the BOM on the start of this string */
     UChar chars[] = {
         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
         0x064a, 0x062a, 0x0000};
     UnicodeString s(chars);
     int32_t beLength = 0, leLength = 0;
     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
     UCharsetDetector *csd = ucsdet_open(&status);
     const UCharsetMatch *match;
     const char *name;
     int32_t conf;

     ucsdet_setText(csd, beBytes, beLength, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("Encoding detection failure for UTF-16BE: got no matches.");
         goto try_le;
     }

     name  = ucsdet_getName(match, &status);
     conf  = ucsdet_getConfidence(match, &status);

     if (strcmp(name, "UTF-16BE") != 0) {
         errln("Encoding detection failure for UTF-16BE: got %s", name);
         goto try_le; // no point in looking at confidence if we got the wrong character set.
     }

     if (conf != 100) {
         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
     }

 try_le:
     ucsdet_setText(csd, leBytes, leLength, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("Encoding detection failure for UTF-16LE: got no matches.");
         goto bail;
     }

     name  = ucsdet_getName(match, &status);
     conf = ucsdet_getConfidence(match, &status);


     if (strcmp(name, "UTF-16LE") != 0) {
         errln("Enconding detection failure for UTF-16LE: got %s", name);
         goto bail; // no point in looking at confidence if we got the wrong character set.
     }

     if (conf != 100) {
         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
     }

 bail:
     freeBytes(leBytes);
     freeBytes(beBytes);
     ucsdet_close(csd);
 }

 void CharsetDetectionTest::InputFilterTest()
 {
     UErrorCode status = U_ZERO_ERROR;
     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
     UnicodeString s  = ss.unescape();
     int32_t byteLength = 0;
     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
     UCharsetDetector *csd = ucsdet_open(&status);
     const UCharsetMatch *match;
     const char *lang, *name;

     ucsdet_enableInputFilter(csd, TRUE);

     if (!ucsdet_isInputFilterEnabled(csd)) {
         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
     }


     ucsdet_setText(csd, bytes, byteLength, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("Turning on the input filter resulted in no matches.");
         goto turn_off;
     }

     name = ucsdet_getName(match, &status);

     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
     } else {
         lang = ucsdet_getLanguage(match, &status);

         if (lang == NULL || strcmp(lang, "fr") != 0) {
             errln("Input filter did not strip markup!");
         }
     }

 turn_off:
     ucsdet_enableInputFilter(csd, FALSE);
     ucsdet_setText(csd, bytes, byteLength, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("Turning off the input filter resulted in no matches.");
         goto bail;
     }

     name = ucsdet_getName(match, &status);

     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
     } else {
         lang = ucsdet_getLanguage(match, &status);

         if (lang == NULL || strcmp(lang, "en") != 0) {
             errln("Unfiltered input did not detect as English!");
         }
     }

 bail:
     freeBytes(bytes);
     ucsdet_close(csd);
 }

 void CharsetDetectionTest::C1BytesTest()
 {
 #if !UCONFIG_NO_LEGACY_CONVERSION
     UErrorCode status = U_ZERO_ERROR;
     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
     UnicodeString ssWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
     UnicodeString sWindows  = ssWindows.unescape();
     int32_t lISO = 0, lWindows = 0;
     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
     UCharsetDetector *csd = ucsdet_open(&status);
     const UCharsetMatch *match;
     const char *name;

     ucsdet_setText(csd, bWindows, lWindows, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("English test with C1 bytes got no matches.");
         goto bail;
     }

     name  = ucsdet_getName(match, &status);

     if (strcmp(name, "windows-1252") != 0) {
         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
     }

     ucsdet_setText(csd, bISO, lISO, &status);
     match = ucsdet_detect(csd, &status);

     if (match == NULL) {
         errln("English text without C1 bytes got no matches.");
         goto bail;
     }

     name  = ucsdet_getName(match, &status);

     if (strcmp(name, "ISO-8859-1") != 0) {
         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
     }

 bail:
     freeBytes(bWindows);
     freeBytes(bISO);

     ucsdet_close(csd);
 #endif
 }

 void CharsetDetectionTest::DetectionTest()
 {
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     UErrorCode status = U_ZERO_ERROR;
     char path[2048];
     const char *testFilePath = getPath(path, "csdetest.xml");

     if (testFilePath == NULL) {
         return; /* Couldn't get path: error message already output. */
     }

     UXMLParser  *parser = UXMLParser::createParser(status);
     if (!assertSuccess("UXMLParser::createParser",status)) return;
     UXMLElement *root   = parser->parseFile(testFilePath, status);
     if (!assertSuccess( "parseFile",status)) return;

     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");

     const UXMLElement *testCase;
     int32_t tc = 0;

     while((testCase = root->nextChildElement(tc)) != NULL) {
         if (testCase->getTagName().compare(test_case) == 0) {
             const UnicodeString *id = testCase->getAttribute(id_attr);
             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
             const UnicodeString  text = testCase->getText(TRUE);
             int32_t encodingCount;
             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);

             for(int32_t e = 0; e < encodingCount; e += 1) {
                 checkEncoding(text, encodingList[e], *id);
             }

             delete[] encodingList;
         }
     }

     delete root;
     delete parser;
 #endif
 }
	/*
	**********************************************************************
	* Copyright (C) 2005-2006, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	*/


	#include "unicode/utypes.h"
	#include "unicode/ucsdet.h"
	#include "unicode/ucnv.h"
	#include "unicode/unistr.h"
	#include "unicode/putil.h"

	#include "intltest.h"
	#include "csdetest.h"

	#include "xmlparser.h"

	#include <stdlib.h>
	#include <string.h>

	#ifdef DEBUG_DETECT
	#include <stdio.h>
	#endif

	#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

	#define NEW_ARRAY(type,count) (type ) /uprv_/malloc((count) sizeof(type))
	#define DELETE_ARRAY(array) /uprv_/free((void *) (array))

	#define CH_SPACE 0x0020
	#define CH_SLASH 0x002F

	//---------------------------------------------------------------------------
	//
	// Test class boilerplate
	//
	//---------------------------------------------------------------------------
	CharsetDetectionTest::CharsetDetectionTest()
	{
	}


	CharsetDetectionTest::~CharsetDetectionTest()
	{
	}



	void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /par/ )
	{
	if (exec) logln("TestSuite CharsetDetectionTest: ");
	switch (index) {
	case 0: name = "ConstructionTest";
	if (exec) ConstructionTest();
	break;

	case 1: name = "UTF8Test";
	if (exec) UTF8Test();
	break;

	case 2: name = "UTF16Test";
	if (exec) UTF16Test();
	break;

	case 3: name = "C1BytesTest";
	if (exec) C1BytesTest();
	break;

	case 4: name = "InputFilterTest";
	if (exec) InputFilterTest();
	break;

	case 5: name = "DetectionTest";
	if (exec) DetectionTest();
	break;

	default: name = "";
	break; //needed to end loop
	}
	}

	static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
	{
	int32_t offset = -1;

	splits = 1;
	while((offset = src.indexOf(ch, offset + 1)) >= 0) {
	splits += 1;
	}

	UnicodeString *result = new UnicodeString[splits];

	int32_t start = 0;
	int32_t split = 0;
	int32_t end;

	while((end = src.indexOf(ch, start)) >= 0) {
	src.extractBetween(start, end, result[split++]);
	start = end + 1;
	}

	src.extractBetween(start, src.length(), result[split]);

	return result;
	}

	static char extractBytes(const UnicodeString &source, const char codepage, int32_t &length)
	{
	int32_t sLength = source.length();
	char *bytes = NULL;

	length = source.extract(0, sLength, NULL, codepage);

	if (length > 0) {
	bytes = NEW_ARRAY(char, length + 1);
	source.extract(0, sLength, bytes, codepage);
	}

	return bytes;
	}

	static void freeBytes(char *bytes)
	{
	DELETE_ARRAY(bytes);
	}

	void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
	{
	int32_t splits = 0;
	int32_t testLength = testString.length();
	UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
	UErrorCode status = U_ZERO_ERROR;
	int32_t cpLength = eSplit[0].length();
	char codepage[64];

	u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
	codepage[cpLength] = '\0';

	UCharsetDetector *csd = ucsdet_open(&status);

	int32_t byteLength = 0;
	char *bytes = extractBytes(testString, codepage, byteLength);

	if (bytes == NULL) {
	#if !UCONFIG_NO_LEGACY_CONVERSION
	errln("Can't open a " + encoding + " converter for " + id);
	#endif
	return;
	}

	ucsdet_setText(csd, bytes, byteLength, &status);

	int32_t matchCount = 0;
	const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);


	UnicodeString name(ucsdet_getName(matches[0], &status));
	UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
	UChar *decoded = NULL;
	int32_t dLength = 0;

	if (matchCount == 0) {
	errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
	goto bail;
	}

	if (name.compare(eSplit[0]) != 0) {
	errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);

	#ifdef DEBUG_DETECT
	for (int32_t m = 0; m < matchCount; m += 1) {
	const char *name = ucsdet_getName(matches[m], &status);
	const char *lang = ucsdet_getLanguage(matches[m], &status);
	int32_t confidence = ucsdet_getConfidence(matches[m], &status);

	printf("%s (%s) %d\n", name, lang, confidence);
	}
	#endif
	goto bail;
	}

	if (splits > 1 && lang.compare(eSplit[1]) != 0) {
	errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
	goto bail;
	}

	decoded = NEW_ARRAY(UChar, testLength);
	dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);

	if (testString.compare(decoded, dLength) != 0) {
	errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");

	#ifdef DEBUG_DETECT
	for(int32_t i = 0; i < testLength; i += 1) {
	if(testString[i] != decoded[i]) {
	printf("Strings differ at byte %d\n", i);
	break;
	}
	}
	#endif

	}

	DELETE_ARRAY(decoded);

	bail:
	freeBytes(bytes);
	ucsdet_close(csd);
	delete[] eSplit;
	}

	const char CharsetDetectionTest::getPath(char buffer[2048], const char filename) {
	UErrorCode status = U_ZERO_ERROR;
	const char *testDataDirectory = IntlTest::getSourceTestData(status);

	if (U_FAILURE(status)) {
	errln("ERROR: getPath() failed - %s", u_errorName(status));
	return NULL;
	}

	strcpy(buffer, testDataDirectory);
	strcat(buffer, filename);
	return buffer;
	}

	void CharsetDetectionTest::ConstructionTest()
	{
	UErrorCode status = U_ZERO_ERROR;
	UCharsetDetector *csd = ucsdet_open(&status);
	UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
	int32_t count = uenum_count(e, &status);

	#ifdef DEBUG_DETECT
	printf("There are %d recognizers.\n", count);
	#endif

	for(int32_t i = 0; i < count; i += 1) {
	int32_t length;
	const char *name = uenum_next(e, &length, &status);

	if(name == NULL \|\| length <= 0) {
	errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
	}

	#ifdef DEBUG_DETECT
	printf("%s\n", name);
	#endif
	}

	uenum_close(e);
	ucsdet_close(csd);
	}

	void CharsetDetectionTest::UTF8Test()
	{
	UErrorCode status = U_ZERO_ERROR;
	UnicodeString ss = "This is a string with some non-ascii characters that will "
	"be converted to UTF-8, then shoved through the detection process. "
	"\\u0391\\u0392\\u0393\\u0394\\u0395"
	"Sure would be nice if our source could contain Unicode directly!";
	UnicodeString s = ss.unescape();
	int32_t byteLength = 0, sLength = s.length();
	char *bytes = extractBytes(s, "UTF-8", byteLength);
	UCharsetDetector *csd = ucsdet_open(&status);
	const UCharsetMatch *match;
	UChar *detected = NEW_ARRAY(UChar, sLength);

	ucsdet_setText(csd, bytes, byteLength, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("Detection failure for UTF-8: got no matches.");
	goto bail;
	}

	ucsdet_getUChars(match, detected, sLength, &status);

	if (s.compare(detected, sLength) != 0) {
	errln("Round-trip test failed!");
	}

	ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */

	bail:
	DELETE_ARRAY(detected);
	freeBytes(bytes);
	ucsdet_close(csd);
	}

	void CharsetDetectionTest::UTF16Test()
	{
	UErrorCode status = U_ZERO_ERROR;
	/* Notice the BOM on the start of this string */
	UChar chars[] = {
	0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
	0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
	0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
	0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
	0x064a, 0x062a, 0x0000};
	UnicodeString s(chars);
	int32_t beLength = 0, leLength = 0;
	char *beBytes = extractBytes(s, "UTF-16BE", beLength);
	char *leBytes = extractBytes(s, "UTF-16LE", leLength);
	UCharsetDetector *csd = ucsdet_open(&status);
	const UCharsetMatch *match;
	const char *name;
	int32_t conf;

	ucsdet_setText(csd, beBytes, beLength, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("Encoding detection failure for UTF-16BE: got no matches.");
	goto try_le;
	}

	name = ucsdet_getName(match, &status);
	conf = ucsdet_getConfidence(match, &status);

	if (strcmp(name, "UTF-16BE") != 0) {
	errln("Encoding detection failure for UTF-16BE: got %s", name);
	goto try_le; // no point in looking at confidence if we got the wrong character set.
	}

	if (conf != 100) {
	errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
	}

	try_le:
	ucsdet_setText(csd, leBytes, leLength, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("Encoding detection failure for UTF-16LE: got no matches.");
	goto bail;
	}

	name = ucsdet_getName(match, &status);
	conf = ucsdet_getConfidence(match, &status);


	if (strcmp(name, "UTF-16LE") != 0) {
	errln("Enconding detection failure for UTF-16LE: got %s", name);
	goto bail; // no point in looking at confidence if we got the wrong character set.
	}

	if (conf != 100) {
	errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
	}

	bail:
	freeBytes(leBytes);
	freeBytes(beBytes);
	ucsdet_close(csd);
	}

	void CharsetDetectionTest::InputFilterTest()
	{
	UErrorCode status = U_ZERO_ERROR;
	UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
	UnicodeString s = ss.unescape();
	int32_t byteLength = 0;
	char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
	UCharsetDetector *csd = ucsdet_open(&status);
	const UCharsetMatch *match;
	const char lang, name;

	ucsdet_enableInputFilter(csd, TRUE);

	if (!ucsdet_isInputFilterEnabled(csd)) {
	errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
	}


	ucsdet_setText(csd, bytes, byteLength, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("Turning on the input filter resulted in no matches.");
	goto turn_off;
	}

	name = ucsdet_getName(match, &status);

	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
	errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
	} else {
	lang = ucsdet_getLanguage(match, &status);

	if (lang == NULL \|\| strcmp(lang, "fr") != 0) {
	errln("Input filter did not strip markup!");
	}
	}

	turn_off:
	ucsdet_enableInputFilter(csd, FALSE);
	ucsdet_setText(csd, bytes, byteLength, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("Turning off the input filter resulted in no matches.");
	goto bail;
	}

	name = ucsdet_getName(match, &status);

	if (name == NULL \|\| strcmp(name, "ISO-8859-1") != 0) {
	errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
	} else {
	lang = ucsdet_getLanguage(match, &status);

	if (lang == NULL \|\| strcmp(lang, "en") != 0) {
	errln("Unfiltered input did not detect as English!");
	}
	}

	bail:
	freeBytes(bytes);
	ucsdet_close(csd);
	}

	void CharsetDetectionTest::C1BytesTest()
	{
	#if !UCONFIG_NO_LEGACY_CONVERSION
	UErrorCode status = U_ZERO_ERROR;
	UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
	UnicodeString ssWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
	UnicodeString sWindows = ssWindows.unescape();
	int32_t lISO = 0, lWindows = 0;
	char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
	char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
	UCharsetDetector *csd = ucsdet_open(&status);
	const UCharsetMatch *match;
	const char *name;

	ucsdet_setText(csd, bWindows, lWindows, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("English test with C1 bytes got no matches.");
	goto bail;
	}

	name = ucsdet_getName(match, &status);

	if (strcmp(name, "windows-1252") != 0) {
	errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
	}

	ucsdet_setText(csd, bISO, lISO, &status);
	match = ucsdet_detect(csd, &status);

	if (match == NULL) {
	errln("English text without C1 bytes got no matches.");
	goto bail;
	}

	name = ucsdet_getName(match, &status);

	if (strcmp(name, "ISO-8859-1") != 0) {
	errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
	}

	bail:
	freeBytes(bWindows);
	freeBytes(bISO);

	ucsdet_close(csd);
	#endif
	}

	void CharsetDetectionTest::DetectionTest()
	{
	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
	UErrorCode status = U_ZERO_ERROR;
	char path[2048];
	const char *testFilePath = getPath(path, "csdetest.xml");

	if (testFilePath == NULL) {
	return; /* Couldn't get path: error message already output. */
	}

	UXMLParser *parser = UXMLParser::createParser(status);
	if (!assertSuccess("UXMLParser::createParser",status)) return;
	UXMLElement *root = parser->parseFile(testFilePath, status);
	if (!assertSuccess( "parseFile",status)) return;

	UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
	UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
	UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");

	const UXMLElement *testCase;
	int32_t tc = 0;

	while((testCase = root->nextChildElement(tc)) != NULL) {
	if (testCase->getTagName().compare(test_case) == 0) {
	const UnicodeString *id = testCase->getAttribute(id_attr);
	const UnicodeString *encodings = testCase->getAttribute(enc_attr);
	const UnicodeString text = testCase->getText(TRUE);
	int32_t encodingCount;
	UnicodeString encodingList = split(encodings, CH_SPACE, encodingCount);

	for(int32_t e = 0; e < encodingCount; e += 1) {
	checkEncoding(text, encodingList[e], *id);
	}

	delete[] encodingList;
	}
	}

	delete root;
	delete parser;
	#endif
	}