blob: 6a238a54c7bda2889184e39b0045a717eca993b6 [file] [log] [blame]
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
*
* File CCONVTST.C
*
* Modification History:
* Name Description
* Steven R. Loomis 7/8/1999 Adding input buffer test
********************************************************************************
*/
#include <stdio.h>
#include "cstring.h"
#include "unicode/uloc.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "unicode/ucnv_cb.h"
#include "cintltst.h"
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/ucol.h"
#include "cmemory.h"
#include "nucnvtst.h"
static void TestNextUChar(UConverter* cnv, const char* source, const char* limit, const int32_t results[], const char* message);
static void TestNextUCharError(UConverter* cnv, const char* source, const char* limit, UErrorCode expected, const char* message);
#if !UCONFIG_NO_COLLATION
static void TestJitterbug981(void);
#endif
static void TestJitterbug1293(void);
static void TestNewConvertWithBufferSizes(int32_t osize, int32_t isize) ;
static void TestConverterTypesAndStarters(void);
static void TestAmbiguous(void);
static void TestSignatureDetection(void);
static void TestUTF7(void);
static void TestIMAP(void);
static void TestUTF8(void);
static void TestCESU8(void);
static void TestUTF16(void);
static void TestUTF16BE(void);
static void TestUTF16LE(void);
static void TestUTF32(void);
static void TestUTF32BE(void);
static void TestUTF32LE(void);
static void TestLATIN1(void);
#if !UCONFIG_NO_LEGACY_CONVERSION
static void TestSBCS(void);
static void TestDBCS(void);
static void TestMBCS(void);
#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FILE_IO
static void TestICCRunout(void);
#endif
#ifdef U_ENABLE_GENERIC_ISO_2022
static void TestISO_2022(void);
#endif
static void TestISO_2022_JP(void);
static void TestISO_2022_JP_1(void);
static void TestISO_2022_JP_2(void);
static void TestISO_2022_KR(void);
static void TestISO_2022_KR_1(void);
static void TestISO_2022_CN(void);
static void TestISO_2022_CN_EXT(void);
static void TestJIS(void);
static void TestHZ(void);
#endif
static void TestSCSU(void);
#if !UCONFIG_NO_LEGACY_CONVERSION
static void TestEBCDIC_STATEFUL(void);
static void TestGB18030(void);
static void TestLMBCS(void);
static void TestJitterbug255(void);
static void TestEBCDICUS4XML(void);
static void TestJitterbug915(void);
static void TestISCII(void);
static void TestCoverageMBCS(void);
static void TestJitterbug2346(void);
static void TestJitterbug2411(void);
static void TestJB5275(void);
static void TestJB5275_1(void);
static void TestJitterbug6175(void);
#endif
static void TestInBufSizes(void);
static void TestRoundTrippingAllUTF(void);
static void TestConv(const uint16_t in[],
int len,
const char* conv,
const char* lang,
char byteArr[],
int byteArrLen);
/* open a converter, using test data if it begins with '@' */
static UConverter *my_ucnv_open(const char *cnv, UErrorCode *err);
#define NEW_MAX_BUFFER 999
static int32_t gInBufferSize = NEW_MAX_BUFFER;
static int32_t gOutBufferSize = NEW_MAX_BUFFER;
static char gNuConvTestName[1024];
#define nct_min(x,y) ((x<y) ? x : y)
static UConverter *my_ucnv_open(const char *cnv, UErrorCode *err)
{
if(cnv && cnv[0] == '@') {
return ucnv_openPackage(loadTestData(err), cnv+1, err);
} else {
return ucnv_open(cnv, err);
}
}
static void printSeq(const unsigned char* a, int len)
{
int i=0;
log_verbose("{");
while (i<len)
log_verbose("0x%02x ", a[i++]);
log_verbose("}\n");
}
static void printUSeq(const UChar* a, int len)
{
int i=0;
log_verbose("{U+");
while (i<len) log_verbose("0x%04x ", a[i++]);
log_verbose("}\n");
}
static void printSeqErr(const unsigned char* a, int len)
{
int i=0;
fprintf(stderr, "{");
while (i<len)
fprintf(stderr, "0x%02x ", a[i++]);
fprintf(stderr, "}\n");
}
static void printUSeqErr(const UChar* a, int len)
{
int i=0;
fprintf(stderr, "{U+");
while (i<len)
fprintf(stderr, "0x%04x ", a[i++]);
fprintf(stderr,"}\n");
}
static void
TestNextUChar(UConverter* cnv, const char* source, const char* limit, const int32_t results[], const char* message)
{
const char* s0;
const char* s=(char*)source;
const int32_t *r=results;
UErrorCode errorCode=U_ZERO_ERROR;
UChar32 c;
while(s<limit) {
s0=s;
c=ucnv_getNextUChar(cnv, &s, limit, &errorCode);
if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
break; /* no more significant input */
} else if(U_FAILURE(errorCode)) {
log_err("%s ucnv_getNextUChar() failed: %s\n", message, u_errorName(errorCode));
break;
} else if(
/* test the expected number of input bytes only if >=0 */
(*r>=0 && (int32_t)(s-s0)!=*r) ||
c!=*(r+1)
) {
log_err("%s ucnv_getNextUChar() result %lx from %d bytes, should have been %lx from %d bytes.\n",
message, c, (s-s0), *(r+1), *r);
break;
}
r+=2;
}
}
static void
TestNextUCharError(UConverter* cnv, const char* source, const char* limit, UErrorCode expected, const char* message)
{
const char* s=(char*)source;
UErrorCode errorCode=U_ZERO_ERROR;
uint32_t c;
c=ucnv_getNextUChar(cnv, &s, limit, &errorCode);
if(errorCode != expected){
log_err("FAIL: Expected:%s when %s-----Got:%s\n", myErrorName(expected), message, myErrorName(errorCode));
}
if(c != 0xFFFD && c != 0xffff){
log_err("FAIL: Expected return value of 0xfffd or 0xffff when %s-----Got 0x%lx\n", message, c);
}
}
static void TestInBufSizes(void)
{
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,1);
#if 1
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,2);
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,3);
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,4);
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,5);
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,6);
TestNewConvertWithBufferSizes(1,1);
TestNewConvertWithBufferSizes(2,3);
TestNewConvertWithBufferSizes(3,2);
#endif
}
static void TestOutBufSizes(void)
{
#if 1
TestNewConvertWithBufferSizes(NEW_MAX_BUFFER,NEW_MAX_BUFFER);
TestNewConvertWithBufferSizes(1,NEW_MAX_BUFFER);
TestNewConvertWithBufferSizes(2,NEW_MAX_BUFFER);
TestNewConvertWithBufferSizes(3,NEW_MAX_BUFFER);
TestNewConvertWithBufferSizes(4,NEW_MAX_BUFFER);
TestNewConvertWithBufferSizes(5,NEW_MAX_BUFFER);
#endif
}
void addTestNewConvert(TestNode** root)
{
#if !UCONFIG_NO_FILE_IO
addTest(root, &TestInBufSizes, "tsconv/nucnvtst/TestInBufSizes");
addTest(root, &TestOutBufSizes, "tsconv/nucnvtst/TestOutBufSizes");
#endif
addTest(root, &TestConverterTypesAndStarters, "tsconv/nucnvtst/TestConverterTypesAndStarters");
addTest(root, &TestAmbiguous, "tsconv/nucnvtst/TestAmbiguous");
addTest(root, &TestSignatureDetection, "tsconv/nucnvtst/TestSignatureDetection");
addTest(root, &TestUTF7, "tsconv/nucnvtst/TestUTF7");
addTest(root, &TestIMAP, "tsconv/nucnvtst/TestIMAP");
addTest(root, &TestUTF8, "tsconv/nucnvtst/TestUTF8");
/* test ucnv_getNextUChar() for charsets that encode single surrogates with complete byte sequences */
addTest(root, &TestCESU8, "tsconv/nucnvtst/TestCESU8");
addTest(root, &TestUTF16, "tsconv/nucnvtst/TestUTF16");
addTest(root, &TestUTF16BE, "tsconv/nucnvtst/TestUTF16BE");
addTest(root, &TestUTF16LE, "tsconv/nucnvtst/TestUTF16LE");
addTest(root, &TestUTF32, "tsconv/nucnvtst/TestUTF32");
addTest(root, &TestUTF32BE, "tsconv/nucnvtst/TestUTF32BE");
addTest(root, &TestUTF32LE, "tsconv/nucnvtst/TestUTF32LE");
#if !UCONFIG_NO_LEGACY_CONVERSION
addTest(root, &TestLMBCS, "tsconv/nucnvtst/TestLMBCS");
#endif
addTest(root, &TestLATIN1, "tsconv/nucnvtst/TestLATIN1");
#if !UCONFIG_NO_LEGACY_CONVERSION
addTest(root, &TestSBCS, "tsconv/nucnvtst/TestSBCS");
#if !UCONFIG_NO_FILE_IO
addTest(root, &TestDBCS, "tsconv/nucnvtst/TestDBCS");
addTest(root, &TestICCRunout, "tsconv/nucnvtst/TestICCRunout");
#endif
addTest(root, &TestMBCS, "tsconv/nucnvtst/TestMBCS");
#ifdef U_ENABLE_GENERIC_ISO_2022
addTest(root, &TestISO_2022, "tsconv/nucnvtst/TestISO_2022");
#endif
addTest(root, &TestISO_2022_JP, "tsconv/nucnvtst/TestISO_2022_JP");
addTest(root, &TestJIS, "tsconv/nucnvtst/TestJIS");
addTest(root, &TestISO_2022_JP_1, "tsconv/nucnvtst/TestISO_2022_JP_1");
addTest(root, &TestISO_2022_JP_2, "tsconv/nucnvtst/TestISO_2022_JP_2");
addTest(root, &TestISO_2022_KR, "tsconv/nucnvtst/TestISO_2022_KR");
addTest(root, &TestISO_2022_KR_1, "tsconv/nucnvtst/TestISO_2022_KR_1");
addTest(root, &TestISO_2022_CN, "tsconv/nucnvtst/TestISO_2022_CN");
/*
* ICU 4.4 (ticket #7314) removes mappings for CNS 11643 planes 3..7
addTest(root, &TestISO_2022_CN_EXT, "tsconv/nucnvtst/TestISO_2022_CN_EXT");
addTest(root, &TestJitterbug915, "tsconv/nucnvtst/TestJitterbug915");
*/
addTest(root, &TestHZ, "tsconv/nucnvtst/TestHZ");
#endif
addTest(root, &TestSCSU, "tsconv/nucnvtst/TestSCSU");
#if !UCONFIG_NO_LEGACY_CONVERSION
addTest(root, &TestEBCDIC_STATEFUL, "tsconv/nucnvtst/TestEBCDIC_STATEFUL");
addTest(root, &TestGB18030, "tsconv/nucnvtst/TestGB18030");
addTest(root, &TestJitterbug255, "tsconv/nucnvtst/TestJitterbug255");
addTest(root, &TestEBCDICUS4XML, "tsconv/nucnvtst/TestEBCDICUS4XML");
addTest(root, &TestISCII, "tsconv/nucnvtst/TestISCII");
addTest(root, &TestJB5275, "tsconv/nucnvtst/TestJB5275");
addTest(root, &TestJB5275_1, "tsconv/nucnvtst/TestJB5275_1");
#if !UCONFIG_NO_COLLATION
addTest(root, &TestJitterbug981, "tsconv/nucnvtst/TestJitterbug981");
#endif
addTest(root, &TestJitterbug1293, "tsconv/nucnvtst/TestJitterbug1293");
#endif
#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FILE_IO
addTest(root, &TestCoverageMBCS, "tsconv/nucnvtst/TestCoverageMBCS");
#endif
addTest(root, &TestRoundTrippingAllUTF, "tsconv/nucnvtst/TestRoundTrippingAllUTF");
#if !UCONFIG_NO_LEGACY_CONVERSION
addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");
addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");
addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");
#endif
}
/* Note that this test already makes use of statics, so it's not really
multithread safe.
This convenience function lets us make the error messages actually useful.
*/
static void setNuConvTestName(const char *codepage, const char *direction)
{
sprintf(gNuConvTestName, "[Testing %s %s Unicode, InputBufSiz=%d, OutputBufSiz=%d]",
codepage,
direction,
(int)gInBufferSize,
(int)gOutBufferSize);
}
typedef enum
{
TC_OK = 0, /* test was OK */
TC_MISMATCH = 1, /* Match failed - err was printed */
TC_FAIL = 2 /* Test failed, don't print an err because it was already printed. */
} ETestConvertResult;
/* Note: This function uses global variables and it will not do offset
checking without gOutBufferSize and gInBufferSize set to NEW_MAX_BUFFER */
static ETestConvertResult testConvertFromU( const UChar *source, int sourceLen, const uint8_t *expect, int expectLen,
const char *codepage, const int32_t *expectOffsets , UBool useFallback)
{
UErrorCode status = U_ZERO_ERROR;
UConverter *conv = 0;
char junkout[NEW_MAX_BUFFER]; /* FIX */
int32_t junokout[NEW_MAX_BUFFER]; /* FIX */
char *p;
const UChar *src;
char *end;
char *targ;
int32_t *offs;
int i;
int32_t realBufferSize;
char *realBufferEnd;
const UChar *realSourceEnd;
const UChar *sourceLimit;
UBool checkOffsets = TRUE;
UBool doFlush;
for(i=0;i<NEW_MAX_BUFFER;i++)
junkout[i] = (char)0xF0;
for(i=0;i<NEW_MAX_BUFFER;i++)
junokout[i] = 0xFF;
setNuConvTestName(codepage, "FROM");
log_verbose("\n========= %s\n", gNuConvTestName);
conv = my_ucnv_open(codepage, &status);
if(U_FAILURE(status))
{
log_data_err("Couldn't open converter %s\n",codepage);
return TC_FAIL;
}
if(useFallback){
ucnv_setFallback(conv,useFallback);
}
log_verbose("Converter opened..\n");
src = source;
targ = junkout;
offs = junokout;
realBufferSize = (sizeof(junkout)/sizeof(junkout[0]));
realBufferEnd = junkout + realBufferSize;
realSourceEnd = source + sourceLen;
if ( gOutBufferSize != realBufferSize || gInBufferSize != NEW_MAX_BUFFER )
checkOffsets = FALSE;
do
{
end = nct_min(targ + gOutBufferSize, realBufferEnd);
sourceLimit = nct_min(src + gInBufferSize, realSourceEnd);
doFlush = (UBool)(sourceLimit == realSourceEnd);
if(targ == realBufferEnd) {
log_err("Error, overflowed the real buffer while about to call fromUnicode! targ=%08lx %s", targ, gNuConvTestName);
return TC_FAIL;
}
log_verbose("calling fromUnicode @ SOURCE:%08lx to %08lx TARGET: %08lx to %08lx, flush=%s\n", src,sourceLimit, targ,end, doFlush?"TRUE":"FALSE");
status = U_ZERO_ERROR;
ucnv_fromUnicode (conv,
&targ,
end,
&src,
sourceLimit,
checkOffsets ? offs : NULL,
doFlush, /* flush if we're at the end of the input data */
&status);
} while ( (status == U_BUFFER_OVERFLOW_ERROR) || (U_SUCCESS(status) && sourceLimit < realSourceEnd) );
if(U_FAILURE(status)) {
log_err("Problem doing fromUnicode to %s, errcode %s %s\n", codepage, myErrorName(status), gNuConvTestName);
return TC_FAIL;
}
log_verbose("\nConversion done [%d uchars in -> %d chars out]. \nResult :",
sourceLen, targ-junkout);
if(VERBOSITY)
{
char junk[9999];
char offset_str[9999];
char *ptr;
junk[0] = 0;
offset_str[0] = 0;
for(ptr = junkout;ptr<targ;ptr++) {
sprintf(junk + strlen(junk), "0x%02x, ", (int)(0xFF & *ptr));
sprintf(offset_str + strlen(offset_str), "0x%02x, ", (int)(0xFF & junokout[ptr-junkout]));
}
log_verbose(junk);
printSeq((const uint8_t *)expect, expectLen);
if ( checkOffsets ) {
log_verbose("\nOffsets:");
log_verbose(offset_str);
}
log_verbose("\n");
}
ucnv_close(conv);
if(expectLen != targ-junkout) {
log_err("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName);
log_verbose("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName);
printf("\nGot:");
printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout));
printf("\nExpected:");
printSeqErr((const unsigned char*)expect, expectLen);
return TC_MISMATCH;
}
if (checkOffsets && (expectOffsets != 0) ) {
log_verbose("comparing %d offsets..\n", targ-junkout);
if(memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t) )){
log_err("did not get the expected offsets. %s\n", gNuConvTestName);
printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout));
log_err("\n");
log_err("Got : ");
for(p=junkout;p<targ;p++) {
log_err("%d,", junokout[p-junkout]);
}
log_err("\n");
log_err("Expected: ");
for(i=0; i<(targ-junkout); i++) {
log_err("%d,", expectOffsets[i]);
}
log_err("\n");
}
}
log_verbose("comparing..\n");
if(!memcmp(junkout, expect, expectLen)) {
log_verbose("Matches!\n");
return TC_OK;
} else {
log_err("String does not match u->%s\n", gNuConvTestName);
printUSeqErr(source, sourceLen);
printf("\nGot:");
printSeqErr((const unsigned char *)junkout, expectLen);
printf("\nExpected:");
printSeqErr((const unsigned char *)expect, expectLen);
return TC_MISMATCH;
}
}
/* Note: This function uses global variables and it will not do offset
checking without gOutBufferSize and gInBufferSize set to NEW_MAX_BUFFER */
static ETestConvertResult testConvertToU( const uint8_t *source, int sourcelen, const UChar *expect, int expectlen,
const char *codepage, const int32_t *expectOffsets, UBool useFallback)
{
UErrorCode status = U_ZERO_ERROR;
UConverter *conv = 0;
UChar junkout[NEW_MAX_BUFFER]; /* FIX */
int32_t junokout[NEW_MAX_BUFFER]; /* FIX */
const char *src;
const char *realSourceEnd;
const char *srcLimit;
UChar *p;
UChar *targ;
UChar *end;
int32_t *offs;
int i;
UBool checkOffsets = TRUE;
int32_t realBufferSize;
UChar *realBufferEnd;
for(i=0;i<NEW_MAX_BUFFER;i++)
junkout[i] = 0xFFFE;
for(i=0;i<NEW_MAX_BUFFER;i++)
junokout[i] = -1;
setNuConvTestName(codepage, "TO");
log_verbose("\n========= %s\n", gNuConvTestName);
conv = my_ucnv_open(codepage, &status);
if(U_FAILURE(status))
{
log_data_err("Couldn't open converter %s\n",gNuConvTestName);
return TC_FAIL;
}
if(useFallback){
ucnv_setFallback(conv,useFallback);
}
log_verbose("Converter opened..\n");
src = (const char *)source;
targ = junkout;
offs = junokout;
realBufferSize = (sizeof(junkout)/sizeof(junkout[0]));
realBufferEnd = junkout + realBufferSize;
realSourceEnd = src + sourcelen;
if ( gOutBufferSize != realBufferSize || gInBufferSize != NEW_MAX_BUFFER )
checkOffsets = FALSE;
do
{
end = nct_min( targ + gOutBufferSize, realBufferEnd);
srcLimit = nct_min(realSourceEnd, src + gInBufferSize);
if(targ == realBufferEnd)
{
log_err("Error, the end would overflow the real output buffer while about to call toUnicode! tarjet=%08lx %s",targ,gNuConvTestName);
return TC_FAIL;
}
log_verbose("calling toUnicode @ %08lx to %08lx\n", targ,end);
/* oldTarg = targ; */
status = U_ZERO_ERROR;
ucnv_toUnicode (conv,
&targ,
end,
&src,
srcLimit,
checkOffsets ? offs : NULL,
(UBool)(srcLimit == realSourceEnd), /* flush if we're at the end of hte source data */
&status);
/* offs += (targ-oldTarg); */
} while ( (status == U_BUFFER_OVERFLOW_ERROR) || (U_SUCCESS(status) && (srcLimit < realSourceEnd)) ); /* while we just need another buffer */
if(U_FAILURE(status))
{
log_err("Problem doing %s toUnicode, errcode %s %s\n", codepage, myErrorName(status), gNuConvTestName);
return TC_FAIL;
}
log_verbose("\nConversion done. %d bytes -> %d chars.\nResult :",
sourcelen, targ-junkout);
if(VERBOSITY)
{
char junk[9999];
char offset_str[9999];
UChar *ptr;
junk[0] = 0;
offset_str[0] = 0;
for(ptr = junkout;ptr<targ;ptr++)
{
sprintf(junk + strlen(junk), "0x%04x, ", (0xFFFF) & (unsigned int)*ptr);
sprintf(offset_str + strlen(offset_str), "0x%04x, ", (0xFFFF) & (unsigned int)junokout[ptr-junkout]);
}
log_verbose(junk);
printUSeq(expect, expectlen);
if ( checkOffsets )
{
log_verbose("\nOffsets:");
log_verbose(offset_str);
}
log_verbose("\n");
}
ucnv_close(conv);
log_verbose("comparing %d uchars (%d bytes)..\n",expectlen,expectlen*2);
if (checkOffsets && (expectOffsets != 0))
{
if(memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t))){
log_err("did not get the expected offsets. %s\n",gNuConvTestName);
log_err("Got: ");
for(p=junkout;p<targ;p++) {
log_err("%d,", junokout[p-junkout]);
}
log_err("\n");
log_err("Expected: ");
for(i=0; i<(targ-junkout); i++) {
log_err("%d,", expectOffsets[i]);
}
log_err("\n");
log_err("output: ");
for(i=0; i<(targ-junkout); i++) {
log_err("%X,", junkout[i]);
}
log_err("\n");
log_err("input: ");
for(i=0; i<(src-(const char *)source); i++) {
log_err("%X,", (unsigned char)source[i]);
}
log_err("\n");
}
}
if(!memcmp(junkout, expect, expectlen*2))
{
log_verbose("Matches!\n");
return TC_OK;
}
else
{
log_err("String does not match. %s\n", gNuConvTestName);
log_verbose("String does not match. %s\n", gNuConvTestName);
printf("\nGot:");
printUSeqErr(junkout, expectlen);
printf("\nExpected:");
printUSeqErr(expect, expectlen);
return TC_MISMATCH;
}
}
static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
{
/** test chars #1 */
/* 1 2 3 1Han 2Han 3Han . */
static const UChar sampleText[] =
{ 0x0031, 0x0032, 0x0033, 0x0000, 0x4e00, 0x4e8c, 0x4e09, 0x002E, 0xD840, 0xDC21 };
static const UChar sampleTextRoundTripUnmappable[] =
{ 0x0031, 0x0032, 0x0033, 0x0000, 0x4e00, 0x4e8c, 0x4e09, 0x002E, 0xfffd };
static const uint8_t expectedUTF8[] =
{ 0x31, 0x32, 0x33, 0x00, 0xe4, 0xb8, 0x80, 0xe4, 0xba, 0x8c, 0xe4, 0xb8, 0x89, 0x2E, 0xf0, 0xa0, 0x80, 0xa1 };
static const int32_t toUTF8Offs[] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, 0x06, 0x06, 0x07, 0x08, 0x08, 0x08, 0x08 };
static const int32_t fmUTF8Offs[] =
{ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0007, 0x000a, 0x000d, 0x000e, 0x000e };
#ifdef U_ENABLE_GENERIC_ISO_2022
/* Same as UTF8, but with ^[%B preceeding */
static const const uint8_t expectedISO2022[] =
{ 0x1b, 0x25, 0x42, 0x31, 0x32, 0x33, 0x00, 0xe4, 0xb8, 0x80, 0xe4, 0xba, 0x8c, 0xe4, 0xb8, 0x89, 0x2E };
static const int32_t toISO2022Offs[] =
{ -1, -1, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x04,
0x04, 0x05, 0x05, 0x05, 0x06, 0x06, 0x06, 0x07 }; /* right? */
static const int32_t fmISO2022Offs[] =
{ 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x000a, 0x000d, 0x0010 }; /* is this right? */
#endif
/* 1 2 3 0, <SO> h1 h2 h3 <SI> . EBCDIC_STATEFUL */
static const uint8_t expectedIBM930[] =
{ 0xF1, 0xF2, 0xF3, 0x00, 0x0E, 0x45, 0x41, 0x45, 0x42, 0x45, 0x43, 0x0F, 0x4B, 0x0e, 0xfe, 0xfe, 0x0f };
static const int32_t toIBM930Offs[] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x08, -1 };
static const int32_t fmIBM930Offs[] =
{ 0x0000, 0x0001, 0x0002, 0x0003, 0x0005, 0x0007, 0x0009, 0x000c, 0x000e };
/* 1 2 3 0 h1 h2 h3 . MBCS*/
static const uint8_t expectedIBM943[] =
{ 0x31, 0x32, 0x33, 0x00, 0x88, 0xea, 0x93, 0xf1, 0x8e, 0x4f, 0x2e, 0xfc, 0xfc };
static const int32_t toIBM943Offs [] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08 };
static const int32_t fmIBM943Offs[] =
{ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0006, 0x0008, 0x000a, 0x000b };
/* 1 2 3 0 h1 h2 h3 . DBCS*/
static const uint8_t expectedIBM9027[] =
{ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0x4c, 0x41, 0x4c, 0x48, 0x4c, 0x55, 0xfe, 0xfe, 0xfe, 0xfe };
static const int32_t toIBM9027Offs [] =
{ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08 };
/* 1 2 3 0 <?> <?> <?> . SBCS*/
static const uint8_t expectedIBM920[] =
{ 0x31, 0x32, 0x33, 0x00, 0x1a, 0x1a, 0x1a, 0x2e, 0x1a };
static const int32_t toIBM920Offs [] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 };
/* 1 2 3 0 <?> <?> <?> . SBCS*/
static const uint8_t expectedISO88593[] =
{ 0x31, 0x32, 0x33, 0x00, 0x1a, 0x1a, 0x1a, 0x2E, 0x1a };
static const int32_t toISO88593Offs[] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 };
/* 1 2 3 0 <?> <?> <?> . <?> LATIN_1*/
static const uint8_t expectedLATIN1[] =
{ 0x31, 0x32, 0x33, 0x00, 0x1a, 0x1a, 0x1a, 0x2E, 0x1a };
static const int32_t toLATIN1Offs[] =
{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 };
/* etc */
static const uint8_t expectedUTF16BE[] =
{ 0x00, 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x00, 0x4e, 0x00, 0x4e, 0x8c, 0x4e, 0x09, 0x00, 0x2e, 0xd8, 0x40, 0xdc, 0x21 };
static const int32_t toUTF16BEOffs[]=
{ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x08, 0x08 };
static const int32_t fmUTF16BEOffs[] =
{ 0x0000, 0x0002, 0x0004, 0x0006, 0x0008, 0x000a, 0x000c, 0x000e, 0x0010, 0x0010 };
static const uint8_t expectedUTF16LE[] =
{ 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x00, 0x00, 0x00, 0x4e, 0x8c, 0x4e, 0x09, 0x4e, 0x2e, 0x00, 0x40, 0xd8, 0x21, 0xdc };
static const int32_t toUTF16LEOffs[]=
{ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x08, 0x08 };
static const int32_t fmUTF16LEOffs[] =
{ 0x0000, 0x0002, 0x0004, 0x0006, 0x0008, 0x000a, 0x000c, 0x000e, 0x0010, 0x0010 };
static const uint8_t expectedUTF32BE[] =
{ 0x00, 0x00, 0x00, 0x31,
0x00, 0x00, 0x00, 0x32,
0x00, 0x00, 0x00, 0x33,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x4e, 0x00,
0x00, 0x00, 0x4e, 0x8c,
0x00, 0x00, 0x4e, 0x09,
0x00, 0x00, 0x00, 0x2e,
0x00, 0x02, 0x00, 0x21 };
static const int32_t toUTF32BEOffs[]=
{ 0x00, 0x00, 0x00, 0x00,
0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02,
0x03, 0x03, 0x03, 0x03,
0x04, 0x04, 0x04, 0x04,
0x05, 0x05, 0x05, 0x05,
0x06, 0x06, 0x06, 0x06,
0x07, 0x07, 0x07, 0x07,
0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08 };
static const int32_t fmUTF32BEOffs[] =
{ 0x0000, 0x0004, 0x0008, 0x000c, 0x0010, 0x0014, 0x0018, 0x001c, 0x0020, 0x0020 };
static const uint8_t expectedUTF32LE[] =
{ 0x31, 0x00, 0x00, 0x00,
0x32, 0x00, 0x00, 0x00,
0x33, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x00, 0x4e, 0x00, 0x00,
0x8c, 0x4e, 0x00, 0x00,
0x09, 0x4e, 0x00, 0x00,
0x2e, 0x00, 0x00, 0x00,
0x21, 0x00, 0x02, 0x00 };
static const int32_t toUTF32LEOffs[]=
{ 0x00, 0x00, 0x00, 0x00,
0x01, 0x01, 0x01, 0x01,
0x02, 0x02, 0x02, 0x02,
0x03, 0x03, 0x03, 0x03,
0x04, 0x04, 0x04, 0x04,
0x05, 0x05, 0x05, 0x05,
0x06, 0x06, 0x06, 0x06,
0x07, 0x07, 0x07, 0x07,
0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08 };
static const int32_t fmUTF32LEOffs[] =
{ 0x0000, 0x0004, 0x0008, 0x000c, 0x0010, 0x0014, 0x0018, 0x001c, 0x0020, 0x0020 };
/** Test chars #2 **/
/* Sahha [health], slashed h's */
static const UChar malteseUChars[] = { 0x0053, 0x0061, 0x0127, 0x0127, 0x0061 };
static const uint8_t expectedMaltese913[] = { 0x53, 0x61, 0xB1, 0xB1, 0x61 };
/* LMBCS */
static const UChar LMBCSUChars[] = { 0x0027, 0x010A, 0x0000, 0x0127, 0x2666, 0x0220 };
static const uint8_t expectedLMBCS[] = { 0x27, 0x06, 0x04, 0x00, 0x01, 0x73, 0x01, 0x04, 0x14, 0x02, 0x20 };
static const int32_t toLMBCSOffs[] = { 0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x04 , 0x05, 0x05, 0x05 };
static const int32_t fmLMBCSOffs[] = { 0x0000, 0x0001, 0x0003, 0x0004, 0x0006, 0x0008};
/*********************************** START OF CODE finally *************/
gInBufferSize = insize;
gOutBufferSize = outsize;
log_verbose("\n\n\nTesting conversions with InputBufferSize = %d, OutputBufferSize = %d\n", gInBufferSize, gOutBufferSize);
/*UTF-8*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedUTF8, sizeof(expectedUTF8), "UTF8", toUTF8Offs,FALSE );
log_verbose("Test surrogate behaviour for UTF8\n");
{
static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01 };
static const uint8_t expectedUTF8test2[]= { 0xe2, 0x82, 0xac,
0xf0, 0x90, 0x90, 0x81,
0xef, 0xbf, 0xbd
};
static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3 };
testConvertFromU(testinput, sizeof(testinput)/sizeof(testinput[0]),
expectedUTF8test2, sizeof(expectedUTF8test2), "UTF8", offsets,FALSE );
}
#if !UCONFIG_NO_LEGACY_CONVERSION && defined(U_ENABLE_GENERIC_ISO_2022)
/*ISO-2022*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedISO2022, sizeof(expectedISO2022), "ISO_2022", toISO2022Offs,FALSE );
#endif
/*UTF16 LE*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedUTF16LE, sizeof(expectedUTF16LE), "utf-16le", toUTF16LEOffs,FALSE );
/*UTF16 BE*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedUTF16BE, sizeof(expectedUTF16BE), "utf-16be", toUTF16BEOffs,FALSE );
/*UTF32 LE*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedUTF32LE, sizeof(expectedUTF32LE), "utf-32le", toUTF32LEOffs,FALSE );
/*UTF32 BE*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedUTF32BE, sizeof(expectedUTF32BE), "utf-32be", toUTF32BEOffs,FALSE );
/*LATIN_1*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedLATIN1, sizeof(expectedLATIN1), "LATIN_1", toLATIN1Offs,FALSE );
#if !UCONFIG_NO_LEGACY_CONVERSION
/*EBCDIC_STATEFUL*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedIBM930, sizeof(expectedIBM930), "ibm-930", toIBM930Offs,FALSE );
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedISO88593, sizeof(expectedISO88593), "iso-8859-3", toISO88593Offs,FALSE );
/*MBCS*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedIBM943, sizeof(expectedIBM943), "ibm-943", toIBM943Offs,FALSE );
/*DBCS*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedIBM9027, sizeof(expectedIBM9027), "@ibm9027", toIBM9027Offs,FALSE );
/*SBCS*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedIBM920, sizeof(expectedIBM920), "ibm-920", toIBM920Offs,FALSE );
/*SBCS*/
testConvertFromU(sampleText, sizeof(sampleText)/sizeof(sampleText[0]),
expectedISO88593, sizeof(expectedISO88593), "iso-8859-3", toISO88593Offs,FALSE );
#endif
/****/
/*UTF-8*/
testConvertToU(expectedUTF8, sizeof(expectedUTF8),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf8", fmUTF8Offs,FALSE);
#if !UCONFIG_NO_LEGACY_CONVERSION && defined(U_ENABLE_GENERIC_ISO_2022)
/*ISO-2022*/
testConvertToU(expectedISO2022, sizeof(expectedISO2022),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "ISO_2022", fmISO2022Offs,FALSE);
#endif
/*UTF16 LE*/
testConvertToU(expectedUTF16LE, sizeof(expectedUTF16LE),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-16le", fmUTF16LEOffs,FALSE);
/*UTF16 BE*/
testConvertToU(expectedUTF16BE, sizeof(expectedUTF16BE),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-16be", fmUTF16BEOffs,FALSE);
/*UTF32 LE*/
testConvertToU(expectedUTF32LE, sizeof(expectedUTF32LE),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-32le", fmUTF32LEOffs,FALSE);
/*UTF32 BE*/
testConvertToU(expectedUTF32BE, sizeof(expectedUTF32BE),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-32be", fmUTF32BEOffs,FALSE);
#if !UCONFIG_NO_LEGACY_CONVERSION
/*EBCDIC_STATEFUL*/
testConvertToU(expectedIBM930, sizeof(expectedIBM930), sampleTextRoundTripUnmappable,
sizeof(sampleTextRoundTripUnmappable)/sizeof(sampleTextRoundTripUnmappable[0]), "ibm-930", fmIBM930Offs,FALSE);
/*MBCS*/
testConvertToU(expectedIBM943, sizeof(expectedIBM943),sampleTextRoundTripUnmappable,
sizeof(sampleTextRoundTripUnmappable)/sizeof(sampleTextRoundTripUnmappable[0]), "ibm-943", fmIBM943Offs,FALSE);
#endif
/* Try it again to make sure it still works */
testConvertToU(expectedUTF16LE, sizeof(expectedUTF16LE),
sampleText, sizeof(sampleText)/sizeof(sampleText[0]), "utf-16le", fmUTF16LEOffs,FALSE);
#if !UCONFIG_NO_LEGACY_CONVERSION
testConvertToU(expectedMaltese913, sizeof(expectedMaltese913),
malteseUChars, sizeof(malteseUChars)/sizeof(malteseUChars[0]), "latin3", NULL,FALSE);
testConvertFromU(malteseUChars, sizeof(malteseUChars)/sizeof(malteseUChars[0]),
expectedMaltese913, sizeof(expectedMaltese913), "iso-8859-3", NULL,FALSE );
/*LMBCS*/
testConvertFromU(LMBCSUChars, sizeof(LMBCSUChars)/sizeof(LMBCSUChars[0]),
expectedLMBCS, sizeof(expectedLMBCS), "LMBCS-1", toLMBCSOffs,FALSE );
testConvertToU(expectedLMBCS, sizeof(expectedLMBCS),
LMBCSUChars, sizeof(LMBCSUChars)/sizeof(LMBCSUChars[0]), "LMBCS-1", fmLMBCSOffs,FALSE);
#endif
/* UTF-7 examples are mostly from http://www.imc.org/rfc2152 */
{
/* encode directly set D and set O */
static const uint8_t utf7[] = {
/*
Hi Mom -+Jjo--!
A+ImIDkQ.
+-
+ZeVnLIqe
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x2b, 0x4a, 0x6a, 0x6f, 0x2d, 0x2d, 0x21,
0x41, 0x2b, 0x49, 0x6d, 0x49, 0x44, 0x6b, 0x51, 0x2e,
0x2b, 0x2d,
0x2b, 0x5a, 0x65, 0x56, 0x6e, 0x4c, 0x49, 0x71, 0x65
};
static const UChar unicode[] = {
/*
Hi Mom -<WHITE SMILING FACE>-!
A<NOT IDENTICAL TO><ALPHA>.
+
[Japanese word "nihongo"]
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x263a, 0x2d, 0x21,
0x41, 0x2262, 0x0391, 0x2e,
0x2b,
0x65e5, 0x672c, 0x8a9e
};
static const int32_t toUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 14,
15, 17, 19, 23,
24,
27, 29, 32
};
static const int32_t fromUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10,
11, 12, 12, 12, 13, 13, 13, 13, 14,
15, 15,
16, 16, 16, 17, 17, 17, 18, 18, 18
};
/* same but escaping set O (the exclamation mark) */
static const uint8_t utf7Restricted[] = {
/*
Hi Mom -+Jjo--+ACE-
A+ImIDkQ.
+-
+ZeVnLIqe
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x2b, 0x4a, 0x6a, 0x6f, 0x2d, 0x2d, 0x2b, 0x41, 0x43, 0x45, 0x2d,
0x41, 0x2b, 0x49, 0x6d, 0x49, 0x44, 0x6b, 0x51, 0x2e,
0x2b, 0x2d,
0x2b, 0x5a, 0x65, 0x56, 0x6e, 0x4c, 0x49, 0x71, 0x65
};
static const int32_t toUnicodeOffsetsR[] = {
0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 15,
19, 21, 23, 27,
28,
31, 33, 36
};
static const int32_t fromUnicodeOffsetsR[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10, 10, 10, 10, 10,
11, 12, 12, 12, 13, 13, 13, 13, 14,
15, 15,
16, 16, 16, 17, 17, 17, 18, 18, 18
};
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, utf7, sizeof(utf7), "UTF-7", fromUnicodeOffsets,FALSE);
testConvertToU(utf7, sizeof(utf7), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "UTF-7", toUnicodeOffsets,FALSE);
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, utf7Restricted, sizeof(utf7Restricted), "UTF-7,version=1", fromUnicodeOffsetsR,FALSE);
testConvertToU(utf7Restricted, sizeof(utf7Restricted), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "UTF-7,version=1", toUnicodeOffsetsR,FALSE);
}
/*
* IMAP-mailbox-name examples are mostly from http://www.imc.org/rfc2152,
* modified according to RFC 2060,
* and supplemented with the one example in RFC 2060 itself.
*/
{
static const uint8_t imap[] = {
/* Hi Mom -&Jjo--!
A&ImIDkQ-.
&-
&ZeVnLIqe-
\
~peter
/mail
/&ZeVnLIqe-
/&U,BTFw-
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x26, 0x4a, 0x6a, 0x6f, 0x2d, 0x2d, 0x21,
0x41, 0x26, 0x49, 0x6d, 0x49, 0x44, 0x6b, 0x51, 0x2d, 0x2e,
0x26, 0x2d,
0x26, 0x5a, 0x65, 0x56, 0x6e, 0x4c, 0x49, 0x71, 0x65, 0x2d,
0x5c,
0x7e, 0x70, 0x65, 0x74, 0x65, 0x72,
0x2f, 0x6d, 0x61, 0x69, 0x6c,
0x2f, 0x26, 0x5a, 0x65, 0x56, 0x6e, 0x4c, 0x49, 0x71, 0x65, 0x2d,
0x2f, 0x26, 0x55, 0x2c, 0x42, 0x54, 0x46, 0x77, 0x2d
};
static const UChar unicode[] = {
/* Hi Mom -<WHITE SMILING FACE>-!
A<NOT IDENTICAL TO><ALPHA>.
&
[Japanese word "nihongo"]
\
~peter
/mail
/<65e5, 672c, 8a9e>
/<53f0, 5317>
*/
0x48, 0x69, 0x20, 0x4d, 0x6f, 0x6d, 0x20, 0x2d, 0x263a, 0x2d, 0x21,
0x41, 0x2262, 0x0391, 0x2e,
0x26,
0x65e5, 0x672c, 0x8a9e,
0x5c,
0x7e, 0x70, 0x65, 0x74, 0x65, 0x72,
0x2f, 0x6d, 0x61, 0x69, 0x6c,
0x2f, 0x65e5, 0x672c, 0x8a9e,
0x2f, 0x53f0, 0x5317
};
static const int32_t toUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 14,
15, 17, 19, 24,
25,
28, 30, 33,
37,
38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48,
49, 51, 53, 56,
60, 62, 64
};
static const int32_t fromUnicodeOffsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10,
11, 12, 12, 12, 13, 13, 13, 13, 13, 14,
15, 15,
16, 16, 16, 17, 17, 17, 18, 18, 18, 18,
19,
20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30,
31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
35, 36, 36, 36, 37, 37, 37, 37, 37
};
testConvertFromU(unicode, sizeof(unicode)/U_SIZEOF_UCHAR, imap, sizeof(imap), "IMAP-mailbox-name", fromUnicodeOffsets,FALSE);
testConvertToU(imap, sizeof(imap), unicode, sizeof(unicode)/U_SIZEOF_UCHAR, "IMAP-mailbox-name", toUnicodeOffsets,FALSE);
}
/* Test UTF-8 bad data handling*/
{
static const uint8_t utf8[]={
0x61,
0xf7, 0xbf, 0xbf, 0xbf, /* > 10FFFF */
0x00,
0x62,
0xfb, 0xbf, 0xbf, 0xbf, 0xbf, /* > 10FFFF */
0xfb, 0xbf, 0xbf, 0xbf, 0xbf, /* > 10FFFF */
0xf4, 0x8f, 0xbf, 0xbf, /* 10FFFF */
0xdf, 0xbf, /* 7ff */
0xbf, /* truncated tail */
0xf4, 0x90, 0x80, 0x80, /* 11FFFF */
0x02
};
static const uint16_t utf8Expected[]={
0x0061,
0xfffd,
0x0000,
0x0062,
0xfffd,
0xfffd,
0xdbff, 0xdfff,
0x07ff,
0xfffd,
0xfffd,
0x0002
};
static const int32_t utf8Offsets[]={
0, 1, 5, 6, 7, 12, 17, 17, 21, 23, 24, 28
};
testConvertToU(utf8, sizeof(utf8),
utf8Expected, sizeof(utf8Expected)/sizeof(utf8Expected[0]), "utf-8", utf8Offsets ,FALSE);
}
/* Test UTF-32BE bad data handling*/
{
static const uint8_t utf32[]={
0x00, 0x00, 0x00, 0x61,
0x00, 0x11, 0x00, 0x00, /* 0x110000 out of range */
0x00, 0x10, 0xff, 0xff, /* 0x10FFFF in range */
0x00, 0x00, 0x00, 0x62,
0xff, 0xff, 0xff, 0xff, /* 0xffffffff out of range */
0x7f, 0xff, 0xff, 0xff, /* 0x7fffffff out of range */
0x00, 0x00, 0x01, 0x62,
0x00, 0x00, 0x02, 0x62
};
static const uint16_t utf32Expected[]={
0x0061,
0xfffd, /* 0x110000 out of range */
0xDBFF, /* 0x10FFFF in range */
0xDFFF,
0x0062,
0xfffd, /* 0xffffffff out of range */
0xfffd, /* 0x7fffffff out of range */
0x0162,
0x0262
};
static const int32_t utf32Offsets[]={
0, 4, 8, 8, 12, 16, 20, 24, 28
};
static const uint8_t utf32ExpectedBack[]={
0x00, 0x00, 0x00, 0x61,
0x00, 0x00, 0xff, 0xfd, /* 0x110000 out of range */
0x00, 0x10, 0xff, 0xff, /* 0x10FFFF in range */
0x00, 0x00, 0x00, 0x62,
0x00, 0x00, 0xff, 0xfd, /* 0xffffffff out of range */
0x00, 0x00, 0xff, 0xfd, /* 0x7fffffff out of range */
0x00, 0x00, 0x01, 0x62,
0x00, 0x00, 0x02, 0x62
};
static const int32_t utf32OffsetsBack[]={
0,0,0,0,
1,1,1,1,
2,2,2,2,
4,4,4,4,
5,5,5,5,
6,6,6,6,
7,7,7,7,
8,8,8,8
};
testConvertToU(utf32, sizeof(utf32),
utf32Expected, sizeof(utf32Expected)/sizeof(utf32Expected[0]), "utf-32be", utf32Offsets ,FALSE);
testConvertFromU(utf32Expected, sizeof(utf32Expected)/sizeof(utf32Expected[0]),
utf32ExpectedBack, sizeof(utf32ExpectedBack), "utf-32be", utf32OffsetsBack, FALSE);
}
/* Test UTF-32LE bad data handling*/
{
static const uint8_t utf32[]={
0x61, 0x00, 0x00, 0x00,
0x00, 0x00, 0x11, 0x00, /* 0x110000 out of range */
0xff, 0xff, 0x10, 0x00, /* 0x10FFFF in range */
0x62, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, /* 0xffffffff out of range */
0xff, 0xff, 0xff, 0x7f, /* 0x7fffffff out of range */
0x62, 0x01, 0x00, 0x00,
0x62, 0x02, 0x00, 0x00,
};
static const uint16_t utf32Expected[]={
0x0061,
0xfffd, /* 0x110000 out of range */
0xDBFF, /* 0x10FFFF in range */
0xDFFF,
0x0062,
0xfffd, /* 0xffffffff out of range */
0xfffd, /* 0x7fffffff out of range */
0x0162,
0x0262
};
static const int32_t utf32Offsets[]={
0, 4, 8, 8, 12, 16, 20, 24, 28
};
static const uint8_t utf32ExpectedBack[]={
0x61, 0x00, 0x00, 0x00,
0xfd, 0xff, 0x00, 0x00, /* 0x110000 out of range */
0xff, 0xff, 0x10, 0x00, /* 0x10FFFF in range */
0x62, 0x00, 0x00, 0x00,
0xfd, 0xff, 0x00, 0x00, /* 0xffffffff out of range */
0xfd, 0xff, 0x00, 0x00, /* 0x7fffffff out of range */
0x62, 0x01, 0x00, 0x00,
0x62, 0x02, 0x00, 0x00
};
static const int32_t utf32OffsetsBack[]={
0,0,0,0,
1,1,1,1,
2,2,2,2,
4,4,4,4,
5,5,5,5,
6,6,6,6,
7,7,7,7,
8,8,8,8
};
testConvertToU(utf32, sizeof(utf32),
utf32Expected, sizeof(utf32Expected)/sizeof(utf32Expected[0]), "utf-32le", utf32Offsets,FALSE );
testConvertFromU(utf32Expected, sizeof(utf32Expected)/sizeof(utf32Expected[0]),
utf32ExpectedBack, sizeof(utf32ExpectedBack), "utf-32le", utf32OffsetsBack, FALSE);
}
}
static void TestCoverageMBCS(){
#if 0
UErrorCode status = U_ZERO_ERROR;
const char *directory = loadTestData(&status);
char* tdpath = NULL;
char* saveDirectory = (char*)malloc(sizeof(char) *(strlen(u_getDataDirectory())+1));
int len = strlen(directory);
char* index=NULL;
tdpath = (char*) malloc(sizeof(char) * (len * 2));
uprv_strcpy(saveDirectory,u_getDataDirectory());
log_verbose("Retrieved data directory %s \n",saveDirectory);
uprv_strcpy(tdpath,directory);
index=strrchr(tdpath,(char)U_FILE_SEP_CHAR);
if((unsigned int)(index-tdpath) != (strlen(tdpath)-1)){
*(index+1)=0;
}
u_setDataDirectory(tdpath);
log_verbose("ICU data directory is set to: %s \n" ,tdpath);
#endif
/*some more test to increase the code coverage in MBCS. Create an test converter from test1.ucm
which is test file for MBCS conversion with single-byte codepage data.*/
{
/* MBCS with single byte codepage data test1.ucm*/
const UChar unicodeInput[] = { 0x20ac, 0x0005, 0x0006, 0xdbc4, 0xde34, 0x0003};
const uint8_t expectedtest1[] = { 0x00, 0x05, 0xff, 0x07, 0xff,};
int32_t totest1Offs[] = { 0, 1, 2, 3, 5, };
/*from Unicode*/
testConvertFromU(unicodeInput, sizeof(unicodeInput)/sizeof(unicodeInput[0]),
expectedtest1, sizeof(expectedtest1), "@test1", totest1Offs,FALSE );
}
/*some more test to increase the code coverage in MBCS. Create an test converter from test3.ucm
which is test file for MBCS conversion with three-byte codepage data.*/
{
/* MBCS with three byte codepage data test3.ucm*/
const UChar unicodeInput[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0x000e};
const uint8_t expectedtest3[] = { 0x00, 0x05, 0xff, 0x01, 0x02, 0x0b, 0x07, 0x01, 0x02, 0x0a, 0xff,};
int32_t totest3Offs[] = { 0, 1, 2, 3, 3, 3, 4, 6, 6, 6, 8};
const uint8_t test3input[] = { 0x00, 0x05, 0x06, 0x01, 0x02, 0x0b, 0x07, 0x01, 0x02, 0x0a, 0x01, 0x02, 0x0c,};
const UChar expectedUnicode[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0xfffd};
int32_t fromtest3Offs[] = { 0, 1, 2, 3, 6, 6, 7, 7, 10 };
/*from Unicode*/
testConvertFromU(unicodeInput, sizeof(unicodeInput)/sizeof(unicodeInput[0]),
expectedtest3, sizeof(expectedtest3), "@test3", totest3Offs,FALSE );
/*to Unicode*/
testConvertToU(test3input, sizeof(test3input),
expectedUnicode, sizeof(expectedUnicode)/sizeof(expectedUnicode[0]), "@test3", fromtest3Offs ,FALSE);
}
/*some more test to increase the code coverage in MBCS. Create an test converter from test4.ucm
which is test file for MBCS conversion with four-byte codepage data.*/
{
/* MBCS with three byte codepage data test4.ucm*/
static const UChar unicodeInput[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0x000e};
static const uint8_t expectedtest4[] = { 0x00, 0x05, 0xff, 0x01, 0x02, 0x03, 0x0b, 0x07, 0x01, 0x02, 0x03, 0x0a, 0xff,};
static const int32_t totest4Offs[] = { 0, 1, 2, 3, 3, 3, 3, 4, 6, 6, 6, 6, 8,};
static const uint8_t test4input[] = { 0x00, 0x05, 0x06, 0x01, 0x02, 0x03, 0x0b, 0x07, 0x01, 0x02, 0x03, 0x0a, 0x01, 0x02, 0x03, 0x0c,};
static const UChar expectedUnicode[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0xfffd};
static const int32_t fromtest4Offs[] = { 0, 1, 2, 3, 7, 7, 8, 8, 12,};
/*from Unicode*/
testConvertFromU(unicodeInput, sizeof(unicodeInput)/sizeof(unicodeInput[0]),
expectedtest4, sizeof(expectedtest4), "@test4", totest4Offs,FALSE );
/*to Unicode*/
testConvertToU(test4input, sizeof(test4input),
expectedUnicode, sizeof(expectedUnicode)/sizeof(expectedUnicode[0]), "@test4", fromtest4Offs,FALSE );
}
#if 0
free(tdpath);
/* restore the original data directory */
log_verbose("Setting the data directory to %s \n", saveDirectory);
u_setDataDirectory(saveDirectory);
free(saveDirectory);
#endif
}
static void TestConverterType(const char *convName, UConverterType convType) {
UConverter* myConverter;
UErrorCode err = U_ZERO_ERROR;
myConverter = my_ucnv_open(convName, &err);
if (U_FAILURE(err)) {
log_data_err("Failed to create an %s converter\n", convName);
return;
}
else
{
if (ucnv_getType(myConverter)!=convType) {
log_err("ucnv_getType Failed for %s. Got enum value 0x%X\n",
convName, convType);
}
else {
log_verbose("ucnv_getType %s ok\n", convName);
}
}
ucnv_close(myConverter);
}
static void TestConverterTypesAndStarters()
{
#if !UCONFIG_NO_LEGACY_CONVERSION
UConverter* myConverter;
UErrorCode err = U_ZERO_ERROR;
UBool mystarters[256];
/* const UBool expectedKSCstarters[256] = {
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE};*/
log_verbose("Testing KSC, ibm-930, ibm-878 for starters and their conversion types.");
myConverter = ucnv_open("ksc", &err);
if (U_FAILURE(err)) {
log_data_err("Failed to create an ibm-ksc converter\n");
return;
}
else
{
if (ucnv_getType(myConverter)!=UCNV_MBCS)
log_err("ucnv_getType Failed for ibm-949\n");
else
log_verbose("ucnv_getType ibm-949 ok\n");
if(myConverter!=NULL)
ucnv_getStarters(myConverter, mystarters, &err);
/*if (memcmp(expectedKSCstarters, mystarters, sizeof(expectedKSCstarters)))
log_err("Failed ucnv_getStarters for ksc\n");
else
log_verbose("ucnv_getStarters ok\n");*/
}
ucnv_close(myConverter);
TestConverterType("ibm-930", UCNV_EBCDIC_STATEFUL);
TestConverterType("ibm-878", UCNV_SBCS);
#endif
TestConverterType("iso-8859-1", UCNV_LATIN_1);
TestConverterType("ibm-1208", UCNV_UTF8);
TestConverterType("utf-8", UCNV_UTF8);
TestConverterType("UTF-16BE", UCNV_UTF16_BigEndian);
TestConverterType("UTF-16LE", UCNV_UTF16_LittleEndian);
TestConverterType("UTF-32BE", UCNV_UTF32_BigEndian);
TestConverterType("UTF-32LE", UCNV_UTF32_LittleEndian);
#if !UCONFIG_NO_LEGACY_CONVERSION
#if defined(U_ENABLE_GENERIC_ISO_2022)
TestConverterType("iso-2022", UCNV_ISO_2022);
#endif
TestConverterType("hz", UCNV_HZ);
#endif
TestConverterType("scsu", UCNV_SCSU);
#if !UCONFIG_NO_LEGACY_CONVERSION
TestConverterType("x-iscii-de", UCNV_ISCII);
#endif
TestConverterType("ascii", UCNV_US_ASCII);
TestConverterType("utf-7", UCNV_UTF7);
TestConverterType("IMAP-mailbox-name", UCNV_IMAP_MAILBOX);
TestConverterType("bocu-1", UCNV_BOCU1);
}
static void
TestAmbiguousConverter(UConverter *cnv) {
static const char inBytes[3]={ 0x61, 0x5B, 0x5c };
UChar outUnicode[20]={ 0, 0, 0, 0 };
const char *s;
UChar *u;
UErrorCode errorCode;
UBool isAmbiguous;
/* try to convert an 'a', a square bracket and a US-ASCII backslash */
errorCode=U_ZERO_ERROR;
s=inBytes;
u=outUnicode;
ucnv_toUnicode(cnv, &u, u+20, &s, s+3, NULL, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
/* we do not care about general failures in this test; the input may just not be mappable */
return;
}
if(outUnicode[0]!=0x61 || outUnicode[1]!=0x5B || outUnicode[2]==0xfffd) {
/* not a close ASCII-family encoding, or 0x5c is unassigned/illegal: this test is not applicable */
/* There are some encodings that are partially ASCII based,
like the ISO-7 and GSM series of codepages, which we ignore. */
return;
}
isAmbiguous=ucnv_isAmbiguous(cnv);
/* check that outUnicode[1]!=0x5c is exactly the same as ucnv_isAmbiguous() */
if((outUnicode[2]!=0x5c)!=isAmbiguous) {
log_err("error: converter \"%s\" needs a backslash fix: %d but ucnv_isAmbiguous()==%d\n",
ucnv_getName(cnv, &errorCode), outUnicode[2]!=0x5c, isAmbiguous);
return;
}
if(outUnicode[2]!=0x5c) {
/* needs fixup, fix it */
ucnv_fixFileSeparator(cnv, outUnicode, (int32_t)(u-outUnicode));
if(outUnicode[2]!=0x5c) {
/* the fix failed */
log_err("error: ucnv_fixFileSeparator(%s) failed\n", ucnv_getName(cnv, &errorCode));
return;
}
}
}
static void TestAmbiguous()
{
UErrorCode status = U_ZERO_ERROR;
UConverter *ascii_cnv = 0, *sjis_cnv = 0, *cnv;
static const char target[] = {
/* "\\usr\\local\\share\\data\\icutest.txt" */
0x5c, 0x75, 0x73, 0x72,
0x5c, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
0x5c, 0x73, 0x68, 0x61, 0x72, 0x65,
0x5c, 0x64, 0x61, 0x74, 0x61,
0x5c, 0x69, 0x63, 0x75, 0x74, 0x65, 0x73, 0x74, 0x2e, 0x74, 0x78, 0x74,
0
};
UChar asciiResult[200], sjisResult[200];
int32_t /*asciiLength = 0,*/ sjisLength = 0, i;
const char *name;
/* enumerate all converters */
status=U_ZERO_ERROR;
for(i=0; (name=ucnv_getAvailableName(i))!=NULL; ++i) {
cnv=ucnv_open(name, &status);
if(U_SUCCESS(status)) {
TestAmbiguousConverter(cnv);
ucnv_close(cnv);
} else {
log_err("error: unable to open available converter \"%s\"\n", name);
status=U_ZERO_ERROR;
}
}
#if !UCONFIG_NO_LEGACY_CONVERSION
sjis_cnv = ucnv_open("ibm-943", &status);
if (U_FAILURE(status))
{
log_data_err("Failed to create a SJIS converter\n");
return;
}
ascii_cnv = ucnv_open("LATIN-1", &status);
if (U_FAILURE(status))
{
log_data_err("Failed to create a LATIN-1 converter\n");
ucnv_close(sjis_cnv);
return;
}
/* convert target from SJIS to Unicode */
sjisLength = ucnv_toUChars(sjis_cnv, sjisResult, sizeof(sjisResult)/U_SIZEOF_UCHAR, target, (int32_t)strlen(target), &status);
if (U_FAILURE(status))
{
log_err("Failed to convert the SJIS string.\n");
ucnv_close(sjis_cnv);
ucnv_close(ascii_cnv);
return;
}
/* convert target from Latin-1 to Unicode */
/*asciiLength =*/ ucnv_toUChars(ascii_cnv, asciiResult, sizeof(asciiResult)/U_SIZEOF_UCHAR, target, (int32_t)strlen(target), &status);
if (U_FAILURE(status))
{
log_err("Failed to convert the Latin-1 string.\n");
ucnv_close(sjis_cnv);
ucnv_close(ascii_cnv);
return;
}
if (!ucnv_isAmbiguous(sjis_cnv))
{
log_err("SJIS converter should contain ambiguous character mappings.\n");
ucnv_close(sjis_cnv);
ucnv_close(ascii_cnv);
return;
}
if (u_strcmp(sjisResult, asciiResult) == 0)
{
log_err("File separators for SJIS don't need to be fixed.\n");
}
ucnv_fixFileSeparator(sjis_cnv, sjisResult, sjisLength);
if (u_strcmp(sjisResult, asciiResult) != 0)
{
log_err("Fixing file separator for SJIS failed.\n");
}
ucnv_close(sjis_cnv);
ucnv_close(ascii_cnv);
#endif
}
static void
TestSignatureDetection(){
/* with null terminated strings */
{
static const char* data[] = {
"\xFE\xFF\x00\x00", /* UTF-16BE */
"\xFF\xFE\x00\x00", /* UTF-16LE */
"\xEF\xBB\xBF\x00", /* UTF-8 */
"\x0E\xFE\xFF\x00", /* SCSU */
"\xFE\xFF", /* UTF-16BE */
"\xFF\xFE", /* UTF-16LE */
"\xEF\xBB\xBF", /* UTF-8 */
"\x0E\xFE\xFF", /* SCSU */
"\xFE\xFF\x41\x42", /* UTF-16BE */
"\xFF\xFE\x41\x41", /* UTF-16LE */
"\xEF\xBB\xBF\x41", /* UTF-8 */
"\x0E\xFE\xFF\x41", /* SCSU */
"\x2B\x2F\x76\x38\x2D", /* UTF-7 */
"\x2B\x2F\x76\x38\x41", /* UTF-7 */
"\x2B\x2F\x76\x39\x41", /* UTF-7 */
"\x2B\x2F\x76\x2B\x41", /* UTF-7 */
"\x2B\x2F\x76\x2F\x41", /* UTF-7 */
"\xDD\x73\x66\x73" /* UTF-EBCDIC */
};
static const char* expected[] = {
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-7",
"UTF-7",
"UTF-7",
"UTF-7",
"UTF-7",
"UTF-EBCDIC"
};
static const int32_t expectedLength[] ={
2,
2,
3,
3,
2,
2,
3,
3,
2,
2,
3,
3,
5,
4,
4,
4,
4,
4
};
int i=0;
UErrorCode err;
int32_t signatureLength = -1;
const char* source = NULL;
const char* enc = NULL;
for( ; i<sizeof(data)/sizeof(char*); i++){
err = U_ZERO_ERROR;
source = data[i];
enc = ucnv_detectUnicodeSignature(source, -1 , &signatureLength, &err);
if(U_FAILURE(err)){
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i. Error: %s\n", source,i,u_errorName(err));
continue;
}
if(enc == NULL || strcmp(enc,expected[i]) !=0){
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i. Expected: %s. Got: %s\n",source,i,expected[i],enc);
continue;
}
if(signatureLength != expectedLength[i]){
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i.Expected Length: %i. Got length: %i\n",source,i,signatureLength,expectedLength[i]);
}
}
}
{
static const char* data[] = {
"\xFE\xFF\x00", /* UTF-16BE */
"\xFF\xFE\x00", /* UTF-16LE */
"\xEF\xBB\xBF\x00", /* UTF-8 */
"\x0E\xFE\xFF\x00", /* SCSU */
"\x00\x00\xFE\xFF", /* UTF-32BE */
"\xFF\xFE\x00\x00", /* UTF-32LE */
"\xFE\xFF", /* UTF-16BE */
"\xFF\xFE", /* UTF-16LE */
"\xEF\xBB\xBF", /* UTF-8 */
"\x0E\xFE\xFF", /* SCSU */
"\x00\x00\xFE\xFF", /* UTF-32BE */
"\xFF\xFE\x00\x00", /* UTF-32LE */
"\xFE\xFF\x41\x42", /* UTF-16BE */
"\xFF\xFE\x41\x41", /* UTF-16LE */
"\xEF\xBB\xBF\x41", /* UTF-8 */
"\x0E\xFE\xFF\x41", /* SCSU */
"\x00\x00\xFE\xFF\x41", /* UTF-32BE */
"\xFF\xFE\x00\x00\x42", /* UTF-32LE */
"\xFB\xEE\x28", /* BOCU-1 */
"\xFF\x41\x42" /* NULL */
};
static const int len[] = {
3,
3,
4,
4,
4,
4,
2,
2,
3,
3,
4,
4,
4,
4,
4,
4,
5,
5,
3,
3
};
static const char* expected[] = {
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-32BE",
"UTF-32LE",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-32BE",
"UTF-32LE",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-32BE",
"UTF-32LE",
"BOCU-1",
NULL
};
static const int32_t expectedLength[] ={
2,
2,
3,
3,
4,
4,
2,
2,
3,
3,
4,
4,
2,
2,
3,
3,
4,
4,
3,
0
};
int i=0;
UErrorCode err;
int32_t signatureLength = -1;
int32_t sourceLength=-1;
const char* source = NULL;
const char* enc = NULL;
for( ; i<sizeof(data)/sizeof(char*); i++){
err = U_ZERO_ERROR;
source = data[i];
sourceLength = len[i];
enc = ucnv_detectUnicodeSignature(source, sourceLength , &signatureLength, &err);
if(U_FAILURE(err)){
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i. Error: %s\n", source,i,u_errorName(err));
continue;
}
if(enc == NULL || strcmp(enc,expected[i]) !=0){
if(expected[i] !=NULL){
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i. Expected: %s. Got: %s\n",source,i,expected[i],enc);
continue;
}
}
if(signatureLength != expectedLength[i]){
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i.Expected Length: %i. Got length: %i\n",source,i,signatureLength,expectedLength[i]);
}
}
}
}
static void TestUTF7() {
/* test input */
static const uint8_t in[]={
/* H - +Jjo- - ! +- +2AHcAQ */
0x48,
0x2d,
0x2b, 0x4a, 0x6a, 0x6f,
0x2d, 0x2d,
0x21,
0x2b, 0x2d,
0x2b, 0x32, 0x41, 0x48, 0x63, 0x41, 0x51
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
1, 0x48,
1, 0x2d,
4, 0x263a, /* <WHITE SMILING FACE> */
2, 0x2d,
1, 0x21,
2, 0x2b,
7, 0x10401
};
const char *cnvName;
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("UTF-7", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF-7 converter: %s\n", u_errorName(errorCode)); /* sholdn't be a data err */
return;
}
TestNextUChar(cnv, source, limit, results, "UTF-7");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
cnvName = ucnv_getName(cnv, &errorCode);
if (U_FAILURE(errorCode) || uprv_strcmp(cnvName, "UTF-7") != 0) {
log_err("UTF-7 converter is called %s: %s\n", cnvName, u_errorName(errorCode));
}
ucnv_close(cnv);
}
static void TestIMAP() {
/* test input */
static const uint8_t in[]={
/* H - &Jjo- - ! &- &2AHcAQ- \ */
0x48,
0x2d,
0x26, 0x4a, 0x6a, 0x6f,
0x2d, 0x2d,
0x21,
0x26, 0x2d,
0x26, 0x32, 0x41, 0x48, 0x63, 0x41, 0x51, 0x2d
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
1, 0x48,
1, 0x2d,
4, 0x263a, /* <WHITE SMILING FACE> */
2, 0x2d,
1, 0x21,
2, 0x26,
7, 0x10401
};
const char *cnvName;
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("IMAP-mailbox-name", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a IMAP-mailbox-name converter: %s\n", u_errorName(errorCode)); /* sholdn't be a data err */
return;
}
TestNextUChar(cnv, source, limit, results, "IMAP-mailbox-name");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
cnvName = ucnv_getName(cnv, &errorCode);
if (U_FAILURE(errorCode) || uprv_strcmp(cnvName, "IMAP-mailbox-name") != 0) {
log_err("IMAP-mailbox-name converter is called %s: %s\n", cnvName, u_errorName(errorCode));
}
ucnv_close(cnv);
}
static void TestUTF8() {
/* test input */
static const uint8_t in[]={
0x61,
0xc2, 0x80,
0xe0, 0xa0, 0x80,
0xf0, 0x90, 0x80, 0x80,
0xf4, 0x84, 0x8c, 0xa1,
0xf0, 0x90, 0x90, 0x81
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
1, 0x61,
2, 0x80,
3, 0x800,
4, 0x10000,
4, 0x104321,
4, 0x10401
};
/* error test input */
static const uint8_t in2[]={
0x61,
0xc0, 0x80, /* illegal non-shortest form */
0xe0, 0x80, 0x80, /* illegal non-shortest form */
0xf0, 0x80, 0x80, 0x80, /* illegal non-shortest form */
0xc0, 0xc0, /* illegal trail byte */
0xf4, 0x90, 0x80, 0x80, /* 0x110000 out of range */
0xf8, 0x80, 0x80, 0x80, 0x80, /* too long */
0xfe, /* illegal byte altogether */
0x62
};
/* expected error test results */
static const int32_t results2[]={
/* number of bytes read, code point */
1, 0x61,
22, 0x62
};
UConverterToUCallback cb;
const void *p;
const char *source=(const char *)in,*limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("UTF-8", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF-8 converter: %s\n", u_errorName(errorCode));
return;
}
TestNextUChar(cnv, source, limit, results, "UTF-8");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/* test error behavior with a skip callback */
ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, &cb, &p, &errorCode);
source=(const char *)in2;
limit=(const char *)(in2+sizeof(in2));
TestNextUChar(cnv, source, limit, results2, "UTF-8");
ucnv_close(cnv);
}
static void TestCESU8() {
/* test input */
static const uint8_t in[]={
0x61,
0xc2, 0x80,
0xe0, 0xa0, 0x80,
0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80,
0xed, 0xb0, 0x81, 0xed, 0xa0, 0x82,
0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf,
0xef, 0xbf, 0xbc
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
1, 0x61,
2, 0x80,
3, 0x800,
6, 0x10000,
3, 0xdc01,
-1,0xd802, /* may read 3 or 6 bytes */
-1,0x10ffff,/* may read 0 or 3 bytes */
3, 0xfffc
};
/* error test input */
static const uint8_t in2[]={
0x61,
0xc0, 0x80, /* illegal non-shortest form */
0xe0, 0x80, 0x80, /* illegal non-shortest form */
0xf0, 0x80, 0x80, 0x80, /* illegal non-shortest form */
0xc0, 0xc0, /* illegal trail byte */
0xf0, 0x90, 0x80, 0x80, /* illegal 4-byte supplementary code point */
0xf4, 0x84, 0x8c, 0xa1, /* illegal 4-byte supplementary code point */
0xf0, 0x90, 0x90, 0x81, /* illegal 4-byte supplementary code point */
0xf4, 0x90, 0x80, 0x80, /* 0x110000 out of range */
0xf8, 0x80, 0x80, 0x80, 0x80, /* too long */
0xfe, /* illegal byte altogether */
0x62
};
/* expected error test results */
static const int32_t results2[]={
/* number of bytes read, code point */
1, 0x61,
34, 0x62
};
UConverterToUCallback cb;
const void *p;
const char *source=(const char *)in,*limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("CESU-8", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a CESU-8 converter: %s\n", u_errorName(errorCode));
return;
}
TestNextUChar(cnv, source, limit, results, "CESU-8");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/* test error behavior with a skip callback */
ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, &cb, &p, &errorCode);
source=(const char *)in2;
limit=(const char *)(in2+sizeof(in2));
TestNextUChar(cnv, source, limit, results2, "CESU-8");
ucnv_close(cnv);
}
static void TestUTF16() {
/* test input */
static const uint8_t in1[]={
0xfe, 0xff, 0x4e, 0x00, 0xfe, 0xff
};
static const uint8_t in2[]={
0xff, 0xfe, 0x4e, 0x00, 0xfe, 0xff
};
static const uint8_t in3[]={
0xfe, 0xfe, 0x4e, 0x00, 0xfe, 0xff, 0xd8, 0x40, 0xdc, 0x01
};
/* expected test results */
static const int32_t results1[]={
/* number of bytes read, code point */
4, 0x4e00,
2, 0xfeff
};
static const int32_t results2[]={
/* number of bytes read, code point */
4, 0x004e,
2, 0xfffe
};
static const int32_t results3[]={
/* number of bytes read, code point */
2, 0xfefe,
2, 0x4e00,
2, 0xfeff,
4, 0x20001
};
const char *source, *limit;
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("UTF-16", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF-16 converter: %s\n", u_errorName(errorCode));
return;
}
source=(const char *)in1, limit=(const char *)in1+sizeof(in1);
TestNextUChar(cnv, source, limit, results1, "UTF-16");
source=(const char *)in2, limit=(const char *)in2+sizeof(in2);
ucnv_resetToUnicode(cnv);
TestNextUChar(cnv, source, limit, results2, "UTF-16");
source=(const char *)in3, limit=(const char *)in3+sizeof(in3);
ucnv_resetToUnicode(cnv);
TestNextUChar(cnv, source, limit, results3, "UTF-16");
/* Test the condition when source >= sourceLimit */
ucnv_resetToUnicode(cnv);
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
ucnv_close(cnv);
}
static void TestUTF16BE() {
/* test input */
static const uint8_t in[]={
0x00, 0x61,
0x00, 0xc0,
0x00, 0x31,
0x00, 0xf4,
0xce, 0xfe,
0xd8, 0x01, 0xdc, 0x01
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
2, 0x61,
2, 0xc0,
2, 0x31,
2, 0xf4,
2, 0xcefe,
4, 0x10401
};
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("utf-16be", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF16-BE converter: %s\n", u_errorName(errorCode));
return;
}
TestNextUChar(cnv, source, limit, results, "UTF-16BE");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/*Test for the condition where there is an invalid character*/
{
static const uint8_t source2[]={0x61};
ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_TRUNCATED_CHAR_FOUND, "an invalid character");
}
#if 0
/*
* Test disabled because currently the UTF-16BE/LE converters are supposed
* to not set errors for unpaired surrogates.
* This may change with
* Jitterbug 1838 - forbid converting surrogate code points in UTF-16/32
*/
/*Test for the condition where there is a surrogate pair*/
{
const uint8_t source2[]={0xd8, 0x01};
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_TRUNCATED_CHAR_FOUND, "an truncated surrogate character");
}
#endif
ucnv_close(cnv);
}
static void
TestUTF16LE() {
/* test input */
static const uint8_t in[]={
0x61, 0x00,
0x31, 0x00,
0x4e, 0x2e,
0x4e, 0x00,
0x01, 0xd8, 0x01, 0xdc
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
2, 0x61,
2, 0x31,
2, 0x2e4e,
2, 0x4e,
4, 0x10401
};
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("utf-16le", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF16-LE converter: %s\n", u_errorName(errorCode));
return;
}
TestNextUChar(cnv, source, limit, results, "UTF-16LE");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/*Test for the condition where there is an invalid character*/
{
static const uint8_t source2[]={0x61};
ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_TRUNCATED_CHAR_FOUND, "an invalid character");
}
#if 0
/*
* Test disabled because currently the UTF-16BE/LE converters are supposed
* to not set errors for unpaired surrogates.
* This may change with
* Jitterbug 1838 - forbid converting surrogate code points in UTF-16/32
*/
/*Test for the condition where there is a surrogate character*/
{
static const uint8_t source2[]={0x01, 0xd8};
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_TRUNCATED_CHAR_FOUND, "an truncated surrogate character");
}
#endif
ucnv_close(cnv);
}
static void TestUTF32() {
/* test input */
static const uint8_t in1[]={
0x00, 0x00, 0xfe, 0xff, 0x00, 0x10, 0x0f, 0x00, 0x00, 0x00, 0xfe, 0xff
};
static const uint8_t in2[]={
0xff, 0xfe, 0x00, 0x00, 0x00, 0x10, 0x0f, 0x00, 0xfe, 0xff, 0x00, 0x00
};
static const uint8_t in3[]={
0x00, 0x00, 0xfe, 0xfe, 0x00, 0x10, 0x0f, 0x00, 0x00, 0x00, 0xd8, 0x40, 0x00, 0x00, 0xdc, 0x01
};
/* expected test results */
static const int32_t results1[]={
/* number of bytes read, code point */
8, 0x100f00,
4, 0xfeff
};
static const int32_t results2[]={
/* number of bytes read, code point */
8, 0x0f1000,
4, 0xfffe
};
static const int32_t results3[]={
/* number of bytes read, code point */
4, 0xfefe,
4, 0x100f00,
4, 0xfffd, /* unmatched surrogate */
4, 0xfffd /* unmatched surrogate */
};
const char *source, *limit;
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("UTF-32", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF-32 converter: %s\n", u_errorName(errorCode));
return;
}
source=(const char *)in1, limit=(const char *)in1+sizeof(in1);
TestNextUChar(cnv, source, limit, results1, "UTF-32");
source=(const char *)in2, limit=(const char *)in2+sizeof(in2);
ucnv_resetToUnicode(cnv);
TestNextUChar(cnv, source, limit, results2, "UTF-32");
source=(const char *)in3, limit=(const char *)in3+sizeof(in3);
ucnv_resetToUnicode(cnv);
TestNextUChar(cnv, source, limit, results3, "UTF-32");
/* Test the condition when source >= sourceLimit */
ucnv_resetToUnicode(cnv);
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
ucnv_close(cnv);
}
static void
TestUTF32BE() {
/* test input */
static const uint8_t in[]={
0x00, 0x00, 0x00, 0x61,
0x00, 0x00, 0x30, 0x61,
0x00, 0x00, 0xdc, 0x00,
0x00, 0x00, 0xd8, 0x00,
0x00, 0x00, 0xdf, 0xff,
0x00, 0x00, 0xff, 0xfe,
0x00, 0x10, 0xab, 0xcd,
0x00, 0x10, 0xff, 0xff
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
4, 0x61,
4, 0x3061,
4, 0xfffd,
4, 0xfffd,
4, 0xfffd,
4, 0xfffe,
4, 0x10abcd,
4, 0x10ffff
};
/* error test input */
static const uint8_t in2[]={
0x00, 0x00, 0x00, 0x61,
0x00, 0x11, 0x00, 0x00, /* 0x110000 out of range */
0x00, 0x00, 0x00, 0x62,
0xff, 0xff, 0xff, 0xff, /* 0xffffffff out of range */
0x7f, 0xff, 0xff, 0xff, /* 0x7fffffff out of range */
0x00, 0x00, 0x01, 0x62,
0x00, 0x00, 0x02, 0x62
};
/* expected error test results */
static const int32_t results2[]={
/* number of bytes read, code point */
4, 0x61,
8, 0x62,
12, 0x162,
4, 0x262
};
UConverterToUCallback cb;
const void *p;
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("UTF-32BE", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF-32BE converter: %s\n", u_errorName(errorCode));
return;
}
TestNextUChar(cnv, source, limit, results, "UTF-32BE");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/* test error behavior with a skip callback */
ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, &cb, &p, &errorCode);
source=(const char *)in2;
limit=(const char *)(in2+sizeof(in2));
TestNextUChar(cnv, source, limit, results2, "UTF-32BE");
ucnv_close(cnv);
}
static void
TestUTF32LE() {
/* test input */
static const uint8_t in[]={
0x61, 0x00, 0x00, 0x00,
0x61, 0x30, 0x00, 0x00,
0x00, 0xdc, 0x00, 0x00,
0x00, 0xd8, 0x00, 0x00,
0xff, 0xdf, 0x00, 0x00,
0xfe, 0xff, 0x00, 0x00,
0xcd, 0xab, 0x10, 0x00,
0xff, 0xff, 0x10, 0x00
};
/* expected test results */
static const int32_t results[]={
/* number of bytes read, code point */
4, 0x61,
4, 0x3061,
4, 0xfffd,
4, 0xfffd,
4, 0xfffd,
4, 0xfffe,
4, 0x10abcd,
4, 0x10ffff
};
/* error test input */
static const uint8_t in2[]={
0x61, 0x00, 0x00, 0x00,
0x00, 0x00, 0x11, 0x00, /* 0x110000 out of range */
0x62, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, /* 0xffffffff out of range */
0xff, 0xff, 0xff, 0x7f, /* 0x7fffffff out of range */
0x62, 0x01, 0x00, 0x00,
0x62, 0x02, 0x00, 0x00,
};
/* expected error test results */
static const int32_t results2[]={
/* number of bytes read, code point */
4, 0x61,
8, 0x62,
12, 0x162,
4, 0x262,
};
UConverterToUCallback cb;
const void *p;
const char *source=(const char *)in, *limit=(const char *)in+sizeof(in);
UErrorCode errorCode=U_ZERO_ERROR;
UConverter *cnv=ucnv_open("UTF-32LE", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("Unable to open a UTF-32LE converter: %s\n", u_errorName(errorCode));
return;
}
TestNextUChar(cnv, source, limit, results, "UTF-32LE");
/* Test the condition when source >= sourceLimit */
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/* test error behavior with a skip callback */
ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, &cb, &p, &errorCode);
source=(const char *)in2;
limit=(const char *)(in2+sizeof(in2));
TestNextUChar(cnv,