blob: 501c23ae8c5abac9c1740e48da00612e1d76e124 [file] [log] [blame]
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: testidn.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003-02-06
* created by: Ram Viswanadha
*
* This program reads the rfc3454_*.txt files,
* parses them, and extracts the data for Nameprep conformance.
* It then preprocesses it and writes a binary file for efficient use
* in various IDNA conversion processes.
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA && !UCONFIG_NO_TRANSLITERATION
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "utrie.h"
#include "umutex.h"
#include "sprpimpl.h"
#include "testidna.h"
UBool beVerbose=FALSE, haveCopyright=TRUE;
/* prototypes --------------------------------------------------------------- */
static UBool isDataLoaded = FALSE;
static UTrie idnTrie={ 0,0,0,0,0,0,0 };
static UDataMemory *idnData=NULL;
static UErrorCode dataErrorCode =U_ZERO_ERROR;
static const uint16_t* mappingData = NULL;
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
static void
parseMappings(const char *filename, UBool withNorm, UBool reportError,TestIDNA& test, UErrorCode *pErrorCode);
static void
parseTable(const char *filename, UBool isUnassigned, TestIDNA& test, UErrorCode *pErrorCode);
static UBool loadIDNData(UErrorCode &errorCode);
static UBool cleanup();
static void
compareMapping(uint32_t codepoint, uint32_t* mapping, int32_t mapLength,
UBool withNorm);
static void
compareFlagsForRange(uint32_t start, uint32_t end,
UBool isUnassigned);
static void
testAllCodepoints(TestIDNA& test);
static TestIDNA* pTestIDNA =NULL;
static const char* fileNames[] = {
"rfc3454_A_1.txt", /* contains unassigned code points */
"rfc3454_C_X.txt", /* contains code points that are prohibited */
"rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */
"rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */
/* "NormalizationCorrections.txt",contains NFKC case mappings whicha are not included in UTR 21 */
};
/* -------------------------------------------------------------------------- */
/* file definitions */
#define DATA_NAME "uidna"
#define DATA_TYPE "icu"
#define MISC_DIR "misc"
extern int
testData(TestIDNA& test) {
char* filename = (char*) malloc(strlen(IntlTest::pathToDataDirectory())*3);
//TODO get the srcDir dynamically
const char *srcDir=IntlTest::pathToDataDirectory();
char *basename=NULL;
UErrorCode errorCode=U_ZERO_ERROR;
char *saveBasename =NULL;
loadIDNData(errorCode);
if(U_FAILURE(dataErrorCode)){
test.errln( "Could not load data. Error: %s\n",u_errorName(dataErrorCode));
return dataErrorCode;
}
//initialize
pTestIDNA = &test;
/* prepare the filename beginning with the source dir */
if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){
filename[0] = 0x2E;
filename[1] = U_FILE_SEP_CHAR;
uprv_strcpy(filename+2,srcDir);
}else{
uprv_strcpy(filename, srcDir);
}
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
/* process unassigned */
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
/* first copy misc directory */
saveBasename = basename;
uprv_strcpy(basename,MISC_DIR);
basename = basename + uprv_strlen(MISC_DIR);
*basename++=U_FILE_SEP_CHAR;
/* process unassigned */
uprv_strcpy(basename,fileNames[0]);
parseTable(filename,TRUE, test,&errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
/* process prohibited */
uprv_strcpy(basename,fileNames[1]);
parseTable(filename,FALSE, test, &errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
/* process mappings */
uprv_strcpy(basename,fileNames[2]);
parseMappings(filename, FALSE, FALSE,test, &errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
uprv_strcpy(basename,fileNames[3]);
parseMappings(filename, TRUE, FALSE,test, &errorCode);
if(U_FAILURE(errorCode)) {
test.errln( "Could not open file %s for reading \n", filename);
return errorCode;
}
testAllCodepoints(test);
cleanup();
pTestIDNA = NULL;
free(filename);
return errorCode;
}
U_CDECL_BEGIN
static void U_CALLCONV
caseMapLineFn(void *context,
char *fields[][2], int32_t /*fieldCount*/,
UErrorCode *pErrorCode) {
uint32_t mapping[40];
char *end, *s;
uint32_t code;
int32_t length;
UBool* mapWithNorm = (UBool*) context;
/* get the character code, field 0 */
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
*pErrorCode=U_PARSE_ERROR;
}
s = fields[1][0];
/* parse the mapping string */
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
/* store the mapping */
compareMapping(code,mapping, length, *mapWithNorm);
}
U_CDECL_END
static void
parseMappings(const char *filename,UBool withNorm, UBool reportError, TestIDNA& test, UErrorCode *pErrorCode) {
char *fields[3][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode);
//fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);
if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
test.errln( "testidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
}
}
/* parser for UnicodeData.txt ----------------------------------------------- */
U_CDECL_BEGIN
static void U_CALLCONV
unicodeDataLineFn(void *context,
char *fields[][2], int32_t /*fieldCount*/,
UErrorCode *pErrorCode) {
uint32_t rangeStart=0,rangeEnd =0;
UBool* isUnassigned = (UBool*) context;
u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode);
if(U_FAILURE(*pErrorCode)){
*pErrorCode = U_PARSE_ERROR;
return;
}
compareFlagsForRange(rangeStart,rangeEnd,*isUnassigned);
}
U_CDECL_END
static void
parseTable(const char *filename,UBool isUnassigned,TestIDNA& test, UErrorCode *pErrorCode) {
char *fields[2][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
test.errln( "testidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
}
}
static void
testAllCodepoints(TestIDNA& test){
if(isDataLoaded){
uint32_t i = 0;
int32_t unassigned = 0;
int32_t prohibited = 0;
int32_t mappedWithNorm = 0;
int32_t mapped = 0;
int32_t noValueInTrie = 0;
for(i=0;i<=0x10FFFF;i++){
uint32_t result = 0;
UTRIE_GET16(&idnTrie,i, result);
if(result != UIDNA_NO_VALUE ){
if((result & 0x07) == UIDNA_UNASSIGNED){
unassigned++;
}
if((result & 0x07) == UIDNA_PROHIBITED){
prohibited++;
}
if((result>>5) == _IDNA_MAP_TO_NOTHING){
mapped++;
}
if((result & 0x07) == UIDNA_MAP_NFKC){
mappedWithNorm++;
}
}else{
noValueInTrie++;
if(result > 0){
test.errln("The return value for 0x%06X is wrong. %i\n",i,result);
}
}
}
test.logln("Number of Unassinged code points : %i \n",unassigned);
test.logln("Number of Prohibited code points : %i \n",prohibited);
test.logln("Number of Mapped code points : %i \n",mapped);
test.logln("Number of Mapped with NFKC code points : %i \n",mappedWithNorm);
test.logln("Number of code points that have no value in Trie: %i \n",noValueInTrie);
}
}
static inline void getValues(uint32_t result, int8_t& flag,
int8_t& length, int32_t& index){
/* first 3 bits contain the flag */
flag = (int8_t) (result & 0x07);
/* next 2 bits contain the length */
length = (int8_t) ((result>>3) & 0x03);
/* next 11 bits contain the index */
index = (result>> 5);
}
static void
compareMapping(uint32_t codepoint, uint32_t* mapping,int32_t mapLength,
UBool withNorm){
if(isDataLoaded){
uint32_t result = 0;
UTRIE_GET16(&idnTrie,codepoint, result);
int8_t flag, length;
int32_t index;
getValues(result,flag,length, index);
if(withNorm){
if(flag != UIDNA_MAP_NFKC){
pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n",codepoint, UIDNA_MAP_NFKC, flag);
}
}else{
if(flag==UIDNA_NO_VALUE || flag == UIDNA_PROHIBITED){
if(index != _IDNA_MAP_TO_NOTHING ){
pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n", codepoint, _IDNA_MAP_TO_NOTHING, index);
}
}
}
if(length ==_IDNA_LENGTH_IN_MAPPING_TABLE){
length = (int8_t)mappingData[index];
index++;
}
int32_t realLength =0;
/* figure out the real length */
for(int32_t j=0; j<mapLength; j++){
if(mapping[j] > 0xFFFF){
realLength +=2;
}else{
realLength++;
}
}
if(realLength != length){
pTestIDNA->errln( "Did not get the expected length. Expected: %i Got: %i\n", mapLength, length);
}
for(int8_t i =0; i< mapLength; i++){
if(mapping[i] <= 0xFFFF){
if(mappingData[index+i] != (uint16_t)mapping[i]){
pTestIDNA->errln("Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n", mapping[i], mappingData[index+i]);
}
}else{
UChar lead = UTF16_LEAD(mapping[i]);
UChar trail = UTF16_TRAIL(mapping[i]);
if(mappingData[index+i] != lead ||
mappingData[index+i+1] != trail){
pTestIDNA->errln( "Did not get the expected result. Expected: 0x%04X 0x%04X Got: 0x%04X 0x%04X", lead, trail, mappingData[index+i], mappingData[index+i+1]);
}
}
}
}
}
static void
compareFlagsForRange(uint32_t start, uint32_t end,
UBool isUnassigned){
if(isDataLoaded){
uint32_t result =0 ;
while(start < end+1){
UTRIE_GET16(&idnTrie,start, result);
if(isUnassigned){
if(result != UIDNA_UNASSIGNED){
pTestIDNA->errln( "UIDNA_UASSIGNED flag failed for 0x%06X. Expected: %04X Got: %04X\n",start,UIDNA_UNASSIGNED, result);
}
}else{
if((result & 0x03) != UIDNA_PROHIBITED){
pTestIDNA->errln( "UIDNA_PROHIBITED flag failed for 0x%06X. Expected: %04X Got: %04X\n\n",start,UIDNA_PROHIBITED, result);
}
}
start++;
}
}
}
UBool
cleanup() {
if(idnData!=NULL) {
udata_close(idnData);
idnData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
isDataLoaded=FALSE;
return TRUE;
}
U_CDECL_BEGIN
static UBool U_CALLCONV
isAcceptable(void * /* context */,
const char * /* type */, const char * /* name */,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
pInfo->dataFormat[1]==0x44 &&
pInfo->dataFormat[2]==0x4e &&
pInfo->dataFormat[3]==0x41 &&
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
return TRUE;
} else {
return FALSE;
}
}
/* idnTrie: the folding offset is the lead FCD value itself */
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
U_CDECL_END
static UBool
loadIDNData(UErrorCode &errorCode) {
/* load Unicode normalization data from file */
if(isDataLoaded==FALSE) {
UTrie _idnTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
}
/* open the data outside the mutex block */
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
dataErrorCode=errorCode;
if(U_FAILURE(errorCode)) {
return isDataLoaded=FALSE;
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
_idnTrie.getFoldingOffset=getFoldingOffset;
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
return isDataLoaded=FALSE;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(idnData==NULL) {
idnData=data;
data=NULL;
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(idnData);
}
umtx_unlock(NULL);
/* initialize some variables */
mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);
isDataLoaded = TRUE;
/* if a different thread set it first, then close the extra data */
if(data!=NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return isDataLoaded;
}
#endif /* #if !UCONFIG_NO_IDNA */
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/