| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 1999-2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| * |
| * |
| * ucnv_io.c: |
| * initializes global variables and defines functions pertaining to file access, |
| * and name resolution aspect of the library. |
| * |
| * new implementation: |
| * |
| * created on: 1999nov22 |
| * created by: Markus W. Scherer |
| * |
| * Use the binary cnvalias.dat (created from convrtrs.txt) to work |
| * with aliases for converter names. |
| ******************************************************************************* |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/putil.h" |
| #include "unicode/ucnv.h" /* This file implements ucnv_xXXX() APIs */ |
| #include "umutex.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "ucnv_io.h" |
| #include "unicode/udata.h" |
| #include "ucln_cmn.h" |
| |
| /* Format of cnvalias.dat ------------------------------------------------------ |
| * |
| * cnvalias.dat is a binary, memory-mappable form of convrtrs.txt . |
| * It contains two sorted tables and a block of zero-terminated strings. |
| * Each table is preceded by the number of table entries. |
| * |
| * The first table maps from aliases to converter indexes. |
| * The converter names themselves are listed as aliases in this table. |
| * Each entry in this table has an offset to the alias and |
| * an index of the converter in the converter table. |
| * |
| * The second table lists only the converters themselves. |
| * Each entry in this table has an offset to the converter name and |
| * the number of aliases, including the converter itself. |
| * A count of 1 means that there is no alias, only the converter name. |
| * |
| * In the block of strings after the tables, each converter name is directly |
| * followed by its aliases. All offsets to strings are offsets from the |
| * beginning of the data. |
| * |
| * More formal file data structure (data format 2.1): |
| * |
| * uint16_t aliasCount; |
| * uint16_t aliasOffsets[aliasCount]; |
| * uint16_t converterIndexes[aliasCount]; |
| * |
| * uint16_t converterCount; |
| * struct { |
| * uint16_t converterOffset; |
| * uint16_t aliasCount; |
| * } converters[converterCount]; |
| * |
| * uint16_t tagCount; |
| * uint16_t taggedAliasesOffsets[tagCount][converterCount]; |
| * char tags[] = { "Tag0\Tag1\0..." }; |
| * |
| * char strings[]={ |
| * "Converter0\0Alias1\0Alias2\0...Converter1\0Converter2\0Alias0\Alias1\0..." |
| * }; |
| * |
| * The code included here can read versions 2 and 2.1 of the data format. |
| * Version 2 does not have tag information, but since the code never refers |
| * to strings[] by its base offset, it's okay. |
| * |
| */ |
| |
| #define DATA_NAME "cnvalias" |
| #define DATA_TYPE "dat" |
| |
| static UDataMemory *aliasData=NULL; |
| static const uint16_t *aliasTable=NULL; |
| |
| static const uint16_t *converterTable = NULL; |
| static const uint16_t *tagTable = NULL; |
| |
| static char defaultConverterNameBuffer[100]; |
| static const char *defaultConverterName = NULL; |
| |
| static UBool |
| isAcceptable(void *context, |
| const char *type, const char *name, |
| const UDataInfo *pInfo) { |
| return (UBool)( |
| pInfo->size>=20 && |
| pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
| pInfo->charsetFamily==U_CHARSET_FAMILY && |
| pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ |
| pInfo->dataFormat[1]==0x76 && |
| pInfo->dataFormat[2]==0x41 && |
| pInfo->dataFormat[3]==0x6c && |
| pInfo->formatVersion[0]==2); |
| } |
| |
| static UBool |
| haveAliasData(UErrorCode *pErrorCode) { |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return FALSE; |
| } |
| |
| /* load converter alias data from file if necessary */ |
| if(aliasData==NULL) { |
| UDataMemory *data; |
| UDataInfo info; |
| const uint16_t *table=NULL; |
| |
| /* open the data outside the mutex block */ |
| data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| return FALSE; |
| } |
| |
| table=(const uint16_t *)udata_getMemory(data); |
| info.size=sizeof(UDataInfo); |
| udata_getInfo(data, &info); |
| |
| /* in the mutex block, set the data for this process */ |
| umtx_lock(NULL); |
| if(aliasData==NULL) { |
| aliasData=data; |
| data=NULL; |
| aliasTable=table; |
| table=NULL; |
| converterTable = aliasTable + 1 + 2 * *aliasTable; |
| |
| if (info.formatVersion[0] == 2 && info.formatVersion[1] > 0) { |
| tagTable = converterTable + 1 + 2 * *converterTable; |
| } |
| } |
| umtx_unlock(NULL); |
| |
| /* if a different thread set it first, then close the extra data */ |
| if(data!=NULL) { |
| udata_close(data); /* NULL if it was set correctly */ |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| static UBool |
| isAlias(const char *alias, UErrorCode *pErrorCode) { |
| if(alias==NULL) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return FALSE; |
| } else if(*alias==0) { |
| return FALSE; |
| } else { |
| return TRUE; |
| } |
| } |
| |
| UBool |
| ucnv_io_cleanup() |
| { |
| if (aliasData) { |
| udata_close(aliasData); |
| } |
| |
| aliasData = NULL; |
| aliasTable = NULL; |
| |
| converterTable = NULL; |
| tagTable = NULL; |
| |
| defaultConverterName = NULL; |
| |
| return TRUE; /* Everything was cleaned up */ |
| } |
| |
| |
| static int16_t getTagNumber(const char *tagname) { |
| if (tagTable) { |
| int16_t tag, count = (int16_t) *tagTable; |
| const char *tags = (const char *) (tagTable + 1 + count * *converterTable); |
| |
| #if 0 |
| |
| char name[100]; |
| int i; |
| |
| /* convert the tag name to lowercase to do case-insensitive comparisons */ |
| for(i = 0; i < sizeof(name) - 1 && *tagname; ++i) { |
| name[i] = (char)uprv_tolower(*tagname++); |
| } |
| name[i] = 0; |
| |
| #else |
| |
| const char *name = tagname; |
| |
| #endif |
| |
| for (tag = 0; count--; ++tag) { |
| if (!uprv_stricmp(name, tags)) { |
| return tag; |
| } |
| tags += strlen(tags) + 1; |
| } |
| } |
| |
| return -1; |
| } |
| |
| /** |
| * Do a fuzzy compare of a two converter/alias names. The comparison |
| * is case-insensitive. It also ignores the characters '-', '_', and |
| * ' ' (dash, underscore, and space). Thus the strings "UTF-8", |
| * "utf_8", and "Utf 8" are exactly equivalent. |
| * |
| * This is a symmetrical (commutative) operation; order of arguments |
| * is insignificant. This is an important property for sorting the |
| * list (when the list is preprocessed into binary form) and for |
| * performing binary searches on it at run time. |
| * |
| * @param name1 a converter name or alias, zero-terminated |
| * @param name2 a converter name or alias, zero-terminated |
| * @return 0 if the names match, or a negative value if the name1 |
| * lexically precedes name2, or a positive value if the name1 |
| * lexically follows name2. |
| */ |
| U_CAPI int U_EXPORT2 |
| ucnv_compareNames(const char *name1, const char *name2) { |
| int rc; |
| unsigned char c1, c2; |
| |
| for (;;) { |
| /* Ignore delimiters '-', '_', and ' ' */ |
| while ((c1 = (unsigned char)*name1) == '-' |
| || c1 == '_' || c1 == ' ') ++name1; |
| while ((c2 = (unsigned char)*name2) == '-' |
| || c2 == '_' || c2 == ' ') ++name2; |
| |
| /* If we reach the ends of both strings then they match */ |
| if ((c1|c2)==0) { |
| return 0; |
| } |
| |
| /* Case-insensitive comparison */ |
| rc = (int)(unsigned char)uprv_tolower(c1) - |
| (int)(unsigned char)uprv_tolower(c2); |
| if (rc!=0) { |
| return rc; |
| } |
| ++name1; |
| ++name2; |
| } |
| } |
| |
| /* |
| * search for an alias |
| * return NULL or a pointer to the converter table entry |
| */ |
| static const uint16_t * |
| findAlias(const char *alias) { |
| char name[100]; |
| const uint16_t *p=aliasTable; |
| uint16_t i, start, limit; |
| |
| limit=*p++; |
| if(limit==0) { |
| /* there are no aliases */ |
| return NULL; |
| } |
| |
| /* convert the alias name to lowercase to do case-insensitive comparisons */ |
| for(i=0; i<sizeof(name)-1 && *alias!=0; ++i) { |
| name[i]=(char)uprv_tolower(*alias++); |
| } |
| name[i]=0; |
| |
| /* do a binary search for the alias */ |
| start=0; |
| while(start<limit-1) { |
| i=(uint16_t)((start+limit)/2); |
| if(ucnv_compareNames(name, (const char *)aliasTable+p[i])<0) { |
| limit=i; |
| } else { |
| start=i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if(ucnv_compareNames(name, (const char *)aliasTable+p[start])==0) { |
| limit=*(p-1); /* aliasCount */ |
| p+=limit; /* advance to the second column of the alias table */ |
| i=p[start]; /* converter index */ |
| return |
| p+limit+ /* beginning of converter table */ |
| 1+ /* skip its count */ |
| 2*i; /* go to this converter's entry and return a pointer to it */ |
| } else { |
| return NULL; |
| } |
| } |
| |
| U_CFUNC const char * |
| ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { |
| const uint16_t *p=findAlias(alias); |
| if(p!=NULL) { |
| return (const char *)aliasTable+*p; |
| } |
| } |
| return NULL; |
| } |
| |
| U_CFUNC uint16_t |
| ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { |
| const uint16_t *p=findAlias(alias); |
| if(p!=NULL) { |
| *aliases=(const char *)aliasTable+*p; |
| return *(p+1); |
| } |
| } |
| return 0; |
| } |
| |
| U_CFUNC const char * |
| ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { |
| const uint16_t *p=findAlias(alias); |
| if(p!=NULL) { |
| uint16_t count=*(p+1); |
| if(n<count) { |
| const char *aliases=(const char *)aliasTable+*p; |
| while(n>0) { |
| /* skip a name, first the canonical converter name */ |
| aliases+=uprv_strlen(aliases)+1; |
| --n; |
| } |
| return aliases; |
| } |
| } |
| } |
| return NULL; |
| } |
| |
| U_CFUNC uint16_t |
| ucnv_io_countStandards(UErrorCode *pErrorCode) { |
| if (haveAliasData(pErrorCode)) { |
| if (!tagTable) { |
| *pErrorCode = U_INVALID_FORMAT_ERROR; |
| return 0; |
| } |
| |
| return *tagTable; |
| } |
| |
| return 0; |
| } |
| |
| U_CAPI const char * U_EXPORT2 |
| ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { |
| if (haveAliasData(pErrorCode) && tagTable) { |
| int16_t count = (int16_t) *tagTable; |
| const char *tags = (const char *) (tagTable + 1 + count * *converterTable); |
| |
| while (n-- && count--) { |
| tags += strlen(tags) + 1; |
| } |
| |
| return count ? tags : NULL; |
| } |
| |
| return NULL; |
| } |
| |
| U_CFUNC const char * U_EXPORT2 |
| ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) { |
| if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { |
| const uint16_t *p = findAlias(alias); |
| if(p != NULL) { |
| int16_t tag = getTagNumber(standard); |
| |
| if (tag > -1) { |
| uint16_t offset = tagTable[1 + tag * *converterTable + (p - converterTable) / 2]; |
| return offset ? (const char *) aliasTable + offset : NULL; |
| } |
| } |
| } |
| |
| return NULL; |
| } |
| |
| U_CFUNC uint16_t |
| ucnv_io_countAvailableConverters(UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode)) { |
| return *converterTable; |
| } |
| return 0; |
| } |
| |
| U_CFUNC const char * |
| ucnv_io_getAvailableConverter(uint16_t n, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode)) { |
| const uint16_t *p=converterTable; |
| if(n<*p) { |
| return (const char *)aliasTable+p[1+2*n]; |
| } |
| } |
| return NULL; |
| } |
| |
| U_CFUNC void |
| ucnv_io_fillAvailableConverters(const char **aliases, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode)) { |
| const uint16_t *p=converterTable; |
| uint16_t count=*p++; |
| while(count>0) { |
| *aliases++=(const char *)aliasTable+*p; |
| p+=2; |
| --count; |
| } |
| } |
| } |
| |
| U_CFUNC uint16_t |
| ucnv_io_countAvailableAliases(UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode)) { |
| return *aliasTable; |
| } |
| return 0; |
| } |
| |
| #if 0 |
| /* |
| * We are not currently using these functions, so I am commenting them out |
| * to reduce the binary file size and improve the code coverage; |
| * I do not currently want to remove this entirely because it may be useful |
| * in the future and also serves to some degree as another piece of |
| * documentation of the data structure. |
| */ |
| U_CFUNC const char * |
| ucnv_io_getAvailableAlias(uint16_t n, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode) && n<*aliasTable) { |
| return (const char *)aliasTable+*(aliasTable+1+n); |
| } |
| return NULL; |
| } |
| |
| U_CFUNC void |
| ucnv_io_fillAvailableAliases(const char **aliases, UErrorCode *pErrorCode) { |
| if(haveAliasData(pErrorCode)) { |
| const uint16_t *p=aliasTable; |
| uint16_t count=*p++; |
| while(count>0) { |
| *aliases++=(const char *)aliasTable+*p; |
| ++p; |
| --count; |
| } |
| } |
| } |
| #endif |
| |
| /* default converter name --------------------------------------------------- */ |
| |
| /* |
| * In order to be really thread-safe, the get function would have to take |
| * a buffer parameter and copy the current string inside a mutex block. |
| * This implementation only tries to be really thread-safe while |
| * setting the name. |
| * It assumes that setting a pointer is atomic. |
| */ |
| |
| U_CFUNC const char * |
| ucnv_io_getDefaultConverterName() { |
| /* local variable to be thread-safe */ |
| const char *name=defaultConverterName; |
| if(name==NULL) { |
| const char *codepage=0; |
| umtx_lock(NULL); |
| codepage = uprv_getDefaultCodepage(); |
| umtx_unlock(NULL); |
| if(codepage!=NULL) { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| name=ucnv_io_getConverterName(codepage, &errorCode); |
| if(U_FAILURE(errorCode) || name==NULL) { |
| name=codepage; |
| } |
| } |
| |
| /* if the name is there, test it out */ |
| if(name != NULL) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| UConverter *cnv; |
| cnv = ucnv_open(name, &errorCode); |
| if(U_FAILURE(errorCode) || (cnv == NULL)) { |
| /* Panic time, let's use a fallback. */ |
| #if (U_CHARSET_FAMILY == U_ASCII_FAMILY) |
| name = "US-ASCII"; |
| /* there is no 'algorithmic' converter for EBCDIC */ |
| #elif defined(OS390) |
| name = "ibm-1047-s390"; |
| #else |
| name = "ibm-37"; |
| #endif |
| } |
| ucnv_close(cnv); |
| } |
| |
| if(name != NULL) { |
| /* Did find a name. And it works.*/ |
| defaultConverterName=name; |
| } |
| } |
| |
| return name; |
| } |
| |
| U_CFUNC void |
| ucnv_io_setDefaultConverterName(const char *converterName) { |
| if(converterName==NULL) { |
| /* reset to the default codepage */ |
| defaultConverterName=NULL; |
| } else { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| const char *name=ucnv_io_getConverterName(converterName, &errorCode); |
| if(U_SUCCESS(errorCode) && name!=NULL) { |
| defaultConverterName=name; |
| } else { |
| /* do not set the name if the alias lookup failed and it is too long */ |
| int32_t length=uprv_strlen(converterName); |
| if(length<sizeof(defaultConverterNameBuffer)) { |
| /* it was not found as an alias, so copy it - accept an empty name */ |
| UBool didLock; |
| if(defaultConverterName==defaultConverterNameBuffer) { |
| umtx_lock(NULL); |
| didLock=TRUE; |
| } else { |
| didLock=FALSE; |
| } |
| uprv_memcpy(defaultConverterNameBuffer, converterName, length); |
| defaultConverterNameBuffer[length]=0; |
| defaultConverterName=defaultConverterNameBuffer; |
| if(didLock) { |
| umtx_unlock(NULL); |
| } |
| } |
| } |
| } |
| } |
| |
| /* |
| * Hey, Emacs, please set the following: |
| * |
| * Local Variables: |
| * indent-tabs-mode: nil |
| * End: |
| * |
| */ |
| |