| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 2000-2016, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * file name: ucnvisci.c |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2001JUN26 |
| * created by: Ram Viswanadha |
| * |
| * Date Name Description |
| * 24/7/2001 Ram Added support for EXT character handling |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION |
| |
| #include "unicode/ucnv.h" |
| #include "unicode/ucnv_cb.h" |
| #include "unicode/utf16.h" |
| #include "cmemory.h" |
| #include "ucnv_bld.h" |
| #include "ucnv_cnv.h" |
| #include "cstring.h" |
| #include "uassert.h" |
| |
| #define UCNV_OPTIONS_VERSION_MASK 0xf |
| #define NUKTA 0x093c |
| #define HALANT 0x094d |
| #define ZWNJ 0x200c /* Zero Width Non Joiner */ |
| #define ZWJ 0x200d /* Zero width Joiner */ |
| #define INVALID_CHAR 0xffff |
| #define ATR 0xEF /* Attribute code */ |
| #define EXT 0xF0 /* Extension code */ |
| #define DANDA 0x0964 |
| #define DOUBLE_DANDA 0x0965 |
| #define ISCII_NUKTA 0xE9 |
| #define ISCII_HALANT 0xE8 |
| #define ISCII_DANDA 0xEA |
| #define ISCII_INV 0xD9 |
| #define ISCII_VOWEL_SIGN_E 0xE0 |
| #define INDIC_BLOCK_BEGIN 0x0900 |
| #define INDIC_BLOCK_END 0x0D7F |
| #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN) |
| #define VOCALLIC_RR 0x0931 |
| #define LF 0x0A |
| #define ASCII_END 0xA0 |
| #define NO_CHAR_MARKER 0xFFFE |
| #define TELUGU_DELTA DELTA * TELUGU |
| #define DEV_ABBR_SIGN 0x0970 |
| #define DEV_ANUDATTA 0x0952 |
| #define EXT_RANGE_BEGIN 0xA1 |
| #define EXT_RANGE_END 0xEE |
| |
| #define PNJ_DELTA 0x0100 |
| #define PNJ_BINDI 0x0A02 |
| #define PNJ_TIPPI 0x0A70 |
| #define PNJ_SIGN_VIRAMA 0x0A4D |
| #define PNJ_ADHAK 0x0A71 |
| #define PNJ_HA 0x0A39 |
| #define PNJ_RRA 0x0A5C |
| |
| typedef enum { |
| DEVANAGARI =0, |
| BENGALI, |
| GURMUKHI, |
| GUJARATI, |
| ORIYA, |
| TAMIL, |
| TELUGU, |
| KANNADA, |
| MALAYALAM, |
| DELTA=0x80 |
| }UniLang; |
| |
| /** |
| * Enumeration for switching code pages if <ATR>+<one of below values> |
| * is encountered |
| */ |
| typedef enum { |
| DEF = 0x40, |
| RMN = 0x41, |
| DEV = 0x42, |
| BNG = 0x43, |
| TML = 0x44, |
| TLG = 0x45, |
| ASM = 0x46, |
| ORI = 0x47, |
| KND = 0x48, |
| MLM = 0x49, |
| GJR = 0x4A, |
| PNJ = 0x4B, |
| ARB = 0x71, |
| PES = 0x72, |
| URD = 0x73, |
| SND = 0x74, |
| KSM = 0x75, |
| PST = 0x76 |
| }ISCIILang; |
| |
| typedef enum { |
| DEV_MASK =0x80, |
| PNJ_MASK =0x40, |
| GJR_MASK =0x20, |
| ORI_MASK =0x10, |
| BNG_MASK =0x08, |
| KND_MASK =0x04, |
| MLM_MASK =0x02, |
| TML_MASK =0x01, |
| ZERO =0x00 |
| }MaskEnum; |
| |
| #define ISCII_CNV_PREFIX "ISCII,version=" |
| |
| typedef struct { |
| UChar contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */ |
| UChar contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */ |
| uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */ |
| uint16_t currentDeltaFromUnicode; /* current delta in Indic block */ |
| uint16_t currentDeltaToUnicode; /* current delta in Indic block */ |
| MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */ |
| MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */ |
| MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */ |
| UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */ |
| UBool resetToDefaultToUnicode; /* boolean for reseting to default delta and mask when a newline is encountered*/ |
| char name[sizeof(ISCII_CNV_PREFIX) + 1]; |
| UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */ |
| } UConverterDataISCII; |
| |
| typedef struct LookupDataStruct { |
| UniLang uniLang; |
| MaskEnum maskEnum; |
| ISCIILang isciiLang; |
| } LookupDataStruct; |
| |
| static const LookupDataStruct lookupInitialData[]={ |
| { DEVANAGARI, DEV_MASK, DEV }, |
| { BENGALI, BNG_MASK, BNG }, |
| { GURMUKHI, PNJ_MASK, PNJ }, |
| { GUJARATI, GJR_MASK, GJR }, |
| { ORIYA, ORI_MASK, ORI }, |
| { TAMIL, TML_MASK, TML }, |
| { TELUGU, KND_MASK, TLG }, |
| { KANNADA, KND_MASK, KND }, |
| { MALAYALAM, MLM_MASK, MLM } |
| }; |
| |
| /* |
| * For special handling of certain Gurmukhi characters. |
| * Bit 0 (value 1): PNJ consonant |
| * Bit 1 (value 2): PNJ Bindi Tippi |
| */ |
| static const uint8_t pnjMap[80] = { |
| /* 0A00..0A0F */ |
| 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, |
| /* 0A10..0A1F */ |
| 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| /* 0A20..0A2F */ |
| 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, |
| /* 0A30..0A3F */ |
| 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2, |
| /* 0A40..0A4F */ |
| 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| |
| static UBool |
| isPNJConsonant(UChar32 c) { |
| if (c < 0xa00 || 0xa50 <= c) { |
| return FALSE; |
| } else { |
| return (UBool)(pnjMap[c - 0xa00] & 1); |
| } |
| } |
| |
| static UBool |
| isPNJBindiTippi(UChar32 c) { |
| if (c < 0xa00 || 0xa50 <= c) { |
| return FALSE; |
| } else { |
| return (UBool)(pnjMap[c - 0xa00] >> 1); |
| } |
| } |
| U_CDECL_BEGIN |
| static void U_CALLCONV |
| _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) { |
| if(pArgs->onlyTestIsLoadable) { |
| return; |
| } |
| |
| cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII)); |
| |
| if (cnv->extraInfo != NULL) { |
| int32_t len=0; |
| UConverterDataISCII *converterData= |
| (UConverterDataISCII *) cnv->extraInfo; |
| converterData->contextCharToUnicode=NO_CHAR_MARKER; |
| cnv->toUnicodeStatus = missingCharMarker; |
| converterData->contextCharFromUnicode=0x0000; |
| converterData->resetToDefaultToUnicode=FALSE; |
| /* check if the version requested is supported */ |
| if ((pArgs->options & UCNV_OPTIONS_VERSION_MASK) < 9) { |
| /* initialize state variables */ |
| converterData->currentDeltaFromUnicode |
| = converterData->currentDeltaToUnicode |
| = converterData->defDeltaToUnicode = (uint16_t)(lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA); |
| |
| converterData->currentMaskFromUnicode |
| = converterData->currentMaskToUnicode |
| = converterData->defMaskToUnicode = lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].maskEnum; |
| |
| converterData->isFirstBuffer=TRUE; |
| (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX); |
| len = (int32_t)uprv_strlen(converterData->name); |
| converterData->name[len]= (char)((pArgs->options & UCNV_OPTIONS_VERSION_MASK) + '0'); |
| converterData->name[len+1]=0; |
| |
| converterData->prevToUnicodeStatus = 0x0000; |
| } else { |
| uprv_free(cnv->extraInfo); |
| cnv->extraInfo = NULL; |
| *errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| |
| } else { |
| *errorCode =U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| static void U_CALLCONV |
| _ISCIIClose(UConverter *cnv) { |
| if (cnv->extraInfo!=NULL) { |
| if (!cnv->isExtraLocal) { |
| uprv_free(cnv->extraInfo); |
| } |
| cnv->extraInfo=NULL; |
| } |
| } |
| |
| static const char* U_CALLCONV |
| _ISCIIgetName(const UConverter* cnv) { |
| if (cnv->extraInfo) { |
| UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo; |
| return myData->name; |
| } |
| return NULL; |
| } |
| |
| static void U_CALLCONV |
| _ISCIIReset(UConverter *cnv, UConverterResetChoice choice) { |
| UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo); |
| if (choice<=UCNV_RESET_TO_UNICODE) { |
| cnv->toUnicodeStatus = missingCharMarker; |
| cnv->mode=0; |
| data->currentDeltaToUnicode=data->defDeltaToUnicode; |
| data->currentMaskToUnicode = data->defMaskToUnicode; |
| data->contextCharToUnicode=NO_CHAR_MARKER; |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| if (choice!=UCNV_RESET_TO_UNICODE) { |
| cnv->fromUChar32=0x0000; |
| data->contextCharFromUnicode=0x00; |
| data->currentMaskFromUnicode=data->defMaskToUnicode; |
| data->currentDeltaFromUnicode=data->defDeltaToUnicode; |
| data->isFirstBuffer=TRUE; |
| data->resetToDefaultToUnicode=FALSE; |
| } |
| } |
| |
| /** |
| * The values in validity table are indexed by the lower bits of Unicode |
| * range 0x0900 - 0x09ff. The values have a structure like: |
| * --------------------------------------------------------------- |
| * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML | |
| * | | | | | ASM | KND | | | |
| * --------------------------------------------------------------- |
| * If a code point is valid in a particular script |
| * then that bit is turned on |
| * |
| * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for |
| * to represent these languages |
| * |
| * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case |
| * and combine and use 1 bit to represent these languages. |
| * |
| * TODO: It is probably easier to understand and maintain to change this |
| * to use uint16_t and give each of the 9 Unicode/script blocks its own bit. |
| */ |
| |
| static const uint8_t validityTable[128] = { |
| /* This state table is tool generated please do not edit unless you know exactly what you are doing */ |
| /* Note: This table was edited to mirror the Windows XP implementation */ |
| /*ISCII:Valid:Unicode */ |
| /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK , |
| /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , |
| /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , |
| /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO , |
| /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO , |
| /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO , |
| /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO , |
| /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , |
| /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , |
| /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , |
| /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , |
| /* |
| * The length of the array is 128 to provide values for 0x900..0x97f. |
| * The last 15 entries for 0x971..0x97f of the validity table are all zero |
| * because no Indic script uses such Unicode code points. |
| */ |
| /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO |
| }; |
| |
| static const uint16_t fromUnicodeTable[128]={ |
| 0x00a0 ,/* 0x0900 */ |
| 0x00a1 ,/* 0x0901 */ |
| 0x00a2 ,/* 0x0902 */ |
| 0x00a3 ,/* 0x0903 */ |
| 0xa4e0 ,/* 0x0904 */ |
| 0x00a4 ,/* 0x0905 */ |
| 0x00a5 ,/* 0x0906 */ |
| 0x00a6 ,/* 0x0907 */ |
| 0x00a7 ,/* 0x0908 */ |
| 0x00a8 ,/* 0x0909 */ |
| 0x00a9 ,/* 0x090a */ |
| 0x00aa ,/* 0x090b */ |
| 0xA6E9 ,/* 0x090c */ |
| 0x00ae ,/* 0x090d */ |
| 0x00ab ,/* 0x090e */ |
| 0x00ac ,/* 0x090f */ |
| 0x00ad ,/* 0x0910 */ |
| 0x00b2 ,/* 0x0911 */ |
| 0x00af ,/* 0x0912 */ |
| 0x00b0 ,/* 0x0913 */ |
| 0x00b1 ,/* 0x0914 */ |
| 0x00b3 ,/* 0x0915 */ |
| 0x00b4 ,/* 0x0916 */ |
| 0x00b5 ,/* 0x0917 */ |
| 0x00b6 ,/* 0x0918 */ |
| 0x00b7 ,/* 0x0919 */ |
| 0x00b8 ,/* 0x091a */ |
| 0x00b9 ,/* 0x091b */ |
| 0x00ba ,/* 0x091c */ |
| 0x00bb ,/* 0x091d */ |
| 0x00bc ,/* 0x091e */ |
| 0x00bd ,/* 0x091f */ |
| 0x00be ,/* 0x0920 */ |
| 0x00bf ,/* 0x0921 */ |
| 0x00c0 ,/* 0x0922 */ |
| 0x00c1 ,/* 0x0923 */ |
| 0x00c2 ,/* 0x0924 */ |
| 0x00c3 ,/* 0x0925 */ |
| 0x00c4 ,/* 0x0926 */ |
| 0x00c5 ,/* 0x0927 */ |
| 0x00c6 ,/* 0x0928 */ |
| 0x00c7 ,/* 0x0929 */ |
| 0x00c8 ,/* 0x092a */ |
| 0x00c9 ,/* 0x092b */ |
| 0x00ca ,/* 0x092c */ |
| 0x00cb ,/* 0x092d */ |
| 0x00cc ,/* 0x092e */ |
| 0x00cd ,/* 0x092f */ |
| 0x00cf ,/* 0x0930 */ |
| 0x00d0 ,/* 0x0931 */ |
| 0x00d1 ,/* 0x0932 */ |
| 0x00d2 ,/* 0x0933 */ |
| 0x00d3 ,/* 0x0934 */ |
| 0x00d4 ,/* 0x0935 */ |
| 0x00d5 ,/* 0x0936 */ |
| 0x00d6 ,/* 0x0937 */ |
| 0x00d7 ,/* 0x0938 */ |
| 0x00d8 ,/* 0x0939 */ |
| 0xFFFF ,/* 0x093A */ |
| 0xFFFF ,/* 0x093B */ |
| 0x00e9 ,/* 0x093c */ |
| 0xEAE9 ,/* 0x093d */ |
| 0x00da ,/* 0x093e */ |
| 0x00db ,/* 0x093f */ |
| 0x00dc ,/* 0x0940 */ |
| 0x00dd ,/* 0x0941 */ |
| 0x00de ,/* 0x0942 */ |
| 0x00df ,/* 0x0943 */ |
| 0xDFE9 ,/* 0x0944 */ |
| 0x00e3 ,/* 0x0945 */ |
| 0x00e0 ,/* 0x0946 */ |
| 0x00e1 ,/* 0x0947 */ |
| 0x00e2 ,/* 0x0948 */ |
| 0x00e7 ,/* 0x0949 */ |
| 0x00e4 ,/* 0x094a */ |
| 0x00e5 ,/* 0x094b */ |
| 0x00e6 ,/* 0x094c */ |
| 0x00e8 ,/* 0x094d */ |
| 0x00ec ,/* 0x094e */ |
| 0x00ed ,/* 0x094f */ |
| 0xA1E9 ,/* 0x0950 */ /* OM Symbol */ |
| 0xFFFF ,/* 0x0951 */ |
| 0xF0B8 ,/* 0x0952 */ |
| 0xFFFF ,/* 0x0953 */ |
| 0xFFFF ,/* 0x0954 */ |
| 0xFFFF ,/* 0x0955 */ |
| 0xFFFF ,/* 0x0956 */ |
| 0xFFFF ,/* 0x0957 */ |
| 0xb3e9 ,/* 0x0958 */ |
| 0xb4e9 ,/* 0x0959 */ |
| 0xb5e9 ,/* 0x095a */ |
| 0xbae9 ,/* 0x095b */ |
| 0xbfe9 ,/* 0x095c */ |
| 0xC0E9 ,/* 0x095d */ |
| 0xc9e9 ,/* 0x095e */ |
| 0x00ce ,/* 0x095f */ |
| 0xAAe9 ,/* 0x0960 */ |
| 0xA7E9 ,/* 0x0961 */ |
| 0xDBE9 ,/* 0x0962 */ |
| 0xDCE9 ,/* 0x0963 */ |
| 0x00ea ,/* 0x0964 */ |
| 0xeaea ,/* 0x0965 */ |
| 0x00f1 ,/* 0x0966 */ |
| 0x00f2 ,/* 0x0967 */ |
| 0x00f3 ,/* 0x0968 */ |
| 0x00f4 ,/* 0x0969 */ |
| 0x00f5 ,/* 0x096a */ |
| 0x00f6 ,/* 0x096b */ |
| 0x00f7 ,/* 0x096c */ |
| 0x00f8 ,/* 0x096d */ |
| 0x00f9 ,/* 0x096e */ |
| 0x00fa ,/* 0x096f */ |
| 0xF0BF ,/* 0x0970 */ |
| 0xFFFF ,/* 0x0971 */ |
| 0xFFFF ,/* 0x0972 */ |
| 0xFFFF ,/* 0x0973 */ |
| 0xFFFF ,/* 0x0974 */ |
| 0xFFFF ,/* 0x0975 */ |
| 0xFFFF ,/* 0x0976 */ |
| 0xFFFF ,/* 0x0977 */ |
| 0xFFFF ,/* 0x0978 */ |
| 0xFFFF ,/* 0x0979 */ |
| 0xFFFF ,/* 0x097a */ |
| 0xFFFF ,/* 0x097b */ |
| 0xFFFF ,/* 0x097c */ |
| 0xFFFF ,/* 0x097d */ |
| 0xFFFF ,/* 0x097e */ |
| 0xFFFF ,/* 0x097f */ |
| }; |
| static const uint16_t toUnicodeTable[256]={ |
| 0x0000,/* 0x00 */ |
| 0x0001,/* 0x01 */ |
| 0x0002,/* 0x02 */ |
| 0x0003,/* 0x03 */ |
| 0x0004,/* 0x04 */ |
| 0x0005,/* 0x05 */ |
| 0x0006,/* 0x06 */ |
| 0x0007,/* 0x07 */ |
| 0x0008,/* 0x08 */ |
| 0x0009,/* 0x09 */ |
| 0x000a,/* 0x0a */ |
| 0x000b,/* 0x0b */ |
| 0x000c,/* 0x0c */ |
| 0x000d,/* 0x0d */ |
| 0x000e,/* 0x0e */ |
| 0x000f,/* 0x0f */ |
| 0x0010,/* 0x10 */ |
| 0x0011,/* 0x11 */ |
| 0x0012,/* 0x12 */ |
| 0x0013,/* 0x13 */ |
| 0x0014,/* 0x14 */ |
| 0x0015,/* 0x15 */ |
| 0x0016,/* 0x16 */ |
| 0x0017,/* 0x17 */ |
| 0x0018,/* 0x18 */ |
| 0x0019,/* 0x19 */ |
| 0x001a,/* 0x1a */ |
| 0x001b,/* 0x1b */ |
| 0x001c,/* 0x1c */ |
| 0x001d,/* 0x1d */ |
| 0x001e,/* 0x1e */ |
| 0x001f,/* 0x1f */ |
| 0x0020,/* 0x20 */ |
| 0x0021,/* 0x21 */ |
| 0x0022,/* 0x22 */ |
| 0x0023,/* 0x23 */ |
| 0x0024,/* 0x24 */ |
| 0x0025,/* 0x25 */ |
| 0x0026,/* 0x26 */ |
| 0x0027,/* 0x27 */ |
| 0x0028,/* 0x28 */ |
| 0x0029,/* 0x29 */ |
| 0x002a,/* 0x2a */ |
| 0x002b,/* 0x2b */ |
| 0x002c,/* 0x2c */ |
| 0x002d,/* 0x2d */ |
| 0x002e,/* 0x2e */ |
| 0x002f,/* 0x2f */ |
| 0x0030,/* 0x30 */ |
| 0x0031,/* 0x31 */ |
| 0x0032,/* 0x32 */ |
| 0x0033,/* 0x33 */ |
| 0x0034,/* 0x34 */ |
| 0x0035,/* 0x35 */ |
| 0x0036,/* 0x36 */ |
| 0x0037,/* 0x37 */ |
| 0x0038,/* 0x38 */ |
| 0x0039,/* 0x39 */ |
| 0x003A,/* 0x3A */ |
| 0x003B,/* 0x3B */ |
| 0x003c,/* 0x3c */ |
| 0x003d,/* 0x3d */ |
| 0x003e,/* 0x3e */ |
| 0x003f,/* 0x3f */ |
| 0x0040,/* 0x40 */ |
| 0x0041,/* 0x41 */ |
| 0x0042,/* 0x42 */ |
| 0x0043,/* 0x43 */ |
| 0x0044,/* 0x44 */ |
| 0x0045,/* 0x45 */ |
| 0x0046,/* 0x46 */ |
| 0x0047,/* 0x47 */ |
| 0x0048,/* 0x48 */ |
| 0x0049,/* 0x49 */ |
| 0x004a,/* 0x4a */ |
| 0x004b,/* 0x4b */ |
| 0x004c,/* 0x4c */ |
| 0x004d,/* 0x4d */ |
| 0x004e,/* 0x4e */ |
| 0x004f,/* 0x4f */ |
| 0x0050,/* 0x50 */ |
| 0x0051,/* 0x51 */ |
| 0x0052,/* 0x52 */ |
| 0x0053,/* 0x53 */ |
| 0x0054,/* 0x54 */ |
| 0x0055,/* 0x55 */ |
| 0x0056,/* 0x56 */ |
| 0x0057,/* 0x57 */ |
| 0x0058,/* 0x58 */ |
| 0x0059,/* 0x59 */ |
| 0x005a,/* 0x5a */ |
| 0x005b,/* 0x5b */ |
| 0x005c,/* 0x5c */ |
| 0x005d,/* 0x5d */ |
| 0x005e,/* 0x5e */ |
| 0x005f,/* 0x5f */ |
| 0x0060,/* 0x60 */ |
| 0x0061,/* 0x61 */ |
| 0x0062,/* 0x62 */ |
| 0x0063,/* 0x63 */ |
| 0x0064,/* 0x64 */ |
| 0x0065,/* 0x65 */ |
| 0x0066,/* 0x66 */ |
| 0x0067,/* 0x67 */ |
| 0x0068,/* 0x68 */ |
| 0x0069,/* 0x69 */ |
| 0x006a,/* 0x6a */ |
| 0x006b,/* 0x6b */ |
| 0x006c,/* 0x6c */ |
| 0x006d,/* 0x6d */ |
| 0x006e,/* 0x6e */ |
| 0x006f,/* 0x6f */ |
| 0x0070,/* 0x70 */ |
| 0x0071,/* 0x71 */ |
| 0x0072,/* 0x72 */ |
| 0x0073,/* 0x73 */ |
| 0x0074,/* 0x74 */ |
| 0x0075,/* 0x75 */ |
| 0x0076,/* 0x76 */ |
| 0x0077,/* 0x77 */ |
| 0x0078,/* 0x78 */ |
| 0x0079,/* 0x79 */ |
| 0x007a,/* 0x7a */ |
| 0x007b,/* 0x7b */ |
| 0x007c,/* 0x7c */ |
| 0x007d,/* 0x7d */ |
| 0x007e,/* 0x7e */ |
| 0x007f,/* 0x7f */ |
| 0x0080,/* 0x80 */ |
| 0x0081,/* 0x81 */ |
| 0x0082,/* 0x82 */ |
| 0x0083,/* 0x83 */ |
| 0x0084,/* 0x84 */ |
| 0x0085,/* 0x85 */ |
| 0x0086,/* 0x86 */ |
| 0x0087,/* 0x87 */ |
| 0x0088,/* 0x88 */ |
| 0x0089,/* 0x89 */ |
| 0x008a,/* 0x8a */ |
| 0x008b,/* 0x8b */ |
| 0x008c,/* 0x8c */ |
| 0x008d,/* 0x8d */ |
| 0x008e,/* 0x8e */ |
| 0x008f,/* 0x8f */ |
| 0x0090,/* 0x90 */ |
| 0x0091,/* 0x91 */ |
| 0x0092,/* 0x92 */ |
| 0x0093,/* 0x93 */ |
| 0x0094,/* 0x94 */ |
| 0x0095,/* 0x95 */ |
| 0x0096,/* 0x96 */ |
| 0x0097,/* 0x97 */ |
| 0x0098,/* 0x98 */ |
| 0x0099,/* 0x99 */ |
| 0x009a,/* 0x9a */ |
| 0x009b,/* 0x9b */ |
| 0x009c,/* 0x9c */ |
| 0x009d,/* 0x9d */ |
| 0x009e,/* 0x9e */ |
| 0x009f,/* 0x9f */ |
| 0x00A0,/* 0xa0 */ |
| 0x0901,/* 0xa1 */ |
| 0x0902,/* 0xa2 */ |
| 0x0903,/* 0xa3 */ |
| 0x0905,/* 0xa4 */ |
| 0x0906,/* 0xa5 */ |
| 0x0907,/* 0xa6 */ |
| 0x0908,/* 0xa7 */ |
| 0x0909,/* 0xa8 */ |
| 0x090a,/* 0xa9 */ |
| 0x090b,/* 0xaa */ |
| 0x090e,/* 0xab */ |
| 0x090f,/* 0xac */ |
| 0x0910,/* 0xad */ |
| 0x090d,/* 0xae */ |
| 0x0912,/* 0xaf */ |
| 0x0913,/* 0xb0 */ |
| 0x0914,/* 0xb1 */ |
| 0x0911,/* 0xb2 */ |
| 0x0915,/* 0xb3 */ |
| 0x0916,/* 0xb4 */ |
| 0x0917,/* 0xb5 */ |
| 0x0918,/* 0xb6 */ |
| 0x0919,/* 0xb7 */ |
| 0x091a,/* 0xb8 */ |
| 0x091b,/* 0xb9 */ |
| 0x091c,/* 0xba */ |
| 0x091d,/* 0xbb */ |
| 0x091e,/* 0xbc */ |
| 0x091f,/* 0xbd */ |
| 0x0920,/* 0xbe */ |
| 0x0921,/* 0xbf */ |
| 0x0922,/* 0xc0 */ |
| 0x0923,/* 0xc1 */ |
| 0x0924,/* 0xc2 */ |
| 0x0925,/* 0xc3 */ |
| 0x0926,/* 0xc4 */ |
| 0x0927,/* 0xc5 */ |
| 0x0928,/* 0xc6 */ |
| 0x0929,/* 0xc7 */ |
| 0x092a,/* 0xc8 */ |
| 0x092b,/* 0xc9 */ |
| 0x092c,/* 0xca */ |
| 0x092d,/* 0xcb */ |
| 0x092e,/* 0xcc */ |
| 0x092f,/* 0xcd */ |
| 0x095f,/* 0xce */ |
| 0x0930,/* 0xcf */ |
| 0x0931,/* 0xd0 */ |
| 0x0932,/* 0xd1 */ |
| 0x0933,/* 0xd2 */ |
| 0x0934,/* 0xd3 */ |
| 0x0935,/* 0xd4 */ |
| 0x0936,/* 0xd5 */ |
| 0x0937,/* 0xd6 */ |
| 0x0938,/* 0xd7 */ |
| 0x0939,/* 0xd8 */ |
| 0x200D,/* 0xd9 */ |
| 0x093e,/* 0xda */ |
| 0x093f,/* 0xdb */ |
| 0x0940,/* 0xdc */ |
| 0x0941,/* 0xdd */ |
| 0x0942,/* 0xde */ |
| 0x0943,/* 0xdf */ |
| 0x0946,/* 0xe0 */ |
| 0x0947,/* 0xe1 */ |
| 0x0948,/* 0xe2 */ |
| 0x0945,/* 0xe3 */ |
| 0x094a,/* 0xe4 */ |
| 0x094b,/* 0xe5 */ |
| 0x094c,/* 0xe6 */ |
| 0x0949,/* 0xe7 */ |
| 0x094d,/* 0xe8 */ |
| 0x093c,/* 0xe9 */ |
| 0x0964,/* 0xea */ |
| 0xFFFF,/* 0xeb */ |
| 0xFFFF,/* 0xec */ |
| 0xFFFF,/* 0xed */ |
| 0xFFFF,/* 0xee */ |
| 0xFFFF,/* 0xef */ |
| 0xFFFF,/* 0xf0 */ |
| 0x0966,/* 0xf1 */ |
| 0x0967,/* 0xf2 */ |
| 0x0968,/* 0xf3 */ |
| 0x0969,/* 0xf4 */ |
| 0x096a,/* 0xf5 */ |
| 0x096b,/* 0xf6 */ |
| 0x096c,/* 0xf7 */ |
| 0x096d,/* 0xf8 */ |
| 0x096e,/* 0xf9 */ |
| 0x096f,/* 0xfa */ |
| 0xFFFF,/* 0xfb */ |
| 0xFFFF,/* 0xfc */ |
| 0xFFFF,/* 0xfd */ |
| 0xFFFF,/* 0xfe */ |
| 0xFFFF /* 0xff */ |
| }; |
| |
| static const uint16_t vowelSignESpecialCases[][2]={ |
| { 2 /*length of array*/ , 0 }, |
| { 0xA4 , 0x0904 }, |
| }; |
| |
| static const uint16_t nuktaSpecialCases[][2]={ |
| { 16 /*length of array*/ , 0 }, |
| { 0xA6 , 0x090c }, |
| { 0xEA , 0x093D }, |
| { 0xDF , 0x0944 }, |
| { 0xA1 , 0x0950 }, |
| { 0xb3 , 0x0958 }, |
| { 0xb4 , 0x0959 }, |
| { 0xb5 , 0x095a }, |
| { 0xba , 0x095b }, |
| { 0xbf , 0x095c }, |
| { 0xC0 , 0x095d }, |
| { 0xc9 , 0x095e }, |
| { 0xAA , 0x0960 }, |
| { 0xA7 , 0x0961 }, |
| { 0xDB , 0x0962 }, |
| { 0xDC , 0x0963 }, |
| }; |
| |
| |
| #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err) UPRV_BLOCK_MACRO_BEGIN { \ |
| int32_t offset = (int32_t)(source - args->source-1); \ |
| /* write the targetUniChar to target */ \ |
| if(target < targetLimit){ \ |
| if(targetByteUnit <= 0xFF){ \ |
| *(target)++ = (uint8_t)(targetByteUnit); \ |
| if(offsets){ \ |
| *(offsets++) = offset; \ |
| } \ |
| }else{ \ |
| if (targetByteUnit > 0xFFFF) { \ |
| *(target)++ = (uint8_t)(targetByteUnit>>16); \ |
| if (offsets) { \ |
| --offset; \ |
| *(offsets++) = offset; \ |
| } \ |
| } \ |
| if (!(target < targetLimit)) { \ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ |
| (uint8_t)(targetByteUnit >> 8); \ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ |
| (uint8_t)targetByteUnit; \ |
| *err = U_BUFFER_OVERFLOW_ERROR; \ |
| } else { \ |
| *(target)++ = (uint8_t)(targetByteUnit>>8); \ |
| if(offsets){ \ |
| *(offsets++) = offset; \ |
| } \ |
| if(target < targetLimit){ \ |
| *(target)++ = (uint8_t) targetByteUnit; \ |
| if(offsets){ \ |
| *(offsets++) = offset ; \ |
| } \ |
| }else{ \ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\ |
| (uint8_t) (targetByteUnit); \ |
| *err = U_BUFFER_OVERFLOW_ERROR; \ |
| } \ |
| } \ |
| } \ |
| }else{ \ |
| if (targetByteUnit & 0xFF0000) { \ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ |
| (uint8_t) (targetByteUnit >>16); \ |
| } \ |
| if(targetByteUnit & 0xFF00){ \ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ |
| (uint8_t) (targetByteUnit >>8); \ |
| } \ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ |
| (uint8_t) (targetByteUnit); \ |
| *err = U_BUFFER_OVERFLOW_ERROR; \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| /* Rules: |
| * Explicit Halant : |
| * <HALANT> + <ZWNJ> |
| * Soft Halant : |
| * <HALANT> + <ZWJ> |
| */ |
| static void U_CALLCONV |
| UConverter_fromUnicode_ISCII_OFFSETS_LOGIC( |
| UConverterFromUnicodeArgs * args, UErrorCode * err) { |
| const UChar *source = args->source; |
| const UChar *sourceLimit = args->sourceLimit; |
| unsigned char *target = (unsigned char *) args->target; |
| unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
| int32_t* offsets = args->offsets; |
| uint32_t targetByteUnit = 0x0000; |
| UChar32 sourceChar = 0x0000; |
| UChar32 tempContextFromUnicode = 0x0000; /* For special handling of the Gurmukhi script. */ |
| UConverterDataISCII *converterData; |
| uint16_t newDelta=0; |
| uint16_t range = 0; |
| UBool deltaChanged = FALSE; |
| |
| if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) { |
| *err = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| /* initialize data */ |
| converterData=(UConverterDataISCII*)args->converter->extraInfo; |
| newDelta=converterData->currentDeltaFromUnicode; |
| range = (uint16_t)(newDelta/DELTA); |
| |
| if ((sourceChar = args->converter->fromUChar32)!=0) { |
| goto getTrail; |
| } |
| |
| /*writing the char to the output stream */ |
| while (source < sourceLimit) { |
| /* Write the language code following LF only if LF is not the last character. */ |
| if (args->converter->fromUnicodeStatus == LF) { |
| targetByteUnit = ATR<<8; |
| targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang; |
| args->converter->fromUnicodeStatus = 0x0000; |
| /* now append ATR and language code */ |
| WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); |
| if (U_FAILURE(*err)) { |
| break; |
| } |
| } |
| |
| sourceChar = *source++; |
| tempContextFromUnicode = converterData->contextCharFromUnicode; |
| |
| targetByteUnit = missingCharMarker; |
| |
| /*check if input is in ASCII and C0 control codes range*/ |
| if (sourceChar <= ASCII_END) { |
| args->converter->fromUnicodeStatus = sourceChar; |
| WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err); |
| if (U_FAILURE(*err)) { |
| break; |
| } |
| continue; |
| } |
| switch (sourceChar) { |
| case ZWNJ: |
| /* contextChar has HALANT */ |
| if (converterData->contextCharFromUnicode) { |
| converterData->contextCharFromUnicode = 0x00; |
| targetByteUnit = ISCII_HALANT; |
| } else { |
| /* consume ZWNJ and continue */ |
| converterData->contextCharFromUnicode = 0x00; |
| continue; |
| } |
| break; |
| case ZWJ: |
| /* contextChar has HALANT */ |
| if (converterData->contextCharFromUnicode) { |
| targetByteUnit = ISCII_NUKTA; |
| } else { |
| targetByteUnit =ISCII_INV; |
| } |
| converterData->contextCharFromUnicode = 0x00; |
| break; |
| default: |
| /* is the sourceChar in the INDIC_RANGE? */ |
| if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) { |
| /* Danda and Double Danda are valid in Northern scripts.. since Unicode |
| * does not include these codepoints in all Northern scrips we need to |
| * filter them out |
| */ |
| if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) { |
| /* find out to which block the souceChar belongs*/ |
| range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA); |
| newDelta =(uint16_t)(range*DELTA); |
| |
| /* Now are we in the same block as the previous? */ |
| if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) { |
| converterData->currentDeltaFromUnicode = newDelta; |
| converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum; |
| deltaChanged =TRUE; |
| converterData->isFirstBuffer=FALSE; |
| } |
| |
| if (converterData->currentDeltaFromUnicode == PNJ_DELTA) { |
| if (sourceChar == PNJ_TIPPI) { |
| /* Make sure Tippi is converted to Bindi. */ |
| sourceChar = PNJ_BINDI; |
| } else if (sourceChar == PNJ_ADHAK) { |
| /* This is for consonant cluster handling. */ |
| converterData->contextCharFromUnicode = PNJ_ADHAK; |
| } |
| |
| } |
| /* Normalize all Indic codepoints to Devanagari and map them to ISCII */ |
| /* now subtract the new delta from sourceChar*/ |
| sourceChar -= converterData->currentDeltaFromUnicode; |
| } |
| |
| /* get the target byte unit */ |
| targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar]; |
| |
| /* is the code point valid in current script? */ |
| if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) { |
| /* Vocallic RR is assigned in ISCII Telugu and Unicode */ |
| if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) { |
| targetByteUnit=missingCharMarker; |
| } |
| } |
| |
| if (deltaChanged) { |
| /* we are in a script block which is different than |
| * previous sourceChar's script block write ATR and language codes |
| */ |
| uint32_t temp=0; |
| temp =(uint16_t)(ATR<<8); |
| temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang); |
| /* reset */ |
| deltaChanged=FALSE; |
| /* now append ATR and language code */ |
| WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err); |
| if (U_FAILURE(*err)) { |
| break; |
| } |
| } |
| |
| if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) { |
| continue; |
| } |
| } |
| /* reset context char */ |
| converterData->contextCharFromUnicode = 0x00; |
| break; |
| } |
| if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) { |
| /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */ |
| /* reset context char */ |
| converterData->contextCharFromUnicode = 0x0000; |
| targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit; |
| /* write targetByteUnit to target */ |
| WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err); |
| if (U_FAILURE(*err)) { |
| break; |
| } |
| } else if (targetByteUnit != missingCharMarker) { |
| if (targetByteUnit==ISCII_HALANT) { |
| converterData->contextCharFromUnicode = (UChar)targetByteUnit; |
| } |
| /* write targetByteUnit to target*/ |
| WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); |
| if (U_FAILURE(*err)) { |
| break; |
| } |
| } else { |
| /* oops.. the code point is unassigned */ |
| /*check if the char is a First surrogate*/ |
| if (U16_IS_SURROGATE(sourceChar)) { |
| if (U16_IS_SURROGATE_LEAD(sourceChar)) { |
| getTrail: |
| /*look ahead to find the trail surrogate*/ |
| if (source < sourceLimit) { |
| /* test the following code unit */ |
| UChar trail= (*source); |
| if (U16_IS_TRAIL(trail)) { |
| source++; |
| sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
| *err =U_INVALID_CHAR_FOUND; |
| /* convert this surrogate code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| } |
| } else { |
| /* no more input */ |
| *err = U_ZERO_ERROR; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| } |
| } else { |
| /* callback(unassigned) for a BMP code point */ |
| *err = U_INVALID_CHAR_FOUND; |
| } |
| |
| args->converter->fromUChar32=sourceChar; |
| break; |
| } |
| }/* end while(mySourceIndex<mySourceLength) */ |
| |
| /*save the state and return */ |
| args->source = source; |
| args->target = (char*)target; |
| } |
| |
| static const uint16_t lookupTable[][2]={ |
| { ZERO, ZERO }, /*DEFALT*/ |
| { ZERO, ZERO }, /*ROMAN*/ |
| { DEVANAGARI, DEV_MASK }, |
| { BENGALI, BNG_MASK }, |
| { TAMIL, TML_MASK }, |
| { TELUGU, KND_MASK }, |
| { BENGALI, BNG_MASK }, |
| { ORIYA, ORI_MASK }, |
| { KANNADA, KND_MASK }, |
| { MALAYALAM, MLM_MASK }, |
| { GUJARATI, GJR_MASK }, |
| { GURMUKHI, PNJ_MASK } |
| }; |
| |
| #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err) UPRV_BLOCK_MACRO_BEGIN { \ |
| /* add offset to current Indic Block */ \ |
| if(targetUniChar>ASCII_END && \ |
| targetUniChar != ZWJ && \ |
| targetUniChar != ZWNJ && \ |
| targetUniChar != DANDA && \ |
| targetUniChar != DOUBLE_DANDA){ \ |
| \ |
| targetUniChar+=(uint16_t)(delta); \ |
| } \ |
| /* now write the targetUniChar */ \ |
| if(target<args->targetLimit){ \ |
| *(target)++ = (UChar)targetUniChar; \ |
| if(offsets){ \ |
| *(offsets)++ = (int32_t)(offset); \ |
| } \ |
| }else{ \ |
| args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \ |
| (UChar)targetUniChar; \ |
| *err = U_BUFFER_OVERFLOW_ERROR; \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define GET_MAPPING(sourceChar,targetUniChar,data) UPRV_BLOCK_MACRO_BEGIN { \ |
| targetUniChar = toUnicodeTable[(sourceChar)] ; \ |
| /* is the code point valid in current script? */ \ |
| if(sourceChar> ASCII_END && \ |
| (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \ |
| /* Vocallic RR is assigned in ISCII Telugu and Unicode */ \ |
| if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \ |
| targetUniChar!=VOCALLIC_RR){ \ |
| targetUniChar=missingCharMarker; \ |
| } \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| /*********** |
| * Rules for ISCII to Unicode converter |
| * ISCII is stateful encoding. To convert ISCII bytes to Unicode, |
| * which has both precomposed and decomposed forms characters |
| * pre-context and post-context need to be considered. |
| * |
| * Post context |
| * i) ATR : Attribute code is used to declare the font and script switching. |
| * Currently we only switch scripts and font codes consumed without generating an error |
| * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure, |
| * obsolete characters |
| * Pre context |
| * i) Halant: if preceeded by a halant then it is a explicit halant |
| * ii) Nukta : |
| * a) if preceeded by a halant then it is a soft halant |
| * b) if preceeded by specific consonants and the ligatures have pre-composed |
| * characters in Unicode then convert to pre-composed characters |
| * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda |
| * |
| */ |
| |
| static void U_CALLCONV |
| UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err) { |
| const char *source = ( char *) args->source; |
| UChar *target = args->target; |
| const char *sourceLimit = args->sourceLimit; |
| const UChar* targetLimit = args->targetLimit; |
| uint32_t targetUniChar = 0x0000; |
| uint8_t sourceChar = 0x0000; |
| UConverterDataISCII* data; |
| UChar32* toUnicodeStatus=NULL; |
| UChar32 tempTargetUniChar = 0x0000; |
| UChar* contextCharToUnicode= NULL; |
| UBool found; |
| int i; |
| int offset = 0; |
| |
| if ((args->converter == NULL) || (target < args->target) || (source < args->source)) { |
| *err = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| data = (UConverterDataISCII*)(args->converter->extraInfo); |
| contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */ |
| toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/ |
| |
| while (U_SUCCESS(*err) && source<sourceLimit) { |
| |
| targetUniChar = missingCharMarker; |
| |
| if (target < targetLimit) { |
| sourceChar = (unsigned char)*(source)++; |
| |
| /* look at the post-context preform special processing */ |
| if (*contextCharToUnicode==ATR) { |
| |
| /* If we have ATR in *contextCharToUnicode then we need to change our |
| * state to the Indic Script specified by sourceChar |
| */ |
| |
| /* check if the sourceChar is supported script range*/ |
| if ((uint8_t)(PNJ-sourceChar)<=PNJ-DEV) { |
| data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA); |
| data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1]; |
| } else if (sourceChar==DEF) { |
| /* switch back to default */ |
| data->currentDeltaToUnicode = data->defDeltaToUnicode; |
| data->currentMaskToUnicode = data->defMaskToUnicode; |
| } else { |
| if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) { |
| /* these are display codes consume and continue */ |
| } else { |
| *err =U_ILLEGAL_CHAR_FOUND; |
| /* reset */ |
| *contextCharToUnicode=NO_CHAR_MARKER; |
| goto CALLBACK; |
| } |
| } |
| |
| /* reset */ |
| *contextCharToUnicode=NO_CHAR_MARKER; |
| |
| continue; |
| |
| } else if (*contextCharToUnicode==EXT) { |
| /* check if sourceChar is in 0xA1-0xEE range */ |
| if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) { |
| /* We currently support only Anudatta and Devanagari abbreviation sign */ |
| if (sourceChar==0xBF || sourceChar == 0xB8) { |
| targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA; |
| |
| /* find out if the mapping is valid in this state */ |
| if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { |
| *contextCharToUnicode= NO_CHAR_MARKER; |
| |
| /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ |
| if (data->prevToUnicodeStatus) { |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| /* write to target */ |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); |
| |
| continue; |
| } |
| } |
| /* byte unit is unassigned */ |
| targetUniChar = missingCharMarker; |
| *err= U_INVALID_CHAR_FOUND; |
| } else { |
| /* only 0xA1 - 0xEE are legal after EXT char */ |
| *contextCharToUnicode= NO_CHAR_MARKER; |
| *err = U_ILLEGAL_CHAR_FOUND; |
| } |
| goto CALLBACK; |
| } else if (*contextCharToUnicode==ISCII_INV) { |
| if (sourceChar==ISCII_HALANT) { |
| targetUniChar = 0x0020; /* replace with space according to Indic FAQ */ |
| } else { |
| targetUniChar = ZWJ; |
| } |
| |
| /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ |
| if (data->prevToUnicodeStatus) { |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| /* write to target */ |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); |
| /* reset */ |
| *contextCharToUnicode=NO_CHAR_MARKER; |
| } |
| |
| /* look at the pre-context and perform special processing */ |
| switch (sourceChar) { |
| case ISCII_INV: |
| case EXT: |
| case ATR: |
| *contextCharToUnicode = (UChar)sourceChar; |
| |
| if (*toUnicodeStatus != missingCharMarker) { |
| /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ |
| if (data->prevToUnicodeStatus) { |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err); |
| *toUnicodeStatus = missingCharMarker; |
| } |
| continue; |
| case ISCII_DANDA: |
| /* handle double danda*/ |
| if (*contextCharToUnicode== ISCII_DANDA) { |
| targetUniChar = DOUBLE_DANDA; |
| /* clear the context */ |
| *contextCharToUnicode = NO_CHAR_MARKER; |
| *toUnicodeStatus = missingCharMarker; |
| } else { |
| GET_MAPPING(sourceChar,targetUniChar,data); |
| *contextCharToUnicode = sourceChar; |
| } |
| break; |
| case ISCII_HALANT: |
| /* handle explicit halant */ |
| if (*contextCharToUnicode == ISCII_HALANT) { |
| targetUniChar = ZWNJ; |
| /* clear the context */ |
| *contextCharToUnicode = NO_CHAR_MARKER; |
| } else { |
| GET_MAPPING(sourceChar,targetUniChar,data); |
| *contextCharToUnicode = sourceChar; |
| } |
| break; |
| case 0x0A: |
| case 0x0D: |
| data->resetToDefaultToUnicode = TRUE; |
| GET_MAPPING(sourceChar,targetUniChar,data) |
| ; |
| *contextCharToUnicode = sourceChar; |
| break; |
| |
| case ISCII_VOWEL_SIGN_E: |
| i=1; |
| found=FALSE; |
| for (; i<vowelSignESpecialCases[0][0]; i++) { |
| U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases)); |
| if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) { |
| targetUniChar=vowelSignESpecialCases[i][1]; |
| found=TRUE; |
| break; |
| } |
| } |
| if (found) { |
| /* find out if the mapping is valid in this state */ |
| if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { |
| /*targetUniChar += data->currentDeltaToUnicode ;*/ |
| *contextCharToUnicode= NO_CHAR_MARKER; |
| *toUnicodeStatus = missingCharMarker; |
| break; |
| } |
| } |
| GET_MAPPING(sourceChar,targetUniChar,data); |
| *contextCharToUnicode = sourceChar; |
| break; |
| |
| case ISCII_NUKTA: |
| /* handle soft halant */ |
| if (*contextCharToUnicode == ISCII_HALANT) { |
| targetUniChar = ZWJ; |
| /* clear the context */ |
| *contextCharToUnicode = NO_CHAR_MARKER; |
| break; |
| } else if (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) { |
| /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ |
| if (data->prevToUnicodeStatus) { |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi. |
| * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39). |
| */ |
| targetUniChar = PNJ_RRA; |
| WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); |
| if (U_SUCCESS(*err)) { |
| targetUniChar = PNJ_SIGN_VIRAMA; |
| WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); |
| if (U_SUCCESS(*err)) { |
| targetUniChar = PNJ_HA; |
| WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); |
| } else { |
| args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA; |
| } |
| } else { |
| args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA; |
| args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA; |
| } |
| *toUnicodeStatus = missingCharMarker; |
| data->contextCharToUnicode = NO_CHAR_MARKER; |
| continue; |
| } else { |
| /* try to handle <CHAR> + ISCII_NUKTA special mappings */ |
| i=1; |
| found =FALSE; |
| for (; i<nuktaSpecialCases[0][0]; i++) { |
| if (nuktaSpecialCases[i][0]==(uint8_t) |
| *contextCharToUnicode) { |
| targetUniChar=nuktaSpecialCases[i][1]; |
| found =TRUE; |
| break; |
| } |
| } |
| if (found) { |
| /* find out if the mapping is valid in this state */ |
| if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { |
| /*targetUniChar += data->currentDeltaToUnicode ;*/ |
| *contextCharToUnicode= NO_CHAR_MARKER; |
| *toUnicodeStatus = missingCharMarker; |
| if (data->currentDeltaToUnicode == PNJ_DELTA) { |
| /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ |
| if (data->prevToUnicodeStatus) { |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); |
| continue; |
| } |
| break; |
| } |
| /* else fall through to default */ |
| } |
| /* else fall through to default */ |
| U_FALLTHROUGH; |
| } |
| default:GET_MAPPING(sourceChar,targetUniChar,data) |
| ; |
| *contextCharToUnicode = sourceChar; |
| break; |
| } |
| |
| if (*toUnicodeStatus != missingCharMarker) { |
| /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */ |
| if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) && |
| (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && ((UChar32)(targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus)) { |
| /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */ |
| offset = (int)(source-args->source - 3); |
| tempTargetUniChar = PNJ_ADHAK; /* This is necessary to avoid some compiler warnings. */ |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,tempTargetUniChar,0,err); |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; /* reset the previous unicode code point */ |
| *toUnicodeStatus = missingCharMarker; |
| continue; |
| } else { |
| /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ |
| if (data->prevToUnicodeStatus) { |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); |
| data->prevToUnicodeStatus = 0x0000; |
| } |
| /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script. |
| * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi. |
| */ |
| if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) { |
| targetUniChar = PNJ_TIPPI - PNJ_DELTA; |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err); |
| } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) { |
| /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */ |
| data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA; |
| } else { |
| /* write the previously mapped codepoint */ |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err); |
| } |
| } |
| *toUnicodeStatus = missingCharMarker; |
| } |
| |
| if (targetUniChar != missingCharMarker) { |
| /* now save the targetUniChar for delayed write */ |
| *toUnicodeStatus = (UChar) targetUniChar; |
| if (data->resetToDefaultToUnicode==TRUE) { |
| data->currentDeltaToUnicode = data->defDeltaToUnicode; |
| data->currentMaskToUnicode = data->defMaskToUnicode; |
| data->resetToDefaultToUnicode=FALSE; |
| } |
| } else { |
| |
| /* we reach here only if targetUniChar == missingCharMarker |
| * so assign codes to reason and err |
| */ |
| *err = U_INVALID_CHAR_FOUND; |
| CALLBACK: |
| args->converter->toUBytes[0] = (uint8_t) sourceChar; |
| args->converter->toULength = 1; |
| break; |
| } |
| |
| } else { |
| *err =U_BUFFER_OVERFLOW_ERROR; |
| break; |
| } |
| } |
| |
| if (U_SUCCESS(*err) && args->flush && source == sourceLimit) { |
| /* end of the input stream */ |
| UConverter *cnv = args->converter; |
| |
| if (*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV) { |
| /* set toUBytes[] */ |
| cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode; |
| cnv->toULength = 1; |
| |
| /* avoid looping on truncated sequences */ |
| *contextCharToUnicode = NO_CHAR_MARKER; |
| } else { |
| cnv->toULength = 0; |
| } |
| |
| if (*toUnicodeStatus != missingCharMarker) { |
| /* output a remaining target character */ |
| WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),*toUnicodeStatus,data->currentDeltaToUnicode,err); |
| *toUnicodeStatus = missingCharMarker; |
| } |
| } |
| |
| args->target = target; |
| args->source = source; |
| } |
| |
| /* structure for SafeClone calculations */ |
| struct cloneISCIIStruct { |
| UConverter cnv; |
| UConverterDataISCII mydata; |
| }; |
| |
| static UConverter * U_CALLCONV |
| _ISCII_SafeClone(const UConverter *cnv, |
| void *stackBuffer, |
| int32_t *pBufferSize, |
| UErrorCode *status) |
| { |
| struct cloneISCIIStruct * localClone; |
| int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct); |
| |
| if (U_FAILURE(*status)) { |
| return 0; |
| } |
| |
| if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ |
| *pBufferSize = bufferSizeNeeded; |
| return 0; |
| } |
| |
| localClone = (struct cloneISCIIStruct *)stackBuffer; |
| /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
| |
| uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII)); |
| localClone->cnv.extraInfo = &localClone->mydata; |
| localClone->cnv.isExtraLocal = TRUE; |
| |
| return &localClone->cnv; |
| } |
| |
| static void U_CALLCONV |
| _ISCIIGetUnicodeSet(const UConverter *cnv, |
| const USetAdder *sa, |
| UConverterUnicodeSet which, |
| UErrorCode *pErrorCode) |
| { |
| (void)cnv; |
| (void)which; |
| (void)pErrorCode; |
| int32_t idx, script; |
| uint8_t mask; |
| |
| /* Since all ISCII versions allow switching to other ISCII |
| scripts, we add all roundtrippable characters to this set. */ |
| sa->addRange(sa->set, 0, ASCII_END); |
| for (script = DEVANAGARI; script <= MALAYALAM; script++) { |
| mask = (uint8_t)(lookupInitialData[script].maskEnum); |
| for (idx = 0; idx < DELTA; idx++) { |
| /* added check for TELUGU character */ |
| if ((validityTable[idx] & mask) || (script==TELUGU && idx==0x31)) { |
| sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN); |
| } |
| } |
| } |
| sa->add(sa->set, DANDA); |
| sa->add(sa->set, DOUBLE_DANDA); |
| sa->add(sa->set, ZWNJ); |
| sa->add(sa->set, ZWJ); |
| } |
| U_CDECL_END |
| static const UConverterImpl _ISCIIImpl={ |
| |
| UCNV_ISCII, |
| |
| NULL, |
| NULL, |
| |
| _ISCIIOpen, |
| _ISCIIClose, |
| _ISCIIReset, |
| |
| UConverter_toUnicode_ISCII_OFFSETS_LOGIC, |
| UConverter_toUnicode_ISCII_OFFSETS_LOGIC, |
| UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, |
| UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, |
| NULL, |
| |
| NULL, |
| _ISCIIgetName, |
| NULL, |
| _ISCII_SafeClone, |
| _ISCIIGetUnicodeSet, |
| NULL, |
| NULL |
| }; |
| |
| static const UConverterStaticData _ISCIIStaticData={ |
| sizeof(UConverterStaticData), |
| "ISCII", |
| 0, |
| UCNV_IBM, |
| UCNV_ISCII, |
| 1, |
| 4, |
| { 0x1a, 0, 0, 0 }, |
| 0x1, |
| FALSE, |
| FALSE, |
| 0x0, |
| 0x0, |
| { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ |
| |
| }; |
| |
| const UConverterSharedData _ISCIIData= |
| UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl); |
| |
| #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |