| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 2000-2016, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * file name: ucnv2022.cpp |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2000feb03 |
| * created by: Markus W. Scherer |
| * |
| * Change history: |
| * |
| * 06/29/2000 helena Major rewrite of the callback APIs. |
| * 08/08/2000 Ram Included support for ISO-2022-JP-2 |
| * Changed implementation of toUnicode |
| * function |
| * 08/21/2000 Ram Added support for ISO-2022-KR |
| * 08/29/2000 Ram Seperated implementation of EBCDIC to |
| * ucnvebdc.c |
| * 09/20/2000 Ram Added support for ISO-2022-CN |
| * Added implementations for getNextUChar() |
| * for specific 2022 country variants. |
| * 10/31/2000 Ram Implemented offsets logic functions |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
| |
| #include "unicode/ucnv.h" |
| #include "unicode/uset.h" |
| #include "unicode/ucnv_err.h" |
| #include "unicode/ucnv_cb.h" |
| #include "unicode/utf16.h" |
| #include "ucnv_imp.h" |
| #include "ucnv_bld.h" |
| #include "ucnv_cnv.h" |
| #include "ucnvmbcs.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "uassert.h" |
| |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| /* |
| * I am disabling the generic ISO-2022 converter after proposing to do so on |
| * the icu mailing list two days ago. |
| * |
| * Reasons: |
| * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of |
| * its designation sequences, single shifts with return to the previous state, |
| * switch-with-no-return to UTF-16BE or similar, etc. |
| * This is unlike the language-specific variants like ISO-2022-JP which |
| * require a much smaller repertoire of ISO-2022 features. |
| * These variants continue to be supported. |
| * 2. I believe that no one is really using the generic ISO-2022 converter |
| * but rather always one of the language-specific variants. |
| * Note that ICU's generic ISO-2022 converter has always output one escape |
| * sequence followed by UTF-8 for the whole stream. |
| * 3. Switching between subcharsets is extremely slow, because each time |
| * the previous converter is closed and a new one opened, |
| * without any kind of caching, least-recently-used list, etc. |
| * 4. The code is currently buggy, and given the above it does not seem |
| * reasonable to spend the time on maintenance. |
| * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. |
| * This means, for example, that when ISO-8859-7 is designated, the following |
| * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. |
| * The ICU ISO-2022 converter does not handle this - and has no information |
| * about which subconverter would have to be shifted vs. which is designed |
| * for 7-bit ISO-2022. |
| * |
| * Markus Scherer 2003-dec-03 |
| */ |
| #endif |
| |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| static const char SHIFT_IN_STR[] = "\x0F"; |
| // static const char SHIFT_OUT_STR[] = "\x0E"; |
| #endif |
| |
| #define CR 0x0D |
| #define LF 0x0A |
| #define H_TAB 0x09 |
| #define V_TAB 0x0B |
| #define SPACE 0x20 |
| |
| enum { |
| HWKANA_START=0xff61, |
| HWKANA_END=0xff9f |
| }; |
| |
| /* |
| * 94-character sets with native byte values A1..FE are encoded in ISO 2022 |
| * as bytes 21..7E. (Subtract 0x80.) |
| * 96-character sets with native byte values A0..FF are encoded in ISO 2022 |
| * as bytes 20..7F. (Subtract 0x80.) |
| * Do not encode C1 control codes with native bytes 80..9F |
| * as bytes 00..1F (C0 control codes). |
| */ |
| enum { |
| GR94_START=0xa1, |
| GR94_END=0xfe, |
| GR96_START=0xa0, |
| GR96_END=0xff |
| }; |
| |
| /* |
| * ISO 2022 control codes must not be converted from Unicode |
| * because they would mess up the byte stream. |
| * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b |
| * corresponding to SO, SI, and ESC. |
| */ |
| #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) |
| |
| /* for ISO-2022-JP and -CN implementations */ |
| typedef enum { |
| /* shared values */ |
| INVALID_STATE=-1, |
| ASCII = 0, |
| |
| SS2_STATE=0x10, |
| SS3_STATE, |
| |
| /* JP */ |
| ISO8859_1 = 1 , |
| ISO8859_7 = 2 , |
| JISX201 = 3, |
| JISX208 = 4, |
| JISX212 = 5, |
| GB2312 =6, |
| KSC5601 =7, |
| HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ |
| |
| /* CN */ |
| /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ |
| GB2312_1=1, |
| ISO_IR_165=2, |
| CNS_11643=3, |
| |
| /* |
| * these are used in StateEnum and ISO2022State variables, |
| * but CNS_11643 must be used to index into myConverterArray[] |
| */ |
| CNS_11643_0=0x20, |
| CNS_11643_1, |
| CNS_11643_2, |
| CNS_11643_3, |
| CNS_11643_4, |
| CNS_11643_5, |
| CNS_11643_6, |
| CNS_11643_7 |
| } StateEnum; |
| |
| /* is the StateEnum charset value for a DBCS charset? */ |
| #if UCONFIG_ONLY_HTML_CONVERSION |
| #define IS_JP_DBCS(cs) (JISX208==(cs)) |
| #else |
| #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
| #endif |
| |
| #define CSM(cs) ((uint16_t)1<<(cs)) |
| |
| /* |
| * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence |
| * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x |
| * |
| * Note: The converter uses some leniency: |
| * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in |
| * all versions, not just JIS7 and JIS8. |
| * - ICU does not distinguish between different versions of JIS X 0208. |
| */ |
| #if UCONFIG_ONLY_HTML_CONVERSION |
| enum { MAX_JA_VERSION=0 }; |
| #else |
| enum { MAX_JA_VERSION=4 }; |
| #endif |
| static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
| CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
| CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
| CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
| CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) |
| #endif |
| }; |
| |
| typedef enum { |
| ASCII1=0, |
| LATIN1, |
| SBCS, |
| DBCS, |
| MBCS, |
| HWKANA |
| }Cnv2022Type; |
| |
| typedef struct ISO2022State { |
| int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ |
| int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ |
| int8_t prevG; /* g before single shift (SS2 or SS3) */ |
| } ISO2022State; |
| |
| #define UCNV_OPTIONS_VERSION_MASK 0xf |
| #define UCNV_2022_MAX_CONVERTERS 10 |
| |
| typedef struct{ |
| UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
| UConverter *currentConverter; |
| Cnv2022Type currentType; |
| ISO2022State toU2022State, fromU2022State; |
| uint32_t key; |
| uint32_t version; |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| UBool isFirstBuffer; |
| #endif |
| UBool isEmptySegment; |
| char name[30]; |
| char locale[3]; |
| }UConverterDataISO2022; |
| |
| /* Protos */ |
| /* ISO-2022 ----------------------------------------------------------------- */ |
| |
| /*Forward declaration */ |
| U_CFUNC void U_CALLCONV |
| ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
| UErrorCode * err); |
| U_CFUNC void U_CALLCONV |
| ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
| UErrorCode * err); |
| |
| #define ESC_2022 0x1B /*ESC*/ |
| |
| typedef enum |
| { |
| INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ |
| VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ |
| VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ |
| VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
| } UCNV_TableStates_2022; |
| |
| /* |
| * The way these state transition arrays work is: |
| * ex : ESC$B is the sequence for JISX208 |
| * a) First Iteration: char is ESC |
| * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index |
| * int x = normalize_esq_chars_2022[27] which is equal to 1 |
| * ii) Search for this value in escSeqStateTable_Key_2022[] |
| * value of x is stored at escSeqStateTable_Key_2022[0] |
| * iii) Save this index as offset |
| * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
| * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
| * b) Switch on this state and continue to next char |
| * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index |
| * which is normalize_esq_chars_2022[36] == 4 |
| * ii) x is currently 1(from above) |
| * x<<=5 -- x is now 32 |
| * x+=normalize_esq_chars_2022[36] |
| * now x is 36 |
| * iii) Search for this value in escSeqStateTable_Key_2022[] |
| * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 |
| * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
| * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
| * c) Switch on this state and continue to next char |
| * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index |
| * ii) x is currently 36 (from above) |
| * x<<=5 -- x is now 1152 |
| * x+=normalize_esq_chars_2022[66] |
| * now x is 1161 |
| * iii) Search for this value in escSeqStateTable_Key_2022[] |
| * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 |
| * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] |
| * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 |
| * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 |
| */ |
| |
| |
| /*Below are the 3 arrays depicting a state transition table*/ |
| static const int8_t normalize_esq_chars_2022[256] = { |
| /* 0 1 2 3 4 5 6 7 8 9 */ |
| |
| 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 |
| ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 |
| ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 |
| ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| ,0 ,0 ,0 ,0 ,0 ,0 |
| }; |
| |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| /* |
| * When the generic ISO-2022 converter is completely removed, not just disabled |
| * per #ifdef, then the following state table and the associated tables that are |
| * dimensioned with MAX_STATES_2022 should be trimmed. |
| * |
| * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of |
| * the associated escape sequences starting with ESC ( B should be removed. |
| * This includes the ones with key values 1097 and all of the ones above 1000000. |
| * |
| * For the latter, the tables can simply be truncated. |
| * For the former, since the tables must be kept parallel, it is probably best |
| * to simply duplicate an adjacent table cell, parallel in all tables. |
| * |
| * It may make sense to restructure the tables, especially by using small search |
| * tables for the variants instead of indexing them parallel to the table here. |
| */ |
| #endif |
| |
| #define MAX_STATES_2022 74 |
| static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { |
| /* 0 1 2 3 4 5 6 7 8 9 */ |
| |
| 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 |
| ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 |
| ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 |
| ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 |
| ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 |
| ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 |
| ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 |
| ,35947631 ,35947635 ,35947636 ,35947638 |
| }; |
| |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| |
| static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { |
| /* 0 1 2 3 4 5 6 7 8 9 */ |
| |
| NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" |
| ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
| ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
| ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" |
| ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" |
| ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" |
| ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" |
| ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" |
| }; |
| |
| #endif |
| |
| static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
| /* 0 1 2 3 4 5 6 7 8 9 */ |
| VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| }; |
| |
| /* Type def for refactoring changeState_2022 code*/ |
| typedef enum{ |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| ISO_2022=0, |
| #endif |
| ISO_2022_JP=1, |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| ISO_2022_KR=2, |
| ISO_2022_CN=3 |
| #endif |
| } Variant2022; |
| |
| /*********** ISO 2022 Converter Protos ***********/ |
| static void U_CALLCONV |
| _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
| |
| static void U_CALLCONV |
| _ISO2022Close(UConverter *converter); |
| |
| static void U_CALLCONV |
| _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
| |
| U_CDECL_BEGIN |
| static const char * U_CALLCONV |
| _ISO2022getName(const UConverter* cnv); |
| U_CDECL_END |
| |
| static void U_CALLCONV |
| _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); |
| |
| U_CDECL_BEGIN |
| static UConverter * U_CALLCONV |
| _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); |
| |
| U_CDECL_END |
| |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| static void U_CALLCONV |
| T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
| #endif |
| |
| namespace { |
| |
| /*const UConverterSharedData _ISO2022Data;*/ |
| extern const UConverterSharedData _ISO2022JPData; |
| |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| extern const UConverterSharedData _ISO2022KRData; |
| extern const UConverterSharedData _ISO2022CNData; |
| #endif |
| |
| } // namespace |
| |
| /*************** Converter implementations ******************/ |
| |
| /* The purpose of this function is to get around gcc compiler warnings. */ |
| static inline void |
| fromUWriteUInt8(UConverter *cnv, |
| const char *bytes, int32_t length, |
| uint8_t **target, const char *targetLimit, |
| int32_t **offsets, |
| int32_t sourceIndex, |
| UErrorCode *pErrorCode) |
| { |
| char *targetChars = (char *)*target; |
| ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, |
| offsets, sourceIndex, pErrorCode); |
| *target = (uint8_t*)targetChars; |
| |
| } |
| |
| static inline void |
| setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ |
| if(myConverterData->version == 1) { |
| UConverter *cnv = myConverterData->currentConverter; |
| |
| cnv->toUnicodeStatus=0; /* offset */ |
| cnv->mode=0; /* state */ |
| cnv->toULength=0; /* byteIndex */ |
| } |
| } |
| |
| static inline void |
| setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
| /* in ISO-2022-KR the designator sequence appears only once |
| * in a file so we append it only once |
| */ |
| if( converter->charErrorBufferLength==0){ |
| |
| converter->charErrorBufferLength = 4; |
| converter->charErrorBuffer[0] = 0x1b; |
| converter->charErrorBuffer[1] = 0x24; |
| converter->charErrorBuffer[2] = 0x29; |
| converter->charErrorBuffer[3] = 0x43; |
| } |
| if(myConverterData->version == 1) { |
| UConverter *cnv = myConverterData->currentConverter; |
| |
| cnv->fromUChar32=0; |
| cnv->fromUnicodeStatus=1; /* prevLength */ |
| } |
| } |
| |
| static void U_CALLCONV |
| _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
| |
| char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'}; |
| |
| cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
| if(cnv->extraInfo != NULL) { |
| UConverterNamePieces stackPieces; |
| UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; |
| UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
| uint32_t version; |
| |
| stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
| |
| uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
| myConverterData->currentType = ASCII1; |
| cnv->fromUnicodeStatus =FALSE; |
| if(pArgs->locale){ |
| uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1); |
| } |
| version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
| myConverterData->version = version; |
| if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
| (myLocale[2]=='_' || myLocale[2]=='\0')) |
| { |
| /* open the required converters and cache them */ |
| if(version>MAX_JA_VERSION) { |
| // ICU 55 fails to open a converter for an unsupported version. |
| // Previously, it fell back to version 0, but that would yield |
| // unexpected behavior. |
| *errorCode = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
| myConverterData->myConverterArray[ISO8859_7] = |
| ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); |
| } |
| myConverterData->myConverterArray[JISX208] = |
| ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); |
| if(jpCharsetMasks[version]&CSM(JISX212)) { |
| myConverterData->myConverterArray[JISX212] = |
| ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); |
| } |
| if(jpCharsetMasks[version]&CSM(GB2312)) { |
| myConverterData->myConverterArray[GB2312] = |
| ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ |
| } |
| if(jpCharsetMasks[version]&CSM(KSC5601)) { |
| myConverterData->myConverterArray[KSC5601] = |
| ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); |
| } |
| |
| /* set the function pointers to appropriate funtions */ |
| cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); |
| uprv_strcpy(myConverterData->locale,"ja"); |
| |
| (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
| size_t len = uprv_strlen(myConverterData->name); |
| myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); |
| myConverterData->name[len+1]='\0'; |
| } |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
| (myLocale[2]=='_' || myLocale[2]=='\0')) |
| { |
| if(version>1) { |
| // ICU 55 fails to open a converter for an unsupported version. |
| // Previously, it fell back to version 0, but that would yield |
| // unexpected behavior. |
| *errorCode = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| const char *cnvName; |
| if(version==1) { |
| cnvName="icu-internal-25546"; |
| } else { |
| cnvName="ibm-949"; |
| myConverterData->version=version=0; |
| } |
| if(pArgs->onlyTestIsLoadable) { |
| ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ |
| uprv_free(cnv->extraInfo); |
| cnv->extraInfo=NULL; |
| return; |
| } else { |
| myConverterData->currentConverter=ucnv_open(cnvName, errorCode); |
| if (U_FAILURE(*errorCode)) { |
| _ISO2022Close(cnv); |
| return; |
| } |
| |
| if(version==1) { |
| (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); |
| uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); |
| cnv->subCharLen = myConverterData->currentConverter->subCharLen; |
| }else{ |
| (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); |
| } |
| |
| /* initialize the state variables */ |
| setInitialStateToUnicodeKR(cnv, myConverterData); |
| setInitialStateFromUnicodeKR(cnv, myConverterData); |
| |
| /* set the function pointers to appropriate funtions */ |
| cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; |
| uprv_strcpy(myConverterData->locale,"ko"); |
| } |
| } |
| else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& |
| (myLocale[2]=='_' || myLocale[2]=='\0')) |
| { |
| if(version>2) { |
| // ICU 55 fails to open a converter for an unsupported version. |
| // Previously, it fell back to version 0, but that would yield |
| // unexpected behavior. |
| *errorCode = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| |
| /* open the required converters and cache them */ |
| myConverterData->myConverterArray[GB2312_1] = |
| ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); |
| if(version==1) { |
| myConverterData->myConverterArray[ISO_IR_165] = |
| ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); |
| } |
| myConverterData->myConverterArray[CNS_11643] = |
| ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); |
| |
| |
| /* set the function pointers to appropriate funtions */ |
| cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; |
| uprv_strcpy(myConverterData->locale,"cn"); |
| |
| if (version==0){ |
| myConverterData->version = 0; |
| (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
| }else if (version==1){ |
| myConverterData->version = 1; |
| (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); |
| }else { |
| myConverterData->version = 2; |
| (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); |
| } |
| } |
| #endif // !UCONFIG_ONLY_HTML_CONVERSION |
| else{ |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| myConverterData->isFirstBuffer = TRUE; |
| |
| /* append the UTF-8 escape sequence */ |
| cnv->charErrorBufferLength = 3; |
| cnv->charErrorBuffer[0] = 0x1b; |
| cnv->charErrorBuffer[1] = 0x25; |
| cnv->charErrorBuffer[2] = 0x42; |
| |
| cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; |
| /* initialize the state variables */ |
| uprv_strcpy(myConverterData->name,"ISO_2022"); |
| #else |
| *errorCode = U_MISSING_RESOURCE_ERROR; |
| // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard |
| // data loading error code. |
| return; |
| #endif |
| } |
| |
| cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
| |
| if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
| _ISO2022Close(cnv); |
| } |
| } else { |
| *errorCode = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| |
| static void U_CALLCONV |
| _ISO2022Close(UConverter *converter) { |
| UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
| UConverterSharedData **array = myData->myConverterArray; |
| int32_t i; |
| |
| if (converter->extraInfo != NULL) { |
| /*close the array of converter pointers and free the memory*/ |
| for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
| if(array[i]!=NULL) { |
| ucnv_unloadSharedDataIfReady(array[i]); |
| } |
| } |
| |
| ucnv_close(myData->currentConverter); |
| |
| if(!converter->isExtraLocal){ |
| uprv_free (converter->extraInfo); |
| converter->extraInfo = NULL; |
| } |
| } |
| } |
| |
| static void U_CALLCONV |
| _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { |
| UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); |
| if(choice<=UCNV_RESET_TO_UNICODE) { |
| uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
| myConverterData->key = 0; |
| myConverterData->isEmptySegment = FALSE; |
| } |
| if(choice!=UCNV_RESET_TO_UNICODE) { |
| uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
| } |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| if(myConverterData->locale[0] == 0){ |
| if(choice<=UCNV_RESET_TO_UNICODE) { |
| myConverterData->isFirstBuffer = TRUE; |
| myConverterData->key = 0; |
| if (converter->mode == UCNV_SO){ |
| ucnv_close (myConverterData->currentConverter); |
| myConverterData->currentConverter=NULL; |
| } |
| converter->mode = UCNV_SI; |
| } |
| if(choice!=UCNV_RESET_TO_UNICODE) { |
| /* re-append UTF-8 escape sequence */ |
| converter->charErrorBufferLength = 3; |
| converter->charErrorBuffer[0] = 0x1b; |
| converter->charErrorBuffer[1] = 0x28; |
| converter->charErrorBuffer[2] = 0x42; |
| } |
| } |
| else |
| #endif |
| { |
| /* reset the state variables */ |
| if(myConverterData->locale[0] == 'k'){ |
| if(choice<=UCNV_RESET_TO_UNICODE) { |
| setInitialStateToUnicodeKR(converter, myConverterData); |
| } |
| if(choice!=UCNV_RESET_TO_UNICODE) { |
| setInitialStateFromUnicodeKR(converter, myConverterData); |
| } |
| } |
| } |
| } |
| |
| U_CDECL_BEGIN |
| |
| static const char * U_CALLCONV |
| _ISO2022getName(const UConverter* cnv){ |
| if(cnv->extraInfo){ |
| UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; |
| return myData->name; |
| } |
| return NULL; |
| } |
| |
| U_CDECL_END |
| |
| |
| /*************** to unicode *******************/ |
| /**************************************************************************** |
| * Recognized escape sequences are |
| * <ESC>(B ASCII |
| * <ESC>.A ISO-8859-1 |
| * <ESC>.F ISO-8859-7 |
| * <ESC>(J JISX-201 |
| * <ESC>(I JISX-201 |
| * <ESC>$B JISX-208 |
| * <ESC>$@ JISX-208 |
| * <ESC>$(D JISX-212 |
| * <ESC>$A GB2312 |
| * <ESC>$(C KSC5601 |
| */ |
| static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
| /* 0 1 2 3 4 5 6 7 8 9 */ |
| INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| }; |
| |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| /*************** to unicode *******************/ |
| static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
| /* 0 1 2 3 4 5 6 7 8 9 */ |
| INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 |
| ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| }; |
| #endif |
| |
| |
| static UCNV_TableStates_2022 |
| getKey_2022(char c,int32_t* key,int32_t* offset){ |
| int32_t togo; |
| int32_t low = 0; |
| int32_t hi = MAX_STATES_2022; |
| int32_t oldmid=0; |
| |
| togo = normalize_esq_chars_2022[(uint8_t)c]; |
| if(togo == 0) { |
| /* not a valid character anywhere in an escape sequence */ |
| *key = 0; |
| *offset = 0; |
| return INVALID_2022; |
| } |
| togo = (*key << 5) + togo; |
| |
| while (hi != low) /*binary search*/{ |
| |
| int32_t mid = (hi+low) >> 1; /*Finds median*/ |
| |
| if (mid == oldmid) |
| break; |
| |
| if (escSeqStateTable_Key_2022[mid] > togo){ |
| hi = mid; |
| } |
| else if (escSeqStateTable_Key_2022[mid] < togo){ |
| low = mid; |
| } |
| else /*we found it*/{ |
| *key = togo; |
| *offset = mid; |
| return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
| } |
| oldmid = mid; |
| |
| } |
| |
| *key = 0; |
| *offset = 0; |
| return INVALID_2022; |
| } |
| |
| /*runs through a state machine to determine the escape sequence - codepage correspondence |
| */ |
| static void |
| changeState_2022(UConverter* _this, |
| const char** source, |
| const char* sourceLimit, |
| Variant2022 var, |
| UErrorCode* err){ |
| UCNV_TableStates_2022 value; |
| UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); |
| uint32_t key = myData2022->key; |
| int32_t offset = 0; |
| int8_t initialToULength = _this->toULength; |
| char c; |
| |
| value = VALID_NON_TERMINAL_2022; |
| while (*source < sourceLimit) { |
| c = *(*source)++; |
| _this->toUBytes[_this->toULength++]=(uint8_t)c; |
| value = getKey_2022(c,(int32_t *) &key, &offset); |
| |
| switch (value){ |
| |
| case VALID_NON_TERMINAL_2022 : |
| /* continue with the loop */ |
| break; |
| |
| case VALID_TERMINAL_2022: |
| key = 0; |
| goto DONE; |
| |
| case INVALID_2022: |
| goto DONE; |
| |
| case VALID_MAYBE_TERMINAL_2022: |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| /* ESC ( B is ambiguous only for ISO_2022 itself */ |
| if(var == ISO_2022) { |
| /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ |
| _this->toULength = 0; |
| |
| /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
| |
| /* continue with the loop */ |
| value = VALID_NON_TERMINAL_2022; |
| break; |
| } else |
| #endif |
| { |
| /* not ISO_2022 itself, finish here */ |
| value = VALID_TERMINAL_2022; |
| key = 0; |
| goto DONE; |
| } |
| } |
| } |
| |
| DONE: |
| myData2022->key = key; |
| |
| if (value == VALID_NON_TERMINAL_2022) { |
| /* indicate that the escape sequence is incomplete: key!=0 */ |
| return; |
| } else if (value == INVALID_2022 ) { |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| } else /* value == VALID_TERMINAL_2022 */ { |
| switch(var){ |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| case ISO_2022: |
| { |
| const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; |
| if(chosenConverterName == NULL) { |
| /* SS2 or SS3 */ |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| _this->toUCallbackReason = UCNV_UNASSIGNED; |
| return; |
| } |
| |
| _this->mode = UCNV_SI; |
| ucnv_close(myData2022->currentConverter); |
| myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); |
| if(U_SUCCESS(*err)) { |
| myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
| _this->mode = UCNV_SO; |
| } |
| break; |
| } |
| #endif |
| case ISO_2022_JP: |
| { |
| StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
| switch(tempState) { |
| case INVALID_STATE: |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| break; |
| case SS2_STATE: |
| if(myData2022->toU2022State.cs[2]!=0) { |
| if(myData2022->toU2022State.g<2) { |
| myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
| } |
| myData2022->toU2022State.g=2; |
| } else { |
| /* illegal to have SS2 before a matching designator */ |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| } |
| break; |
| /* case SS3_STATE: not used in ISO-2022-JP-x */ |
| case ISO8859_1: |
| case ISO8859_7: |
| if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| } else { |
| /* G2 charset for SS2 */ |
| myData2022->toU2022State.cs[2]=(int8_t)tempState; |
| } |
| break; |
| default: |
| if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| } else { |
| /* G0 charset */ |
| myData2022->toU2022State.cs[0]=(int8_t)tempState; |
| } |
| break; |
| } |
| } |
| break; |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| case ISO_2022_CN: |
| { |
| StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
| switch(tempState) { |
| case INVALID_STATE: |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| break; |
| case SS2_STATE: |
| if(myData2022->toU2022State.cs[2]!=0) { |
| if(myData2022->toU2022State.g<2) { |
| myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
| } |
| myData2022->toU2022State.g=2; |
| } else { |
| /* illegal to have SS2 before a matching designator */ |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| } |
| break; |
| case SS3_STATE: |
| if(myData2022->toU2022State.cs[3]!=0) { |
| if(myData2022->toU2022State.g<2) { |
| myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
| } |
| myData2022->toU2022State.g=3; |
| } else { |
| /* illegal to have SS3 before a matching designator */ |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| } |
| break; |
| case ISO_IR_165: |
| if(myData2022->version==0) { |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| break; |
| } |
| U_FALLTHROUGH; |
| case GB2312_1: |
| U_FALLTHROUGH; |
| case CNS_11643_1: |
| myData2022->toU2022State.cs[1]=(int8_t)tempState; |
| break; |
| case CNS_11643_2: |
| myData2022->toU2022State.cs[2]=(int8_t)tempState; |
| break; |
| default: |
| /* other CNS 11643 planes */ |
| if(myData2022->version==0) { |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| } else { |
| myData2022->toU2022State.cs[3]=(int8_t)tempState; |
| } |
| break; |
| } |
| } |
| break; |
| case ISO_2022_KR: |
| if(offset==0x30){ |
| /* nothing to be done, just accept this one escape sequence */ |
| } else { |
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| } |
| break; |
| #endif // !UCONFIG_ONLY_HTML_CONVERSION |
| |
| default: |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| break; |
| } |
| } |
| if(U_SUCCESS(*err)) { |
| _this->toULength = 0; |
| } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
| if(_this->toULength>1) { |
| /* |
| * Ticket 5691: consistent illegal sequences: |
| * - We include at least the first byte (ESC) in the illegal sequence. |
| * - If any of the non-initial bytes could be the start of a character, |
| * we stop the illegal sequence before the first one of those. |
| * In escape sequences, all following bytes are "printable", that is, |
| * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), |
| * they are valid single/lead bytes. |
| * For simplicity, we always only report the initial ESC byte as the |
| * illegal sequence and back out all other bytes we looked at. |
| */ |
| /* Back out some bytes. */ |
| int8_t backOutDistance=_this->toULength-1; |
| int8_t bytesFromThisBuffer=_this->toULength-initialToULength; |
| if(backOutDistance<=bytesFromThisBuffer) { |
| /* same as initialToULength<=1 */ |
| *source-=backOutDistance; |
| } else { |
| /* Back out bytes from the previous buffer: Need to replay them. */ |
| _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); |
| /* same as -(initialToULength-1) */ |
| /* preToULength is negative! */ |
| uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); |
| *source-=bytesFromThisBuffer; |
| } |
| _this->toULength=1; |
| } |
| } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
| _this->toUCallbackReason = UCNV_UNASSIGNED; |
| } |
| } |
| |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| /*Checks the characters of the buffer against valid 2022 escape sequences |
| *if the match we return a pointer to the initial start of the sequence otherwise |
| *we return sourceLimit |
| */ |
| /*for 2022 looks ahead in the stream |
| *to determine the longest possible convertible |
| *data stream |
| */ |
| static inline const char* |
| getEndOfBuffer_2022(const char** source, |
| const char* sourceLimit, |
| UBool /*flush*/){ |
| |
| const char* mySource = *source; |
| |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| if (*source >= sourceLimit) |
| return sourceLimit; |
| |
| do{ |
| |
| if (*mySource == ESC_2022){ |
| int8_t i; |
| int32_t key = 0; |
| int32_t offset; |
| UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; |
| |
| /* Kludge: I could not |
| * figure out the reason for validating an escape sequence |
| * twice - once here and once in changeState_2022(). |
| * is it possible to have an ESC character in a ISO2022 |
| * byte stream which is valid in a code page? Is it legal? |
| */ |
| for (i=0; |
| (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
| i++) { |
| value = getKey_2022(*(mySource+i), &key, &offset); |
| } |
| if (value > 0 || *mySource==ESC_2022) |
| return mySource; |
| |
| if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
| return sourceLimit; |
| } |
| }while (++mySource < sourceLimit); |
| |
| return sourceLimit; |
| #else |
| while(mySource < sourceLimit && *mySource != ESC_2022) { |
| ++mySource; |
| } |
| return mySource; |
| #endif |
| } |
| #endif |
| |
| /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c |
| * any future change in _MBCSFromUChar32() function should be reflected here. |
| * @return number of bytes in *value; negative number if fallback; 0 if no mapping |
| */ |
| static inline int32_t |
| MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
| UChar32 c, |
| uint32_t* value, |
| UBool useFallback, |
| int outputType) |
| { |
| const int32_t *cx; |
| const uint16_t *table; |
| uint32_t stage2Entry; |
| uint32_t myValue; |
| int32_t length; |
| const uint8_t *p; |
| /* |
| * TODO(markus): Use and require new, faster MBCS conversion table structures. |
| * Use internal version of ucnv_open() that verifies that the new structures are available, |
| * else U_INTERNAL_PROGRAM_ERROR. |
| */ |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
| table=sharedData->mbcs.fromUnicodeTable; |
| stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
| /* get the bytes and the length for the output */ |
| if(outputType==MBCS_OUTPUT_2){ |
| myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| if(myValue<=0xff) { |
| length=1; |
| } else { |
| length=2; |
| } |
| } else /* outputType==MBCS_OUTPUT_3 */ { |
| p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
| if(myValue<=0xff) { |
| length=1; |
| } else if(myValue<=0xffff) { |
| length=2; |
| } else { |
| length=3; |
| } |
| } |
| /* is this code point assigned, or do we use fallbacks? */ |
| if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
| /* assigned */ |
| *value=myValue; |
| return length; |
| } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { |
| /* |
| * We allow a 0 byte output if the "assigned" bit is set for this entry. |
| * There is no way with this data structure for fallback output |
| * to be a zero byte. |
| */ |
| *value=myValue; |
| return -length; |
| } |
| } |
| |
| cx=sharedData->mbcs.extIndexes; |
| if(cx!=NULL) { |
| return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
| } |
| |
| /* unassigned */ |
| return 0; |
| } |
| |
| /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c |
| * any future change in _MBCSSingleFromUChar32() function should be reflected here. |
| * @param retval pointer to output byte |
| * @return 1 roundtrip byte 0 no mapping -1 fallback byte |
| */ |
| static inline int32_t |
| MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
| UChar32 c, |
| uint32_t* retval, |
| UBool useFallback) |
| { |
| const uint16_t *table; |
| int32_t value; |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
| return 0; |
| } |
| /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ |
| table=sharedData->mbcs.fromUnicodeTable; |
| /* get the byte for the output */ |
| value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
| /* is this code point assigned, or do we use fallbacks? */ |
| *retval=(uint32_t)(value&0xff); |
| if(value>=0xf00) { |
| return 1; /* roundtrip */ |
| } else if(useFallback ? value>=0x800 : value>=0xc00) { |
| return -1; /* fallback taken */ |
| } else { |
| return 0; /* no mapping */ |
| } |
| } |
| |
| /* |
| * Check that the result is a 2-byte value with each byte in the range A1..FE |
| * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte |
| * to move it to the ISO 2022 range 21..7E. |
| * Return 0 if out of range. |
| */ |
| static inline uint32_t |
| _2022FromGR94DBCS(uint32_t value) { |
| if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) |
| ) { |
| return value - 0x8080; /* shift down to 21..7e byte range */ |
| } else { |
| return 0; /* not valid for ISO 2022 */ |
| } |
| } |
| |
| #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ |
| /* |
| * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the |
| * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point |
| * unchanged. |
| */ |
| static inline uint32_t |
| _2022ToGR94DBCS(uint32_t value) { |
| uint32_t returnValue = value + 0x8080; |
| if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
| return returnValue; |
| } else { |
| return value; |
| } |
| } |
| #endif |
| |
| #ifdef U_ENABLE_GENERIC_ISO_2022 |
| |
| /********************************************************************************** |
| * ISO-2022 Converter |
| * |
| * |
| */ |
| |
| static void U_CALLCONV |
| T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
| UErrorCode* err){ |
| const char* mySourceLimit, *realSourceLimit; |
| const char* sourceStart; |
| const UChar* myTargetStart; |
| UConverter* saveThis; |
| UConverterDataISO2022* myData; |
| int8_t length; |
| |
| saveThis = args->converter; |
| myData=((UConverterDataISO2022*)(saveThis->extraInfo)); |
| |
| realSourceLimit = args->sourceLimit; |
| while (args->source < realSourceLimit) { |
| if(myData->key == 0) { /* are we in the middle of an escape sequence? */ |
| /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
| mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); |
| |
| if(args->source < mySourceLimit) { |
| if(myData->currentConverter==NULL) { |
| myData->currentConverter = ucnv_open("ASCII",err); |
| if(U_FAILURE(*err)){ |
| return; |
| } |
| |
| myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
| saveThis->mode = UCNV_SO; |
| } |
| |
| /* convert to before the ESC or until the end of the buffer */ |
| myData->isFirstBuffer=FALSE; |
| sourceStart = args->source; |
| myTargetStart = args->target; |
| args->converter = myData->currentConverter; |
| ucnv_toUnicode(args->converter, |
| &args->target, |
| args->targetLimit, |
| &args->source, |
| mySourceLimit, |
| args->offsets, |
| (UBool)(args->flush && mySourceLimit == realSourceLimit), |
| err); |
| args->converter = saveThis; |
| |
| if (*err == U_BUFFER_OVERFLOW_ERROR) { |
| /* move the overflow buffer */ |
| length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; |
| myData->currentConverter->UCharErrorBufferLength = 0; |
| if(length > 0) { |
| uprv_memcpy(saveThis->UCharErrorBuffer, |
| myData->currentConverter->UCharErrorBuffer, |
| length*U_SIZEOF_UCHAR); |
| } |
| return; |
| } |
| |
| /* |
| * At least one of: |
| * -Error while converting |
| * -Done with entire buffer |
| * -Need to write offsets or update the current offset |
| * (leave that up to the code in ucnv.c) |
| * |
| * or else we just stopped at an ESC byte and continue with changeState_2022() |
| */ |
| if (U_FAILURE(*err) || |
| (args->source == realSourceLimit) || |
| (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || |
| (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) |
| ) { |
| /* copy partial or error input for truncated detection and error handling */ |
| if(U_FAILURE(*err)) { |
| length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; |
| if(length > 0) { |
| uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); |
| } |
| } else { |
| length = saveThis->toULength = myData->currentConverter->toULength; |
| if(length > 0) { |
| uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); |
| if(args->source < mySourceLimit) { |
| *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ |
| } |
| } |
| } |
| return; |
| } |
| } |
| } |
| |
| sourceStart = args->source; |
| changeState_2022(args->converter, |
| &(args->source), |
| realSourceLimit, |
| ISO_2022, |
| err); |
| if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
| /* let the ucnv.c code update its current offset */ |
| return; |
| } |
| } |
| } |
| |
| #endif |
| |
| /* |
| * To Unicode Callback helper function |
| */ |
| static void |
| toUnicodeCallback(UConverter *cnv, |
| const uint32_t sourceChar, const uint32_t targetUniChar, |
| UErrorCode* err){ |
| if(sourceChar>0xff){ |
| cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
| cnv->toUBytes[1] = (uint8_t)sourceChar; |
| cnv->toULength = 2; |
| } |
| else{ |
| cnv->toUBytes[0] =(char) sourceChar; |
| cnv->toULength = 1; |
| } |
| |
| if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ |
| *err = U_INVALID_CHAR_FOUND; |
| } |
| else{ |
| *err = U_ILLEGAL_CHAR_FOUND; |
| } |
| } |
| |
| /**************************************ISO-2022-JP*************************************************/ |
| |
| /************************************** IMPORTANT ************************************************** |
| * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and |
| * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). |
| * The converter iterates over each Unicode codepoint |
| * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is |
| * processed one char at a time it would make sense to reduce the extra processing a canned converter |
| * would do as far as possible. |
| * |
| * If the implementation of these macros or structure of sharedData struct change in the future, make |
| * sure that ISO-2022 is also changed. |
| *************************************************************************************************** |
| */ |
| |
| /*************************************************************************************************** |
| * Rules for ISO-2022-jp encoding |
| * (i) Escape sequences must be fully contained within a line they should not |
| * span new lines or CRs |
| * (ii) If the last character on a line is represented by two bytes then an ASCII or |
| * JIS-Roman character escape sequence should follow before the line terminates |
| * (iii) If the first character on the line is represented by two bytes then a two |
| * byte character escape sequence should precede it |
| * (iv) If no escape sequence is encountered then the characters are ASCII |
| * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, |
| * and invoked with SS2 (ESC N). |
| * (vi) If there is any G0 designation in text, there must be a switch to |
| * ASCII or to JIS X 0201-Roman before a space character (but not |
| * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control |
| * characters such as tab or CRLF. |
| * (vi) Supported encodings: |
| * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 |
| * |
| * source : RFC-1554 |
| * |
| * JISX201, JISX208,JISX212 : new .cnv data files created |
| * KSC5601 : alias to ibm-949 mapping table |
| * GB2312 : alias to ibm-1386 mapping table |
| * ISO-8859-1 : Algorithmic implemented as LATIN1 case |
| * ISO-8859-7 : alias to ibm-9409 mapping table |
| */ |
| |
| /* preference order of JP charsets */ |
| static const StateEnum jpCharsetPref[]={ |
| ASCII, |
| JISX201, |
| ISO8859_1, |
| JISX208, |
| ISO8859_7, |
| JISX212, |
| GB2312, |
| KSC5601, |
| HWKANA_7BIT |
| }; |
| |
| /* |
| * The escape sequences must be in order of the enum constants like JISX201 = 3, |
| * not in order of jpCharsetPref[]! |
| */ |
| static const char escSeqChars[][6] ={ |
| "\x1B\x28\x42", /* <ESC>(B ASCII */ |
| "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ |
| "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ |
| "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ |
| "\x1B\x24\x42", /* <ESC>$B JISX-208 */ |
| "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ |
| "\x1B\x24\x41", /* <ESC>$A GB2312 */ |
| "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ |
| "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ |
| |
| }; |
| static const int8_t escSeqCharsLen[] ={ |
| 3, /* length of <ESC>(B ASCII */ |
| 3, /* length of <ESC>.A ISO-8859-1 */ |
| 3, /* length of <ESC>.F ISO-8859-7 */ |
| 3, /* length of <ESC>(J JISX-201 */ |
| 3, /* length of <ESC>$B JISX-208 */ |
| 4, /* length of <ESC>$(D JISX-212 */ |
| 3, /* length of <ESC>$A GB2312 */ |
| 4, /* length of <ESC>$(C KSC5601 */ |
| 3 /* length of <ESC>(I HWKANA_7BIT */ |
| }; |
| |
| /* |
| * The iteration over various code pages works this way: |
| * i) Get the currentState from myConverterData->currentState |
| * ii) Check if the character is mapped to a valid character in the currentState |
| * Yes -> a) set the initIterState to currentState |
| * b) remain in this state until an invalid character is found |
| * No -> a) go to the next code page and find the character |
| * iii) Before changing the state increment the current state check if the current state |
| * is equal to the intitIteration state |
| * Yes -> A character that cannot be represented in any of the supported encodings |
| * break and return a U_INVALID_CHARACTER error |
| * No -> Continue and find the character in next code page |
| * |
| * |
| * TODO: Implement a priority technique where the users are allowed to set the priority of code pages |
| */ |
| |
| /* Map 00..7F to Unicode according to JIS X 0201. */ |
| static inline uint32_t |
| jisx201ToU(uint32_t value) { |
| if(value < 0x5c) { |
| return value; |
| } else if(value == 0x5c) { |
| return 0xa5; |
| } else if(value == 0x7e) { |
| return 0x203e; |
| } else /* value <= 0x7f */ { |
| return value; |
| } |
| } |
| |
| /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ |
| static inline uint32_t |
| jisx201FromU(uint32_t value) { |
| if(value<=0x7f) { |
| if(value!=0x5c && value!=0x7e) { |
| return value; |
| } |
| } else if(value==0xa5) { |
| return 0x5c; |
| } else if(value==0x203e) { |
| return 0x7e; |
| } |
| return 0xfffe; |
| } |
| |
| /* |
| * Take a valid Shift-JIS byte pair, check that it is in the range corresponding |
| * to JIS X 0208, and convert it to a pair of 21..7E bytes. |
| * Return 0 if the byte pair is out of range. |
| */ |
| static inline uint32_t |
| _2022FromSJIS(uint32_t value) { |
| uint8_t trail; |
| |
| if(value > 0xEFFC) { |
| return 0; /* beyond JIS X 0208 */ |
| } |
| |
| trail = (uint8_t)value; |
| |
| value &= 0xff00; /* lead byte */ |
| if(value <= 0x9f00) { |
| value -= 0x7000; |
| } else /* 0xe000 <= value <= 0xef00 */ { |
| value -= 0xb000; |
| } |
| value <<= 1; |
| |
| if(trail <= 0x9e) { |
| value -= 0x100; |
| if(trail <= 0x7e) { |
| value |= trail - 0x1f; |
| } else { |
| value |= trail - 0x20; |
| } |
| } else /* trail <= 0xfc */ { |
| value |= trail - 0x7e; |
| } |
| return value; |
| } |
| |
| /* |
| * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. |
| * If either byte is outside 21..7E make sure that the result is not valid |
| * for Shift-JIS so that the converter catches it. |
| * Some invalid byte values already turn into equally invalid Shift-JIS |
| * byte values and need not be tested explicitly. |
| */ |
| static inline void |
| _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { |
| if(c1&1) { |
| ++c1; |
| if(c2 <= 0x5f) { |
| c2 += 0x1f; |
| } else if(c2 <= 0x7e) { |
| c2 += 0x20; |
| } else { |
| c2 = 0; /* invalid */ |
| } |
| } else { |
| if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { |
| c2 += 0x7e; |
| } else { |
| c2 = 0; /* invalid */ |
| } |
| } |
| c1 >>= 1; |
| if(c1 <= 0x2f) { |
| c1 += 0x70; |
| } else if(c1 <= 0x3f) { |
| c1 += 0xb0; |
| } else { |
| c1 = 0; /* invalid */ |
| } |
| bytes[0] = (char)c1; |
| bytes[1] = (char)c2; |
| } |
| |
| /* |
| * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) |
| * Katakana. |
| * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks |
| * because Shift-JIS roundtrips half-width Katakana to single bytes. |
| * These were the only fallbacks in ICU's jisx-208.ucm file. |
| */ |
| static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { |
| 0x2123, /* U+FF61 */ |
| 0x2156, |
| 0x2157, |
| 0x2122, |
| 0x2126, |
| 0x2572, |
| 0x2521, |
| 0x2523, |
| 0x2525, |
| 0x2527, |
| 0x2529, |
| 0x2563, |
| 0x2565, |
| 0x2567, |
| 0x2543, |
| 0x213C, /* U+FF70 */ |
| 0x2522, |
| 0x2524, |
| 0x2526, |
| 0x2528, |
| 0x252A, |
| 0x252B, |
| 0x252D, |
| 0x252F, |
| 0x2531, |
| 0x2533, |
| 0x2535, |
| 0x2537, |
| 0x2539, |
| 0x253B, |
| 0x253D, |
| 0x253F, /* U+FF80 */ |
| 0x2541, |
| 0x2544, |
| 0x2546, |
| 0x2548, |
| 0x254A, |
| 0x254B, |
| 0x254C, |
| 0x254D, |
| 0x254E, |
| 0x254F, |
| 0x2552, |
| 0x2555, |
| 0x2558, |
| 0x255B, |
| 0x255E, |
| 0x255F, /* U+FF90 */ |
| 0x2560, |
| 0x2561, |
| 0x2562, |
| 0x2564, |
| 0x2566, |
| 0x2568, |
| 0x2569, |
| 0x256A, |
| 0x256B, |
| 0x256C, |
| 0x256D, |
| 0x256F, |
| 0x2573, |
| 0x212B, |
| 0x212C /* U+FF9F */ |
| }; |
| |
| static void U_CALLCONV |
| UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
| UConverter *cnv = args->converter; |
| UConverterDataISO2022 *converterData; |
| ISO2022State *pFromU2022State; |
| uint8_t *target = (uint8_t *) args->target; |
| const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
| const UChar* source = args->source; |
| const UChar* sourceLimit = args->sourceLimit; |
| int32_t* offsets = args->offsets; |
| UChar32 sourceChar; |
| char buffer[8]; |
| int32_t len, outLen; |
| int8_t choices[10]; |
| int32_t choiceCount; |
| uint32_t targetValue = 0; |
| UBool useFallback; |
| |
| int32_t i; |
| int8_t cs, g; |
| |
| /* set up the state */ |
| converterData = (UConverterDataISO2022*)cnv->extraInfo; |
| pFromU2022State = &converterData->fromU2022State; |
| |
| choiceCount = 0; |
| |
| /* check if the last codepoint of previous buffer was a lead surrogate*/ |
| if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
| goto getTrail; |
| } |
| |
| while(source < sourceLimit) { |
| if(target < targetLimit) { |
| |
| sourceChar = *(source++); |
| /*check if the char is a First surrogate*/ |
| if(U16_IS_SURROGATE(sourceChar)) { |
| if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
| getTrail: |
| /*look ahead to find the trail surrogate*/ |
| if(source < sourceLimit) { |
| /* test the following code unit */ |
| UChar trail=(UChar) *source; |
| if(U16_IS_TRAIL(trail)) { |
| source++; |
| sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
| cnv->fromUChar32=0x00; |
| /* convert this supplementary code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| cnv->fromUChar32=sourceChar; |
| break; |
| } |
| } else { |
| /* no more input */ |
| cnv->fromUChar32=sourceChar; |
| break; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| cnv->fromUChar32=sourceChar; |
| break; |
| } |
| } |
| |
| /* do not convert SO/SI/ESC */ |
| if(IS_2022_CONTROL(sourceChar)) { |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| cnv->fromUChar32=sourceChar; |
| break; |
| } |
| |
| /* do the conversion */ |
| |
| if(choiceCount == 0) { |
| uint16_t csm; |
| |
| /* |
| * The csm variable keeps track of which charsets are allowed |
| * and not used yet while building the choices[]. |
| */ |
| csm = jpCharsetMasks[converterData->version]; |
| choiceCount = 0; |
| |
| /* JIS7/8: try single-byte half-width Katakana before JISX208 */ |
| if(converterData->version == 3 || converterData->version == 4) { |
| choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
| } |
| /* Do not try single-byte half-width Katakana for other versions. */ |
| csm &= ~CSM(HWKANA_7BIT); |
| |
| /* try the current G0 charset */ |
| choices[choiceCount++] = cs = pFromU2022State->cs[0]; |
| csm &= ~CSM(cs); |
| |
| /* try the current G2 charset */ |
| if((cs = pFromU2022State->cs[2]) != 0) { |
| choices[choiceCount++] = cs; |
| csm &= ~CSM(cs); |
| } |
| |
| /* try all the other possible charsets */ |
| for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { |
| cs = (int8_t)jpCharsetPref[i]; |
| if(CSM(cs) & csm) { |
| choices[choiceCount++] = cs; |
| csm &= ~CSM(cs); |
| } |
| } |
| } |
| |
| cs = g = 0; |
| /* |
| * len==0: no mapping found yet |
| * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
| * len>0: found a roundtrip result, done |
| */ |
| len = 0; |
| /* |
| * We will turn off useFallback after finding a fallback, |
| * but we still get fallbacks from PUA code points as usual. |
| * Therefore, we will also need to check that we don't overwrite |
| * an early fallback with a later one. |
| */ |
| useFallback = cnv->useFallback; |
| |
| for(i = 0; i < choiceCount && len <= 0; ++i) { |
| uint32_t value; |
| int32_t len2; |
| int8_t cs0 = choices[i]; |
| switch(cs0) { |
| case ASCII: |
| if(sourceChar <= 0x7f) { |
| targetValue = (uint32_t)sourceChar; |
| len = 1; |
| cs = cs0; |
| g = 0; |
| } |
| break; |
| case ISO8859_1: |
| if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
| targetValue = (uint32_t)sourceChar - 0x80; |
| len = 1; |
| cs = cs0; |
| g = 2; |
| } |
| break; |
| case HWKANA_7BIT: |
| if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
| if(converterData->version==3) { |
| /* JIS7: use G1 (SO) */ |
| /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
| targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); |
| len = 1; |
| pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ |
| g = 1; |
| } else if(converterData->version==4) { |
| /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ |
| /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
| targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); |
| len = 1; |
| |
| cs = pFromU2022State->cs[0]; |
| if(IS_JP_DBCS(cs)) { |
| /* switch from a DBCS charset to JISX201 */ |
| cs = (int8_t)JISX201; |
| } |
| /* else stay in the current G0 charset */ |
| g = 0; |
| } |
| /* else do not use HWKANA_7BIT with other versions */ |
| } |
| break; |
| case JISX201: |
| /* G0 SBCS */ |
| value = jisx201FromU(sourceChar); |
| if(value <= 0x7f) { |
| targetValue = value; |
| len = 1; |
| cs = cs0; |
| g = 0; |
| useFallback = FALSE; |
| } |
| break; |
| case JISX208: |
| /* G0 DBCS from Shift-JIS table */ |
| len2 = MBCS_FROM_UCHAR32_ISO2022( |
| converterData->myConverterArray[cs0], |
| sourceChar, &value, |
| useFallback, MBCS_OUTPUT_2); |
| if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
| value = _2022FromSJIS(value); |
| if(value != 0) { |
| targetValue = value; |
| len = len2; |
| cs = cs0; |
| g = 0; |
| useFallback = FALSE; |
| } |
| } else if(len == 0 && useFallback && |
| (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
| targetValue = hwkana_fb[sourceChar - HWKANA_START]; |
| len = -2; |
| cs = cs0; |
| g = 0; |
| useFallback = FALSE; |
| } |
| break; |
| case ISO8859_7: |
| /* G0 SBCS forced to 7-bit output */ |
| len2 = MBCS_SINGLE_FROM_UCHAR32( |
| converterData->myConverterArray[cs0], |
| sourceChar, &value, |
| useFallback); |
| if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { |
| targetValue = value - 0x80; |
| len = len2; |
| cs = cs0; |
| g = 2; |
| useFallback = FALSE; |
| } |
| break; |
| default: |
| /* G0 DBCS */ |
| len2 = MBCS_FROM_UCHAR32_ISO2022( |
| converterData->myConverterArray[cs0], |
| sourceChar, &value, |
| useFallback, MBCS_OUTPUT_2); |
| if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
| if(cs0 == KSC5601) { |
| /* |
| * Check for valid bytes for the encoding scheme. |
| * This is necessary because the sub-converter (windows-949) |
| * has a broader encoding scheme than is valid for 2022. |
| */ |
| value = _2022FromGR94DBCS(value); |
| if(value == 0) { |
| break; |
| } |
| } |
| targetValue = value; |
| len = len2; |
| cs = cs0; |
| g = 0; |
| useFallback = FALSE; |
| } |
| break; |
| } |
| } |
| |
| if(len != 0) { |
| if(len < 0) { |
| len = -len; /* fallback */ |
| } |
| outLen = 0; /* count output bytes */ |
| |
| /* write SI if necessary (only for JIS7) */ |
| if(pFromU2022State->g == 1 && g == 0) { |
| buffer[outLen++] = UCNV_SI; |
| pFromU2022State->g = 0; |
| } |
| |
| /* write the designation sequence if necessary */ |
| if(cs != pFromU2022State->cs[g]) { |
| int32_t escLen = escSeqCharsLen[cs]; |
| uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); |
| outLen += escLen; |
| pFromU2022State->cs[g] = cs; |
| |
| /* invalidate the choices[] */ |
| choiceCount = 0; |
| } |
| |
| /* write the shift sequence if necessary */ |
| if(g != pFromU2022State->g) { |
| switch(g) { |
| /* case 0 handled before writing escapes */ |
| case 1: |
| buffer[outLen++] = UCNV_SO; |
| pFromU2022State->g = 1; |
| break; |
| default: /* case 2 */ |
| buffer[outLen++] = 0x1b; |
| buffer[outLen++] = 0x4e; |
| break; |
| /* no case 3: no SS3 in ISO-2022-JP-x */ |
| } |
| } |
| |
| /* write the output bytes */ |
| if(len == 1) { |
| buffer[outLen++] = (char)targetValue; |
| } else /* len == 2 */ { |
| buffer[outLen++] = (char)(targetValue >> 8); |
| buffer[outLen++] = (char)targetValue; |
| } |
| } else { |
| /* |
| * if we cannot find the character after checking all codepages |
| * then this is an error |
| */ |
| *err = U_INVALID_CHAR_FOUND; |
| cnv->fromUChar32=sourceChar; |
| break; |
| } |
| |
| if(sourceChar == CR || sourceChar == LF) { |
| /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ |
| pFromU2022State->cs[2] = 0; |
| choiceCount = 0; |
| } |
| |
| /* output outLen>0 bytes in buffer[] */ |
| if(outLen == 1) { |
| *target++ = buffer[0]; |
| if(offsets) { |
| *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
| } |
| } else if(outLen == 2 && (target + 2) <= targetLimit) { |
| *target++ = buffer[0]; |
| *target++ = buffer[1]; |
| if(offsets) { |
| int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
| *offsets++ = sourceIndex; |
| *offsets++ = sourceIndex; |
| } |
| } else { |
| fromUWriteUInt8( |
| cnv, |
| buffer, outLen, |
| &target, (const char *)targetLimit, |
| &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
| err); |
| if(U_FAILURE(*err)) { |
| break; |
| } |
| } |
| } /* end if(myTargetIndex<myTargetLength) */ |
| else{ |
| *err =U_BUFFER_OVERFLOW_ERROR; |
| break; |
| } |
| |
| }/* end while(mySourceIndex<mySourceLength) */ |
| |
| /* |
| * the end of the input stream and detection of truncated input |
| * are handled by the framework, but for ISO-2022-JP conversion |
| * we need to be in ASCII mode at the very end |
| * |
| * conditions: |
| * successful |
| * in SO mode or not in ASCII mode |
| * end of input and no truncated input |
| */ |
| if( U_SUCCESS(*err) && |
| (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && |
| args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
| ) { |
| int32_t sourceIndex; |
| |
| outLen = 0; |
| |
| if(pFromU2022State->g != 0) { |
| buffer[outLen++] = UCNV_SI; |
| pFromU2022State->g = 0; |
| } |
| |
| if(pFromU2022State->cs[0] != ASCII) { |
| int32_t escLen = escSeqCharsLen[ASCII]; |
| uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); |
| outLen += escLen; |
| pFromU2022State->cs[0] = (int8_t)ASCII; |
| } |
| |
| /* get the source index of the last input character */ |
| /* |
| * TODO this would be simpler and more reliable if we used a pair |
| * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| * so that we could simply use the prevSourceIndex here; |
| * this code gives an incorrect result for the rare case of an unmatched |
| * trail surrogate that is alone in the last buffer of the text stream |
| */ |
| sourceIndex=(int32_t)(source-args->source); |
| if(sourceIndex>0) { |
| --sourceIndex; |
| if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| ) { |
| --sourceIndex; |
| } |
| } else { |
| sourceIndex=-1; |
| } |
| |
| fromUWriteUInt8( |
| cnv, |
| buffer, outLen, |
| &target, (const char *)targetLimit, |
| &offsets, sourceIndex, |
| err); |
| } |
| |
| /*save the state and return */ |
| args->source = source; |
| args->target = (char*)target; |
| } |
| |
| /*************** to unicode *******************/ |
| |
| static void U_CALLCONV |
| UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| UErrorCode* err){ |
| char tempBuf[2]; |
| const char *mySource = (char *) args->source; |
| UChar *myTarget = args->target; |
| const char *mySourceLimit = args->sourceLimit; |
| uint32_t targetUniChar = 0x0000; |
| uint32_t mySourceChar = 0x0000; |
| uint32_t tmpSourceChar = 0x0000; |
| UConverterDataISO2022* myData; |
| ISO2022State *pToU2022State; |
| StateEnum cs; |
| |
| myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| pToU2022State = &myData->toU2022State; |
| |
| if(myData->key != 0) { |
| /* continue with a partial escape sequence */ |
| goto escape; |
| } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
| /* continue with a partial double-byte character */ |
| mySourceChar = args->converter->toUBytes[0]; |
| args->converter->toULength = 0; |
| cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| targetUniChar = missingCharMarker; |
| goto getTrailByte; |
| } |
| |
| while(mySource < mySourceLimit){ |
| |
| targetUniChar =missingCharMarker; |
| |
| if(myTarget < args->targetLimit){ |
| |
| mySourceChar= (unsigned char) *mySource++; |
| |
| switch(mySourceChar) { |
| case UCNV_SI: |
| if(myData->version==3) { |
| pToU2022State->g=0; |
| continue; |
| } else { |
| /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
| myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
| break; |
| } |
| |
| case UCNV_SO: |
| if(myData->version==3) { |
| /* JIS7: switch to G1 half-width Katakana */ |
| pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; |
| pToU2022State->g=1; |
| continue; |
| } else { |
| /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
| myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
| break; |
| } |
| |
| case ESC_2022: |
| mySource--; |
| escape: |
| { |
| const char * mySourceBefore = mySource; |
| int8_t toULengthBefore = args->converter->toULength; |
| |
| changeState_2022(args->converter,&(mySource), |
| mySourceLimit, ISO_2022_JP,err); |
| |
| /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ |
| if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
| } |
| } |
| |
| /* invalid or illegal escape sequence */ |
| if(U_FAILURE(*err)){ |
| args->target = myTarget; |
| args->source = mySource; |
| myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
| return; |
| } |
| /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
| if(myData->key==0) { |
| myData->isEmptySegment = TRUE; |
| } |
| continue; |
| |
| /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
| |
| case CR: |
| case LF: |
| /* automatically reset to single-byte mode */ |
| if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { |
| pToU2022State->cs[0] = (int8_t)ASCII; |
| } |
| pToU2022State->cs[2] = 0; |
| pToU2022State->g = 0; |
| U_FALLTHROUGH; |
| default: |
| /* convert one or two bytes */ |
| myData->isEmptySegment = FALSE; |
| cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && |
| !IS_JP_DBCS(cs) |
| ) { |
| /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ |
| targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
| |
| /* return from a single-shift state to the previous one */ |
| if(pToU2022State->g >= 2) { |
| pToU2022State->g=pToU2022State->prevG; |
| } |
| } else switch(cs) { |
| case ASCII: |
| if(mySourceChar <= 0x7f) { |
| targetUniChar = mySourceChar; |
| } |
| break; |
| case ISO8859_1: |
| if(mySourceChar <= 0x7f) { |
| targetUniChar = mySourceChar + 0x80; |
| } |
| /* return from a single-shift state to the previous one */ |
| pToU2022State->g=pToU2022State->prevG; |
| break; |
| case ISO8859_7: |
| if(mySourceChar <= 0x7f) { |
| /* convert mySourceChar+0x80 to use a normal 8-bit table */ |
| targetUniChar = |
| _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( |
| myData->myConverterArray[cs], |
| mySourceChar + 0x80); |
| } |
| /* return from a single-shift state to the previous one */ |
| pToU2022State->g=pToU2022State->prevG; |
| break; |
| case JISX201: |
| if(mySourceChar <= 0x7f) { |
| targetUniChar = jisx201ToU(mySourceChar); |
| } |
| break; |
| case HWKANA_7BIT: |
| if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { |
| /* 7-bit halfwidth Katakana */ |
| targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
| } |
| break; |
| default: |
| /* G0 DBCS */ |
| if(mySource < mySourceLimit) { |
| int leadIsOk, trailIsOk; |
| uint8_t trailByte; |
| getTrailByte: |
| trailByte = (uint8_t)*mySource; |
| /* |
| * Ticket 5691: consistent illegal sequences: |
| * - We include at least the first byte in the illegal sequence. |
| * - If any of the non-initial bytes could be the start of a character, |
| * we stop the illegal sequence before the first one of those. |
| * |
| * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
| * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
| * Otherwise we convert or report the pair of bytes. |
| */ |
| leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
| trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
| if (leadIsOk && trailIsOk) { |
| ++mySource; |
| tmpSourceChar = (mySourceChar << 8) | trailByte; |
| if(cs == JISX208) { |
| _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); |
| mySourceChar = tmpSourceChar; |
| } else { |
| /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ |
| mySourceChar = tmpSourceChar; |
| if (cs == KSC5601) { |
| tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ |
| } |
| tempBuf[0] = (char)(tmpSourceChar >> 8); |
| tempBuf[1] = (char)(tmpSourceChar); |
| } |
| targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); |
| } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
| ++mySource; |
| /* add another bit so that the code below writes 2 bytes in case of error */ |
| mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
| } |
| } else { |
| args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| args->converter->toULength = 1; |
| goto endloop; |
| } |
| } /* End of inner switch */ |
| break; |
| } /* End of outer switch */ |
| if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
| if(args->offsets){ |
| args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| } |
| *(myTarget++)=(UChar)targetUniChar; |
| } |
| else if(targetUniChar > missingCharMarker){ |
| /* disassemble the surrogate pair and write to output*/ |
| targetUniChar-=0x0010000; |
| *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
| if(args->offsets){ |
| args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| } |
| ++myTarget; |
| if(myTarget< args->targetLimit){ |
| *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| if(args->offsets){ |
| args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| } |
| ++myTarget; |
| }else{ |
| args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
| (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| } |
| |
| } |
| else{ |
| /* Call the callback function*/ |
| toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
| break; |
| } |
| } |
| else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ |
| *err =U_BUFFER_OVERFLOW_ERROR; |
| break; |
| } |
| } |
| endloop: |
| args->target = myTarget; |
| args->source = mySource; |
| } |
| |
| |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| /*************************************************************** |
| * Rules for ISO-2022-KR encoding |
| * i) The KSC5601 designator sequence should appear only once in a file, |
| * at the beginning of a line before any KSC5601 characters. This usually |
| * means that it appears by itself on the first line of the file |
| * ii) There are only 2 shifting sequences SO to shift into double byte mode |
| * and SI to shift into single byte mode |
| */ |
| static void U_CALLCONV |
| UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
| |
| UConverter* saveConv = args->converter; |
| UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; |
| args->converter=myConverterData->currentConverter; |
| |
| myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; |
| ucnv_MBCSFromUnicodeWithOffsets(args,err); |
| saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
| |
| if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
| uprv_memcpy( |
| saveConv->charErrorBuffer, |
| myConverterData->currentConverter->charErrorBuffer, |
| myConverterData->currentConverter->charErrorBufferLength); |
| } |
| saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
| myConverterData->currentConverter->charErrorBufferLength = 0; |
| } |
| args->converter=saveConv; |
| } |
| |
| static void U_CALLCONV |
| UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
| |
| const UChar *source = args->source; |
| const UChar *sourceLimit = args->sourceLimit; |
| unsigned char *target = (unsigned char *) args->target; |
| unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
| int32_t* offsets = args->offsets; |
| uint32_t targetByteUnit = 0x0000; |
| UChar32 sourceChar = 0x0000; |
| UBool isTargetByteDBCS; |
| UBool oldIsTargetByteDBCS; |
| UConverterDataISO2022 *converterData; |
| UConverterSharedData* sharedData; |
| UBool useFallback; |
| int32_t length =0; |
| |
| converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
| /* if the version is 1 then the user is requesting |
| * conversion with ibm-25546 pass the arguments to |
| * MBCS converter and return |
| */ |
| if(converterData->version==1){ |
| UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
| return; |
| } |
| |
| /* initialize data */ |
| sharedData = converterData->currentConverter->sharedData; |
| useFallback = args->converter->useFallback; |
| isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; |
| oldIsTargetByteDBCS = isTargetByteDBCS; |
| |
| isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
| if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
| goto getTrail; |
| } |
| while(source < sourceLimit){ |
| |
| targetByteUnit = missingCharMarker; |
| |
| if(target < (unsigned char*) args->targetLimit){ |
| sourceChar = *source++; |
| |
| /* do not convert SO/SI/ESC */ |
| if(IS_2022_CONTROL(sourceChar)) { |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| args->converter->fromUChar32=sourceChar; |
| break; |
| } |
| |
| length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); |
| if(length < 0) { |
| length = -length; /* fallback */ |
| } |
| /* only DBCS or SBCS characters are expected*/ |
| /* DB characters with high bit set to 1 are expected */ |
| if( length > 2 || length==0 || |
| (length == 1 && targetByteUnit > 0x7f) || |
| (length == 2 && |
| ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
| (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
| ) { |
| targetByteUnit=missingCharMarker; |
| } |
| if (targetByteUnit != missingCharMarker){ |
| |
| oldIsTargetByteDBCS = isTargetByteDBCS; |
| isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); |
| /* append the shift sequence */ |
| if (oldIsTargetByteDBCS != isTargetByteDBCS ){ |
| |
| if (isTargetByteDBCS) |
| *target++ = UCNV_SO; |
| else |
| *target++ = UCNV_SI; |
| if(offsets) |
| *(offsets++) = (int32_t)(source - args->source-1); |
| } |
| /* write the targetUniChar to target */ |
| if(targetByteUnit <= 0x00FF){ |
| if( target < targetLimit){ |
| *(target++) = (unsigned char) targetByteUnit; |
| if(offsets){ |
| *(offsets++) = (int32_t)(source - args->source-1); |
| } |
| |
| }else{ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); |
| *err = U_BUFFER_OVERFLOW_ERROR; |
| } |
| }else{ |
| if(target < targetLimit){ |
| *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); |
| if(offsets){ |
| *(offsets++) = (int32_t)(source - args->source-1); |
| } |
| if(target < targetLimit){ |
| *(target++) =(unsigned char) (targetByteUnit -0x80); |
| if(offsets){ |
| *(offsets++) = (int32_t)(source - args->source-1); |
| } |
| }else{ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); |
| *err = U_BUFFER_OVERFLOW_ERROR; |
| } |
| }else{ |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); |
| args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); |
| *err = U_BUFFER_OVERFLOW_ERROR; |
| } |
| } |
| |
| } |
| else{ |
| /* oops.. the code point is unassingned |
| * set the error and reason |
| */ |
| |
| /*check if the char is a First surrogate*/ |
| if(U16_IS_SURROGATE(sourceChar)) { |
| if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
| getTrail: |
| /*look ahead to find the trail surrogate*/ |
| if(source < sourceLimit) { |
| /* test the following code unit */ |
| UChar trail=(UChar) *source; |
| if(U16_IS_TRAIL(trail)) { |
| source++; |
| sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
| *err = U_INVALID_CHAR_FOUND; |
| /* convert this surrogate code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| } |
| } else { |
| /* no more input */ |
| *err = U_ZERO_ERROR; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| *err=U_ILLEGAL_CHAR_FOUND; |
| } |
| } else { |
| /* callback(unassigned) for a BMP code point */ |
| *err = U_INVALID_CHAR_FOUND; |
| } |
| |
| args->converter->fromUChar32=sourceChar; |
| break; |
| } |
| } /* end if(myTargetIndex<myTargetLength) */ |
| else{ |
| *err =U_BUFFER_OVERFLOW_ERROR; |
| break; |
| } |
| |
| }/* end while(mySourceIndex<mySourceLength) */ |
| |
| /* |
| * the end of the input stream and detection of truncated input |
| * are handled by the framework, but for ISO-2022-KR conversion |
| * we need to be in ASCII mode at the very end |
| * |
| * conditions: |
| * successful |
| * not in ASCII mode |
| * end of input and no truncated input |
| */ |
| if( U_SUCCESS(*err) && |
| isTargetByteDBCS && |
| args->flush && source>=sourceLimit && args->converter->fromUChar32==0 |
| ) { |
| int32_t sourceIndex; |
| |
| /* we are switching to ASCII */ |
| isTargetByteDBCS=FALSE; |
| |
| /* get the source index of the last input character */ |
| /* |
| * TODO this would be simpler and more reliable if we used a pair |
| * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| * so that we could simply use the prevSourceIndex here; |
| * this code gives an incorrect result for the rare case of an unmatched |
| * trail surrogate that is alone in the last buffer of the text stream |
| */ |
| sourceIndex=(int32_t)(source-args->source); |
| if(sourceIndex>0) { |
| --sourceIndex; |
| if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| ) { |
| --sourceIndex; |
| } |
| } else { |
| sourceIndex=-1; |
| } |
| |
| fromUWriteUInt8( |
| args->converter, |
| SHIFT_IN_STR, 1, |
| &target, (const char *)targetLimit, |
| &offsets, sourceIndex, |
| err); |
| } |
| |
| /*save the state and return */ |
| args->source = source; |
| args->target = (char*)target; |
| args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; |
| } |
| |
| /************************ To Unicode ***************************************/ |
| |
| static void U_CALLCONV |
| UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, |
| UErrorCode* err){ |
| char const* sourceStart; |
| UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| |
| UConverterToUnicodeArgs subArgs; |
| int32_t minArgsSize; |
| |
| /* set up the subconverter arguments */ |
| if(args->size<sizeof(UConverterToUnicodeArgs)) { |
| minArgsSize = args->size; |
| } else { |
| minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); |
| } |
| |
| uprv_memcpy(&subArgs, args, minArgsSize); |
| subArgs.size = (uint16_t)minArgsSize; |
| subArgs.converter = myData->currentConverter; |
| |
| /* remember the original start of the input for offsets */ |
| sourceStart = args->source; |
| |
| if(myData->key != 0) { |
| /* continue with a partial escape sequence */ |
| goto escape; |
| } |
| |
| while(U_SUCCESS(*err) && args->source < args->sourceLimit) { |
| /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
| subArgs.source = args->source; |
| subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); |
| if(subArgs.source != subArgs.sourceLimit) { |
| /* |
| * get the current partial byte sequence |
| * |
| * it needs to be moved between the public and the subconverter |
| * so that the conversion framework, which only sees the public |
| * converter, can handle truncated and illegal input etc. |
| */ |
| if(args->converter->toULength > 0) { |
| uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); |
| } |
| subArgs.converter->toULength = args->converter->toULength; |
| |
| /* |
| * Convert up to the end of the input, or to before the next escape character. |
| * Does not handle conversion extensions because the preToU[] state etc. |
| * is not copied. |
| */ |
| ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); |
| |
| if(args->offsets != NULL && sourceStart != args->source) { |
| /* update offsets to base them on the actual start of the input */ |
| int32_t *offsets = args->offsets; |
| UChar *target = args->target; |
| int32_t delta = (int32_t)(args->source - sourceStart); |
| while(target < subArgs.target) { |
| if(*offsets >= 0) { |
| *offsets += delta; |
| } |
| ++offsets; |
| ++target; |
| } |
| } |
| args->source = subArgs.source; |
| args->target = subArgs.target; |
| args->offsets = subArgs.offsets; |
| |
| /* copy input/error/overflow buffers */ |
| if(subArgs.converter->toULength > 0) { |
| uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); |
| } |
| args->converter->toULength = subArgs.converter->toULength; |
| |
| if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| if(subArgs.converter->UCharErrorBufferLength > 0) { |
| uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, |
| subArgs.converter->UCharErrorBufferLength); |
| } |
| args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; |
| subArgs.converter->UCharErrorBufferLength = 0; |
| } |
| } |
| |
| if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
| return; |
| } |
| |
| escape: |
| changeState_2022(args->converter, |
| &(args->source), |
| args->sourceLimit, |
| ISO_2022_KR, |
| err); |
| } |
| } |
| |
| static void U_CALLCONV |
| UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| UErrorCode* err){ |
| char tempBuf[2]; |
| const char *mySource = ( char *) args->source; |
| UChar *myTarget = args->target; |
| const char *mySourceLimit = args->sourceLimit; |
| UChar32 targetUniChar = 0x0000; |
| UChar mySourceChar = 0x0000; |
| UConverterDataISO2022* myData; |
| UConverterSharedData* sharedData ; |
| UBool useFallback; |
| |
| myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| if(myData->version==1){ |
| UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
| return; |
| } |
| |
| /* initialize state */ |
| sharedData = myData->currentConverter->sharedData; |
| useFallback = args->converter->useFallback; |
| |
| if(myData->key != 0) { |
| /* continue with a partial escape sequence */ |
| goto escape; |
| } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
| /* continue with a partial double-byte character */ |
| mySourceChar = args->converter->toUBytes[0]; |
| args->converter->toULength = 0; |
| goto getTrailByte; |
| } |
| |
| while(mySource< mySourceLimit){ |
| |
| if(myTarget < args->targetLimit){ |
| |
| mySourceChar= (unsigned char) *mySource++; |
| |
| if(mySourceChar==UCNV_SI){ |
| myData->toU2022State.g = 0; |
| if (myData->isEmptySegment) { |
| myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
| *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| args->converter->toULength = 1; |
| args->target = myTarget; |
| args->source = mySource; |
| return; |
| } |
| /*consume the source */ |
| continue; |
| }else if(mySourceChar==UCNV_SO){ |
| myData->toU2022State.g = 1; |
| myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
| /*consume the source */ |
| continue; |
| }else if(mySourceChar==ESC_2022){ |
| mySource--; |
| escape: |
| myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ |
| changeState_2022(args->converter,&(mySource), |
| mySourceLimit, ISO_2022_KR, err); |
| if(U_FAILURE(*err)){ |
| args->target = myTarget; |
| args->source = mySource; |
| return; |
| } |
| continue; |
| } |
| |
| myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ |
| if(myData->toU2022State.g == 1) { |
| if(mySource < mySourceLimit) { |
| int leadIsOk, trailIsOk; |
| uint8_t trailByte; |
| getTrailByte: |
| targetUniChar = missingCharMarker; |
| trailByte = (uint8_t)*mySource; |
| /* |
| * Ticket 5691: consistent illegal sequences: |
| * - We include at least the first byte in the illegal sequence. |
| * - If any of the non-initial bytes could be the start of a character, |
| * we stop the illegal sequence before the first one of those. |
| * |
| * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
| * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
| * Otherwise we convert or report the pair of bytes. |
| */ |
| leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
| trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
| if (leadIsOk && trailIsOk) { |
| ++mySource; |
| tempBuf[0] = (char)(mySourceChar + 0x80); |
| tempBuf[1] = (char)(trailByte + 0x80); |
| targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); |
| mySourceChar = (mySourceChar << 8) | trailByte; |
| } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
| ++mySource; |
| /* add another bit so that the code below writes 2 bytes in case of error */ |
| mySourceChar = static_cast<UChar>(0x10000 | (mySourceChar << 8) | trailByte); |
| } |
| } else { |
| args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| args->converter->toULength = 1; |
| break; |
| } |
| } |
| else if(mySourceChar <= 0x7f) { |
| targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); |
| } else { |
| targetUniChar = 0xffff; |
| } |
| if(targetUniChar < 0xfffe){ |
| if(args->offsets) { |
| args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| } |
| *(myTarget++)=(UChar)targetUniChar; |
| } |
| else { |
| /* Call the callback function*/ |
| toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
| break; |
| } |
| } |
| else{ |
| *err =U_BUFFER_OVERFLOW_ERROR; |
| break; |
| } |
| } |
| args->target = myTarget; |
| args->source = mySource; |
| } |
| |
| /*************************** END ISO2022-KR *********************************/ |
| |
| /*************************** ISO-2022-CN ********************************* |
| * |
| * Rules for ISO-2022-CN Encoding: |
| * i) The designator sequence must appear once on a line before any instance |
| * of character set it designates. |
| * ii) If two lines contain characters from the same character set, both lines |
| * must include the designator sequence. |
| * iii) Once the designator sequence is known, a shifting sequence has to be found |
| * to invoke the shifting |
| * iv) All lines start in ASCII and end in ASCII. |
| * v) Four shifting sequences are employed for this purpose: |
| * |
| * Sequcence ASCII Eq Charsets |
| * ---------- ------- --------- |
| * SI <SI> US-ASCII |
| * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 |
| * SS2 <ESC>N CNS-11643-1992 Plane 2 |
| * SS3 <ESC>O CNS-11643-1992 Planes 3-7 |
| * |
| * vi) |
| * SOdesignator : ESC "$" ")" finalchar_for_SO |
| * SS2designator : ESC "$" "*" finalchar_for_SS2 |
| * SS3designator : ESC "$" "+" finalchar_for_SS3 |
| * |
| * ESC $ ) A Indicates the bytes following SO are Chinese |
| * characters as defined in GB 2312-80, until |
| * another SOdesignation appears |
| * |
| * |
| * ESC $ ) E Indicates the bytes following SO are as defined |
| * in ISO-IR-165 (for details, see section 2.1), |
| * until another SOdesignation appears |
| * |
| * ESC $ ) G Indicates the bytes following SO are as defined |
| * in CNS 11643-plane-1, until another |
| * SOdesignation appears |
| * |
| * ESC $ * H Indicates the two bytes immediately following |
| * SS2 is a Chinese character as defined in CNS |
| * 11643-plane-2, until another SS2designation |
| * appears |
| * (Meaning <ESC>N must preceed every 2 byte |
| * sequence.) |
| * |
| * ESC $ + I Indicates the immediate two bytes following SS3 |
| * is a Chinese character as defined in CNS |
| * 11643-plane-3, until another SS3designation |
| * appears |
| * (Meaning <ESC>O must preceed every 2 byte |
| * sequence.) |
| * |
| * ESC $ + J Indicates the immediate two bytes following SS3 |
| * is a Chinese character as defined in CNS |
| * 11643-plane-4, until another SS3designation |
| * appears |
| * (In English: <ESC>O must preceed every 2 byte |
| * sequence.) |
| * |
| * ESC $ + K Indicates the immediate two bytes following SS3 |
| * is a Chinese character as defined in CNS |
| * 11643-plane-5, until another SS3designation |
| * appears |
| * |
| * ESC $ + L Indicates the immediate two bytes following SS3 |
| * is a Chinese character as defined in CNS |
| * 11643-plane-6, until another SS3designation |
| * appears |
| * |
| * ESC $ + M Indicates the immediate two bytes following SS3 |
| * is a Chinese character as defined in CNS |
| * 11643-plane-7, until another SS3designation |
| * appears |
| * |
| * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and |
| * has its own designation information before any Chinese characters |
| * appear |
| * |
| */ |
| |
| /* The following are defined this way to make the strings truly readonly */ |
| static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
| static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; |
| static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; |
| static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; |
| static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; |
| static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; |
| static const char CNS_1
|