| /* |
| ******************************************************************************* |
| * Copyright (C) 2010, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.charset; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.IntBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| |
| import com.ibm.icu.charset.CharsetMBCS.CharsetDecoderMBCS; |
| import com.ibm.icu.charset.CharsetMBCS.CharsetEncoderMBCS; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| class CharsetCompoundText extends CharsetICU { |
| private static final byte[] fromUSubstitution = new byte[] { (byte) 0x3F }; |
| private CharsetMBCS gbCharset[]; |
| |
| private final static byte COMPOUND_TEXT_SINGLE_0 = 0; |
| private final static byte COMPOUND_TEXT_SINGLE_1 = 1; |
| private final static byte COMPOUND_TEXT_SINGLE_2 = 2; |
| private final static byte COMPOUND_TEXT_SINGLE_3 = 3; |
| |
| /*private final static byte COMPOUND_TEXT_DOUBLE_1 = 4; |
| private final static byte COMPOUND_TEXT_DOUBLE_2 = 5; |
| private final static byte COMPOUND_TEXT_DOUBLE_3 = 6; |
| private final static byte COMPOUND_TEXT_DOUBLE_4 = 7; |
| private final static byte COMPOUND_TEXT_DOUBLE_5 = 8; |
| private final static byte COMPOUND_TEXT_DOUBLE_6 = 9; |
| private final static byte COMPOUND_TEXT_DOUBLE_7 = 10; |
| |
| private final static byte COMPOUND_TEXT_TRIPLE_DOUBLE = 11;*/ |
| |
| private final static byte IBM_915 = 12; |
| private final static byte IBM_916 = 13; |
| private final static byte IBM_914 = 14; |
| private final static byte IBM_874 = 15; |
| private final static byte IBM_912 = 16; |
| private final static byte IBM_913 = 17; |
| private final static byte ISO_8859_14 = 18; |
| private final static byte IBM_923 = 19; |
| |
| private final static byte NUM_OF_CONVERTERS = 20; |
| |
| private final static byte SEARCH_LENGTH = 12; |
| |
| private final static byte[][] escSeqCcompoundText = { |
| /* Single */ |
| { 0x1B, 0x2D, 0x41 }, |
| { 0x1B, 0x2D, 0x4D }, |
| { 0x1B, 0x2D, 0x46 }, |
| { 0x1B, 0x2D, 0x47 }, |
| |
| /* Double */ |
| { 0x1B, 0x24, 0x29, 0x41 }, |
| { 0x1B, 0x24, 0x29, 0x42 }, |
| { 0x1B, 0x24, 0x29, 0x43 }, |
| { 0x1B, 0x24, 0x29, 0x44 }, |
| { 0x1B, 0x24, 0x29, 0x47 }, |
| { 0x1B, 0x24, 0x29, 0x48 }, |
| { 0x1B, 0x24, 0x29, 0x49 }, |
| |
| /* Triple/Double */ |
| { 0x1B, 0x25, 0x47 }, |
| |
| /*IBM-915*/ |
| { 0x1B, 0x2D, 0x4C }, |
| /*IBM-916*/ |
| { 0x1B, 0x2D, 0x48 }, |
| /*IBM-914*/ |
| { 0x1B, 0x2D, 0x44 }, |
| /*IBM-874*/ |
| { 0x1B, 0x2D, 0x54 }, |
| /*IBM-912*/ |
| { 0x1B, 0x2D, 0x42 }, |
| /* IBM-913 */ |
| { 0x1B, 0x2D, 0x43 }, |
| /* ISO-8859_14 */ |
| { 0x1B, 0x2D, 0x5F }, |
| /* IBM-923 */ |
| { 0x1B, 0x2D, 0x62 }, |
| }; |
| |
| private final static byte ESC_START = 0x1B; |
| |
| private static boolean isASCIIRange(int codepoint) { |
| if ((codepoint == 0x0000) || (codepoint == 0x0009) || (codepoint == 0x000A) || |
| (codepoint >= 0x0020 && codepoint <= 0x007f) || (codepoint >= 0x00A0 && codepoint <= 0x00FF)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM915(int codepoint) { |
| if ((codepoint >= 0x0401 && codepoint <= 0x045F) || (codepoint == 0x2116)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM916(int codepoint) { |
| if ((codepoint >= 0x05D0 && codepoint <= 0x05EA) || (codepoint == 0x2017) || (codepoint == 0x203E)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isCompoundS3(int codepoint) { |
| if ((codepoint == 0x060C) || (codepoint == 0x061B) || (codepoint == 0x061F) || (codepoint >= 0x0621 && codepoint <= 0x063A) || |
| (codepoint >= 0x0640 && codepoint <= 0x0652) || (codepoint >= 0x0660 && codepoint <= 0x066D) || (codepoint == 0x200B) || |
| (codepoint >= 0x0FE70 && codepoint <= 0x0FE72) || (codepoint == 0x0FE74) || (codepoint >= 0x0FE76 && codepoint <= 0x0FEBE)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isCompoundS2(int codepoint) { |
| if ((codepoint == 0x02BC) || (codepoint == 0x02BD) || (codepoint >= 0x0384 && codepoint <= 0x03CE) || (codepoint == 0x2015)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM914(int codepoint) { |
| if ((codepoint == 0x0100) || (codepoint == 0x0101) || (codepoint == 0x0112) || (codepoint == 0x0113) || (codepoint == 0x0116) || (codepoint == 0x0117) || |
| (codepoint == 0x0122) || (codepoint == 0x0123) || (codepoint >= 0x0128 && codepoint <= 0x012B) || (codepoint == 0x012E) || (codepoint == 0x012F) || |
| (codepoint >= 0x0136 && codepoint <= 0x0138) || (codepoint == 0x013B) || (codepoint == 0x013C) || (codepoint == 0x0145) || (codepoint == 0x0146) || |
| (codepoint >= 0x014A && codepoint <= 0x014D) || (codepoint == 0x0156) || (codepoint == 0x0157) || (codepoint >= 0x0166 && codepoint <= 0x016B) || |
| (codepoint == 0x0172) || (codepoint == 0x0173)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM874(int codepoint) { |
| if ((codepoint >= 0x0E01 && codepoint <= 0x0E3A) || (codepoint >= 0x0E3F && codepoint <= 0x0E5B)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM912(int codepoint) { |
| if ((codepoint >= 0x0102 && codepoint <= 0x0107) || (codepoint >= 0x010C && codepoint <= 0x0111) || (codepoint >= 0x0118 && codepoint <= 0x011B) || |
| (codepoint == 0x0139) || (codepoint == 0x013A) || (codepoint == 0x013D) || (codepoint == 0x013E) || (codepoint >= 0x0141 && codepoint <= 0x0144) || |
| (codepoint == 0x0147) || (codepoint == 0x0147) || (codepoint == 0x0150) || (codepoint == 0x0151) || (codepoint == 0x0154) || (codepoint == 0x0155) || |
| (codepoint >= 0x0158 && codepoint <= 0x015B) || (codepoint == 0x015E) || (codepoint == 0x015F) || (codepoint >= 0x0160 && codepoint <= 0x0165) || |
| (codepoint == 0x016E) || (codepoint == 0x016F) || (codepoint == 0x0170) || (codepoint == 0x0171) || (codepoint >= 0x0179 && codepoint <= 0x017E) || |
| (codepoint == 0x02C7) || (codepoint == 0x02D8) || (codepoint == 0x02D9) || (codepoint == 0x02DB) || (codepoint == 0x02DD)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM913(int codepoint) { |
| if ((codepoint >= 0x0108 && codepoint <= 0x010B) || (codepoint == 0x011C) || |
| (codepoint == 0x011D) || (codepoint == 0x0120) || (codepoint == 0x0121) || |
| (codepoint >= 0x0124 && codepoint <= 0x0127) || (codepoint == 0x0134) || (codepoint == 0x0135) || |
| (codepoint == 0x015C) || (codepoint == 0x015D) || (codepoint == 0x016C) || (codepoint == 0x016D)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isCompoundS1(int codepoint) { |
| if ((codepoint == 0x011E) || (codepoint == 0x011F) || (codepoint == 0x0130) || |
| (codepoint == 0x0131) || (codepoint >= 0x0218 && codepoint <= 0x021B)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isISO8859_14(int codepoint) { |
| if ((codepoint >= 0x0174 && codepoint <= 0x0177) || (codepoint == 0x1E0A) || |
| (codepoint == 0x1E0B) || (codepoint == 0x1E1E) || (codepoint == 0x1E1F) || |
| (codepoint == 0x1E40) || (codepoint == 0x1E41) || (codepoint == 0x1E56) || |
| (codepoint == 0x1E57) || (codepoint == 0x1E60) || (codepoint == 0x1E61) || |
| (codepoint == 0x1E6A) || (codepoint == 0x1E6B) || (codepoint == 0x1EF2) || |
| (codepoint == 0x1EF3) || (codepoint >= 0x1E80 && codepoint <= 0x1E85)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static boolean isIBM923(int codepoint) { |
| if ((codepoint == 0x0152) || (codepoint == 0x0153) || (codepoint == 0x0178) || (codepoint == 0x20AC)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private static int findNextEsc(ByteBuffer source) { |
| for (int i = source.position(); i < source.limit(); i++) { |
| if (source.get(i) == 0x1B) { |
| return i; |
| } |
| } |
| return source.limit(); |
| } |
| |
| private static byte getState(int codepoint) { |
| byte state = -1; |
| |
| if (isASCIIRange(codepoint)) { |
| state = COMPOUND_TEXT_SINGLE_0; |
| } else if (isIBM912(codepoint)) { |
| state = IBM_912; |
| }else if (isIBM913(codepoint)) { |
| state = IBM_913; |
| } else if (isISO8859_14(codepoint)) { |
| state = ISO_8859_14; |
| } else if (isIBM923(codepoint)) { |
| state = IBM_923; |
| } else if (isIBM874(codepoint)) { |
| state = IBM_874; |
| } else if (isIBM914(codepoint)) { |
| state = IBM_914; |
| } else if (isCompoundS2(codepoint)) { |
| state = COMPOUND_TEXT_SINGLE_2; |
| } else if (isCompoundS3(codepoint)) { |
| state = COMPOUND_TEXT_SINGLE_3; |
| } else if (isIBM916(codepoint)) { |
| state = IBM_916; |
| } else if (isIBM915(codepoint)) { |
| state = IBM_915; |
| } else if (isCompoundS1(codepoint)) { |
| state = COMPOUND_TEXT_SINGLE_1; |
| } |
| |
| return state; |
| } |
| |
| private static int findStateFromEscSeq(ByteBuffer source) { |
| int state = -1; |
| int sourceIndex = source.position(); |
| boolean matchFound = false; |
| int i, n; |
| |
| for (i = 0; i < escSeqCcompoundText.length; i++) { |
| for (n = 0; n < escSeqCcompoundText[i].length; n++) { |
| matchFound = true; |
| if (source.get(sourceIndex + n) != escSeqCcompoundText[i][n]) { |
| matchFound = false; |
| break; |
| } |
| } |
| if (matchFound) { |
| break; |
| } |
| } |
| |
| if (matchFound) { |
| state = i; |
| source.position(sourceIndex + escSeqCcompoundText[i].length); |
| } |
| |
| return state; |
| } |
| |
| public CharsetCompoundText(String icuCanonicalName, String javaCanonicalName, String[] aliases) { |
| super(icuCanonicalName, javaCanonicalName, aliases); |
| |
| LoadConverters(); |
| |
| maxBytesPerChar = 6; |
| minBytesPerChar = 1; |
| maxCharsPerByte = 1; |
| } |
| |
| private void LoadConverters() { |
| gbCharset = new CharsetMBCS[NUM_OF_CONVERTERS]; |
| |
| gbCharset[COMPOUND_TEXT_SINGLE_0] = null; |
| |
| for (int i = 1; i < SEARCH_LENGTH; i++) { |
| String name = "icu-internal-compound-"; |
| if (i <= 3) { |
| name = name + "s" + i; |
| } else if (i <= 10) { |
| name = name + "d" + (i - 3); |
| } else { |
| name = name + "t"; |
| } |
| |
| gbCharset[i] = (CharsetMBCS)CharsetICU.forNameICU(name); |
| } |
| |
| gbCharset[IBM_915] = (CharsetMBCS)CharsetICU.forNameICU("ibm-915_P100-1995"); |
| gbCharset[IBM_916] = (CharsetMBCS)CharsetICU.forNameICU("ibm-916_P100-1995"); |
| gbCharset[IBM_914] = (CharsetMBCS)CharsetICU.forNameICU("ibm-914_P100-1995"); |
| gbCharset[IBM_874] = (CharsetMBCS)CharsetICU.forNameICU("ibm-874_P100-1995"); |
| gbCharset[IBM_912] = (CharsetMBCS)CharsetICU.forNameICU("ibm-912_P100-1995"); |
| gbCharset[IBM_913] = (CharsetMBCS)CharsetICU.forNameICU("ibm-913_P100-2000"); |
| gbCharset[ISO_8859_14] = (CharsetMBCS)CharsetICU.forNameICU("iso-8859_14-1998"); |
| gbCharset[IBM_923] = (CharsetMBCS)CharsetICU.forNameICU("ibm-923_P100-1998"); |
| } |
| |
| class CharsetEncoderCompoundText extends CharsetEncoderICU { |
| CharsetEncoderMBCS gbEncoder[]; |
| |
| public CharsetEncoderCompoundText(CharsetICU cs) { |
| super(cs, fromUSubstitution); |
| |
| gbEncoder = new CharsetEncoderMBCS[NUM_OF_CONVERTERS]; |
| |
| for (int i = 0; i < NUM_OF_CONVERTERS; i++) { |
| if (i == 0) { |
| gbEncoder[i] = null; |
| } else { |
| gbEncoder[i] = (CharsetEncoderMBCS)gbCharset[i].newEncoder(); |
| } |
| } |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| for (int i = 0; i < NUM_OF_CONVERTERS; i++) { |
| if (gbEncoder[i] != null) { |
| gbEncoder[i].implReset(); |
| } |
| } |
| } |
| |
| protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { |
| CoderResult err = CoderResult.UNDERFLOW; |
| int sourceChar; |
| char []sourceCharArray = { 0x0000 }; |
| ByteBuffer tmpTargetBuffer = ByteBuffer.allocate(3); |
| byte[] targetBytes = new byte[10]; |
| int targetLength = 0; |
| int currentState = this.fromUnicodeStatus; |
| byte tmpState = 0; |
| int i = 0; |
| boolean gotoGetTrail = false; |
| |
| if (!source.hasRemaining()) |
| return CoderResult.UNDERFLOW; |
| else if (!target.hasRemaining()) |
| return CoderResult.OVERFLOW; |
| |
| /* check if the last codepoint of previous buffer was a lead surrogate */ |
| if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) { |
| // goto getTrail label |
| gotoGetTrail = true; |
| } |
| |
| while (source.hasRemaining()) { |
| if (target.hasRemaining()) { |
| if (!gotoGetTrail) { |
| sourceChar = source.get(); |
| } |
| |
| targetLength = 0; |
| tmpTargetBuffer.position(0); |
| tmpTargetBuffer.limit(3); |
| |
| /* check if the char is a First surrogate */ |
| if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) { |
| if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) { |
| // getTrail label |
| /* reset gotoGetTrail flag*/ |
| gotoGetTrail = false; |
| |
| /* look ahead to find the trail surrogate */ |
| if (source.hasRemaining()) { |
| /* test the following code unit */ |
| char trail = source.get(); |
| source.position(source.position()-1); |
| if (UTF16.isTrailSurrogate(trail)) { |
| source.get(); |
| sourceChar = UCharacter.getCodePoint((char)sourceChar, trail); |
| fromUChar32 = 0x00; |
| /* convert this supplementary code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| err = CoderResult.malformedForLength(1); |
| fromUChar32 = sourceChar; |
| break; |
| } |
| } else { |
| /* no more input */ |
| fromUChar32 = sourceChar; |
| break; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| err = CoderResult.malformedForLength(1); |
| fromUChar32 = sourceChar; |
| break; |
| } |
| } |
| |
| tmpState = getState(sourceChar); |
| |
| sourceCharArray[0] = (char)sourceChar; |
| |
| if (tmpState < 0) { |
| /* Test all available converters */ |
| for (i = 1; i < SEARCH_LENGTH; i++) { |
| err = gbEncoder[i].cnvMBCSFromUnicodeWithOffsets(CharBuffer.wrap(sourceCharArray), tmpTargetBuffer, offsets, true); |
| if (!err.isError()) { |
| tmpState = (byte)i; |
| tmpTargetBuffer.limit(tmpTargetBuffer.position()); |
| implReset(); |
| break; |
| } |
| } |
| } else if (tmpState == COMPOUND_TEXT_SINGLE_0) { |
| tmpTargetBuffer.put(0, (byte)sourceChar); |
| tmpTargetBuffer.limit(1); |
| } else { |
| err = gbEncoder[tmpState].cnvMBCSFromUnicodeWithOffsets(CharBuffer.wrap(sourceCharArray), tmpTargetBuffer, offsets, true); |
| if (!err.isError()) { |
| tmpTargetBuffer.limit(tmpTargetBuffer.position()); |
| } |
| } |
| if (err.isError()) { |
| break; |
| } |
| |
| if (currentState != tmpState) { |
| currentState = tmpState; |
| |
| /* Write escape sequence if necessary */ |
| for (i = 0; i < escSeqCcompoundText[currentState].length; i++) { |
| targetBytes[i] = escSeqCcompoundText[currentState][i]; |
| } |
| targetLength = i; |
| } |
| |
| for (i = 0; i < tmpTargetBuffer.limit(); i++) { |
| targetBytes[i+targetLength] = tmpTargetBuffer.get(i); |
| } |
| targetLength += i; |
| |
| for (i = 0; i < targetLength; i++) { |
| if (target.hasRemaining()) { |
| target.put(targetBytes[i]); |
| } else { |
| err = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } else { |
| err = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| |
| if (err.isOverflow()) { |
| int m = 0; |
| for (int n = i; n < targetLength; n++) { |
| this.errorBuffer[m++] = targetBytes[n]; |
| } |
| this.errorBufferLength = m; |
| } |
| this.fromUnicodeStatus = currentState; |
| |
| return err; |
| } |
| } |
| |
| class CharsetDecoderCompoundText extends CharsetDecoderICU { |
| CharsetDecoderMBCS gbDecoder[]; |
| |
| public CharsetDecoderCompoundText(CharsetICU cs) { |
| super(cs); |
| gbDecoder = new CharsetDecoderMBCS[NUM_OF_CONVERTERS]; |
| |
| for (int i = 0; i < NUM_OF_CONVERTERS; i++) { |
| if (i == 0) { |
| gbDecoder[i] = null; |
| } else { |
| gbDecoder[i] = (CharsetDecoderMBCS)gbCharset[i].newDecoder(); |
| } |
| } |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| for (int i = 0; i < NUM_OF_CONVERTERS; i++) { |
| if (gbDecoder[i] != null) { |
| gbDecoder[i].implReset(); |
| } |
| } |
| } |
| |
| protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { |
| CoderResult err = CoderResult.UNDERFLOW; |
| byte[] sourceChar = { 0x00 }; |
| int currentState = this.toUnicodeStatus; |
| int tmpState = currentState; |
| CharsetDecoderMBCS decoder; |
| int sourceLimit = source.limit();; |
| |
| if (!source.hasRemaining()) |
| return CoderResult.UNDERFLOW; |
| else if (!target.hasRemaining()) |
| return CoderResult.OVERFLOW; |
| |
| while (source.hasRemaining()) { |
| if (target.hasRemaining()) { |
| sourceChar[0] = source.get(source.position()); |
| |
| if (sourceChar[0] == ESC_START) { |
| tmpState = findStateFromEscSeq(source); |
| if (tmpState < 0) { |
| err = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| |
| if (tmpState != currentState) { |
| currentState = tmpState; |
| } |
| |
| if (currentState == COMPOUND_TEXT_SINGLE_0) { |
| while (source.hasRemaining()) { |
| if (!target.hasRemaining()) { |
| err = CoderResult.OVERFLOW; |
| break; |
| } |
| if (source.get(source.position()) == ESC_START) { |
| break; |
| } |
| if (target.hasRemaining()) { |
| target.put((char)(UConverterConstants.UNSIGNED_BYTE_MASK&source.get())); |
| } |
| } |
| } else { |
| source.limit(findNextEsc(source)); |
| |
| decoder = gbDecoder[currentState]; |
| |
| err = decoder.decodeLoop(source, target, offsets, true); |
| |
| source.limit(sourceLimit); |
| } |
| |
| if (err.isError()) { |
| break; |
| } |
| } else { |
| err = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| return err; |
| } |
| } |
| |
| public CharsetDecoder newDecoder() { |
| return new CharsetDecoderCompoundText(this); |
| } |
| |
| public CharsetEncoder newEncoder() { |
| return new CharsetEncoderCompoundText(this); |
| } |
| |
| void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ |
| for (int i = 1; i < NUM_OF_CONVERTERS; i++) { |
| gbCharset[i].MBCSGetFilteredUnicodeSetForUnicode(gbCharset[i].sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_NONE); |
| } |
| setFillIn.add(0x0000); |
| setFillIn.add(0x0009); |
| setFillIn.add(0x000A); |
| setFillIn.add(0x0020, 0x007F); |
| setFillIn.add(0x00A0, 0x00FF); |
| } |
| } |