| /** |
| ******************************************************************************* |
| * Copyright (C) 2006-2012, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.charset; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.Buffer; |
| import java.nio.BufferOverflowException; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.IntBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| import java.util.Locale; |
| |
| import com.ibm.icu.charset.UConverterSharedData.UConverterType; |
| import com.ibm.icu.impl.ICUData; |
| import com.ibm.icu.impl.ICUResourceBundle; |
| import com.ibm.icu.impl.InvalidFormatException; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| class CharsetMBCS extends CharsetICU { |
| |
| private byte[] fromUSubstitution = null; |
| UConverterSharedData sharedData = null; |
| private static final int MAX_VERSION_LENGTH = 4; |
| |
| // these variables are used in getUnicodeSet() and may be changed in future |
| // typedef enum UConverterSetFilter { |
| static final int UCNV_SET_FILTER_NONE = 1; |
| static final int UCNV_SET_FILTER_DBCS_ONLY = 2; |
| static final int UCNV_SET_FILTER_2022_CN = 3; |
| static final int UCNV_SET_FILTER_SJIS= 4 ; |
| static final int UCNV_SET_FILTER_GR94DBCS = 5; |
| static final int UCNV_SET_FILTER_HZ = 6; |
| static final int UCNV_SET_FILTER_COUNT = 7; |
| // } UConverterSetFilter; |
| |
| /** |
| * Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of |
| * this type. They are sorted by offset. |
| */ |
| final static class MBCSToUFallback { |
| int offset; |
| int codePoint; |
| } |
| |
| /** |
| * This is the MBCS part of the UConverterTable union (a runtime data structure). It keeps all the per-converter |
| * data and points into the loaded mapping tables. |
| */ |
| static final class UConverterMBCSTable { |
| /* toUnicode */ |
| short countStates; |
| byte dbcsOnlyState; |
| boolean stateTableOwned; |
| int countToUFallbacks; |
| |
| int stateTable[/* countStates */][/* 256 */]; |
| int swapLFNLStateTable[/* countStates */][/* 256 */]; /* for swaplfnl */ |
| char unicodeCodeUnits[/* countUnicodeResults */]; |
| MBCSToUFallback toUFallbacks[/* countToUFallbacks */]; |
| |
| /* fromUnicode */ |
| char fromUnicodeTable[]; |
| byte fromUnicodeBytes[]; |
| byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */ |
| int fromUBytesLength; |
| short outputType, unicodeMask; |
| |
| /* converter name for swaplfnl */ |
| String swapLFNLName; |
| |
| /* extension data */ |
| UConverterSharedData baseSharedData; |
| // int extIndexes[]; |
| ByteBuffer extIndexes; // create int[] view etc. as needed |
| |
| CharBuffer mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */ |
| char sbcsIndex[/* SBCS_FAST_LIMIT>>6 */]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */ |
| boolean utf8Friendly; /* for utf8Friendly data */ |
| char maxFastUChar; /* for utf8Friendly data */ |
| |
| /* roundtrips */ |
| long asciiRoundtrips; |
| |
| UConverterMBCSTable() { |
| utf8Friendly = false; |
| mbcsIndex = null; |
| sbcsIndex = new char[SBCS_FAST_LIMIT>>6]; |
| } |
| |
| /* |
| * UConverterMBCSTable(UConverterMBCSTable t) { countStates = t.countStates; dbcsOnlyState = t.dbcsOnlyState; |
| * stateTableOwned = t.stateTableOwned; countToUFallbacks = t.countToUFallbacks; stateTable = t.stateTable; |
| * swapLFNLStateTable = t.swapLFNLStateTable; unicodeCodeUnits = t.unicodeCodeUnits; toUFallbacks = |
| * t.toUFallbacks; fromUnicodeTable = t.fromUnicodeTable; fromUnicodeBytes = t.fromUnicodeBytes; |
| * swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes; fromUBytesLength = t.fromUBytesLength; outputType = |
| * t.outputType; unicodeMask = t.unicodeMask; swapLFNLName = t.swapLFNLName; baseSharedData = t.baseSharedData; |
| * extIndexes = t.extIndexes; } |
| */ |
| } |
| |
| /* Constants used in MBCS data header */ |
| // enum { |
| static final int MBCS_OPT_LENGTH_MASK=0x3f; |
| static final int MBCS_OPT_NO_FROM_U=0x40; |
| /* |
| * If any of the following options bits are set, |
| * then the file must be rejected. |
| */ |
| static final int MBCS_OPT_INCOMPATIBLE_MASK=0xffc0; |
| /* |
| * Remove bits from this mask as more options are recognized |
| * by all implementations that use this constant. |
| */ |
| static final int MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80; |
| // }; |
| /* Constants for fast and UTF-8-friendly conversion. */ |
| // enum { |
| static final int SBCS_FAST_MAX=0x0fff; /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */ |
| static final int SBCS_FAST_LIMIT=SBCS_FAST_MAX+1; /* =0x1000 */ |
| static final int MBCS_FAST_MAX=0xd7ff; /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */ |
| static final int MBCS_FAST_LIMIT=MBCS_FAST_MAX+1; /* =0xd800 */ |
| // }; |
| /** |
| * MBCS data header. See data format description above. |
| */ |
| final static class MBCSHeader { |
| byte version[/* U_MAX_VERSION_LENGTH */]; |
| int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes; |
| int flags; |
| int fromUBytesLength; |
| |
| /* new and required in version 5 */ |
| int options; |
| |
| /* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */ |
| int fullStage2Length; /* number of 32-bit units */ |
| |
| MBCSHeader() { |
| version = new byte[MAX_VERSION_LENGTH]; |
| } |
| } |
| |
| public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath, |
| ClassLoader loader) throws InvalidFormatException { |
| super(icuCanonicalName, javaCanonicalName, aliases); |
| |
| /* See if the icuCanonicalName contains certain option information. */ |
| if (icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) { |
| options = UConverterConstants.OPTION_SWAP_LFNL; |
| icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING)); |
| super.icuCanonicalName = icuCanonicalName; |
| } |
| |
| // now try to load the data |
| sharedData = loadConverter(1, icuCanonicalName, classPath, loader); |
| |
| maxBytesPerChar = sharedData.staticData.maxBytesPerChar; |
| minBytesPerChar = sharedData.staticData.minBytesPerChar; |
| maxCharsPerByte = 1; |
| fromUSubstitution = sharedData.staticData.subChar; |
| subChar = sharedData.staticData.subChar; |
| subCharLen = sharedData.staticData.subCharLen; |
| subChar1 = sharedData.staticData.subChar1; |
| fromUSubstitution = new byte[sharedData.staticData.subCharLen]; |
| System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen); |
| |
| initializeConverter(options); |
| } |
| |
| public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases) |
| throws InvalidFormatException { |
| this(icuCanonicalName, javaCanonicalName, aliases, ICUResourceBundle.ICU_BUNDLE, null); |
| } |
| |
| private UConverterSharedData loadConverter(int nestedLoads, String myName, String classPath, ClassLoader loader) |
| throws InvalidFormatException { |
| boolean noFromU = false; |
| // Read converter data from file |
| UConverterStaticData staticData = new UConverterStaticData(); |
| UConverterDataReader reader = null; |
| try { |
| String resourceName = classPath + "/" + myName + "." + UConverterSharedData.DATA_TYPE; |
| InputStream i; |
| |
| if (loader != null) { |
| i = ICUData.getRequiredStream(loader, resourceName); |
| } else { |
| i = ICUData.getRequiredStream(resourceName); |
| } |
| BufferedInputStream b = new BufferedInputStream(i, UConverterConstants.CNV_DATA_BUFFER_SIZE); |
| reader = new UConverterDataReader(b); |
| reader.readStaticData(staticData); |
| } catch (IOException e) { |
| throw new InvalidFormatException(); |
| } catch (Exception e) { |
| throw new InvalidFormatException(); |
| } |
| |
| UConverterSharedData data = null; |
| int type = staticData.conversionType; |
| |
| if (type != UConverterSharedData.UConverterType.MBCS |
| || staticData.structSize != UConverterStaticData.SIZE_OF_UCONVERTER_STATIC_DATA) { |
| throw new InvalidFormatException(); |
| } |
| |
| data = new UConverterSharedData(1, null, false, 0); |
| data.dataReader = reader; |
| data.staticData = staticData; |
| data.sharedDataCached = false; |
| |
| // Load data |
| UConverterMBCSTable mbcsTable = data.mbcs; |
| MBCSHeader header = new MBCSHeader(); |
| try { |
| reader.readMBCSHeader(header); |
| } catch (IOException e) { |
| throw new InvalidFormatException(); |
| } |
| |
| int offset; |
| // int[] extIndexesArray = null; |
| String baseNameString = null; |
| int[][] stateTableArray = null; |
| MBCSToUFallback[] toUFallbacksArray = null; |
| char[] unicodeCodeUnitsArray = null; |
| char[] fromUnicodeTableArray = null; |
| byte[] fromUnicodeBytesArray = null; |
| |
| if (header.version[0] == 5 && header.version[1] >= 3 && (header.options & MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK) == 0) { |
| noFromU = ((header.options & MBCS_OPT_NO_FROM_U) != 0); |
| } else if (header.version[0] != 4) { |
| throw new InvalidFormatException(); |
| } |
| |
| mbcsTable.outputType = (byte) header.flags; |
| |
| /* extension data, header version 4.2 and higher */ |
| offset = header.flags >>> 8; |
| // if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { |
| if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { |
| try { |
| baseNameString = reader.readBaseTableName(); |
| if (offset != 0) { |
| // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null |
| // terminator byte all already read; |
| mbcsTable.extIndexes = reader.readExtIndexes(offset |
| - (reader.bytesRead - reader.staticDataBytesRead)); |
| } |
| } catch (IOException e) { |
| throw new InvalidFormatException(); |
| } |
| } |
| |
| // agljport:add this would be unnecessary if extIndexes were memory mapped |
| /* |
| * if(mbcsTable.extIndexes != null) { |
| * |
| * try { //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 + |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 + |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 + |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] + |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 + |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 + |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; //int nbytes = |
| * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] //byte[] extTables = dataReader.readExtTables(nbytes); |
| * //mbcsTable.extTables = ByteBuffer.wrap(extTables); } catch(IOException e) { System.err.println("Caught |
| * IOException: " + e.getMessage()); pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; return; } } |
| */ |
| if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { |
| UConverterSharedData baseSharedData = null; |
| ByteBuffer extIndexes; |
| String baseName; |
| |
| /* extension-only file, load the base table and set values appropriately */ |
| extIndexes = mbcsTable.extIndexes; |
| if (extIndexes == null) { |
| /* extension-only file without extension */ |
| throw new InvalidFormatException(); |
| } |
| |
| if (nestedLoads != 1) { |
| /* an extension table must not be loaded as a base table */ |
| throw new InvalidFormatException(); |
| } |
| |
| /* load the base table */ |
| baseName = baseNameString; |
| if (baseName.equals(staticData.name)) { |
| /* forbid loading this same extension-only file */ |
| throw new InvalidFormatException(); |
| } |
| |
| // agljport:fix args.size=sizeof(UConverterLoadArgs); |
| baseSharedData = loadConverter(2, baseName, classPath, loader); |
| |
| if (baseSharedData.staticData.conversionType != UConverterType.MBCS |
| || baseSharedData.mbcs.baseSharedData != null) { |
| // agljport:fix ucnv_unload(baseSharedData); |
| throw new InvalidFormatException(); |
| } |
| |
| /* copy the base table data */ |
| // agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't |
| // need the deep copy so can just make sure mbcs and its local reference both refer to the same new object |
| mbcsTable = data.mbcs = baseSharedData.mbcs; |
| |
| /* overwrite values with relevant ones for the extension converter */ |
| mbcsTable.baseSharedData = baseSharedData; |
| mbcsTable.extIndexes = extIndexes; |
| |
| /* |
| * It would be possible to share the swapLFNL data with a base converter, but the generated name would have |
| * to be different, and the memory would have to be free'd only once. It is easier to just create the data |
| * for the extension converter separately when it is requested. |
| */ |
| mbcsTable.swapLFNLStateTable = null; |
| mbcsTable.swapLFNLFromUnicodeBytes = null; |
| mbcsTable.swapLFNLName = null; |
| |
| /* |
| * Set a special, runtime-only outputType if the extension converter is a DBCS version of a base converter |
| * that also maps single bytes. |
| */ |
| if (staticData.conversionType == UConverterType.DBCS |
| || (staticData.conversionType == UConverterType.MBCS && staticData.minBytesPerChar >= 2)) { |
| |
| if (baseSharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) { |
| /* the base converter is SI/SO-stateful */ |
| int entry; |
| |
| /* get the dbcs state from the state table entry for SO=0x0e */ |
| entry = mbcsTable.stateTable[0][0xe]; |
| if (MBCS_ENTRY_IS_FINAL(entry) && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_CHANGE_ONLY |
| && MBCS_ENTRY_FINAL_STATE(entry) != 0) { |
| mbcsTable.dbcsOnlyState = (byte) MBCS_ENTRY_FINAL_STATE(entry); |
| |
| mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY; |
| } |
| } else if (baseSharedData.staticData.conversionType == UConverterType.MBCS |
| && baseSharedData.staticData.minBytesPerChar == 1 |
| && baseSharedData.staticData.maxBytesPerChar == 2 && mbcsTable.countStates <= 127) { |
| |
| /* non-stateful base converter, need to modify the state table */ |
| int newStateTable[][/* 256 */]; |
| int state[]; // this works because java 2-D array is array of references and we can have state = |
| // newStateTable[i]; |
| int i, count; |
| |
| /* allocate a new state table and copy the base state table contents */ |
| count = mbcsTable.countStates; |
| newStateTable = new int[(count + 1) * 1024][256]; |
| |
| for (i = 0; i < mbcsTable.stateTable.length; ++i) |
| System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, |
| mbcsTable.stateTable[i].length); |
| |
| /* change all final single-byte entries to go to a new all-illegal state */ |
| state = newStateTable[0]; |
| for (i = 0; i < 256; ++i) { |
| if (MBCS_ENTRY_IS_FINAL(state[i])) { |
| state[i] = MBCS_ENTRY_TRANSITION(count, 0); |
| } |
| } |
| |
| /* build the new all-illegal state */ |
| state = newStateTable[count]; |
| for (i = 0; i < 256; ++i) { |
| state[i] = MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); |
| } |
| mbcsTable.stateTable = newStateTable; |
| mbcsTable.countStates = (byte) (count + 1); |
| mbcsTable.stateTableOwned = true; |
| |
| mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY; |
| } |
| } |
| |
| /* |
| * unlike below for files with base tables, do not get the unicodeMask from the sharedData; instead, use the |
| * base table's unicodeMask, which we copied in the memcpy above; this is necessary because the static data |
| * unicodeMask, especially the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data |
| */ |
| } else { |
| /* conversion file with a base table; an additional extension table is optional */ |
| /* make sure that the output type is known */ |
| switch (mbcsTable.outputType) { |
| case MBCS_OUTPUT_1: |
| case MBCS_OUTPUT_2: |
| case MBCS_OUTPUT_3: |
| case MBCS_OUTPUT_4: |
| case MBCS_OUTPUT_3_EUC: |
| case MBCS_OUTPUT_4_EUC: |
| case MBCS_OUTPUT_2_SISO: |
| /* OK */ |
| break; |
| default: |
| throw new InvalidFormatException(); |
| } |
| |
| stateTableArray = new int[header.countStates][256]; |
| toUFallbacksArray = new MBCSToUFallback[header.countToUFallbacks]; |
| for (int i = 0; i < toUFallbacksArray.length; ++i) |
| toUFallbacksArray[i] = new MBCSToUFallback(); |
| unicodeCodeUnitsArray = new char[(header.offsetFromUTable - header.offsetToUCodeUnits) / 2]; |
| fromUnicodeTableArray = new char[(header.offsetFromUBytes - header.offsetFromUTable) / 2]; |
| fromUnicodeBytesArray = new byte[header.fromUBytesLength]; |
| try { |
| reader.readMBCSTable(stateTableArray, toUFallbacksArray, unicodeCodeUnitsArray, fromUnicodeTableArray, |
| fromUnicodeBytesArray); |
| } catch (IOException e) { |
| throw new InvalidFormatException(); |
| } |
| |
| mbcsTable.countStates = (byte) header.countStates; |
| mbcsTable.countToUFallbacks = header.countToUFallbacks; |
| mbcsTable.stateTable = stateTableArray; |
| mbcsTable.toUFallbacks = toUFallbacksArray; |
| mbcsTable.unicodeCodeUnits = unicodeCodeUnitsArray; |
| |
| mbcsTable.fromUnicodeTable = fromUnicodeTableArray; |
| mbcsTable.fromUnicodeBytes = fromUnicodeBytesArray; |
| mbcsTable.fromUBytesLength = header.fromUBytesLength; |
| |
| /* |
| * converter versions 6.1 and up contain a unicodeMask that is used here to select the most efficient |
| * function implementations |
| */ |
| // agljport:fix info.size=sizeof(UDataInfo); |
| // agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); |
| // agljport:fix if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { |
| /* mask off possible future extensions to be safe */ |
| mbcsTable.unicodeMask = (short) (staticData.unicodeMask & 3); |
| // agljport:fix } else { |
| /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ |
| // agljport:fix mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; |
| // agljport:fix } |
| if (offset != 0) { |
| try { |
| // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null |
| // terminator byte all already read; |
| // int namelen = baseNameString != null? baseNameString.length() + 1: 0; |
| mbcsTable.extIndexes = reader.readExtIndexes(offset |
| - (reader.bytesRead - reader.staticDataBytesRead)); |
| } catch (IOException e) { |
| throw new InvalidFormatException(); |
| } |
| } |
| |
| if (header.version[1] >= 3 && (mbcsTable.unicodeMask & UConverterConstants.HAS_SURROGATES) == 0 && |
| (mbcsTable.countStates == 1 ? ((char)header.version[2] >= (SBCS_FAST_MAX>>8)) : ((char)header.version[2] >= (MBCS_FAST_MAX>>8)))) { |
| mbcsTable.utf8Friendly = true; |
| |
| if (mbcsTable.countStates == 1) { |
| /* |
| * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. |
| * Build a table with indexes to each block, to be used instaed of |
| * the regular stage 1/2 table. |
| */ |
| for (int i = 0; i < (SBCS_FAST_LIMIT>>6); ++i) { |
| mbcsTable.sbcsIndex[i] = mbcsTable.fromUnicodeTable[mbcsTable.fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; |
| } |
| /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header.version[2]>(SBCS_FAST_MAX>>8) */ |
| mbcsTable.maxFastUChar = SBCS_FAST_MAX; |
| } else { |
| /* |
| * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. |
| * The .cnv file is prebuilt with an additional stage table with indexes to each block. |
| */ |
| if (noFromU) { |
| mbcsTable.mbcsIndex = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer(); |
| } |
| mbcsTable.maxFastUChar = (char)((header.version[2]<<8) | 0xff); |
| } |
| } |
| /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ |
| { |
| long asciiRoundtrips = 0xffffffff; |
| for (int i = 0; i < 0x80; ++i) { |
| if (mbcsTable.stateTable[0][i] != MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { |
| asciiRoundtrips&=~((long)1<<(i>>2))&UConverterConstants.UNSIGNED_INT_MASK; |
| } |
| } |
| mbcsTable.asciiRoundtrips = asciiRoundtrips&UConverterConstants.UNSIGNED_INT_MASK; |
| } |
| |
| if (noFromU) { |
| int stage1Length = (mbcsTable.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) != 0 ? 0x440 : 0x40; |
| int stage2Length = (header.offsetFromUBytes - header.offsetFromUTable)/4 - stage1Length/2; |
| reconstituteData(mbcsTable, stage1Length, stage2Length, header.fullStage2Length); |
| } |
| if (mbcsTable.outputType == MBCS_OUTPUT_DBCS_ONLY || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) { |
| /* |
| * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. |
| * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. |
| */ |
| mbcsTable.asciiRoundtrips = 0; |
| } |
| } |
| return data; |
| } |
| |
| private static boolean writeStage3Roundtrip(UConverterMBCSTable mbcsTable, long value, int codePoints[]) { |
| char[] table; |
| byte[] bytes; |
| int stage2; |
| int p; |
| int c; |
| int i, st3; |
| long temp; |
| |
| table = mbcsTable.fromUnicodeTable; |
| bytes = mbcsTable.fromUnicodeBytes; |
| |
| /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ |
| switch(mbcsTable.outputType) { |
| case MBCS_OUTPUT_3_EUC: |
| if(value<=0xffff) { |
| /* short sequences are stored directly */ |
| /* code set 0 or 1 */ |
| } else if(value<=0x8effff) { |
| /* code set 2 */ |
| value&=0x7fff; |
| } else /* first byte is 0x8f */ { |
| /* code set 3 */ |
| value&=0xff7f; |
| } |
| break; |
| case MBCS_OUTPUT_4_EUC: |
| if(value<=0xffffff) { |
| /* short sequences are stored directly */ |
| /* code set 0 or 1 */ |
| } else if(value<=0x8effffff) { |
| /* code set 2 */ |
| value&=0x7fffff; |
| } else /* first byte is 0x8f */ { |
| /* code set 3 */ |
| value&=0xff7fff; |
| } |
| break; |
| default: |
| break; |
| } |
| |
| for(i=0; i<=0x1f; ++value, ++i) { |
| c=codePoints[i]; |
| if(c<0) { |
| continue; |
| } |
| |
| /* locate the stage 2 & 3 data */ |
| stage2 = table[c>>10] + ((c>>4)&0x3f); |
| st3 = table[stage2*2]<<16|table[stage2*2 + 1]; |
| st3 = (int)(char)(st3 * 16 + (c&0xf)); |
| |
| /* write the codepage bytes into stage 3 */ |
| switch(mbcsTable.outputType) { |
| case MBCS_OUTPUT_3: |
| case MBCS_OUTPUT_4_EUC: |
| p = st3*3; |
| bytes[p] = (byte)(value>>16); |
| bytes[p+1] = (byte)(value>>8); |
| bytes[p+2] = (byte)value; |
| break; |
| case MBCS_OUTPUT_4: |
| bytes[st3*4] = (byte)(value >> 24); |
| bytes[st3*4 + 1] = (byte)(value >> 16); |
| bytes[st3*4 + 2] = (byte)(value >> 8); |
| bytes[st3*4 + 3] = (byte)value; |
| break; |
| default: |
| /* 2 bytes per character */ |
| bytes[st3*2] = (byte)(value >> 8); |
| bytes[st3*2 + 1] = (byte)value; |
| break; |
| } |
| |
| /* set the roundtrip flag */ |
| temp = (1L<<(16+(c&0xf))); |
| table[stage2*2] |= (char)(temp>>16); |
| table[stage2*2 + 1] |= (char)temp; |
| } |
| return true; |
| } |
| |
| private static void reconstituteData(UConverterMBCSTable mbcsTable, int stage1Length, int stage2Length, int fullStage2Length) { |
| int datalength = stage1Length*2+fullStage2Length*4+mbcsTable.fromUBytesLength; |
| int offset = 0; |
| byte[] stage = new byte[datalength]; |
| |
| for (int i = 0; i < stage1Length; ++i) { |
| stage[i*2] = (byte)(mbcsTable.fromUnicodeTable[i]>>8); |
| stage[i*2+1] = (byte)(mbcsTable.fromUnicodeTable[i]); |
| } |
| |
| offset = ((fullStage2Length - stage2Length) * 4) + (stage1Length * 2); |
| for (int i = 0; i < stage2Length; ++i) { |
| stage[offset + i*4] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2]>>8); |
| stage[offset + i*4+1] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2]); |
| stage[offset + i*4+2] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2+1]>>8); |
| stage[offset + i*4+3] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2+1]); |
| } |
| |
| /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ |
| |
| /* reconsitute the initial part of stage 2 from the mbcsIndex */ |
| { |
| int stageUTF8Length=(mbcsTable.maxFastUChar+1)>>6; |
| int stageUTF8Index=0; |
| int st1, st2, st3, i; |
| |
| for (st1 = 0; stageUTF8Index < stageUTF8Length; ++st1) { |
| st2 = ((char)stage[2*st1]<<8) | (0xff & stage[2*st1+1]); |
| if (st2 != stage1Length/2) { |
| /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ |
| for (i = 0; i < 16; ++i) { |
| st3 = mbcsTable.mbcsIndex.get(stageUTF8Index++); |
| if (st3 != 0) { |
| /* a stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ |
| st3>>=4; |
| /* |
| * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are |
| * allocated together as a single 64-block for access from the mbcsIndex |
| */ |
| stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++; |
| stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++; |
| stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++; |
| stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); |
| } else { |
| /* no stage 3 block, skip */ |
| st2+=4; |
| } |
| } |
| } else { |
| /* no stage 2 block, skip */ |
| stageUTF8Index+=16; |
| } |
| } |
| } |
| |
| char[] stage1 = new char[stage.length/2]; |
| for (int i = 0; i < stage1.length; ++i) { |
| stage1[i] = (char)(((stage[i*2])<<8)|(stage[i*2+1] & UConverterConstants.UNSIGNED_BYTE_MASK)); |
| } |
| byte[] stage2 = new byte[stage.length - ((stage1Length * 2) + (fullStage2Length * 4))]; |
| System.arraycopy(stage, ((stage1Length * 2) + (fullStage2Length * 4)), stage2, 0, stage2.length); |
| |
| mbcsTable.fromUnicodeTable = stage1; |
| mbcsTable.fromUnicodeBytes = stage2; |
| |
| /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ |
| MBCSEnumToUnicode(mbcsTable); |
| } |
| |
| /* |
| * Internal function enumerating the toUnicode data of an MBCS converter. |
| * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U |
| * table, but could also be used for a future getUnicodeSet() option |
| * that includes reverse fallbacks (after updating this function's implementation). |
| * Currently only handles roundtrip mappings. |
| * Does not currently handle extensions. |
| */ |
| private static void MBCSEnumToUnicode(UConverterMBCSTable mbcsTable) { |
| /* |
| * Properties for each state, to speed up the enumeration. |
| * Ignorable actions are unassigned/illegal/state-change-only: |
| * They do not lead to mappings. |
| * |
| * Bits 7..6 |
| * 1 direct/initial state (stateful converters have mulitple) |
| * 0 non-initial state with transitions or with nonignorable result actions |
| * -1 final state with only ignorable actions |
| * |
| * Bits 5..3 |
| * The lowest byte value with non-ignorable actions is |
| * value<<5 (rounded down). |
| * |
| * Bits 2..0: |
| * The highest byte value with non-ignorable actions is |
| * (value<<5)&0x1f (rounded up). |
| */ |
| byte stateProps[] = new byte[MBCS_MAX_STATE_COUNT]; |
| int state; |
| |
| /* recurse from state 0 and set all stateProps */ |
| getStateProp(mbcsTable.stateTable, stateProps, 0); |
| |
| for (state = 0; state < mbcsTable.countStates; ++state) { |
| if (stateProps[state] >= 0x40) { |
| /* start from each direct state */ |
| enumToU(mbcsTable, stateProps, state, 0, 0); |
| } |
| } |
| |
| |
| } |
| |
| private static boolean enumToU(UConverterMBCSTable mbcsTable, byte stateProps[], int state, int offset, int value) { |
| int[] codePoints = new int[32]; |
| int[] row; |
| char[] unicodeCodeUnits; |
| int anyCodePoints; |
| int b, limit; |
| |
| row = mbcsTable.stateTable[state]; |
| unicodeCodeUnits = mbcsTable.unicodeCodeUnits; |
| |
| value<<=8; |
| anyCodePoints = -1; /* becomes non-negative if there is a mapping */ |
| |
| b = (stateProps[state]&0x38)<<2; |
| if (b == 0 && stateProps[state] >= 0x40) { |
| /* skip byte sequences with leading zeros because they are note stored in the fromUnicode table */ |
| codePoints[0] = UConverterConstants.U_SENTINEL; |
| b = 1; |
| } |
| limit = ((stateProps[state]&7)+1)<<5; |
| while (b < limit) { |
| int entry = row[b]; |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| int nextState = MBCS_ENTRY_TRANSITION_STATE(entry); |
| if (stateProps[nextState] >= 0) { |
| /* recurse to a state with non-ignorable actions */ |
| if (!enumToU(mbcsTable, stateProps, nextState, offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), value|b)) { |
| return false; |
| } |
| } |
| codePoints[b&0x1f] = UConverterConstants.U_SENTINEL; |
| } else { |
| int c; |
| int action; |
| |
| /* |
| * An if-else-if chain provides more reliable performance for |
| * the most common cases compared to a switch. |
| */ |
| action = MBCS_ENTRY_FINAL_ACTION(entry); |
| if (action == MBCS_STATE_VALID_DIRECT_16) { |
| /* output BMP code point */ |
| c = MBCS_ENTRY_FINAL_VALUE_16(entry); |
| } else if (action == MBCS_STATE_VALID_16) { |
| int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[finalOffset]; |
| if (c < 0xfffe) { |
| /* output BMP code point */ |
| } else { |
| c = UConverterConstants.U_SENTINEL; |
| } |
| } else if (action == MBCS_STATE_VALID_16_PAIR) { |
| int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[finalOffset++]; |
| if (c < 0xd800) { |
| /* output BMP code point below 0xd800 */ |
| } else if (c <= 0xdbff) { |
| /* output roundtrip or fallback supplementary code point */ |
| c = ((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); |
| } else if (c == 0xe000) { |
| /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ |
| c = unicodeCodeUnits[finalOffset]; |
| } else { |
| c = UConverterConstants.U_SENTINEL; |
| } |
| } else if (action == MBCS_STATE_VALID_DIRECT_20) { |
| /* output supplementary code point */ |
| c = MBCS_ENTRY_FINAL_VALUE(entry)+0x10000; |
| } else { |
| c = UConverterConstants.U_SENTINEL; |
| } |
| |
| codePoints[b&0x1f] = c; |
| anyCodePoints&=c; |
| } |
| if (((++b)&0x1f) == 0) { |
| if(anyCodePoints>=0) { |
| if(!writeStage3Roundtrip(mbcsTable, value|(b-0x20)&UConverterConstants.UNSIGNED_INT_MASK, codePoints)) { |
| return false; |
| } |
| anyCodePoints=-1; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| /* |
| * Only called if stateProps[state]==-1. |
| * A recursive call may do stateProps[state]|=0x40 if this state is the target of an |
| * MBCS_STATE_CHANGE_ONLY. |
| */ |
| private static byte getStateProp(int stateTable[][], byte stateProps[], int state) { |
| int[] row; |
| int min, max, entry, nextState; |
| |
| row = stateTable[state]; |
| stateProps[state] = 0; |
| |
| /* find first non-ignorable state */ |
| for (min = 0;;++min) { |
| entry = row[min]; |
| nextState = MBCS_ENTRY_STATE(entry); |
| if (stateProps[nextState] == -1) { |
| getStateProp(stateTable, stateProps, nextState); |
| } |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| if (stateProps[nextState] >- 0) { |
| break; |
| } |
| } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) { |
| break; |
| } |
| if (min == 0xff) { |
| stateProps[state] = -0x40; /* (byte)0xc0 */ |
| return stateProps[state]; |
| } |
| } |
| stateProps[state]|=(byte)((min>>5)<<3); |
| |
| /* find last non-ignorable state */ |
| for (max = 0xff; min < max; --max) { |
| entry = row[max]; |
| nextState = MBCS_ENTRY_STATE(entry); |
| if (stateProps[nextState] == -1) { |
| getStateProp(stateTable, stateProps, nextState); |
| } |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| if (stateProps[nextState] >- 0) { |
| break; |
| } |
| } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) { |
| break; |
| } |
| } |
| stateProps[state]|=(byte)(max>>5); |
| |
| /* recurse further and collect direct-state information */ |
| while (min <= max) { |
| entry = row[min]; |
| nextState = MBCS_ENTRY_STATE(entry); |
| if (stateProps[nextState] == -1) { |
| getStateProp(stateTable, stateProps, nextState); |
| } |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| stateProps[nextState]|=0x40; |
| if (MBCS_ENTRY_FINAL_ACTION(entry) <= MBCS_STATE_FALLBACK_DIRECT_20) { |
| stateProps[state]|=0x40; |
| } |
| } |
| ++min; |
| } |
| return stateProps[state]; |
| } |
| |
| protected void initializeConverter(int myOptions) { |
| UConverterMBCSTable mbcsTable; |
| ByteBuffer extIndexes; |
| short outputType; |
| byte maxBytesPerUChar; |
| |
| mbcsTable = sharedData.mbcs; |
| outputType = mbcsTable.outputType; |
| |
| if (outputType == MBCS_OUTPUT_DBCS_ONLY) { |
| /* the swaplfnl option does not apply, remove it */ |
| this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL; |
| } |
| |
| if ((myOptions & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| /* do this because double-checked locking is broken */ |
| boolean isCached; |
| |
| // agljport:todo umtx_lock(NULL); |
| isCached = mbcsTable.swapLFNLStateTable != null; |
| // agljport:todo umtx_unlock(NULL); |
| |
| if (!isCached) { |
| try { |
| if (!EBCDICSwapLFNL()) { |
| /* this option does not apply, remove it */ |
| this.options = myOptions & ~UConverterConstants.OPTION_SWAP_LFNL; |
| } |
| } catch (Exception e) { |
| /* something went wrong. */ |
| return; |
| } |
| } |
| } |
| |
| String lowerCaseName = icuCanonicalName.toLowerCase(Locale.ENGLISH); |
| if (lowerCaseName.indexOf("gb18030") >= 0) { |
| /* set a flag for GB 18030 mode, which changes the callback behavior */ |
| this.options |= MBCS_OPTION_GB18030; |
| } else if (lowerCaseName.indexOf("keis") >= 0) { |
| this.options |= MBCS_OPTION_KEIS; |
| } else if (lowerCaseName.indexOf("jef") >= 0) { |
| this.options |= MBCS_OPTION_JEF; |
| } else if (lowerCaseName.indexOf("jips") >= 0) { |
| this.options |= MBCS_OPTION_JIPS; |
| } |
| |
| /* fix maxBytesPerUChar depending on outputType and options etc. */ |
| if (outputType == MBCS_OUTPUT_2_SISO) { |
| /* changed from 3 to 4 in ICU4J only. #9205 */ |
| maxBytesPerChar = 4; /* SO+DBCS+SI*/ |
| } |
| |
| extIndexes = mbcsTable.extIndexes; |
| if (extIndexes != null) { |
| maxBytesPerUChar = (byte) GET_MAX_BYTES_PER_UCHAR(extIndexes); |
| if (outputType == MBCS_OUTPUT_2_SISO) { |
| ++maxBytesPerUChar; /* SO + multiple DBCS */ |
| } |
| |
| if (maxBytesPerUChar > maxBytesPerChar) { |
| maxBytesPerChar = maxBytesPerUChar; |
| } |
| } |
| } |
| /* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/ |
| /* |
| * This code modifies a standard EBCDIC<->Unicode mappling table for |
| * OS/390 (z/OS) Unix System Services (Open Edition). |
| * The difference is in the mapping of Line Feed and New Line control codes: |
| * Standard EBDIC maps |
| * |
| * <U000A> \x25 |0 |
| * <U0085> \x15 |0 |
| * |
| * but OS/390 USS EBCDIC swaps the control codes for LF and NL, |
| * mapping |
| * |
| * <U000A> \x15 |0 |
| * <U0085> \x25 |0 |
| * |
| * This code modifies a loaded standard EBCDIC<->Unicode mapping table |
| * by copying it into allocated memory and swapping the LF and NL values. |
| * It allows to support the same EBCDIC charset in both version without |
| * duplicating the entire installed table. |
| */ |
| /* standard EBCDIC codes */ |
| private static final short EBCDIC_LF = 0x0025; |
| private static final short EBCDIC_NL = 0x0015; |
| |
| /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ |
| private static final short EBCDIC_RT_LF = 0x0f25; |
| private static final short EBCDIC_RT_NL = 0x0f15; |
| |
| /* Unicode code points */ |
| private static final short U_LF = 0x000A; |
| private static final short U_NL = 0x0085; |
| |
| private boolean EBCDICSwapLFNL() throws Exception { |
| UConverterMBCSTable mbcsTable; |
| |
| char[] table; |
| byte[] results; |
| byte[] bytes; |
| |
| int[][] newStateTable; |
| byte[] newResults; |
| String newName; |
| |
| int stage2Entry; |
| // int size; |
| int sizeofFromUBytes; |
| |
| mbcsTable = sharedData.mbcs; |
| |
| table = mbcsTable.fromUnicodeTable; |
| bytes = mbcsTable.fromUnicodeBytes; |
| results = bytes; |
| |
| /* |
| * Check that this is an EBCDIC table with SBCS portion - |
| * SBCS or EBCDIC with standard EBCDIC LF and NL mappings. |
| * |
| * If not, ignore the option Options are always ignored if they do not apply. |
| */ |
| if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) && |
| mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && |
| mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) { |
| return false; |
| } |
| |
| if (mbcsTable.outputType == MBCS_OUTPUT_1) { |
| if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && |
| EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) { |
| return false; |
| } |
| } else /* MBCS_OUTPUT_2_SISO */ { |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF); |
| if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) && |
| EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF))) { |
| return false; |
| } |
| |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL); |
| if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) && |
| EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL))) { |
| return false; |
| } |
| } |
| |
| if (mbcsTable.fromUBytesLength > 0) { |
| /* |
| * We _know_ the number of bytes in the fromUnicodeBytes array |
| * starting with header.version 4.1. |
| */ |
| sizeofFromUBytes = mbcsTable.fromUBytesLength; |
| } else { |
| /* |
| * Otherwise: |
| * There used to be code to enumerate the fromUnicode |
| * trie and find the highest entry, but it was removed in ICU 3.2 |
| * because it was not tested and caused a low code coverage number. |
| */ |
| throw new Exception("U_INVALID_FORMAT_ERROR"); |
| } |
| |
| /* |
| * The table has an appropriate format. |
| * Allocate and build |
| * - a modified to-Unicode state table |
| * - a modified from-Unicode output array |
| * - a converter name string with the swap option appended |
| */ |
| // size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20; |
| |
| /* copy and modify the to-Unicode state table */ |
| newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length]; |
| for (int i = 0; i < newStateTable.length; i++) { |
| System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length); |
| } |
| |
| newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); |
| newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); |
| |
| /* copy and modify the from-Unicode result table */ |
| newResults = new byte[sizeofFromUBytes]; |
| System.arraycopy(bytes, 0, newResults, 0, sizeofFromUBytes); |
| /* conveniently, the table access macros work on the left side of expressions */ |
| if (mbcsTable.outputType == MBCS_OUTPUT_1) { |
| MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL); |
| MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF); |
| } else /* MBCS_OUTPUT_2_SISO */ { |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF); |
| MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL); |
| |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL); |
| MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF); |
| } |
| |
| /* set the canonical converter name */ |
| newName = icuCanonicalName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING); |
| |
| if (mbcsTable.swapLFNLStateTable == null) { |
| mbcsTable.swapLFNLStateTable = newStateTable; |
| mbcsTable.swapLFNLFromUnicodeBytes = newResults; |
| mbcsTable.swapLFNLName = newName; |
| } |
| return true; |
| } |
| |
| /** |
| * MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3 |
| * of the lookup table, mostly how many bytes are stored per entry. |
| */ |
| static final int MBCS_OUTPUT_1 = 0; /* 0 */ |
| static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */ |
| static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */ |
| static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */ |
| static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */ |
| static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */ |
| static final int MBCS_OUTPUT_2_SISO = 12; /* c */ |
| static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */ |
| static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */ |
| // static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1; |
| static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */ |
| |
| /* GB 18030 data ------------------------------------------------------------ */ |
| |
| /* helper macros for linear values for GB 18030 four-byte sequences */ |
| private static long LINEAR_18030(long a, long b, long c, long d) { |
| return ((((a & 0xff) * 10 + (b & 0xff)) * 126L + (c & 0xff)) * 10L + (d & 0xff)); |
| } |
| |
| private static long LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30); |
| |
| private static long LINEAR(long x) { |
| return LINEAR_18030(x >>> 24, (x >>> 16) & 0xff, (x >>> 8) & 0xff, x & 0xff); |
| } |
| |
| /* |
| * Some ranges of GB 18030 where both the Unicode code points and the GB four-byte sequences are contiguous and are |
| * handled algorithmically by the special callback functions below. The values are start & end of Unicode & GB |
| * codes. |
| * |
| * Note that single surrogates are not mapped by GB 18030 as of the re-released mapping tables from 2000-nov-30. |
| */ |
| private static final long gb18030Ranges[][] = new long[/* 14 */][/* 4 */] { |
| { 0x10000L, 0x10FFFFL, LINEAR(0x90308130L), LINEAR(0xE3329A35L) }, |
| { 0x9FA6L, 0xD7FFL, LINEAR(0x82358F33L), LINEAR(0x8336C738L) }, |
| { 0x0452L, 0x1E3EL, LINEAR(0x8130D330L), LINEAR(0x8135F436L) }, |
| { 0x1E40L, 0x200FL, LINEAR(0x8135F438L), LINEAR(0x8136A531L) }, |
| { 0xE865L, 0xF92BL, LINEAR(0x8336D030L), LINEAR(0x84308534L) }, |
| { 0x2643L, 0x2E80L, LINEAR(0x8137A839L), LINEAR(0x8138FD38L) }, |
| { 0xFA2AL, 0xFE2FL, LINEAR(0x84309C38L), LINEAR(0x84318537L) }, |
| { 0x3CE1L, 0x4055L, LINEAR(0x8231D438L), LINEAR(0x8232AF32L) }, |
| { 0x361BL, 0x3917L, LINEAR(0x8230A633L), LINEAR(0x8230F237L) }, |
| { 0x49B8L, 0x4C76L, LINEAR(0x8234A131L), LINEAR(0x8234E733L) }, |
| { 0x4160L, 0x4336L, LINEAR(0x8232C937L), LINEAR(0x8232F837L) }, |
| { 0x478EL, 0x4946L, LINEAR(0x8233E838L), LINEAR(0x82349638L) }, |
| { 0x44D7L, 0x464BL, LINEAR(0x8233A339L), LINEAR(0x8233C931L) }, |
| { 0xFFE6L, 0xFFFFL, LINEAR(0x8431A234L), LINEAR(0x8431A439L) } }; |
| |
| /* bit flag for UConverter.options indicating GB 18030 special handling */ |
| private static final int MBCS_OPTION_GB18030 = 0x8000; |
| |
| /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ |
| private static final int MBCS_OPTION_KEIS = 0x01000; |
| private static final int MBCS_OPTION_JEF = 0x02000; |
| private static final int MBCS_OPTION_JIPS = 0x04000; |
| |
| private static enum SISO_Option { |
| SI, |
| SO |
| } |
| |
| private static final byte[] KEIS_SO_CHAR = { 0x0A, 0x42 }; |
| private static final byte[] KEIS_SI_CHAR = { 0x0A, 0x41 }; |
| private static final byte JEF_SO_CHAR = 0x28; |
| private static final byte JEF_SI_CHAR = 0x29; |
| private static final byte[] JIPS_SO_CHAR = { 0x1A, 0x70 }; |
| private static final byte[] JIPS_SI_CHAR = { 0x1A, 0x71 }; |
| |
| private static int getSISOBytes(SISO_Option option, int cnvOption, byte[] value) { |
| int SISOLength = 0; |
| |
| switch (option) { |
| case SI: |
| if ((cnvOption&MBCS_OPTION_KEIS)!=0) { |
| value[0] = KEIS_SI_CHAR[0]; |
| value[1] = KEIS_SI_CHAR[1]; |
| SISOLength = 2; |
| } else if ((cnvOption&MBCS_OPTION_JEF)!=0) { |
| value[0] = JEF_SI_CHAR; |
| SISOLength = 1; |
| } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) { |
| value[0] = JIPS_SI_CHAR[0]; |
| value[1] = JIPS_SI_CHAR[1]; |
| SISOLength = 2; |
| } else { |
| value[0] = UConverterConstants.SI; |
| SISOLength = 1; |
| } |
| break; |
| case SO: |
| if ((cnvOption&MBCS_OPTION_KEIS)!=0) { |
| value[0] = KEIS_SO_CHAR[0]; |
| value[1] = KEIS_SO_CHAR[1]; |
| SISOLength = 2; |
| } else if ((cnvOption&MBCS_OPTION_JEF)!=0) { |
| value[0] = JEF_SO_CHAR; |
| SISOLength = 1; |
| } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) { |
| value[0] = JIPS_SO_CHAR[0]; |
| value[1] = JIPS_SO_CHAR[1]; |
| SISOLength = 2; |
| } else { |
| value[0] = UConverterConstants.SO; |
| SISOLength = 1; |
| } |
| break; |
| default: |
| /* Should never happen. */ |
| break; |
| } |
| |
| return SISOLength; |
| } |
| // enum { |
| static final int MBCS_MAX_STATE_COUNT = 128; |
| // }; |
| /** |
| * MBCS action codes for conversions to Unicode. These values are in bits 23..20 of the state table entries. |
| */ |
| static final int MBCS_STATE_VALID_DIRECT_16 = 0; |
| static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1; |
| static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1; |
| static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1; |
| static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1; |
| static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1; |
| static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1; |
| static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1; |
| static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1; |
| |
| static int MBCS_ENTRY_SET_STATE(int entry, int state) { |
| return (entry&0x80ffffff)|(state<<24L); |
| } |
| |
| static int MBCS_ENTRY_STATE(int entry) { |
| return (((entry)>>24)&0x7f); |
| } |
| |
| /* Methods for state table entries */ |
| static int MBCS_ENTRY_TRANSITION(int state, int offset) { |
| return (state << 24L) | offset; |
| } |
| |
| static int MBCS_ENTRY_FINAL(int state, int action, int value) { |
| return 0x80000000 | (state << 24L) | (action << 20L) | value; |
| } |
| |
| static boolean MBCS_ENTRY_IS_TRANSITION(int entry) { |
| return (entry) >= 0; |
| } |
| |
| static boolean MBCS_ENTRY_IS_FINAL(int entry) { |
| return (entry) < 0; |
| } |
| |
| static int MBCS_ENTRY_TRANSITION_STATE(int entry) { |
| return ((entry) >>> 24); |
| } |
| |
| static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) { |
| return ((entry) & 0xffffff); |
| } |
| |
| static int MBCS_ENTRY_FINAL_STATE(int entry) { |
| return ((entry) >>> 24) & 0x7f; |
| } |
| |
| static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) { |
| return ((entry) < 0x80100000); |
| } |
| |
| static int MBCS_ENTRY_FINAL_ACTION(int entry) { |
| return ((entry) >>> 20) & 0xf; |
| } |
| |
| static int MBCS_ENTRY_FINAL_VALUE(int entry) { |
| return ((entry) & 0xfffff); |
| } |
| |
| static char MBCS_ENTRY_FINAL_VALUE_16(int entry) { |
| return (char) (entry); |
| } |
| |
| static boolean MBCS_IS_ASCII_ROUNDTRIP(int b, long asciiRoundtrips) { |
| return (((asciiRoundtrips) & (1<<((b)>>2)))!=0); |
| } |
| |
| /** |
| * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. It works for single-byte, |
| * single-state codepages that only map to and from BMP code points, and it always returns fallback values. |
| */ |
| static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) { |
| return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b & UConverterConstants.UNSIGNED_BYTE_MASK]); |
| } |
| |
| /* single-byte fromUnicode: get the 16-bit result word */ |
| static char MBCS_SINGLE_RESULT_FROM_U(char[] table, byte[] results, int c) { |
| int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); |
| int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array |
| return (char) (((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (results[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK)); |
| } |
| |
| /* single-byte fromUnicode: set the 16-bit result word with newValue*/ |
| static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, byte[] results, int c, int newValue) { |
| int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); |
| int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array |
| results[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| results[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } |
| |
| /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ |
| static int MBCS_STAGE_2_FROM_U(char[] table, int c) { |
| int i = 2 * (table[(c) >>> 10] + ((c >>> 4) & 0x3f)); // 2x because used as index into char[] array treated as |
| // int[] array |
| return ((table[i] & UConverterConstants.UNSIGNED_SHORT_MASK) << 16) |
| | (table[i + 1] & UConverterConstants.UNSIGNED_SHORT_MASK); |
| } |
| |
| private static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) { |
| return (((stage2Entry) & (1 << (16 + ((c) & 0xf)))) != 0); |
| } |
| |
| static char MBCS_VALUE_2_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) { |
| int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf)); |
| return (char) (((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK)); |
| } |
| |
| static void MBCS_VALUE_2_FROM_STAGE_2_SET(byte[] bytes, int stage2Entry, int c, int newValue) { |
| int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf)); |
| bytes[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| bytes[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } |
| |
| private static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) { |
| int i = 4 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf)); |
| return ((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 24) |
| | ((bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) |
| | ((bytes[i + 2] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) |
| | (bytes[i + 3] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } |
| |
| static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) { |
| return ((16 * ((char) (stage2Entry) & UConverterConstants.UNSIGNED_SHORT_MASK) + ((c) & 0xf)) * 3); |
| } |
| |
| // ------------UConverterExt------------------------------------------------------- |
| |
| static final int EXT_INDEXES_LENGTH = 0; /* 0 */ |
| |
| static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */ |
| static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1; |
| static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1; |
| static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1; |
| |
| static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */ |
| static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1; |
| static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1; |
| static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1; |
| static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1; |
| |
| static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */ |
| static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1; |
| static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1; |
| static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1; |
| static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1; |
| static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1; |
| static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1; |
| |
| private static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */ |
| // private static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1; |
| // private static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1; |
| // |
| // private static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */ |
| // |
| // private static final int EXT_SIZE=31; |
| // private static final int EXT_INDEXES_MIN_LENGTH=32; |
| |
| static final int EXT_FROM_U_MAX_DIRECT_LENGTH = 3; |
| |
| /* toUnicode helpers -------------------------------------------------------- */ |
| |
| private static final int TO_U_BYTE_SHIFT = 24; |
| private static final int TO_U_VALUE_MASK = 0xffffff; |
| private static final int TO_U_MIN_CODE_POINT = 0x1f0000; |
| private static final int TO_U_MAX_CODE_POINT = 0x2fffff; |
| private static final int TO_U_ROUNDTRIP_FLAG = (1 << 23); |
| private static final int TO_U_INDEX_MASK = 0x3ffff; |
| private static final int TO_U_LENGTH_SHIFT = 18; |
| private static final int TO_U_LENGTH_OFFSET = 12; |
| |
| /* maximum number of indexed UChars */ |
| static final int MAX_UCHARS = 19; |
| |
| static int TO_U_GET_BYTE(int word) { |
| return word >>> TO_U_BYTE_SHIFT; |
| } |
| |
| static int TO_U_GET_VALUE(int word) { |
| return word & TO_U_VALUE_MASK; |
| } |
| |
| static boolean TO_U_IS_ROUNDTRIP(int value) { |
| return (value & TO_U_ROUNDTRIP_FLAG) != 0; |
| } |
| |
| static boolean TO_U_IS_PARTIAL(int value) { |
| return (value & UConverterConstants.UNSIGNED_INT_MASK) < TO_U_MIN_CODE_POINT; |
| } |
| |
| static int TO_U_GET_PARTIAL_INDEX(int value) { |
| return value; |
| } |
| |
| static int TO_U_MASK_ROUNDTRIP(int value) { |
| return value & ~TO_U_ROUNDTRIP_FLAG; |
| } |
| |
| private static int TO_U_MAKE_WORD(byte b, int value) { |
| return ((b & UConverterConstants.UNSIGNED_BYTE_MASK) << TO_U_BYTE_SHIFT) | value; |
| } |
| |
| /* use after masking off the roundtrip flag */ |
| static boolean TO_U_IS_CODE_POINT(int value) { |
| return (value & UConverterConstants.UNSIGNED_INT_MASK) <= TO_U_MAX_CODE_POINT; |
| } |
| |
| static int TO_U_GET_CODE_POINT(int value) { |
| return (int) ((value & UConverterConstants.UNSIGNED_INT_MASK) - TO_U_MIN_CODE_POINT); |
| } |
| |
| private static int TO_U_GET_INDEX(int value) { |
| return value & TO_U_INDEX_MASK; |
| } |
| |
| private static int TO_U_GET_LENGTH(int value) { |
| return (value >>> TO_U_LENGTH_SHIFT) - TO_U_LENGTH_OFFSET; |
| } |
| |
| /* fromUnicode helpers ------------------------------------------------------ */ |
| |
| /* most trie constants are shared with ucnvmbcs.h */ |
| private static final int STAGE_2_LEFT_SHIFT = 2; |
| |
| // private static final int STAGE_3_GRANULARITY = 4; |
| |
| /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */ |
| static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) { |
| return stage3.get(((int) stage12.get((stage12.get(s1Index) + ((c >>> 4) & 0x3f))) << STAGE_2_LEFT_SHIFT) |
| + (c & 0xf)); |
| } |
| |
| private static final int FROM_U_LENGTH_SHIFT = 24; |
| private static final int FROM_U_ROUNDTRIP_FLAG = 1 << 31; |
| static final int FROM_U_RESERVED_MASK = 0x60000000; |
| private static final int FROM_U_DATA_MASK = 0xffffff; |
| |
| /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */ |
| static final int FROM_U_SUBCHAR1 = 0x80000001; |
| |
| /* at most 3 bytes in the lower part of the value */ |
| private static final int FROM_U_MAX_DIRECT_LENGTH = 3; |
| |
| /* maximum number of indexed bytes */ |
| static final int MAX_BYTES = 0x1f; |
| |
| static boolean FROM_U_IS_PARTIAL(int value) { |
| return (value >>> FROM_U_LENGTH_SHIFT) == 0; |
| } |
| |
| static int FROM_U_GET_PARTIAL_INDEX(int value) { |
| return value; |
| } |
| |
| static boolean FROM_U_IS_ROUNDTRIP(int value) { |
| return (value & FROM_U_ROUNDTRIP_FLAG) != 0; |
| } |
| |
| private static int FROM_U_MASK_ROUNDTRIP(int value) { |
| return value & ~FROM_U_ROUNDTRIP_FLAG; |
| } |
| |
| /* use after masking off the roundtrip flag */ |
| static int FROM_U_GET_LENGTH(int value) { |
| return (value >>> FROM_U_LENGTH_SHIFT) & MAX_BYTES; |
| } |
| |
| /* get bytes or bytes index */ |
| static int FROM_U_GET_DATA(int value) { |
| return value & FROM_U_DATA_MASK; |
| } |
| |
| /* get the pointer to an extension array from indexes[index] */ |
| static Buffer ARRAY(ByteBuffer indexes, int index, Class<?> itemType) { |
| int oldpos = indexes.position(); |
| Buffer b; |
| |
| indexes.position(indexes.getInt(index << 2)); |
| if (itemType == int.class) |
| b = indexes.asIntBuffer(); |
| else if (itemType == char.class) |
| b = indexes.asCharBuffer(); |
| else if (itemType == short.class) |
| b = indexes.asShortBuffer(); |
| else |
| // default or (itemType == byte.class) |
| b = indexes.slice(); |
| indexes.position(oldpos); |
| return b; |
| } |
| |
| private static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) { |
| indexes.position(0); |
| return indexes.getInt(EXT_COUNT_BYTES) & 0xff; |
| } |
| |
| /* |
| * @return index of the UChar, if found; else <0 |
| */ |
| static int findFromU(CharBuffer fromUSection, int length, char u) { |
| int i, start, limit; |
| |
| /* binary search */ |
| start = 0; |
| limit = length; |
| for (;;) { |
| i = limit - start; |
| if (i <= 1) { |
| break; /* done */ |
| } |
| /* start<limit-1 */ |
| |
| if (i <= 4) { |
| /* linear search for the last part */ |
| if (u <= fromUSection.get(fromUSection.position() + start)) { |
| break; |
| } |
| if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) { |
| break; |
| } |
| if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) { |
| break; |
| } |
| /* always break at start==limit-1 */ |
| ++start; |
| break; |
| } |
| |
| i = (start + limit) / 2; |
| if (u < fromUSection.get(fromUSection.position() + i)) { |
| limit = i; |
| } else { |
| start = i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if (start < limit && u == fromUSection.get(fromUSection.position() + start)) { |
| return start; |
| } else { |
| return -1; /* not found */ |
| } |
| } |
| |
| /* |
| * @return lookup value for the byte, if found; else 0 |
| */ |
| static int findToU(IntBuffer toUSection, int length, short byt) { |
| long word0, word; |
| int i, start, limit; |
| |
| /* check the input byte against the lowest and highest section bytes */ |
| // agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position |
| // property |
| start = TO_U_GET_BYTE(toUSection.get(toUSection.position())); |
| limit = TO_U_GET_BYTE(toUSection.get(toUSection.position() + length - 1)); |
| if (byt < start || limit < byt) { |
| return 0; /* the byte is out of range */ |
| } |
| |
| if (length == ((limit - start) + 1)) { |
| /* direct access on a linear array */ |
| return TO_U_GET_VALUE(toUSection.get(toUSection.position() + byt - start)); /* could be 0 */ |
| } |
| |
| /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ |
| word0 = TO_U_MAKE_WORD((byte) byt, 0) & UConverterConstants.UNSIGNED_INT_MASK; |
| |
| /* |
| * Shift byte once instead of each section word and add 0xffffff. We will compare the shifted/added byte |
| * (bbffffff) against section words which have byte values in the same bit position. If and only if byte bb < |
| * section byte ss then bbffffff<ssvvvvvv for all v=0..f so we need not mask off the lower 24 bits of each |
| * section word. |
| */ |
| word = word0 | TO_U_VALUE_MASK; |
| |
| /* binary search */ |
| start = 0; |
| limit = length; |
| for (;;) { |
| i = limit - start; |
| if (i <= 1) { |
| break; /* done */ |
| } |
| /* start<limit-1 */ |
| |
| if (i <= 4) { |
| /* linear search for the last part */ |
| if (word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { |
| break; |
| } |
| if (++start < limit |
| && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { |
| break; |
| } |
| if (++start < limit |
| && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { |
| break; |
| } |
| /* always break at start==limit-1 */ |
| ++start; |
| break; |
| } |
| |
| i = (start + limit) / 2; |
| if (word < (toUSection.get(toUSection.position() + i) & UConverterConstants.UNSIGNED_INT_MASK)) { |
| limit = i; |
| } else { |
| start = i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if (start < limit) { |
| word = (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK); |
| if (byt == TO_U_GET_BYTE((int)word)) { |
| return TO_U_GET_VALUE((int) word); /* never 0 */ |
| } |
| } |
| return 0; /* not found */ |
| } |
| |
| /* |
| * TRUE if not an SI/SO stateful converter, or if the match length fits with the current converter state |
| */ |
| static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState, int match) { |
| return sisoState < 0 || (sisoState == 0) == (match == 1); |
| } |
| |
| /* |
| * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), or 1 for DBCS-only, or -1 if the converter is not |
| * SI/SO stateful |
| * |
| * Note: For SI/SO stateful converters getting here, cnv->mode==0 is equivalent to firstLength==1. |
| */ |
| private static int SISO_STATE(UConverterSharedData sharedData, int mode) { |
| return sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO ? (byte) mode |
| : sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1 : -1; |
| } |
| |
| class CharsetDecoderMBCS extends CharsetDecoderICU { |
| |
| CharsetDecoderMBCS(CharsetICU cs) { |
| super(cs); |
| } |
| |
| protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { |
| /* Just call cnvMBCSToUnicodeWithOffsets() to remove duplicate code. */ |
| return cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush); |
| } |
| |
| /* |
| * continue partial match with new input never called for simple, single-character conversion |
| */ |
| private CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, |
| boolean flush) { |
| CoderResult cr = CoderResult.UNDERFLOW; |
| |
| int[] value = new int[1]; |
| int match, length; |
| |
| match = matchToU((byte) SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source, |
| value, isToUUseFallback(), flush); |
| |
| if (match > 0) { |
| if (match >= preToULength) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position() + match - preToULength); |
| preToULength = 0; |
| } else { |
| /* the match did not use all of preToU[] - keep the rest for replay */ |
| length = preToULength - match; |
| System.arraycopy(preToUArray, preToUBegin + match, preToUArray, preToUBegin, length); |
| preToULength = (byte) -length; |
| } |
| |
| /* write result */ |
| cr = writeToU(value[0], target, offsets, srcIndex); |
| } else if (match < 0) { |
| /* save state for partial match */ |
| int j, sArrayIndex; |
| |
| /* just _append_ the newly consumed input to preToU[] */ |
| sArrayIndex = source.position(); |
| match = -match; |
| for (j = preToULength; j < match; ++j) { |
| preToUArray[j] = source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ |
| preToULength = (byte) match; |
| } else /* match==0 */{ |
| /* |
| * no match |
| * |
| * We need to split the previous input into two parts: |
| * |
| * 1. The first codepage character is unmappable - that's how we got into trying the extension data in |
| * the first place. We need to move it from the preToU buffer to the error buffer, set an error code, |
| * and prepare the rest of the previous input for 2. |
| * |
| * 2. The rest of the previous input must be converted once we come back from the callback for the first |
| * character. At that time, we have to try again from scratch to convert these input characters. The |
| * replay will be handled by the ucnv.c conversion code. |
| */ |
| |
| /* move the first codepage character to the error field */ |
| System.arraycopy(preToUArray, preToUBegin, toUBytesArray, toUBytesBegin, preToUFirstLength); |
| toULength = preToUFirstLength; |
| |
| /* move the rest up inside the buffer */ |
| length = preToULength - preToUFirstLength; |
| if (length > 0) { |
| System.arraycopy(preToUArray, preToUBegin + preToUFirstLength, preToUArray, preToUBegin, length); |
| } |
| |
| /* mark preToU for replay */ |
| preToULength = (byte) -length; |
| |
| /* set the error code for unassigned */ |
| cr = CoderResult.unmappableForLength(preToUFirstLength); |
| } |
| return cr; |
| } |
| |
| /* |
| * this works like matchFromU() except - the first character is in pre - no trie is used - the returned |
| * matchLength is not offset by 2 |
| */ |
| private int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source, |
| int[] pMatchValue, boolean isUseFallback, boolean flush) { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| IntBuffer toUTable, toUSection; |
| |
| int value, matchValue, srcLength = 0; |
| int i, j, index, length, matchLength; |
| short b; |
| |
| if (cx == null || cx.asIntBuffer().get(EXT_TO_U_LENGTH) <= 0) { |
| return 0; /* no extension data, no match */ |
| } |
| |
| /* initialize */ |
| toUTable = (IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class); |
| index = 0; |
| |
| matchValue = 0; |
| i = j = matchLength = 0; |
| if (source != null) { |
| srcLength = source.remaining(); |
| } |
| |
| if (sisoState == 0) { |
| /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ |
| if (preLength > 1) { |
| return 0; /* no match of a DBCS sequence in SBCS mode */ |
| } else if (preLength == 1) { |
| srcLength = 0; |
| } else /* preLength==0 */{ |
| if (srcLength > 1) { |
| srcLength = 1; |
| } |
| } |
| flush = true; |
| } |
| |
| /* we must not remember fallback matches when not using fallbacks */ |
| |
| /* match input units until there is a full match or the input is consumed */ |
| for (;;) { |
| /* go to the next section */ |
| int oldpos = toUTable.position(); |
| toUSection = ((IntBuffer) toUTable.position(index)).slice(); |
| toUTable.position(oldpos); |
| |
| /* read first pair of the section */ |
| value = toUSection.get(); |
| length = TO_U_GET_BYTE(value); |
| value = TO_U_GET_VALUE(value); |
| if (value != 0 && (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) |
| && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) { |
| /* remember longest match so far */ |
| matchValue = value; |
| matchLength = i + j; |
| } |
| |
| /* match pre[] then src[] */ |
| if (i < preLength) { |
| b = (short) (preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } else if (j < srcLength) { |
| b = (short) (source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } else { |
| /* all input consumed, partial match */ |
| if (flush || (length = (i + j)) > MAX_BYTES) { |
| /* |
| * end of the entire input stream, stop with the longest match so far or: partial match must not |
| * be longer than UCNV_EXT_MAX_BYTES because it must fit into state buffers |
| */ |
| break; |
| } else { |
| /* continue with more input next time */ |
| return -length; |
| } |
| } |
| |
| /* search for the current UChar */ |
| value = findToU(toUSection, length, b); |
| if (value == 0) { |
| /* no match here, stop with the longest match so far */ |
| break; |
| } else { |
| if (TO_U_IS_PARTIAL(value)) { |
| /* partial match, continue */ |
| index = TO_U_GET_PARTIAL_INDEX(value); |
| } else { |
| if ((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) { |
| /* full match, stop with result */ |
| matchValue = value; |
| matchLength = i + j; |
| } else { |
| /* full match on fallback not taken, stop with the longest match so far */ |
| } |
| break; |
| } |
| } |
| } |
| |
| if (matchLength == 0) { |
| /* no match at all */ |
| return 0; |
| } |
| |
| /* return result */ |
| pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue); |
| return matchLength; |
| } |
| |
| private CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| /* output the result */ |
| if (TO_U_IS_CODE_POINT(value)) { |
| /* output a single code point */ |
| return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex); |
| } else { |
| /* output a string - with correct data we have resultLength>0 */ |
| |
| char[] a = new char[TO_U_GET_LENGTH(value)]; |
| CharBuffer cb = ((CharBuffer) ARRAY(cx, EXT_TO_U_UCHARS_INDEX, char.class)); |
| cb.position(TO_U_GET_INDEX(value)); |
| cb.get(a, 0, a.length); |
| return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex); |
| } |
| } |
| |
| private CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) { |
| CoderResult cr = CoderResult.UNDERFLOW; |
| int tBeginIndex = target.position(); |
| |
| if (target.hasRemaining()) { |
| if (c <= 0xffff) { |
| target.put((char) c); |
| c = UConverterConstants.U_SENTINEL; |
| } else /* c is a supplementary code point */{ |
| target.put(UTF16.getLeadSurrogate(c)); |
| c = UTF16.getTrailSurrogate(c); |
| if (target.hasRemaining()) { |
| target.put((char) c); |
| c = UConverterConstants.U_SENTINEL; |
| } |
| } |
| |
| /* write offsets */ |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| if ((tBeginIndex + 1) < target.position()) { |
| offsets.put(sourceIndex); |
| } |
| } |
| } |
| |
| /* write overflow from c */ |
| if (c >= 0) { |
| charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c); |
| cr = CoderResult.OVERFLOW; |
| } |
| |
| return cr; |
| } |
| |
| /* |
| * Input sequence: cnv->toUBytes[0..length[ @return if(U_FAILURE) return the length (toULength, byteIndex) for |
| * the input else return 0 after output has been written to the target |
| */ |
| private int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex, |
| boolean flush, CoderResult[] cr) { |
| // ByteBuffer cx; |
| |
| if (sharedData.mbcs.extIndexes != null |
| && initialMatchToU(length, source, target, offsets, sourceIndex, flush, cr)) { |
| return 0; /* an extension mapping handled the input */ |
| } |
| |
| /* GB 18030 */ |
| if (length == 4 && (options & MBCS_OPTION_GB18030) != 0) { |
| long[] range; |
| long linear; |
| int i; |
| |
| linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]); |
| for (i = 0; i < gb18030Ranges.length; ++i) { |
| range = gb18030Ranges[i]; |
| if (range[2] <= linear && linear <= range[3]) { |
| /* found the sequence, output the Unicode code point for it */ |
| cr[0] = CoderResult.UNDERFLOW; |
| |
| /* add the linear difference between the input and start sequences to the start code point */ |
| linear = range[0] + (linear - range[2]); |
| |
| /* output this code point */ |
| cr[0] = toUWriteCodePoint((int) linear, target, offsets, sourceIndex); |
| |
| return 0; |
| } |
| } |
| } |
| |
| /* no mapping */ |
| cr[0] = CoderResult.unmappableForLength(length); |
| return length; |
| } |
| |
| /* |
| * target<targetLimit; set error code for overflow |
| */ |
| private boolean initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets, |
| int srcIndex, boolean flush, CoderResult[] cr) { |
| int[] value = new int[1]; |
| int match = 0; |
| |
| /* try to match */ |
| match = matchToU((byte) SISO_STATE(sharedData, mode), toUBytesArray, toUBytesBegin, firstLength, source, |
| value, isToUUseFallback(), flush); |
| if (match > 0) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position() + match - firstLength); |
| |
| /* write result to target */ |
| cr[0] = writeToU(value[0], target, offsets, srcIndex); |
| return true; |
| } else if (match < 0) { |
| /* save state for partial match */ |
| byte[] sArray; |
| int sArrayIndex; |
| int j; |
| |
| /* copy the first code point */ |
| sArray = toUBytesArray; |
| sArrayIndex = toUBytesBegin; |
| preToUFirstLength = (byte) firstLength; |
| for (j = 0; j < firstLength; ++j) { |
| preToUArray[j] = sArray[sArrayIndex++]; |
| } |
| |
| /* now copy the newly consumed input */ |
| sArrayIndex = source.position(); |
| match = -match; |
| for (; j < match; ++j) { |
| preToUArray[j] = source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); |
| preToULength = (byte) match; |
| return true; |
| } else /* match==0 no match */{ |
| return false; |
| } |
| } |
| |
| private int simpleMatchToU(ByteBuffer source, boolean useFallback) { |
| int[] value = new int[1]; |
| int match; |
| |
| if (source.remaining() <= 0) { |
| return 0xffff; |
| } |
| |
| /* try to match */ |
| byte[] sourceArray; |
| int sourcePosition, sourceLimit; |
| if (source.isReadOnly()) { |
| // source.array() would throw an exception |
| sourcePosition = source.position(); // relative to source.array() |
| sourceArray = new byte[Math.min(source.remaining(), EXT_MAX_BYTES)]; |
| source.get(sourceArray).position(sourcePosition); |
| sourcePosition = 0; // relative to sourceArray |
| sourceLimit = sourceArray.length; |
| } else { |
| sourceArray = source.array(); |
| sourcePosition = source.position(); |
| sourceLimit = source.limit(); |
| } |
| match = matchToU((byte) -1, sourceArray, sourcePosition, sourceLimit, null, value, useFallback, true); |
| |
| if (match == source.remaining()) { |
| /* write result for simple, single-character conversion */ |
| if (TO_U_IS_CODE_POINT(value[0])) { |
| return TO_U_GET_CODE_POINT(value[0]); |
| } |
| } |
| |
| /* |
| * return no match because - match>0 && value points to string: simple conversion cannot handle multiple |
| * code points - match>0 && match!=length: not all input consumed, forbidden for this function - match==0: |
| * no match found in the first place - match<0: partial match, not supported for simple conversion (and |
| * flush==TRUE) |
| */ |
| return 0xfffe; |
| } |
| |
| CoderResult cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| |
| int sourceArrayIndex, sourceArrayIndexStart; |
| int stateTable[][/* 256 */]; |
| char[] unicodeCodeUnits; |
| |
| int offset; |
| byte state; |
| int byteIndex; |
| byte[] bytes; |
| |
| int sourceIndex, nextSourceIndex; |
| |
| int entry = 0; |
| char c; |
| byte action; |
| |
| if (preToULength > 0) { |
| /* |
| * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with |
| * continuous offsets |
| */ |
| cr[0] = continueMatchToU(source, target, offsets, -1, flush); |
| |
| if (cr[0].isError() || preToULength < 0) { |
| return cr[0]; |
| } |
| } |
| |
| if (sharedData.mbcs.countStates == 1) { |
| if ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush); |
| } else { |
| cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush); |
| } |
| return cr[0]; |
| } |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = sourceArrayIndexStart = source.position(); |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| stateTable = sharedData.mbcs.swapLFNLStateTable; |
| } else { |
| stateTable = sharedData.mbcs.stateTable; |
| } |
| unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; |
| |
| /* get the converter state from UConverter */ |
| offset = toUnicodeStatus; |
| byteIndex = toULength; |
| bytes = toUBytesArray; |
| |
| /* |
| * if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data |
| * (dbcsOnlyState==0 if it is not a DBCS-only converter) |
| */ |
| state = (byte)mode; |
| if (state == 0) { |
| state = sharedData.mbcs.dbcsOnlyState; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = byteIndex == 0 ? 0 : -1; |
| nextSourceIndex = 0; |
| |
| /* conversion loop */ |
| while (sourceArrayIndex < source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. It does not catch output |
| * of more than one code unit that overflows as a result of a surrogate pair or callback output from the |
| * last source byte. Therefore, those situations also test for overflows and will then break the loop, |
| * too. |
| */ |
| if (!target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| if (byteIndex == 0) { |
| /* optimized loop for 1/2-byte input and BMP output */ |
| // agljport:todo see ucnvmbcs.c for deleted block |
| do { |
| entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK]; |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); |
| offset = MBCS_ENTRY_TRANSITION_OFFSET(entry); |
| ++sourceArrayIndex; |
| if (sourceArrayIndex < source.limit() |
| && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK]) |
| && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16 |
| && (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) { |
| ++sourceArrayIndex; |
| target.put(c); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| sourceIndex = (nextSourceIndex += 2); |
| } |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| offset = 0; |
| } else { |
| /* set the state and leave the optimized loop */ |
| ++nextSourceIndex; |
| bytes[0] = source.get(sourceArrayIndex - 1); |
| byteIndex = 1; |
| break; |
| } |
| } else { |
| if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| ++sourceArrayIndex; |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| sourceIndex = ++nextSourceIndex; |
| } |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| } else { |
| /* leave the optimized loop */ |
| break; |
| } |
| } |
| } while (sourceArrayIndex < source.limit() && target.hasRemaining()); |
| /* |
| * these tests and break statements could be put inside the loop if C had "break outerLoop" like |
| * Java |
| */ |
| if (sourceArrayIndex >= source.limit()) { |
| break; |
| } |
| if (!target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| ++nextSourceIndex; |
| bytes[byteIndex++] = source.get(sourceArrayIndex++); |
| } else /* byteIndex>0 */{ |
| ++nextSourceIndex; |
| entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++)) |
| & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| } |
| |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); |
| offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); |
| continue; |
| } |
| |
| /* save the previous state for proper extension mapping with SI/SO-stateful converters */ |
| mode = state; |
| |
| /* set the next state early so that we can reuse the entry variable */ |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| |
| /* |
| * An if-else-if chain provides more reliable performance for the most common cases compared to a |
| * switch. |
| */ |
| action = (byte)MBCS_ENTRY_FINAL_ACTION(entry); |
| if (action == MBCS_STATE_VALID_16) { |
| offset += MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[offset]; |
| if (c < 0xfffe) { |
| /* output BMP code point */ |
| target.put(c); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } else if (c == 0xfffe) { |
| if (isFallbackUsed() && (entry = getFallback(sharedData.mbcs, offset)) != 0xfffe) { |
| /* output fallback BMP code point */ |
| target.put((char)entry); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| } else { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| } else if (action == MBCS_STATE_VALID_DIRECT_16) { |
| /* output BMP code point */ |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } else if (action == MBCS_STATE_VALID_16_PAIR) { |
| offset += MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[offset++]; |
| if (c < 0xd800) { |
| /* output BMP code point below 0xd800 */ |
| target.put(c); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) { |
| /* output roundtrip or fallback surrogate pair */ |
| target.put((char)(c & 0xdbff)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| if (target.hasRemaining()) { |
| target.put(unicodeCodeUnits[offset]); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| } else { |
| /* target overflow */ |
| charErrorBufferArray[0] = unicodeCodeUnits[offset]; |
| charErrorBufferLength = 1; |
| cr[0] = CoderResult.OVERFLOW; |
| |
| offset = 0; |
| break; |
| } |
| } else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) { |
| /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ |
| target.put(unicodeCodeUnits[offset]); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } else if (c == 0xffff) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| } else if (action == MBCS_STATE_VALID_DIRECT_20 |
| || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) { |
| entry = MBCS_ENTRY_FINAL_VALUE(entry); |
| /* output surrogate pair */ |
| target.put((char)(0xd800 | (char)(entry >> 10))); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| c = (char)(0xdc00 | (char)(entry & 0x3ff)); |
| if (target.hasRemaining()) { |
| target.put(c); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| } else { |
| /* target overflow */ |
| charErrorBufferArray[0] = c; |
| charErrorBufferLength = 1; |
| cr[0] = CoderResult.OVERFLOW; |
| |
| offset = 0; |
| break; |
| } |
| } else if (action == MBCS_STATE_CHANGE_ONLY) { |
| /* |
| * This serves as a state change without any output. It is useful for reading simple stateful |
| * encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used |
| * for more sophisticated state transitions. |
| */ |
| if (sharedData.mbcs.dbcsOnlyState == 0) { |
| byteIndex = 0; |
| } else { |
| /* SI/SO are illegal for DBCS-only conversion */ |
| state = (byte)(mode); /* restore the previous state */ |
| |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { |
| if (isFallbackUsed()) { |
| /* output BMP code point */ |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| } else if (action == MBCS_STATE_UNASSIGNED) { |
| /* just fall through */ |
| } else if (action == MBCS_STATE_ILLEGAL) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } else { |
| /* reserved, must never occur */ |
| byteIndex = 0; |
| } |
| |
| /* end of action codes: prepare for a new character */ |
| offset = 0; |
| |
| if (byteIndex == 0) { |
| sourceIndex = nextSourceIndex; |
| } else if (cr[0].isError()) { |
| /* callback(illegal) */ |
| if (byteIndex > 1) { |
| /* |
| * Ticket 5691: consistent illegal sequences: |
| * - We include at least the first byte in the illegal sequence. |
| * - If any of the non-initial bytes could be the start of a character, |
| * we stop the illegal sequence before the first one of those. |
| */ |
| boolean isDBCSOnly = (sharedData.mbcs.dbcsOnlyState != 0); |
| byte i; |
| for (i = 1; i < byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, (short)(bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK)); i++) {} |
| if (i < byteIndex) { |
| byte backOutDistance = (byte)(byteIndex - i); |
| int bytesFromThisBuffer = sourceArrayIndex - sourceArrayIndexStart; |
| byteIndex = i; /* length of reported illegal byte sequence */ |
| if (backOutDistance <= bytesFromThisBuffer) { |
| sourceArrayIndex -= backOutDistance; |
| } else { |
| /* Back out bytes from the previous buffer: Need to replay them. */ |
| this.preToULength = (byte)(bytesFromThisBuffer - backOutDistance); |
| /* preToULength is negative! */ |
| for (int n = 0; n < -this.preToULength; n++) { |
| this.preToUArray[n] = bytes[i+n]; |
| } |
| sourceArrayIndex = sourceArrayIndexStart; |
| } |
| } |
| } |
| break; |
| } else /* unassigned sequences indicated with byteIndex>0 */{ |
| /* try an extension mapping */ |
| int sourceBeginIndex = sourceArrayIndex; |
| source.position(sourceArrayIndex); |
| byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex = nextSourceIndex += (sourceArrayIndex - sourceBeginIndex); |
| |
| if (cr[0].isError() || cr[0].isOverflow()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| toUnicodeStatus = offset; |
| mode = state; |
| toULength = byteIndex; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| /* |
| * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages that |
| * only map to and from the BMP. In addition to single-byte optimizations, the offset calculations become much |
| * easier. |
| */ |
| private CoderResult cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, |
| boolean flush) { |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| |
| int sourceArrayIndex, lastSource; |
| int targetCapacity, length; |
| int[][] stateTable; |
| |
| int sourceIndex; |
| |
| int entry; |
| byte action; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| targetCapacity = target.remaining(); |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| stateTable = sharedData.mbcs.swapLFNLStateTable; |
| } else { |
| stateTable = sharedData.mbcs.stateTable; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = 0; |
| lastSource = sourceArrayIndex; |
| |
| /* |
| * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the |
| * sourceLength and targetCapacity |
| */ |
| length = source.remaining(); |
| if (length < targetCapacity) { |
| targetCapacity = length; |
| } |
| |
| /* conversion loop */ |
| while (targetCapacity > 0 && sourceArrayIndex < source.limit()) { |
| entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| /* MBCS_ENTRY_IS_FINAL(entry) */ |
| |
| /* test the most common case first */ |
| if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| --targetCapacity; |
| continue; |
| } |
| |
| /* |
| * An if-else-if chain provides more reliable performance for the most common cases compared to a |
| * switch. |
| */ |
| action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry)); |
| if (action == MBCS_STATE_FALLBACK_DIRECT_16) { |
| if (isFallbackUsed()) { |
| /* output BMP code point */ |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| --targetCapacity; |
| continue; |
| } |
| } else if (action == MBCS_STATE_UNASSIGNED) { |
| /* just fall through */ |
| } else if (action == MBCS_STATE_ILLEGAL) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(sourceArrayIndex - lastSource); |
| } else { |
| /* reserved, must never occur */ |
| continue; |
| } |
| |
| /* set offsets since the start or the last extension */ |
| if (offsets != null) { |
| int count = sourceArrayIndex - lastSource; |
| |
| /* predecrement: do not set the offset for the callback-causing character */ |
| while (--count > 0) { |
| offsets.put(sourceIndex++); |
| } |
| /* offset and sourceIndex are now set for the current character */ |
| } |
| |
| if (cr[0].isError()) { |
| /* callback(illegal) */ |
| break; |
| } else /* unassigned sequences indicated with byteIndex>0 */{ |
| /* try an extension mapping */ |
| lastSource = sourceArrayIndex; |
| toUBytesArray[0] = source.get(sourceArrayIndex - 1); |
| source.position(sourceArrayIndex); |
| toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex += 1 + (sourceArrayIndex - lastSource); |
| |
| if (cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| targetCapacity = target.remaining(); |
| length = source.remaining(); |
| if (length < targetCapacity) { |
| targetCapacity = length; |
| } |
| } |
| } |
| |
| if (!cr[0].isError() && sourceArrayIndex < source.limit() && !target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| |
| /* set offsets since the start or the last callback */ |
| if (offsets != null) { |
| int count = sourceArrayIndex - lastSource; |
| while (count > 0) { |
| offsets.put(sourceIndex++); |
| --count; |
| } |
| } |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ |
| private CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, |
| boolean flush) { |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| |
| int sourceArrayIndex; |
| int[][] stateTable; |
| |
| int sourceIndex; |
| |
| int entry; |
| char c; |
| byte action; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| stateTable = sharedData.mbcs.swapLFNLStateTable; |
| } else { |
| stateTable = sharedData.mbcs.stateTable; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = 0; |
| |
| /* conversion loop */ |
| while (sourceArrayIndex < source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. It does not catch output |
| * of more than one code unit that overflows as a result of a surrogate pair or callback output from the |
| * last source byte. Therefore, those situations also test for overflows and will then break the loop, |
| * too. |
| */ |
| if (!target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| /* MBCS_ENTRY_IS_FINAL(entry) */ |
| |
| /* test the most common case first */ |
| if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| |
| /* normal end of action codes: prepare for a new character */ |
| ++sourceIndex; |
| continue; |
| } |
| |
| /* |
| * An if-else-if chain provides more reliable performance for the most common cases compared to a |
| * switch. |
| */ |
| action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry)); |
| if (action == MBCS_STATE_VALID_DIRECT_20 |
| || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) { |
| |
| entry = MBCS_ENTRY_FINAL_VALUE(entry); |
| /* output surrogate pair */ |
| target.put((char) (0xd800 | (char) (entry >>> 10))); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| c = (char) (0xdc00 | (char) (entry & 0x3ff)); |
| if (target.hasRemaining()) { |
| target.put(c); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| } else { |
| /* target overflow */ |
| charErrorBufferArray[0] = c; |
| charErrorBufferLength = 1; |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| ++sourceIndex; |
| continue; |
| } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { |
| if (isFallbackUsed()) { |
| /* output BMP code point */ |
| target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| |
| ++sourceIndex; |
| continue; |
| } |
| } else if (action == MBCS_STATE_UNASSIGNED) { |
| /* just fall through */ |
| } else if (action == MBCS_STATE_ILLEGAL) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| } else { |
| /* reserved, must never occur */ |
| ++sourceIndex; |
| continue; |
| } |
| |
| if (cr[0].isError()) { |
| /* callback(illegal) */ |
| break; |
| } else /* unassigned sequences indicated with byteIndex>0 */{ |
| /* try an extension mapping */ |
| int sourceBeginIndex = sourceArrayIndex; |
| toUBytesArray[0] = source.get(sourceArrayIndex - 1); |
| source.position(sourceArrayIndex); |
| toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex += 1 + (sourceArrayIndex - sourceBeginIndex); |
| |
| if (cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } |
| } |
| } |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| private int getFallback(UConverterMBCSTable mbcsTable, int offset) { |
| MBCSToUFallback[] toUFallbacks; |
| int i, start, limit; |
| |
| limit = mbcsTable.countToUFallbacks; |
| if (limit > 0) { |
| /* do a binary search for the fallback mapping */ |
| toUFallbacks = mbcsTable.toUFallbacks; |
| start = 0; |
| while (start < limit - 1) { |
| i = (start + limit) >>> 1; |
| if (offset < toUFallbacks[i].offset) { |
| limit = i; |
| } else { |
| start = i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if (offset == toUFallbacks[start].offset) { |
| return toUFallbacks[start].codePoint; |
| } |
| } |
| |
| return 0xfffe; |
| } |
| |
| /** |
| * This is a simple version of _MBCSGetNextUChar() that is used by other converter implementations. It only |
| * returns an "assigned" result if it consumes the entire input. It does not use state from the converter, nor |
| * error codes. It does not handle the EBCDIC swaplfnl option (set in UConverter). It handles conversion |
| * extensions but not GB 18030. |
| * |
| * @return U+fffe unassigned U+ffff illegal otherwise the Unicode code point |
| */ |
| int simpleGetNextUChar(ByteBuffer source, boolean useFallback) { |
| |
| // #if 0 |
| // /* |
| // * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus |
| // * TODO In future releases, verify that this function is never called for SBCS |
| // * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. |
| // * Removal improves code coverage. |
| // */ |
| // /* use optimized function if possible */ |
| // if(sharedData->mbcs.countStates==1) { |
| // if(length==1) { |
| // return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); |
| // } else { |
| // return 0xffff; /* illegal: more than a single byte for an SBCS converter */ |
| // } |
| // } |
| // #endif |
| |
| /* set up the local pointers */ |
| int[][] stateTable = sharedData.mbcs.stateTable; |
| char[] unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; |
| |
| /* converter state */ |
| int offset = 0; |
| int state = sharedData.mbcs.dbcsOnlyState; |
| |
| int action; |
| int entry; |
| int c; |
| int i = source.position(); |
| int length = source.limit() - i; |
| |
| /* conversion loop */ |
| while (true) { |
| // entry=stateTable[state][(uint8_t)source[i++]]; |
| entry = stateTable[state][source.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { |
| state = MBCS_ENTRY_TRANSITION_STATE(entry); |
| offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); |
| |
| if (i == source.limit()) { |
| return 0xffff; /* truncated character */ |
| } |
| } else { |
| /* |
| * An if-else-if chain provides more reliable performance for the most common cases compared to a |
| * switch. |
| */ |
| action = MBCS_ENTRY_FINAL_ACTION(entry); |
| if (action == MBCS_STATE_VALID_16) { |
| offset += MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[offset]; |
| if (c != 0xfffe) { |
| /* done */ |
| } else if (isToUUseFallback()) { |
| c = getFallback(sharedData.mbcs, offset); |
| } |
| /* else done with 0xfffe */ |
| } else if (action == MBCS_STATE_VALID_DIRECT_16) { |
| // /* output BMP code point */ |
| c = MBCS_ENTRY_FINAL_VALUE_16(entry); |
| } else if (action == MBCS_STATE_VALID_16_PAIR) { |
| offset += MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[offset++]; |
| if (c < 0xd800) { |
| /* output BMP code point below 0xd800 */ |
| } else if (isToUUseFallback() ? c <= 0xdfff : c <= 0xdbff) { |
| /* output roundtrip or fallback supplementary code point */ |
| c = (((c & 0x3ff) << 10) + unicodeCodeUnits[offset] + (0x10000 - 0xdc00)); |
| } else if (isToUUseFallback() ? (c & 0xfffe) == 0xe000 : c == 0xe000) { |
| /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ |
| c = unicodeCodeUnits[offset]; |
| } else if (c == 0xffff) { |
| return 0xffff; |
| } else { |
| c = 0xfffe; |
| } |
| } else if (action == MBCS_STATE_VALID_DIRECT_20) { |
| /* output supplementary code point */ |
| c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry); |
| } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { |
| if (!isToUUseFallback(useFallback)) { |
| c = 0xfffe; |
| } else { |
| /* output BMP code point */ |
| c = MBCS_ENTRY_FINAL_VALUE_16(entry); |
| } |
| } else if (action == MBCS_STATE_FALLBACK_DIRECT_20) { |
| if (!isToUUseFallback(useFallback)) { |
| c = 0xfffe; |
| } else { |
| /* output supplementary code point */ |
| c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry); |
| } |
| } else if (action == MBCS_STATE_UNASSIGNED) { |
| c = 0xfffe; |
| } else { |
| /* |
| * forbid MBCS_STATE_CHANGE_ONLY for this function, and MBCS_STATE_ILLEGAL and reserved action |
| * codes |
| */ |
| return 0xffff; |
| } |
| break; |
| } |
| } |
| |
| if (i != source.limit()) { |
| /* illegal for this function: not all input consumed */ |
| return 0xffff; |
| } |
| |
| if (c == 0xfffe) { |
| /* try an extension mapping */ |
| if (sharedData.mbcs.extIndexes != null) { |
| /* Increase the limit for proper handling. Used in LMBCS. */ |
| if (source.limit() > i + length) { |
| source.limit(i + length); |
| } |
| return simpleMatchToU(source, useFallback); |
| } |
| } |
| |
| return c; |
| } |
| private boolean hasValidTrailBytes(int[][] stateTable, short state) { |
| int[] row = stateTable[state]; |
| int b, entry; |
| /* First test for final entries in this state for some commonly valid byte values. */ |
| entry = row[0xa1]; |
| if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { |
| return true; |
| } |
| entry = row[0x41]; |
| if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { |
| return true; |
| } |
| /* Then test for final entries in this state. */ |
| for (b = 0; b <= 0xff; b++) { |
| entry = row[b]; |
| if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { |
| return true; |
| } |
| } |
| /* Then recurse for transition entries. */ |
| for (b = 0; b <= 0xff; b++) { |
| entry = row[b]; |
| if (MBCS_ENTRY_IS_TRANSITION(entry) && |
| hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK))) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private boolean isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b) { |
| int[] row = stateTable[state]; |
| int entry = row[b]; |
| if (MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ |
| return hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK)); |
| } else { |
| short action = (short)(MBCS_ENTRY_FINAL_ACTION(entry) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| if (action == MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { |
| return false; /* SI/SO are illegal for DBCS-only conversion */ |
| } else { |
| return (action != MBCS_STATE_ILLEGAL); |
| } |
| } |
| } |
| |
| |
| } |
| |
| class CharsetEncoderMBCS extends CharsetEncoderICU { |
| private boolean allowReplacementChanges = false; |
| |
| CharsetEncoderMBCS(CharsetICU cs) { |
| super(cs, fromUSubstitution); |
| allowReplacementChanges = true; // allow changes in implReplaceWith |
| implReset(); |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| preFromUFirstCP = UConverterConstants.U_SENTINEL; |
| } |
| |
| @SuppressWarnings("fallthrough") |
| protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| // if (!source.hasRemaining() && fromUChar32 == 0) |
| // return cr[0]; |
| |
| int sourceArrayIndex; |
| char[] table; |
| byte[] pArray, bytes; |
| int pArrayIndex, outputType, c; |
| int prevSourceIndex, sourceIndex, nextSourceIndex; |
| int stage2Entry = 0, value = 0, length = 0, prevLength; |
| short uniMask; |
| // long asciiRoundtrips; |
| |
| byte[] si_value = new byte[2]; |
| byte[] so_value = new byte[2]; |
| int si_value_length = 0, so_value_length = 0; |
| |
| boolean gotoUnassigned = false; |
| |
| try { |
| |
| if (!flush && preFromUFirstCP >= 0) { |
| /* |
| * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change |
| * with continuous offsets |
| */ |
| cr[0] = continueMatchFromU(source, target, offsets, flush, -1); |
| |
| if (cr[0].isError() || preFromULength < 0) { |
| return cr[0]; |
| } |
| } |
| |
| /* use optimized function if possible */ |
| outputType = sharedData.mbcs.outputType; |
| uniMask = sharedData.mbcs.unicodeMask; |
| if (outputType == MBCS_OUTPUT_1 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { |
| if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush); |
| } else { |
| cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush); |
| } |
| return cr[0]; |
| } else if (outputType == MBCS_OUTPUT_2) { |
| cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush); |
| return cr[0]; |
| } |
| |
| table = sharedData.mbcs.fromUnicodeTable; |
| sourceArrayIndex = source.position(); |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes; |
| } else { |
| bytes = sharedData.mbcs.fromUnicodeBytes; |
| } |
| |
| // asciiRoundtrips = sharedData.mbcs.asciiRoundtrips; |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| if (outputType == MBCS_OUTPUT_2_SISO) { |
| prevLength = fromUnicodeStatus; |
| if (prevLength == 0) { |
| /* set the real value */ |
| prevLength = 1; |
| } |
| } else { |
| /* prevent fromUnicodeStatus from being set to something non-0 */ |
| prevLength = 0; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| prevSourceIndex = -1; |
| sourceIndex = c == 0 ? 0 : -1; |
| nextSourceIndex = 0; |
| |
| /* Get the SI/SO character for the converter */ |
| si_value_length = getSISOBytes(SISO_Option.SI, options, si_value); |
| so_value_length = getSISOBytes(SISO_Option.SO, options, so_value); |
| |
| /* conversion loop */ |
| /* |
| * This is another piece of ugly code: A goto into the loop if the converter state contains a first |
| * surrogate from the previous function call. It saves me to check in each loop iteration a check of |
| * if(c==0) and duplicating the trail-surrogate-handling code in the else branch of that check. I could |
| * not find any other way to get around this other than using a function call for the conversion and |
| * callback, which would be even more inefficient. |
| * |
| * Markus Scherer 2000-jul-19 |
| */ |
| boolean doloop = true; |
| boolean doread = true; |
| if (c != 0 && target.hasRemaining()) { |
| if (UTF16.isLeadSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { |
| // c is a lead surrogate, read another input |
| SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, |
| prevSourceIndex, prevLength); |
| doloop = getTrail(source, target, uniMask, x, flush, cr); |
| doread = x.doread; |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| prevSourceIndex = x.prevSourceIndex; |
| prevLength = x.prevLength; |
| } else { |
| // c is not a lead surrogate, do not read another input |
| doread = false; |
| } |
| } |
| |
| if (doloop) { |
| while (!doread || sourceArrayIndex < source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. It does not catch |
| * output of more than one byte that overflows as a result of a multi-byte character or callback |
| * output from the last source character. Therefore, those situations also test for overflows |
| * and will then break the loop, too. |
| */ |
| if (target.hasRemaining()) { |
| /* |
| * Get a correct Unicode code point: a single UChar for a BMP code point or a matched |
| * surrogate pair for a "supplementary code point". |
| */ |
| |
| if (doread) { |
| // doread might be false only on the first looping |
| |
| c = source.get(sourceArrayIndex++); |
| ++nextSourceIndex; |
| |
| /* |
| * This also tests if the codepage maps single surrogates. If it does, then surrogates |
| * are not paired but mapped separately. Note that in this case unmatched surrogates are |
| * not detected. |
| */ |
| if (UTF16.isSurrogate((char) c) |
| && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { |
| if (UTF16.isLeadSurrogate((char) c)) { |
| // getTrail: |
| SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, |
| nextSourceIndex, prevSourceIndex, prevLength); |
| doloop = getTrail(source, target, uniMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| prevSourceIndex = x.prevSourceIndex; |
| |
| if (x.doread) { |
| if (doloop) |
| continue; |
| else |
| break; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| } else { |
| doread = true; |
| } |
| /* convert the Unicode code point in c into codepage bytes */ |
| |
| /* |
| * The basic lookup is a triple-stage compact array (trie) lookup. For details see the |
| * beginning of this file. |
| * |
| * Single-byte codepages are handled with a different data structure by _MBCSSingle... |
| * functions. |
| * |
| * The result consists of a 32-bit value from stage 2 and a pointer to as many bytes as are |
| * stored per character. The pointer points to the character's bytes in stage 3. Bits 15..0 |
| * of the stage 2 entry contain the stage 3 index for that pointer, while bits 31..16 are |
| * flags for which of the 16 characters in the block are roundtrip-assigned. |
| * |
| * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t respectively as |
| * uint32_t, in the platform encoding. For 3-byte codepages, the bytes are always stored in |
| * big-endian order. |
| * |
| * For EUC encodings that use only either 0x8e or 0x8f as the first byte of their longest |
| * byte sequences, the first two bytes in this third stage indicate with their 7th bits |
| * whether these bytes are to be written directly or actually need to be preceeded by one of |
| * the two Single-Shift codes. With this, the third stage stores one byte fewer per |
| * character than the actual maximum length of EUC byte sequences. |
| * |
| * Other than that, leading zero bytes are removed and the other bytes output. A single zero |
| * byte may be output if the "assigned" bit in stage 2 was on. The data structure does not |
| * support zero byte output as a fallback, and also does not allow output of leading zeros. |
| */ |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, c); |
| |
| /* get the bytes and the length for the output */ |
| switch (outputType) { |
| /* This is handled above with the method cnvMBCSDoubleFromUnicodeWithOffsets() */ |
| /* case MBCS_OUTPUT_2: |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| length = 1; |
| } else { |
| length = 2; |
| } |
| break; */ |
| case MBCS_OUTPUT_2_SISO: |
| /* 1/2-byte stateful with Shift-In/Shift-Out */ |
| /* |
| * Save the old state in the converter object right here, then change the local |
| * prevLength state variable if necessary. Then, if this character turns out to be |
| * unassigned or a fallback that is not taken, the callback code must not save the new |
| * state in the converter because the new state is for a character that is not output. |
| * However, the callback must still restore the state from the converter in case the |
| * callback function changed it for its output. |
| */ |
| fromUnicodeStatus = prevLength; /* save the old state */ |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) == false) { |
| /* no mapping, leave value==0 */ |
| length = 0; |
| } else if (prevLength <= 1) { |
| length = 1; |
| } else { |
| /* change from double-byte mode to single-byte */ |
| if (si_value_length == 1) { |
| value|=si_value[0]<<8; |
| length = 2; |
| } else if (si_value_length == 2) { |
| value|=si_value[1]<<8; |
| value|=si_value[0]<<16; |
| length = 3; |
| } |
| prevLength = 1; |
| } |
| } else { |
| if (prevLength == 2) { |
| length = 2; |
| } else { |
| /* change from single-byte mode to double-byte */ |
| if (so_value_length == 1) { |
| value|=so_value[0]<<16; |
| length = 3; |
| } else if (so_value_length == 2) { |
| value|=so_value[1]<<16; |
| value|=so_value[0]<<24; |
| length = 4; |
| } |
| prevLength = 2; |
| } |
| } |
| break; |
| case MBCS_OUTPUT_DBCS_ONLY: |
| /* table with single-byte results, but only DBCS mappings used */ |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| /* no mapping or SBCS result, not taken for DBCS-only */ |
| value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */ |
| length = 0; |
| } else { |
| length = 2; |
| } |
| break; |
| case MBCS_OUTPUT_3: |
| pArray = bytes; |
| pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); |
| value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) |
| | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) |
| | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| length = 1; |
| } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) { |
| length = 2; |
| } else { |
| length = 3; |
| } |
| break; |
| case MBCS_OUTPUT_4: |
| value = MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| length = 1; |
| } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) { |
| length = 2; |
| } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffffff) { |
| length = 3; |
| } else { |
| length = 4; |
| } |
| break; |
| case MBCS_OUTPUT_3_EUC: |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| /* EUC 16-bit fixed-length representation */ |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| length = 1; |
| } else if ((value & 0x8000) == 0) { |
| value |= 0x8e8000; |
| length = 3; |
| } else if ((value & 0x80) == 0) { |
| value |= 0x8f0080; |
| length = 3; |
| } else { |
| length = 2; |
| } |
| break; |
| case MBCS_OUTPUT_4_EUC: |
| pArray = bytes; |
| pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); |
| value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) |
| | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) |
| | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| /* EUC 16-bit fixed-length representation applied to the first two bytes */ |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| length = 1; |
| } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) { |
| length = 2; |
| } else if ((value & 0x800000) == 0) { |
| value |= 0x8e800000; |
| length = 4; |
| } else if ((value & 0x8000) == 0) { |
| value |= 0x8f008000; |
| length = 4; |
| } else { |
| length = 3; |
| } |
| break; |
| default: |
| /* must not occur */ |
| /* |
| * To avoid compiler warnings that value & length may be used without having been |
| * initialized, we set them here. In reality, this is unreachable code. Not having a |
| * default branch also causes warnings with some compilers. |
| */ |
| value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */ |
| length = 0; |
| break; |
| } |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if (gotoUnassigned || (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0)))) { |
| gotoUnassigned = false; |
| /* |
| * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way |
| * with this data structure for fallback output to be a zero byte. |
| */ |
| |
| // unassigned: |
| SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, |
| prevSourceIndex, prevLength); |
| doloop = unassigned(source, target, offsets, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| prevSourceIndex = x.prevSourceIndex; |
| prevLength = x.prevLength; |
| if (doloop) |
| continue; |
| else |
| break; |
| } |
| |
| /* write the output character bytes from value and length */ |
| /* from the first if in the loop we know that targetCapacity>0 */ |
| if (length <= target.remaining()) { |
| switch (length) { |
| /* each branch falls through to the next one */ |
| case 4: |
| target.put((byte) (value >>> 24)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| case 3: |
| target.put((byte) (value >>> 16)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| case 2: |
| target.put((byte) (value >>> 8)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| case 1: |
| target.put((byte) value); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| default: |
| /* will never occur */ |
| break; |
| } |
| } else { |
| int errorBufferArrayIndex; |
| |
| /* |
| * We actually do this backwards here: In order to save an intermediate variable, we |
| * output first to the overflow buffer what does not fit into the regular target. |
| */ |
| /* we know that 1<=targetCapacity<length<=4 */ |
| length -= target.remaining(); |
| |
| errorBufferArrayIndex = 0; |
| switch (length) { |
| /* each branch falls through to the next one */ |
| case 3: |
| errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 16); |
| case 2: |
| errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 8); |
| case 1: |
| errorBuffer[errorBufferArrayIndex] = (byte) value; |
| default: |
| /* will never occur */ |
| break; |
| } |
| errorBufferLength = (byte) length; |
| |
| /* now output what fits into the regular target */ |
| value >>>= 8 * length; /* length was reduced by targetCapacity */ |
| switch (target.remaining()) { |
| /* each branch falls through to the next one */ |
| case 3: |
| target.put((byte) (value >>> 16)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| case 2: |
| target.put((byte) (value >>> 8)); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| case 1: |
| target.put((byte) value); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| default: |
| /* will never occur */ |
| break; |
| } |
| |
| /* target overflow */ |
| cr[0] = CoderResult.OVERFLOW; |
| c = 0; |
| break; |
| } |
| |
| /* normal end of conversion: prepare for a new character */ |
| c = 0; |
| if (offsets != null) { |
| prevSourceIndex = sourceIndex; |
| sourceIndex = nextSourceIndex; |
| } |
| continue; |
| } else { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| |
| /* |
| * the end of the input stream and detection of truncated input are handled by the framework, but for |
| * EBCDIC_STATEFUL conversion we need to emit an SI at the very end |
| * |
| * conditions: successful EBCDIC_STATEFUL in DBCS mode end of input and no truncated input |
| */ |
| if (outputType == MBCS_OUTPUT_2_SISO && prevLength == 2 && flush && sourceArrayIndex >= source.limit() |
| && c == 0) { |
| |
| /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ |
| if (target.hasRemaining()) { |
| target.put(si_value[0]); |
| if (si_value_length == 2) { |
| if (target.remaining() > 0) { |
| target.put(si_value[1]); |
| } else { |
| errorBuffer[0] = si_value[1]; |
| errorBufferLength = 1; |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| } |
| if (offsets != null) { |
| /* set the last source character's index (sourceIndex points at sourceLimit now) */ |
| offsets.put(prevSourceIndex); |
| } |
| } else { |
| /* target is full */ |
| errorBuffer[0] = si_value[0]; |
| if (si_value_length == 2) { |
| errorBuffer[1] = si_value[1]; |
| } |
| errorBufferLength = si_value_length; |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| prevLength = 1; /* we switched into SBCS */ |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32 = c; |
| fromUnicodeStatus = prevLength; |
| |
| source.position(sourceArrayIndex); |
| } catch (BufferOverflowException ex) { |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| |
| return cr[0]; |
| } |
| |
| /* |
| * This is another simple conversion function for internal use by other conversion implementations. It does not |
| * use the converter state nor call callbacks. It does not handle the EBCDIC swaplfnl option (set in |
| * UConverter). It handles conversion extensions but not GB 18030. |
| * |
| * It converts one single Unicode code point into codepage bytes, encoded as one 32-bit value. The function |
| * returns the number of bytes in *pValue: 1..4 the number of bytes in *pValue 0 unassigned (*pValue undefined) |
| * -1 illegal (currently not used, *pValue undefined) |
| * |
| * *pValue will contain the resulting bytes with the last byte in bits 7..0, the second to last byte in bits |
| * 15..8, etc. Currently, the function assumes but does not check that 0<=c<=0x10ffff. |
| */ |
| int fromUChar32(int c, int[] pValue, boolean isUseFallback) { |
| // #if 0 |
| // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ |
| // const uint8_t *p; |
| // #endif |
| |
| char[] table; |
| int stage2Entry; |
| int value; |
| int length; |
| int p; |
| |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| if (c <= 0xffff || ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) != 0)) { |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ |
| if (sharedData.mbcs.outputType == MBCS_OUTPUT_1) { |
| value = MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c); |
| /* is this code point assigned, or do we use fallbacks? */ |
| if (isUseFallback ? value >= 0x800 : value >= 0xc00) { |
| pValue[0] = value & 0xff; |
| return 1; |
| } |
| } else /* outputType!=MBCS_OUTPUT_1 */{ |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, c); |
| |
| /* get the bytes and the length for the output */ |
| switch (sharedData.mbcs.outputType) { |
| case MBCS_OUTPUT_2: |
| value = MBCS_VALUE_2_FROM_STAGE_2(sharedData.mbcs.fromUnicodeBytes, stage2Entry, c); |
| if (value <= 0xff) { |
| length = 1; |
| } else { |
| length = 2; |
| } |
| break; |
| // #if 0 |
| // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ |
| // case MBCS_OUTPUT_DBCS_ONLY: |
| // /* table with single-byte results, but only DBCS mappings used */ |
| // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| // if(value<=0xff) { |
| // /* no mapping or SBCS result, not taken for DBCS-only */ |
| // value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ |
| // length=0; |
| // } else { |
| // length=2; |
| // } |
| // break; |
| case MBCS_OUTPUT_3: |
| byte[] bytes = sharedData.mbcs.fromUnicodeBytes; |
| p = CharsetMBCS.MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); |
| value = ((bytes[p] & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) | |
| ((bytes[p+1] & UConverterConstants.UNSIGNED_BYTE_MASK)<<8) | |
| (bytes[p+2] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| if (value <= 0xff) { |
| length = 1; |
| } else if (value <= 0xffff) { |
| length = 2; |
| } else { |
| length = 3; |
| } |
| break; |
| // case MBCS_OUTPUT_4: |
| // value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| // if(value<=0xff) { |
| // length=1; |
| // } else if(value<=0xffff) { |
| // length=2; |
| // } else if(value<=0xffffff) { |
| // length=3; |
| // } else { |
| // length=4; |
| // } |
| // break; |
| // case MBCS_OUTPUT_3_EUC: |
| // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| // /* EUC 16-bit fixed-length representation */ |
| // if(value<=0xff) { |
| // length=1; |
| // } else if((value&0x8000)==0) { |
| // value|=0x8e8000; |
| // length=3; |
| // } else if((value&0x80)==0) { |
| // value|=0x8f0080; |
| // length=3; |
| // } else { |
| // length=2; |
| // } |
| // break; |
| // case MBCS_OUTPUT_4_EUC: |
| // p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| // value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
| // /* EUC 16-bit fixed-length representation applied to the first two bytes */ |
| // if(value<=0xff) { |
| // length=1; |
| // } else if(value<=0xffff) { |
| // length=2; |
| // } else if((value&0x800000)==0) { |
| // value|=0x8e800000; |
| // length=4; |
| // } else if((value&0x8000)==0) { |
| // value|=0x8f008000; |
| // length=4; |
| // } else { |
| // length=3; |
| // } |
| // break; |
| // #endif |
| default: |
| /* must not occur */ |
| return -1; |
| } |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if (MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) |
| || (CharsetEncoderICU.isFromUUseFallback(isUseFallback, c) && value != 0)) { |
| /* |
| * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way with |
| * this data structure for fallback output to be a zero byte. |
| */ |
| /* assigned */ |
| pValue[0] = value; |
| return length; |
| } |
| } |
| } |
| |
| if (sharedData.mbcs.extIndexes != null) { |
| length = simpleMatchFromU(c, pValue, isUseFallback); |
| return length >= 0 ? length : -length; /* return abs(length); */ |
| } |
| |
| /* unassigned */ |
| return 0; |
| } |
| |
| /* |
| * continue partial match with new input, requires cnv->preFromUFirstCP>=0 never called for simple, |
| * single-character conversion |
| */ |
| private CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush, |
| int srcIndex) { |
| CoderResult cr = CoderResult.UNDERFLOW; |
| int[] value = new int[1]; |
| int match; |
| |
| match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, value, useFallback, flush); |
| if (match >= 2) { |
| match -= 2; /* remove 2 for the initial code point */ |
| |
| if (match >= preFromULength) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position() + match - preFromULength); |
| preFromULength = 0; |
| } else { |
| /* the match did not use all of preFromU[] - keep the rest for replay */ |
| int length = preFromULength - match; |
| System.arraycopy(preFromUArray, preFromUBegin + match, preFromUArray, preFromUBegin, length); |
| preFromULength = (byte) -length; |
| } |
| |
| /* finish the partial match */ |
| preFromUFirstCP = UConverterConstants.U_SENTINEL; |
| |
| /* write result */ |
| writeFromU(value[0], target, offsets, srcIndex); |
| } else if (match < 0) { |
| /* save state for partial match */ |
| int sArrayIndex; |
| int j; |
| |
| /* just _append_ the newly consumed input to preFromU[] */ |
| sArrayIndex = source.position(); |
| match = -match - 2; /* remove 2 for the initial code point */ |
| for (j = preFromULength; j < match; ++j) { |
| preFromUArray[j] = source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ |
| preFromULength = (byte) match; |
| } else { /* match==0 or 1 */ |
| /* |
| * no match |
| * |
| * We need to split the previous input into two parts: |
| * |
| * 1. The first code point is unmappable - that's how we got into trying the extension data in the first |
| * place. We need to move it from the preFromU buffer to the error buffer, set an error code, and |
| * prepare the rest of the previous input for 2. |
| * |
| * 2. The rest of the previous input must be converted once we come back from the callback for the first |
| * code point. At that time, we have to try again from scratch to convert these input characters. The |
| * replay will be handled by the ucnv.c conversion code. |
| */ |
| |
| if (match == 1) { |
| /* matched, no mapping but request for <subchar1> */ |
| useSubChar1 = true; |
| } |
| |
| /* move the first code point to the error field */ |
| fromUChar32 = preFromUFirstCP; |
| preFromUFirstCP = UConverterConstants.U_SENTINEL; |
| |
| /* mark preFromU for replay */ |
| preFromULength = (byte) -preFromULength; |
| |
| /* set the error code for unassigned */ |
| // TODO: figure out what the unmappable length really should be |
| cr = CoderResult.unmappableForLength(1); |
| } |
| return cr; |
| } |
| |
| /** |
| * @param cx |
| * pointer to extension data; if NULL, returns 0 |
| * @param firstCP |
| * the first code point before all the other UChars |
| * @param pre |
| * UChars that must match; !initialMatch: partial match with them |
| * @param preLength |
| * length of pre, >=0 |
| * @param src |
| * UChars that can be used to complete a match |
| * @param srcLength |
| * length of src, >=0 |
| * @param pMatchValue |
| * [out] output result value for the match from the data structure |
| * @param useFallback |
| * "use fallback" flag, usually from cnv->useFallback |
| * @param flush |
| * TRUE if the end of the input stream is reached |
| * @return >1: matched, return value=total match length (number of input units matched) 1: matched, no mapping |
| * but request for <subchar1> (only for the first code point) 0: no match <0: partial match, return |
| * value=negative total match length (partial matches are never returned for flush==TRUE) (partial |
| * matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) the matchLength is 2 if only |
| * firstCP matched, and >2 if firstCP and further code units matched |
| */ |
| // static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength, |
| // const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush) |
| private int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source, |
| int[] pMatchValue, boolean isUseFallback, boolean flush) { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| |
| CharBuffer stage12, stage3; |
| IntBuffer stage3b; |
| |
| CharBuffer fromUTableUChars, fromUSectionUChars; |
| IntBuffer fromUTableValues, fromUSectionValues; |
| |
| int value, matchValue; |
| int i, j, index, length, matchLength; |
| char c; |
| |
| if (cx == null) { |
| return 0; /* no extension data, no match */ |
| } |
| |
| /* trie lookup of firstCP */ |
| index = firstCP >>> 10; /* stage 1 index */ |
| if (index >= cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH)) { |
| return 0; /* the first code point is outside the trie */ |
| } |
| |
| stage12 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class); |
| stage3 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class); |
| index = FROM_U(stage12, stage3, index, firstCP); |
| |
| stage3b = (IntBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class); |
| value = stage3b.get(stage3b.position() + index); |
| if (value == 0) { |
| return 0; |
| } |
| |
| if (TO_U_IS_PARTIAL(value)) { |
| /* partial match, enter the loop below */ |
| index = FROM_U_GET_PARTIAL_INDEX(value); |
| |
| /* initialize */ |
| fromUTableUChars = (CharBuffer) ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class); |
| fromUTableValues = (IntBuffer) ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class); |
| |
| matchValue = 0; |
| i = j = matchLength = 0; |
| |
| /* we must not remember fallback matches when not using fallbacks */ |
| |
| /* match input units until there is a full match or the input is consumed */ |
| for (;;) { |
| /* go to the next section */ |
| int oldpos = fromUTableUChars.position(); |
| fromUSectionUChars = ((CharBuffer) fromUTableUChars.position(index)).slice(); |
| fromUTableUChars.position(oldpos); |
| oldpos = fromUTableValues.position(); |
| fromUSectionValues = ((IntBuffer) fromUTableValues.position(index)).slice(); |
| fromUTableValues.position(oldpos); |
| |
| /* read first pair of the section */ |
| length = fromUSectionUChars.get(); |
| value = fromUSectionValues.get(); |
| if (value != 0 && (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP))) { |
| /* remember longest match so far */ |
| matchValue = value; |
| matchLength = 2 + i + j; |
| } |
| |
| /* match pre[] then src[] */ |
| if (i < preLength) { |
| c = preArray[preArrayBegin + i++]; |
| } else if (source != null && j < source.remaining()) { |
| c = source.get(source.position() + j++); |
| } else { |
| /* all input consumed, partial match */ |
| if (flush || (length = (i + j)) > MAX_UCHARS) { |
| /* |
| * end of the entire input stream, stop with the longest match so far or: partial match must |
| * not be longer than UCNV_EXT_MAX_UCHARS because it must fit into state buffers |
| */ |
| break; |
| } else { |
| /* continue with more input next time */ |
| return -(2 + length); |
| } |
| } |
| |
| /* search for the current UChar */ |
| index = findFromU(fromUSectionUChars, length, c); |
| if (index < 0) { |
| /* no match here, stop with the longest match so far */ |
| break; |
| } else { |
| value = fromUSectionValues.get(fromUSectionValues.position() + index); |
| if (FROM_U_IS_PARTIAL(value)) { |
| /* partial match, continue */ |
| index = FROM_U_GET_PARTIAL_INDEX(value); |
| } else { |
| if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) { |
| /* full match, stop with result */ |
| matchValue = value; |
| matchLength = 2 + i + j; |
| } else { |
| /* full match on fallback not taken, stop with the longest match so far */ |
| } |
| break; |
| } |
| } |
| } |
| |
| if (matchLength == 0) { |
| /* no match at all */ |
| return 0; |
| } |
| } else /* result from firstCP trie lookup */{ |
| if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) { |
| /* full match, stop with result */ |
| matchValue = value; |
| matchLength = 2; |
| } else { |
| /* fallback not taken */ |
| return 0; |
| } |
| } |
| |
| if ((matchValue & FROM_U_RESERVED_MASK) != 0) { |
| /* do not interpret values with reserved bits used, for forward compatibility */ |
| return 0; |
| } |
| |
| /* return result */ |
| if (matchValue == FROM_U_SUBCHAR1) { |
| return 1; /* assert matchLength==2 */ |
| } |
| |
| pMatchValue[0] = FROM_U_MASK_ROUNDTRIP(matchValue); |
| return matchLength; |
| } |
| |
| private int simpleMatchFromU(int cp, int[] pValue, boolean isUseFallback) { |
| int[] value = new int[1]; |
| int match; // signed |
| |
| /* try to match */ |
| match = matchFromU(cp, null, 0, 0, null, value, isUseFallback, true); |
| if (match >= 2) { |
| /* write result for simple, single-character conversion */ |
| int length; |
| boolean isRoundtrip; |
| |
| isRoundtrip = FROM_U_IS_ROUNDTRIP(value[0]); |
| length = FROM_U_GET_LENGTH(value[0]); |
| value[0] = FROM_U_GET_DATA(value[0]); |
| |
| if (length <= EXT_FROM_U_MAX_DIRECT_LENGTH) { |
| pValue[0] = value[0]; |
| return isRoundtrip ? length : -length; |
| // #if 0 /* not currently used */ |
| // } else if(length==4) { |
| // /* de-serialize a 4-byte result */ |
| // const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; |
| // *pValue= |
| // ((uint32_t)result[0]<<24)| |
| // ((uint32_t)result[1]<<16)| |
| // ((uint32_t)result[2]<<8)| |
| // result[3]; |
| // return isRoundtrip ? 4 : -4; |
| // #endif |
| } |
| } |
| |
| /* |
| * return no match because - match>1 && resultLength>4: result too long for simple conversion - match==1: no |
| * match found, <subchar1> preferred - match==0: no match found in the first place - match<0: partial |
| * match, not supported for simple conversion (and flush==TRUE) |
| */ |
| return 0; |
| } |
| |
| @SuppressWarnings("fallthrough") |
| private CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| |
| byte bufferArray[] = new byte[1 + MAX_BYTES]; |
| int bufferArrayIndex = 0; |
| byte[] resultArray; |
| int resultArrayIndex; |
| int length, prevLength; |
| |
| length = FROM_U_GET_LENGTH(value); |
| value = FROM_U_GET_DATA(value); |
| |
| /* output the result */ |
| if (length <= FROM_U_MAX_DIRECT_LENGTH) { |
| /* |
| * Generate a byte array and then write it below. This is not the fastest possible way, but it should be |
| * ok for extension mappings, and it is much simpler. Offset and overflow handling are only done once |
| * this way. |
| */ |
| int p = bufferArrayIndex + 1; /* reserve buffer[0] for shiftByte below */ |
| switch (length) { |
| case 3: |
| bufferArray[p++] = (byte) (value >>> 16); |
| case 2: |
| bufferArray[p++] = (byte) (value >>> 8); |
| case 1: |
| bufferArray[p++] = (byte) value; |
| default: |
| break; /* will never occur */ |
| } |
| resultArray = bufferArray; |
| resultArrayIndex = bufferArrayIndex + 1; |
| } else { |
| byte[] slice = new byte[length]; |
| |
| ByteBuffer bb = ((ByteBuffer) ARRAY(cx, EXT_FROM_U_BYTES_INDEX, byte.class)); |
| bb.position(value); |
| bb.get(slice, 0, slice.length); |
| |
| resultArray = slice; |
| resultArrayIndex = 0; |
| } |
| |
| /* with correct data we have length>0 */ |
| |
| if ((prevLength = fromUnicodeStatus) != 0) { |
| /* handle SI/SO stateful output */ |
| byte shiftByte; |
| |
| if (prevLength > 1 && length == 1) { |
| /* change from double-byte mode to single-byte */ |
| shiftByte = (byte) UConverterConstants.SI; |
| fromUnicodeStatus = 1; |
| } else if (prevLength == 1 && length > 1) { |
| /* change from single-byte mode to double-byte */ |
| shiftByte = (byte) UConverterConstants.SO; |
| fromUnicodeStatus = 2; |
| } else { |
| shiftByte = 0; |
| } |
| |
| if (shiftByte != 0) { |
| /* prepend the shift byte to the result bytes */ |
| bufferArray[0] = shiftByte; |
| if (resultArray != bufferArray || resultArrayIndex != bufferArrayIndex + 1) { |
| System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex + 1, length); |
| } |
| resultArray = bufferArray; |
| resultArrayIndex = bufferArrayIndex; |
| ++length; |
| } |
| } |
| |
| return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex); |
| } |
| |
| /* |
| * @return if(U_FAILURE) return the code point for cnv->fromUChar32 else return 0 after output has been written |
| * to the target |
| */ |
| private int fromU(int cp_, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex, |
| int length, boolean flush, CoderResult[] cr) { |
| // ByteBuffer cx; |
| long cp = cp_ & UConverterConstants.UNSIGNED_INT_MASK; |
| |
| useSubChar1 = false; |
| |
| if (sharedData.mbcs.extIndexes != null |
| && initialMatchFromU((int) cp, source, target, offsets, sourceIndex, flush, cr)) { |
| return 0; /* an extension mapping handled the input */ |
| } |
| |
| /* GB 18030 */ |
| if ((options & MBCS_OPTION_GB18030) != 0) { |
| long[] range; |
| int i; |
| |
| for (i = 0; i < gb18030Ranges.length; ++i) { |
| range = gb18030Ranges[i]; |
| if (range[0] <= cp && cp <= range[1]) { |
| /* found the Unicode code point, output the four-byte sequence for it */ |
| long linear; |
| byte bytes[] = new byte[4]; |
| |
| /* get the linear value of the first GB 18030 code in this range */ |
| linear = range[2] - LINEAR_18030_BASE; |
| |
| /* add the offset from the beginning of the range */ |
| linear += (cp - range[0]); |
| |
| bytes[3] = (byte) (0x30 + linear % 10); |
| linear /= 10; |
| bytes[2] = (byte) (0x81 + linear % 126); |
| linear /= 126; |
| bytes[1] = (byte) (0x30 + linear % 10); |
| linear /= 10; |
| bytes[0] = (byte) (0x81 + linear); |
| |
| /* output this sequence */ |
| cr[0] = fromUWriteBytes(this, bytes, 0, 4, target, offsets, sourceIndex); |
| return 0; |
| } |
| } |
| } |
| |
| /* no mapping */ |
| cr[0] = CoderResult.unmappableForLength(length); |
| return (int) cp; |
| } |
| |
| /* |
| * target<targetLimit; set error code for overflow |
| */ |
| private boolean initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, |
| int srcIndex, boolean flush, CoderResult[] cr) { |
| int[] value = new int[1]; |
| int match; |
| |
| /* try to match */ |
| match = matchFromU(cp, null, 0, 0, source, value, useFallback, flush); |
| |
| /* reject a match if the result is a single byte for DBCS-only */ |
| if (match >= 2 |
| && !(FROM_U_GET_LENGTH(value[0]) == 1 && sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY)) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position() + match - 2); /* remove 2 for the initial code point */ |
| |
| /* write result to target */ |
| cr[0] = writeFromU(value[0], target, offsets, srcIndex); |
| return true; |
| } else if (match < 0) { |
| /* save state for partial match */ |
| int sArrayIndex; |
| int j; |
| |
| /* copy the first code point */ |
| preFromUFirstCP = cp; |
| |
| /* now copy the newly consumed input */ |
| sArrayIndex = source.position(); |
| match = -match - 2; /* remove 2 for the initial code point */ |
| for (j = 0; j < match; ++j) { |
| preFromUArray[j] = source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ |
| preFromULength = (byte) match; |
| return true; |
| } else if (match == 1) { |
| /* matched, no mapping but request for <subchar1> */ |
| useSubChar1 = true; |
| return false; |
| } else /* match==0 no match */{ |
| return false; |
| } |
| } |
| |
| CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { |
| // Just call encodeLoop to remove duplicate code. |
| return encodeLoop(source, target, offsets, flush); |
| } |
| |
| /* |
| * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages that map only to and from the |
| * BMP. In addition to single-byte/state optimizations, the offset calculations become much easier. |
| */ |
| private CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, |
| boolean flush) { |
| |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| |
| int sourceArrayIndex, lastSource; |
| int targetCapacity, length; |
| char[] table; |
| byte[] results; |
| |
| int c, sourceIndex; |
| char value, minValue; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| targetCapacity = target.remaining(); |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| results = sharedData.mbcs.swapLFNLFromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes |
| // be a ByteBuffer so results can be a 16-bit view |
| // of it? |
| } else { |
| results = sharedData.mbcs.fromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes be a |
| // ByteBuffer so results can be a 16-bit view of it? |
| } |
| |
| if (useFallback) { |
| /* use all roundtrip and fallback results */ |
| minValue = 0x800; |
| } else { |
| /* use only roundtrips and fallbacks from private-use characters */ |
| minValue = 0xc00; |
| } |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = c == 0 ? 0 : -1; |
| lastSource = sourceArrayIndex; |
| |
| /* |
| * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the |
| * sourceLength and targetCapacity |
| */ |
| length = source.limit() - sourceArrayIndex; |
| if (length < targetCapacity) { |
| targetCapacity = length; |
| } |
| |
| boolean doloop = true; |
| if (c != 0 && targetCapacity > 0) { |
| SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); |
| doloop = getTrailSingleBMP(source, x, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| } |
| |
| if (doloop) { |
| while (targetCapacity > 0) { |
| /* |
| * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate pair |
| * for a "supplementary code point". |
| */ |
| c = source.get(sourceArrayIndex++); |
| /* |
| * Do not immediately check for single surrogates: Assume that they are unassigned and check for |
| * them in that case. This speeds up the conversion of assigned characters. |
| */ |
| /* convert the Unicode code point in c into codepage bytes */ |
| value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if (value >= minValue) { |
| /* assigned, write the output character bytes from value and length */ |
| /* length==1 */ |
| /* this is easy because we know that there is enough space */ |
| target.put((byte) value); |
| --targetCapacity; |
| |
| /* normal end of conversion: prepare for a new character */ |
| c = 0; |
| continue; |
| } else if (!UTF16.isSurrogate((char) c)) { |
| /* normal, unassigned BMP character */ |
| } else if (UTF16.isLeadSurrogate((char) c)) { |
| // getTrail: |
| SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); |
| doloop = getTrailSingleBMP(source, x, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| if (!doloop) |
| break; |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| |
| /* c does not have a mapping */ |
| |
| /* get the number of code units for c to correctly advance sourceIndex */ |
| length = UTF16.getCharCount(c); |
| |
| /* set offsets since the start or the last extension */ |
| if (offsets != null) { |
| int count = sourceArrayIndex - lastSource; |
| |
| /* do not set the offset for this character */ |
| count -= length; |
| |
| while (count > 0) { |
| offsets.put(sourceIndex++); |
| --count; |
| } |
| /* offsets and sourceIndex are now set for the current character */ |
| } |
| |
| /* try an extension mapping */ |
| lastSource = sourceArrayIndex; |
| source.position(sourceArrayIndex); |
| c = fromU(c, source, target, offsets, sourceIndex, length, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex += length + (sourceArrayIndex - lastSource); |
| lastSource = sourceArrayIndex; |
| |
| if (cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } else { |
| /* a mapping was written to the target, continue */ |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| targetCapacity = target.remaining(); |
| length = source.limit() - sourceArrayIndex; |
| if (length < targetCapacity) { |
| targetCapacity = length; |
| } |
| } |
| } |
| } |
| |
| if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| |
| /* set offsets since the start or the last callback */ |
| if (offsets != null) { |
| int count = sourceArrayIndex - lastSource; |
| while (count > 0) { |
| offsets.put(sourceIndex++); |
| --count; |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32 = c; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ |
| private CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, |
| IntBuffer offsets, boolean flush) { |
| |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| |
| int sourceArrayIndex; |
| |
| char[] table; |
| byte[] results; // agljport:comment results is used to to get 16-bit values out of byte[] array |
| |
| int c; |
| int sourceIndex, nextSourceIndex; |
| |
| char value, minValue; |
| |
| /* set up the local pointers */ |
| short uniMask; |
| sourceArrayIndex = source.position(); |
| |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| results = sharedData.mbcs.swapLFNLFromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes |
| // be a ByteBuffer so results can be a 16-bit view |
| // of it? |
| } else { |
| results = sharedData.mbcs.fromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes be a |
| // ByteBuffer so results can be a 16-bit view of it? |
| } |
| |
| if (useFallback) { |
| /* use all roundtrip and fallback results */ |
| minValue = 0x800; |
| } else { |
| /* use only roundtrips and fallbacks from private-use characters */ |
| minValue = 0xc00; |
| } |
| // agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation |
| uniMask = sharedData.mbcs.unicodeMask; |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = c == 0 ? 0 : -1; |
| nextSourceIndex = 0; |
| |
| boolean doloop = true; |
| boolean doread = true; |
| if (c != 0 && target.hasRemaining()) { |
| if (UTF16.isLeadSurrogate((char) c)) { |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = getTrailDouble(source, target, uniMask, x, flush, cr); |
| doread = x.doread; |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| } else { |
| doread = false; |
| } |
| } |
| |
| if (doloop) { |
| while (!doread || sourceArrayIndex < source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. It does not catch |
| * output of more than one byte that overflows as a result of a multi-byte character or callback |
| * output from the last source character. Therefore, those situations also test for overflows and |
| * will then break the loop, too. |
| */ |
| if (target.hasRemaining()) { |
| /* |
| * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate |
| * pair for a "supplementary code point". |
| */ |
| |
| if (doread) { |
| c = source.get(sourceArrayIndex++); |
| ++nextSourceIndex; |
| if (UTF16.isSurrogate((char) c)) { |
| if (UTF16.isLeadSurrogate((char) c)) { |
| // getTrail: |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, |
| nextSourceIndex); |
| doloop = getTrailDouble(source, target, uniMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| if (x.doread) { |
| if (doloop) |
| continue; |
| else |
| break; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| } else { |
| doread = true; |
| } |
| |
| /* convert the Unicode code point in c into codepage bytes */ |
| value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if (value >= minValue) { |
| /* assigned, write the output character bytes from value and length */ |
| /* length==1 */ |
| /* this is easy because we know that there is enough space */ |
| target.put((byte) value); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| |
| /* normal end of conversion: prepare for a new character */ |
| c = 0; |
| sourceIndex = nextSourceIndex; |
| } else { /* unassigned */ |
| /* try an extension mapping */ |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, |
| nextSourceIndex); |
| doloop = unassignedDouble(source, target, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| if (!doloop) |
| break; |
| } |
| } else { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32 = c; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ |
| private CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, |
| IntBuffer offsets, boolean flush) { |
| CoderResult[] cr = { CoderResult.UNDERFLOW }; |
| |
| int sourceArrayIndex; |
| |
| char[] table; |
| byte[] bytes; |
| |
| int c, sourceIndex, nextSourceIndex; |
| |
| int stage2Entry; |
| int value; |
| int length; |
| short uniMask; |
| |
| /* use optimized function if possible */ |
| uniMask = sharedData.mbcs.unicodeMask; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { |
| bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes; |
| } else { |
| bytes = sharedData.mbcs.fromUnicodeBytes; |
| } |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = c == 0 ? 0 : -1; |
| nextSourceIndex = 0; |
| |
| /* conversion loop */ |
| boolean doloop = true; |
| boolean doread = true; |
| if (c != 0 && target.hasRemaining()) { |
| if (UTF16.isLeadSurrogate((char) c)) { |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = getTrailDouble(source, target, uniMask, x, flush, cr); |
| doread = x.doread; |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| } else { |
| doread = false; |
| } |
| } |
| |
| if (doloop) { |
| while (!doread || sourceArrayIndex < source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. It does not catch |
| * output of more than one byte that overflows as a result of a multi-byte character or callback |
| * output from the last source character. Therefore, those situations also test for overflows and |
| * will then break the loop, too. |
| */ |
| if (target.hasRemaining()) { |
| if (doread) { |
| /* |
| * Get a correct Unicode code point: a single UChar for a BMP code point or a matched |
| * surrogate pair for a "supplementary code point". |
| */ |
| c = source.get(sourceArrayIndex++); |
| ++nextSourceIndex; |
| /* |
| * This also tests if the codepage maps single surrogates. If it does, then surrogates are |
| * not paired but mapped separately. Note that in this case unmatched surrogates are not |
| * detected. |
| */ |
| if (UTF16.isSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { |
| if (UTF16.isLeadSurrogate((char) c)) { |
| // getTrail: |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, |
| nextSourceIndex); |
| doloop = getTrailDouble(source, target, uniMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| |
| if (x.doread) { |
| if (doloop) |
| continue; |
| else |
| break; |
| } |
| } else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| } else { |
| doread = true; |
| } |
| |
| /* convert the Unicode code point in c into codepage bytes */ |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, c); |
| |
| /* get the bytes and the length for the output */ |
| /* MBCS_OUTPUT_2 */ |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { |
| length = 1; |
| } else { |
| length = 2; |
| } |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0))) { |
| /* |
| * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way |
| * with this data structure for fallback output to be a zero byte. |
| */ |
| |
| // unassigned: |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, |
| nextSourceIndex); |
| |
| doloop = unassignedDouble(source, target, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| if (doloop) |
| continue; |
| else |
| break; |
| } |
| |
| /* write the output character bytes from value and length */ |
| /* from the first if in the loop we know that targetCapacity>0 */ |
| if (length == 1) { |
| /* this is easy because we know that there is enough space */ |
| target.put((byte) value); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| } else /* length==2 */{ |
| target.put((byte) (value >>> 8)); |
| if (2 <= target.remaining()) { |
| target.put((byte) value); |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| offsets.put(sourceIndex); |
| } |
| } else { |
| if (offsets != null) { |
| offsets.put(sourceIndex); |
| } |
| errorBuffer[0] = (byte) value; |
| errorBufferLength = 1; |
| |
| /* target overflow */ |
| cr[0] = CoderResult.OVERFLOW; |
| c = 0; |
| break; |
| } |
| } |
| |
| /* normal end of conversion: prepare for a new character */ |
| c = 0; |
| sourceIndex = nextSourceIndex; |
| continue; |
| } else { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32 = c; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| private final class SideEffectsSingleBMP { |
| int c, sourceArrayIndex; |
| |
| SideEffectsSingleBMP(int c_, int sourceArrayIndex_) { |
| c = c_; |
| sourceArrayIndex = sourceArrayIndex_; |
| } |
| } |
| |
| // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets |
| // assumes input c is lead surrogate |
| private final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr) { |
| if (x.sourceArrayIndex < source.limit()) { |
| /* test the following code unit */ |
| char trail = source.get(x.sourceArrayIndex); |
| if (UTF16.isTrailSurrogate(trail)) { |
| ++x.sourceArrayIndex; |
| x.c = UCharacter.getCodePoint((char) x.c, trail); |
| /* this codepage does not map supplementary code points */ |
| /* callback(unassigned) */ |
| cr[0] = CoderResult.unmappableForLength(2); |
| return false; |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| return false; |
| } |
| } else { |
| /* no more input */ |
| return false; |
| } |
| // return true; |
| } |
| |
| private final class SideEffects { |
| int c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength; |
| boolean doread = true; |
| |
| SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_, |
| int prevLength_) { |
| c = c_; |
| sourceArrayIndex = sourceArrayIndex_; |
| sourceIndex = sourceIndex_; |
| nextSourceIndex = nextSourceIndex_; |
| prevSourceIndex = prevSourceIndex_; |
| prevLength = prevLength_; |
| } |
| } |
| |
| // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets |
| // assumes input c is lead surrogate |
| private final boolean getTrail(CharBuffer source, ByteBuffer target, int uniMask, SideEffects x, |
| boolean flush, CoderResult[] cr) { |
| if (x.sourceArrayIndex < source.limit()) { |
| /* test the following code unit */ |
| char trail = source.get(x.sourceArrayIndex); |
| if (UTF16.isTrailSurrogate(trail)) { |
| ++x.sourceArrayIndex; |
| ++x.nextSourceIndex; |
| /* convert this supplementary code point */ |
| x.c = UCharacter.getCodePoint((char) x.c, trail); |
| if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| fromUnicodeStatus = x.prevLength; /* save the old state */ |
| /* callback(unassigned) */ |
| x.doread = true; |
| return unassigned(source, target, null, x, flush, cr); |
| } else { |
| x.doread = false; |
| return true; |
| } |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| return false; |
| } |
| } else { |
| /* no more input */ |
| return false; |
| } |
| } |
| |
| // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets |
| private final boolean unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x, |
| boolean flush, CoderResult[] cr) { |
| /* try an extension mapping */ |
| int sourceBegin = x.sourceArrayIndex; |
| source.position(x.sourceArrayIndex); |
| x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr); |
| x.sourceArrayIndex = source.position(); |
| x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; |
| x.prevLength = fromUnicodeStatus; |
| |
| if (cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| return false; |
| } else { |
| /* a mapping was written to the target, continue */ |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; |
| /* normal end of conversion: prepare for a new character */ |
| if (offsets != null) { |
| x.prevSourceIndex = x.sourceIndex; |
| x.sourceIndex = x.nextSourceIndex; |
| } |
| return true; |
| } |
| } |
| |
| private final class SideEffectsDouble { |
| int c, sourceArrayIndex, sourceIndex, nextSourceIndex; |
| boolean doread = true; |
| |
| SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_) { |
| c = c_; |
| sourceArrayIndex = sourceArrayIndex_; |
| sourceIndex = sourceIndex_; |
| nextSourceIndex = nextSourceIndex_; |
| } |
| } |
| |
| // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets |
| // assumes input c is lead surrogate |
| private final boolean getTrailDouble(CharBuffer source, ByteBuffer target, int uniMask, |
| SideEffectsDouble x, boolean flush, CoderResult[] cr) { |
| if (x.sourceArrayIndex < source.limit()) { |
| /* test the following code unit */ |
| char trail = source.get(x.sourceArrayIndex); |
| if (UTF16.isTrailSurrogate(trail)) { |
| ++x.sourceArrayIndex; |
| ++x.nextSourceIndex; |
| /* convert this supplementary code point */ |
| x.c = UCharacter.getCodePoint((char) x.c, trail); |
| if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| /* callback(unassigned) */ |
| x.doread = true; |
| return unassignedDouble(source, target, x, flush, cr); |
| } else { |
| x.doread = false; |
| return true; |
| } |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| return false; |
| } |
| } else { |
| /* no more input */ |
| return false; |
| } |
| } |
| |
| // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets |
| private final boolean unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x, |
| boolean flush, CoderResult[] cr) { |
| /* try an extension mapping */ |
| int sourceBegin = x.sourceArrayIndex; |
| source.position(x.sourceArrayIndex); |
| x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr); |
| x.sourceArrayIndex = source.position(); |
| x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; |
| |
| if (cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| return false; |
| } else { |
| /* a mapping was written to the target, continue */ |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; |
| /* normal end of conversion: prepare for a new character */ |
| x.sourceIndex = x.nextSourceIndex; |
| return true; |
| } |
| } |
| |
| /** |
| * Overrides super class method |
| * |
| * @param encoder |
| * @param source |
| * @param target |
| * @param offsets |
| * @return |
| */ |
| protected CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, |
| IntBuffer offsets) { |
| CharsetMBCS cs = (CharsetMBCS) encoder.charset(); |
| byte[] subchar; |
| int length; |
| |
| if (cs.subChar1 != 0 |
| && (cs.sharedData.mbcs.extIndexes != null ? encoder.useSubChar1 |
| : (encoder.invalidUCharBuffer[0] <= 0xff))) { |
| /* |
| * select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS |
| * behavior) |
| */ |
| subchar = new byte[] { cs.subChar1 }; |
| length = 1; |
| } else { |
| /* select subChar in all other cases */ |
| subchar = cs.subChar; |
| length = cs.subCharLen; |
| } |
| |
| /* reset the selector for the next code point */ |
| encoder.useSubChar1 = false; |
| |
| if (cs.sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) { |
| byte[] buffer = new byte[4]; |
| int i = 0; |
| |
| /* fromUnicodeStatus contains prevLength */ |
| switch (length) { |
| case 1: |
| if (encoder.fromUnicodeStatus == 2) { |
| /* DBCS mode and SBCS sub char: change to SBCS */ |
| encoder.fromUnicodeStatus = 1; |
| buffer[i++] = UConverterConstants.SI; |
| } |
| buffer[i++] = subchar[0]; |
| break; |
| case 2: |
| if (encoder.fromUnicodeStatus <= 1) { |
| /* SBCS mode and DBCS sub char: change to DBCS */ |
| encoder.fromUnicodeStatus = 2; |
| buffer[i++] = UConverterConstants.SO; |
| } |
| buffer[i++] = subchar[0]; |
| buffer[i++] = subchar[1]; |
| break; |
| default: |
| throw new IllegalArgumentException(); |
| } |
| |
| subchar = buffer; |
| length = i; |
| } |
| return CharsetEncoderICU.fromUWriteBytes(encoder, subchar, 0, length, target, offsets, source.position()); |
| } |
| |
| /** |
| * Gets called whenever CharsetEncoder.replaceWith gets called. allowReplacementChanges only allows subChar and |
| * subChar1 to be modified outside construction (since replaceWith is called once during construction). |
| * |
| * @param replacement |
| * The replacement for subchar. |
| */ |
| protected void implReplaceWith(byte[] replacement) { |
| if (allowReplacementChanges) { |
| CharsetMBCS cs = (CharsetMBCS) this.charset(); |
| |
| System.arraycopy(replacement, 0, cs.subChar, 0, replacement.length); |
| cs.subCharLen = (byte) replacement.length; |
| cs.subChar1 = 0; |
| } |
| } |
| } |
| |
| public CharsetDecoder newDecoder() { |
| return new CharsetDecoderMBCS(this); |
| } |
| |
| public CharsetEncoder newEncoder() { |
| return new CharsetEncoderMBCS(this); |
| } |
| |
| @SuppressWarnings("fallthrough") |
| void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){ |
| UConverterMBCSTable mbcsTable; |
| char[] table; |
| char st1,maxStage1, st2; |
| int st3; |
| int c ; |
| |
| mbcsTable = data.mbcs; |
| table = mbcsTable.fromUnicodeTable; |
| if((mbcsTable.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY)!=0){ |
| maxStage1 = 0x440; |
| } |
| else{ |
| maxStage1 = 0x40; |
| } |
| c=0; /* keep track of current code point while enumerating */ |
| |
| if(mbcsTable.outputType==MBCS_OUTPUT_1){ |
| char stage2, stage3; |
| char minValue; |
| CharBuffer results; |
| results = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer(); |
| |
| if(which==ROUNDTRIP_SET) { |
| /* use only roundtrips */ |
| minValue=0xf00; |
| } else { |
| /* use all roundtrip and fallback results */ |
| minValue=0x800; |
| } |
| for(st1=0;st1<maxStage1;++st1){ |
| st2 = table[st1]; |
| if(st2>maxStage1){ |
| stage2 = st2; |
| for(st2=0; st2<64; ++st2){ |
| st3 = table[stage2 + st2]; |
| if(st3!=0){ |
| /*read the stage 3 block */ |
| stage3 = (char)st3; |
| do { |
| if(results.get(stage3++)>=minValue){ |
| setFillIn.add(c); |
| } |
| |
| }while((++c&0xf) !=0); |
| } else { |
| c+= 16; /*empty stage 2 block */ |
| } |
| } |
| } else { |
| c+=1024; /* empty stage 2 block */ |
| } |
| } |
| } else { |
| int stage2,stage3; |
| byte[] bytes; |
| int st3Multiplier; |
| int value; |
| boolean useFallBack; |
| bytes = mbcsTable.fromUnicodeBytes; |
| useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET); |
| switch(mbcsTable.outputType) { |
| case MBCS_OUTPUT_3: |
| case MBCS_OUTPUT_4_EUC: |
| st3Multiplier = 3; |
| break; |
| case MBCS_OUTPUT_4: |
| st3Multiplier =4; |
| break; |
| default: |
| st3Multiplier =2; |
| break; |
| } |
| //ByteBuffer buffer = (ByteBuffer)charTobyte(table); |
| |
| for(st1=0;st1<maxStage1;++st1){ |
| st2 = table[st1]; |
| if(st2>(maxStage1>>1)){ |
| stage2 = st2 ; |
| for(st2=0;st2<128;++st2){ |
| /*read the stage 3 block */ |
| st3 = table[stage2*2 + st2]<<16; |
| st3+=table[stage2*2 + ++st2]; |
| if(st3!=0){ |
| //if((st3=table[stage2+st2])!=0){ |
| stage3 = st3Multiplier*16*(st3&UConverterConstants.UNSIGNED_SHORT_MASK); |
| |
| /* get the roundtrip flags for the stage 3 block */ |
| st3>>=16; |
| st3 &= UConverterConstants.UNSIGNED_SHORT_MASK; |
| switch(filter) { |
| case UCNV_SET_FILTER_NONE: |
| do { |
| |
| if((st3&1)!=0){ |
| setFillIn.add(c); |
| stage3+=st3Multiplier; |
| }else if (useFallBack) { |
| |
| char b =0; |
| switch(st3Multiplier) { |
| case 4 : |
| |
| b|= ByteBuffer.wrap(bytes).getChar(stage3++); |
| |
| case 3 : |
| |
| b|= ByteBuffer.wrap(bytes).getChar(stage3++); |
| |
| case 2 : |
| |
| b|= ByteBuffer.wrap(bytes).getChar(stage3) | ByteBuffer.wrap(bytes).getChar(stage3+1); |
| stage3+=2; |
| default: |
| break; |
| } |
| if(b!=0) { |
| setFillIn.add(c); |
| } |
| } |
| st3>>=1; |
| }while((++c&0xf)!=0); |
| break; |
| case UCNV_SET_FILTER_DBCS_ONLY: |
| /* Ignore single bytes results (<0x100). */ |
| do { |
| if(((st3&1) != 0 || useFallBack) && |
| (UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))) >= 0x100){ |
| setFillIn.add(c); |
| } |
| st3>>=1; |
| stage3+=2; |
| }while((++c&0xf) != 0); |
| break; |
| case UCNV_SET_FILTER_2022_CN : |
| /* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */ |
| do { |
| if(((st3&1) != 0 || useFallBack) && |
| ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & (ByteBuffer.wrap(bytes).get(stage3))))==0x81 || value==0x82) ){ |
| setFillIn.add(c); |
| } |
| st3>>=1; |
| stage3+=3; |
| }while((++c&0xf)!=0); |
| break; |
| case UCNV_SET_FILTER_SJIS: |
| /* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */ |
| do{ |
| |
| if(((st3&1) != 0 || useFallBack) && (value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))>=0x8140 && value<=0xeffc){ |
| setFillIn.add(c); |
| } |
| st3>>=1; |
| stage3+=2; |
| }while((++c&0xf)!=0); |
| break; |
| case UCNV_SET_FILTER_GR94DBCS: |
| /* only add code points that maps to ISO 2022 GR 94 DBCS codes*/ |
| do { |
| if(((st3&1) != 0 || useFallBack) && |
| (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))- 0xa1a1))<=(0xfefe - 0xa1a1) && |
| (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){ |
| setFillIn.add(c); |
| } |
| st3>>=1; |
| stage3+=2; |
| }while((++c&0xf)!=0); |
| break; |
| case UCNV_SET_FILTER_HZ: |
| /*Only add code points that are suitable for HZ DBCS*/ |
| do { |
| if( ((st3&1) != 0 || useFallBack) && |
| (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))-0xa1a1))<=(0xfdfe - 0xa1a1) && |
| (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){ |
| setFillIn.add(c); |
| } |
| st3>>=1; |
| stage3+=2; |
| }while((++c&0xf) != 0); |
| break; |
| default: |
| return; |
| } |
| } else { |
| c+=16; /* empty stage 3 block */ |
| } |
| } |
| } else { |
| c+=1024; /*empty stage2 block */ |
| } |
| } |
| } |
| extGetUnicodeSet(setFillIn, which, filter, data); |
| } |
| |
| static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback, |
| int minLength, int c, char s[],int length,int sectionIndex){ |
| CharBuffer fromUSectionUChar; |
| IntBuffer fromUSectionValues; |
| fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class ); |
| fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class ); |
| int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex; |
| int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex; |
| int value, i, count; |
| |
| /* read first pair of the section */ |
| count = fromUSectionUChar.get(fromUSectionUCharIndex++); |
| value = fromUSectionValues.get(fromUSectionValuesIndex++); |
| if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) { |
| if(c>=0){ |
| setFillIn.add(c); |
| } else { |
| StringBuilder normalizedStringBuilder = new StringBuilder(); |
| for(int j=0; j<length;j++){ |
| normalizedStringBuilder.append(s[j]); |
| } |
| String normalizedString = normalizedStringBuilder.toString(); |
| for(int j=0;j<length;j++){ |
| setFillIn.add(normalizedString); |
| } |
| } |
| } |
| |
| for(i=0; i<count; ++i){ |
| s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i); |
| value = fromUSectionValues.get(fromUSectionValuesIndex + i); |
| |
| if(value==0) { |
| /* no mapping, do nothing */ |
| } else if (FROM_U_IS_PARTIAL(value)) { |
| extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, UConverterConstants.U_SENTINEL, s, length+1, |
| FROM_U_GET_PARTIAL_INDEX(value)); |
| } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG)) |
| && FROM_U_GET_LENGTH(value)>=minLength) { |
| StringBuilder normalizedStringBuilder = new StringBuilder(); // String for composite characters |
| for(int j=0; j<(length+1);j++){ |
| normalizedStringBuilder.append(s[j]); |
| } |
| setFillIn.add(normalizedStringBuilder.toString()); |
| } |
| } |
| |
| } |
| |
| |
| static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){ |
| int st1, stage1Length, st2, st3, minLength; |
| int ps2, ps3; |
| |
| CharBuffer stage12, stage3; |
| int value, length; |
| IntBuffer stage3b; |
| boolean useFallback; |
| char s[] = new char[MAX_UCHARS]; |
| int c; |
| ByteBuffer cx = Data.mbcs.extIndexes; |
| if(cx == null){ |
| return; |
| } |
| stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class ); |
| stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class ); |
| stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class ); |
| |
| stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH); |
| useFallback = (which==ROUNDTRIP_AND_FALLBACK_SET); |
| |
| c = 0; |
| if(filter == UCNV_SET_FILTER_2022_CN) { |
| minLength = 3; |
| } else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) { |
| /* DBCS-only, ignore single-byte results */ |
| minLength = 2; |
| } else { |
| minLength = 1; |
| } |
| |
| for(st1=0; st1< stage1Length; ++st1){ |
| st2 = stage12.get(st1); |
| if(st2>stage1Length) { |
| ps2 = st2; |
| for(st2=0;st2<64;++st2){ |
| st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT; |
| if(st3!= 0){ |
| ps3 = st3; |
| do { |
| value = stage3b.get(UConverterConstants.UNSIGNED_SHORT_MASK&stage3.get(ps3++)); |
| if(value==0){ |
| /* no mapping do nothing */ |
| }else if (FROM_U_IS_PARTIAL(value)){ |
| length = 0; |
| length=UTF16.append(s, length, c); |
| extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,FROM_U_GET_PARTIAL_INDEX(value)); |
| } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0 :((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))== FROM_U_ROUNDTRIP_FLAG)) && |
| FROM_U_GET_LENGTH(value)>=minLength){ |
| |
| switch(filter) { |
| case UCNV_SET_FILTER_2022_CN: |
| if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){ |
| continue; |
| } |
| break; |
| case UCNV_SET_FILTER_SJIS: |
| if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){ |
| continue; |
| } |
| break; |
| case UCNV_SET_FILTER_GR94DBCS: |
| if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfefe - 0xa1a1) |
| && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){ |
| |
| continue; |
| } |
| break; |
| case UCNV_SET_FILTER_HZ: |
| if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfdfe - 0xa1a1) |
| && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){ |
| continue; |
| } |
| break; |
| default: |
| /* |
| * UCNV_SET_FILTER_NONE, |
| * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength |
| */ |
| break; |
| } |
| setFillIn.add(c); |
| |
| } |
| }while((++c&0xf) != 0); |
| |
| } else { |
| c+=16; /* emplty stage3 block */ |
| } |
| } |
| } else { |
| c+=1024; /* empty stage 2 block*/ |
| } |
| } |
| } |
| |
| void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){ |
| MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which, |
| this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? UCNV_SET_FILTER_DBCS_ONLY : UCNV_SET_FILTER_NONE ); |
| } |
| |
| void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ |
| if((options & MBCS_OPTION_GB18030)!=0){ |
| setFillIn.add(0, 0xd7ff); |
| setFillIn.add(0xe000, 0x10ffff); |
| } |
| else { |
| this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which); |
| } |
| } |
| |
| } |