| /** |
| ******************************************************************************* |
| * Copyright (C) 2006, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.charset; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.Buffer; |
| import java.nio.BufferOverflowException; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.IntBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| |
| import com.ibm.icu.charset.UConverterSharedData.UConverterType; |
| import com.ibm.icu.impl.ICUData; |
| import com.ibm.icu.impl.ICUResourceBundle; |
| import com.ibm.icu.impl.InvalidFormatException; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.UTF16; |
| |
| class CharsetMBCS extends CharsetICU { |
| |
| protected byte[] fromUSubstitution = null; |
| protected UConverterSharedData sharedData = null; |
| static final int MAX_VERSION_LENGTH=4; |
| /** |
| * Fallbacks to Unicode are stored outside the normal state table and code point structures |
| * in a vector of items of this type. They are sorted by offset. |
| */ |
| final class MBCSToUFallback { |
| int offset; |
| int codePoint; |
| } |
| /** |
| * This is the MBCS part of the UConverterTable union (a runtime data structure). |
| * It keeps all the per-converter data and points into the loaded mapping tables. |
| */ |
| static final class UConverterMBCSTable { |
| /* toUnicode */ |
| short countStates; |
| byte dbcsOnlyState; |
| boolean stateTableOwned; |
| int countToUFallbacks; |
| |
| int stateTable[/*countStates*/][/*256*/]; |
| int swapLFNLStateTable[/*countStates*/][/*256*/]; /* for swaplfnl */ |
| char unicodeCodeUnits[/*countUnicodeResults*/]; |
| MBCSToUFallback toUFallbacks[/*countToUFallbacks*/]; |
| |
| /* fromUnicode */ |
| char fromUnicodeTable[]; |
| byte fromUnicodeBytes[]; |
| byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */ |
| int fromUBytesLength; |
| short outputType, unicodeMask; |
| |
| /* converter name for swaplfnl */ |
| String swapLFNLName; |
| |
| /* extension data */ |
| UConverterSharedData baseSharedData; |
| //int extIndexes[]; |
| ByteBuffer extIndexes; // create int[] view etc. as needed |
| |
| UConverterMBCSTable() |
| { |
| } |
| |
| /* UConverterMBCSTable(UConverterMBCSTable t) |
| { |
| countStates = t.countStates; |
| dbcsOnlyState = t.dbcsOnlyState; |
| stateTableOwned = t.stateTableOwned; |
| countToUFallbacks = t.countToUFallbacks; |
| stateTable = t.stateTable; |
| swapLFNLStateTable = t.swapLFNLStateTable; |
| unicodeCodeUnits = t.unicodeCodeUnits; |
| toUFallbacks = t.toUFallbacks; |
| fromUnicodeTable = t.fromUnicodeTable; |
| fromUnicodeBytes = t.fromUnicodeBytes; |
| swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes; |
| fromUBytesLength = t.fromUBytesLength; |
| outputType = t.outputType; |
| unicodeMask = t.unicodeMask; |
| swapLFNLName = t.swapLFNLName; |
| baseSharedData = t.baseSharedData; |
| extIndexes = t.extIndexes; |
| }*/ |
| } |
| |
| /** |
| * MBCS data header. See data format description above. |
| */ |
| final class MBCSHeader { |
| byte version[/*U_MAX_VERSION_LENGTH*/]; |
| int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes; |
| int flags; |
| int fromUBytesLength; |
| |
| MBCSHeader() |
| { |
| version = new byte[MAX_VERSION_LENGTH]; |
| } |
| } |
| /** |
| * Tags for pacifying the check tags tool |
| * @draft ICU 3.6 |
| * @provisional This API might change or be removed in a future release. |
| */ |
| public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases) throws InvalidFormatException{ |
| super(icuCanonicalName, javaCanonicalName, aliases); |
| |
| // now try to load the data |
| LoadArguments args = new LoadArguments(1, icuCanonicalName); |
| sharedData = loadConverter(args); |
| |
| maxBytesPerChar = sharedData.staticData.maxBytesPerChar; |
| minBytesPerChar = sharedData.staticData.minBytesPerChar; |
| maxCharsPerByte = 1; |
| fromUSubstitution = sharedData.staticData.subChar; |
| subChar = sharedData.staticData.subChar; |
| subCharLen = sharedData.staticData.subCharLen; |
| subChar1 = sharedData.staticData.subChar1; |
| fromUSubstitution = new byte[sharedData.staticData.subCharLen]; |
| System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen); |
| |
| // Todo: pass options |
| initializeConverter(0); |
| } |
| |
| class LoadArguments |
| { |
| int nestedLoads; /* count nested loadConverter() calls */ |
| // int reserved; /* reserved - for good alignment of the pointers */ |
| // long options; |
| // String pkg; |
| String name; |
| |
| LoadArguments(int nestedLoads, String name) |
| { |
| this.nestedLoads = nestedLoads; |
| this.name = name; |
| } |
| } |
| |
| protected UConverterSharedData loadConverter(LoadArguments args) throws InvalidFormatException |
| { |
| // Read converter data from file |
| UConverterStaticData staticData = new UConverterStaticData(); |
| UConverterDataReader reader = null; |
| try { |
| InputStream i = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE + "/" + args.name + "." + UConverterSharedData.DATA_TYPE); |
| BufferedInputStream b = new BufferedInputStream(i, UConverterConstants.CNV_DATA_BUFFER_SIZE); |
| reader = new UConverterDataReader(b); |
| reader.readStaticData(staticData); |
| } |
| catch(IOException e) { |
| throw new InvalidFormatException(); |
| } |
| catch(Exception e) { |
| throw new InvalidFormatException(); |
| } |
| |
| UConverterSharedData data = null; |
| int type = staticData.conversionType; |
| |
| if( type != UConverterSharedData.UConverterType.MBCS || |
| staticData.structSize != UConverterSharedData.SIZE_OF_UCONVERTER_SHARED_DATA) |
| { |
| throw new InvalidFormatException(); |
| } |
| |
| data = new UConverterSharedData(UConverterSharedData.SIZE_OF_UCONVERTER_SHARED_DATA, 1, null, false, 0); |
| data.dataReader = reader; |
| data.staticData = staticData; |
| data.sharedDataCached = false; |
| |
| // Load data |
| UConverterMBCSTable mbcsTable = data.mbcs; |
| MBCSHeader header = new MBCSHeader(); |
| try { |
| reader.readMBCSHeader(header); |
| } |
| catch(IOException e) { |
| throw new InvalidFormatException(); |
| } |
| |
| int offset; |
| //int[] extIndexesArray = null; |
| String baseNameString = null; |
| int[][] stateTableArray = null; |
| MBCSToUFallback[] toUFallbacksArray = null; |
| char[] unicodeCodeUnitsArray = null; |
| char[] fromUnicodeTableArray = null; |
| byte[] fromUnicodeBytesArray = null; |
| |
| if(header.version[0]!=4) { |
| throw new InvalidFormatException(); |
| } |
| |
| mbcsTable.outputType=(byte)header.flags; |
| |
| /* extension data, header version 4.2 and higher */ |
| offset=header.flags>>>8; |
| //if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { |
| if(mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { |
| try { |
| baseNameString = reader.readBaseTableName(); |
| if(offset != 0) { |
| //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read; |
| mbcsTable.extIndexes=reader.readExtIndexes(offset - 32 - baseNameString.length() - 1); |
| } |
| } |
| catch(IOException e) { |
| throw new InvalidFormatException(); |
| } |
| } |
| /* |
| if(offset != 0) { |
| try { |
| //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read; |
| int namelen = baseNameString != null? baseNameString.length() + 1: 0; |
| mbcsTable.extIndexes=dataReader.readExtIndexes(offset - 32 - namelen); |
| |
| } |
| catch(IOException e) { |
| if(debug) System.err.println("Caught IOException: " + e.getMessage()); |
| pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; |
| return; |
| } |
| } |
| */ |
| //agljport:add this would be unnecessary if extIndexes were memory mapped |
| if(mbcsTable.extIndexes != null) { |
| /* |
| try { |
| //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; |
| //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] |
| //byte[] extTables = dataReader.readExtTables(nbytes); |
| //mbcsTable.extTables = ByteBuffer.wrap(extTables); |
| } |
| catch(IOException e) { |
| System.err.println("Caught IOException: " + e.getMessage()); |
| pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; |
| return; |
| } |
| */ |
| } |
| |
| if(mbcsTable.outputType==MBCS_OUTPUT_EXT_ONLY) { |
| UConverterSharedData baseSharedData = null; |
| ByteBuffer extIndexes; |
| String baseName; |
| |
| /* extension-only file, load the base table and set values appropriately */ |
| if((extIndexes=mbcsTable.extIndexes)==null) { |
| /* extension-only file without extension */ |
| throw new InvalidFormatException(); |
| } |
| |
| if(args.nestedLoads!=1) { |
| /* an extension table must not be loaded as a base table */ |
| throw new InvalidFormatException(); |
| } |
| |
| /* load the base table */ |
| baseName=baseNameString; |
| if(baseName.equals(staticData.name)) { |
| /* forbid loading this same extension-only file */ |
| throw new InvalidFormatException(); |
| } |
| |
| /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ |
| //agljport:fix args.size=sizeof(UConverterLoadArgs); |
| LoadArguments args2 = new LoadArguments(2, baseName); |
| baseSharedData=loadConverter(args2); |
| |
| if( baseSharedData.staticData.conversionType!=UConverterType.MBCS || |
| baseSharedData.mbcs.baseSharedData!=null |
| ) { |
| //agljport:fix ucnv_unload(baseSharedData); |
| throw new InvalidFormatException(); |
| } |
| |
| /* copy the base table data */ |
| //agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't need the deep copy so can just make sure mbcs and its local reference both refer to the same new object |
| mbcsTable = data.mbcs = baseSharedData.mbcs; |
| |
| /* overwrite values with relevant ones for the extension converter */ |
| mbcsTable.baseSharedData=baseSharedData; |
| mbcsTable.extIndexes=extIndexes; |
| |
| /* |
| * It would be possible to share the swapLFNL data with a base converter, |
| * but the generated name would have to be different, and the memory |
| * would have to be free'd only once. |
| * It is easier to just create the data for the extension converter |
| * separately when it is requested. |
| */ |
| mbcsTable.swapLFNLStateTable=null; |
| mbcsTable.swapLFNLFromUnicodeBytes=null; |
| mbcsTable.swapLFNLName=null; |
| |
| /* |
| * Set a special, runtime-only outputType if the extension converter |
| * is a DBCS version of a base converter that also maps single bytes. |
| */ |
| if(staticData.conversionType==UConverterType.DBCS || |
| (staticData.conversionType==UConverterType.MBCS && staticData.minBytesPerChar>=2)){ |
| |
| if(baseSharedData.mbcs.outputType==MBCS_OUTPUT_2_SISO) { |
| /* the base converter is SI/SO-stateful */ |
| int entry; |
| |
| /* get the dbcs state from the state table entry for SO=0x0e */ |
| entry=mbcsTable.stateTable[0][0xe]; |
| if( MBCS_ENTRY_IS_FINAL(entry) && |
| MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && |
| MBCS_ENTRY_FINAL_STATE(entry)!=0 |
| ) { |
| mbcsTable.dbcsOnlyState=(byte)MBCS_ENTRY_FINAL_STATE(entry); |
| |
| mbcsTable.outputType=MBCS_OUTPUT_DBCS_ONLY; |
| } |
| } |
| else if(baseSharedData.staticData.conversionType==UConverterType.MBCS && |
| baseSharedData.staticData.minBytesPerChar==1 && |
| baseSharedData.staticData.maxBytesPerChar==2 && |
| mbcsTable.countStates<=127){ |
| |
| /* non-stateful base converter, need to modify the state table */ |
| int newStateTable[][/*256*/]; |
| int state[]; // this works because java 2-D array is array of references and we can have state = newStateTable[i]; |
| int i, count; |
| |
| /* allocate a new state table and copy the base state table contents */ |
| count=mbcsTable.countStates; |
| newStateTable=new int[(count+1)*1024][256]; |
| |
| for(i = 0; i < mbcsTable.stateTable.length; ++i) |
| System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, mbcsTable.stateTable[i].length); |
| |
| /* change all final single-byte entries to go to a new all-illegal state */ |
| state=newStateTable[0]; |
| for(i=0; i<256; ++i) { |
| if(MBCS_ENTRY_IS_FINAL(state[i])) { |
| state[i]=MBCS_ENTRY_TRANSITION(count, 0); |
| } |
| } |
| |
| /* build the new all-illegal state */ |
| state=newStateTable[count]; |
| for(i=0; i<256; ++i) { |
| state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); |
| } |
| mbcsTable.stateTable=newStateTable; |
| mbcsTable.countStates=(byte)(count+1); |
| mbcsTable.stateTableOwned=true; |
| |
| mbcsTable.outputType=MBCS_OUTPUT_DBCS_ONLY; |
| } |
| } |
| |
| /* |
| * unlike below for files with base tables, do not get the unicodeMask |
| * from the sharedData; instead, use the base table's unicodeMask, |
| * which we copied in the memcpy above; |
| * this is necessary because the static data unicodeMask, especially |
| * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data |
| */ |
| } |
| else { |
| /* conversion file with a base table; an additional extension table is optional */ |
| /* make sure that the output type is known */ |
| switch(mbcsTable.outputType) { |
| case MBCS_OUTPUT_1: |
| case MBCS_OUTPUT_2: |
| case MBCS_OUTPUT_3: |
| case MBCS_OUTPUT_4: |
| case MBCS_OUTPUT_3_EUC: |
| case MBCS_OUTPUT_4_EUC: |
| case MBCS_OUTPUT_2_SISO: |
| /* OK */ |
| break; |
| default: |
| throw new InvalidFormatException(); |
| } |
| |
| stateTableArray = new int[header.countStates][256]; |
| toUFallbacksArray = new MBCSToUFallback[header.countToUFallbacks]; |
| for(int i = 0; i < toUFallbacksArray.length; ++i) |
| toUFallbacksArray[i] = new MBCSToUFallback(); |
| unicodeCodeUnitsArray = new char[(header.offsetFromUTable - header.offsetToUCodeUnits)/2]; |
| fromUnicodeTableArray = new char[(header.offsetFromUBytes - header.offsetFromUTable)/2]; |
| fromUnicodeBytesArray = new byte[header.fromUBytesLength]; |
| try { |
| reader.readMBCSTable(stateTableArray, toUFallbacksArray, unicodeCodeUnitsArray, fromUnicodeTableArray, fromUnicodeBytesArray); |
| } |
| catch(IOException e) { |
| throw new InvalidFormatException(); |
| } |
| |
| mbcsTable.countStates=(byte)header.countStates; |
| mbcsTable.countToUFallbacks=header.countToUFallbacks; |
| mbcsTable.stateTable=stateTableArray; |
| mbcsTable.toUFallbacks=toUFallbacksArray; |
| mbcsTable.unicodeCodeUnits=unicodeCodeUnitsArray; |
| |
| mbcsTable.fromUnicodeTable=fromUnicodeTableArray; |
| mbcsTable.fromUnicodeBytes=fromUnicodeBytesArray; |
| mbcsTable.fromUBytesLength=header.fromUBytesLength; |
| |
| /* |
| * converter versions 6.1 and up contain a unicodeMask that is |
| * used here to select the most efficient function implementations |
| */ |
| //agljport:fix info.size=sizeof(UDataInfo); |
| //agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); |
| //agljport:fix if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { |
| /* mask off possible future extensions to be safe */ |
| mbcsTable.unicodeMask=(short)(staticData.unicodeMask&3); |
| //agljport:fix } else { |
| /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ |
| //agljport:fix mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; |
| //agljport:fix } |
| if(offset != 0) { |
| try { |
| //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read; |
| //int namelen = baseNameString != null? baseNameString.length() + 1: 0; |
| //mbcsTable.extIndexes=dataReader.readExtIndexes(offset - 32 - namelen); |
| mbcsTable.extIndexes=reader.readExtIndexes(0); |
| } |
| catch(IOException e) { |
| throw new InvalidFormatException(); |
| } |
| } |
| } |
| return data; |
| } |
| |
| protected void initializeConverter(int options) |
| { |
| UConverterMBCSTable mbcsTable; |
| ByteBuffer extIndexes; |
| short outputType; |
| byte maxBytesPerUChar; |
| |
| mbcsTable=sharedData.mbcs; |
| outputType=mbcsTable.outputType; |
| |
| if(outputType==MBCS_OUTPUT_DBCS_ONLY) { |
| /* the swaplfnl option does not apply, remove it */ |
| this.options=options&=~UConverterConstants.OPTION_SWAP_LFNL; |
| } |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| /* do this because double-checked locking is broken */ |
| boolean isCached; |
| |
| //agljport:todo umtx_lock(NULL); |
| isCached=mbcsTable.swapLFNLStateTable!=null; |
| //agljport:todo umtx_unlock(NULL); |
| |
| if(!isCached) { |
| //agljport:fix if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { |
| //agljport:fix if(U_FAILURE(*pErrorCode)) { |
| //agljport:fix return; /* something went wrong */ |
| //agljport:fix } |
| |
| /* the option does not apply, remove it */ |
| //agljport:fix cnv->options=options&=~UCNV_OPTION_SWAP_LFNL; |
| //agljport:fix } |
| } |
| } |
| |
| if(icuCanonicalName.toLowerCase().indexOf("gb18030") >= 0) { |
| /* set a flag for GB 18030 mode, which changes the callback behavior */ |
| this.options|=MBCS_OPTION_GB18030; |
| } |
| |
| /* fix maxBytesPerUChar depending on outputType and options etc. */ |
| if(outputType==MBCS_OUTPUT_2_SISO) { |
| maxBytesPerChar=3; /* SO+DBCS */ |
| } |
| |
| extIndexes=mbcsTable.extIndexes; |
| if(extIndexes!=null) { |
| maxBytesPerUChar=(byte)GET_MAX_BYTES_PER_UCHAR(extIndexes); |
| if(outputType==MBCS_OUTPUT_2_SISO) { |
| ++maxBytesPerUChar; /* SO + multiple DBCS */ |
| } |
| |
| if(maxBytesPerUChar>maxBytesPerChar) { |
| maxBytesPerChar=maxBytesPerUChar; |
| } |
| } |
| } |
| |
| /** |
| * MBCS output types for conversions from Unicode. |
| * These per-converter types determine the storage method in stage 3 of the lookup table, |
| * mostly how many bytes are stored per entry. |
| */ |
| protected static final int MBCS_OUTPUT_1 = 0; /* 0 */ |
| protected static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */ |
| protected static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */ |
| protected static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */ |
| protected static final int MBCS_OUTPUT_3_EUC=8; /* 8 */ |
| protected static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */ |
| protected static final int MBCS_OUTPUT_2_SISO=12; /* c */ |
| protected static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */ |
| protected static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */ |
| protected static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1; |
| protected static final int MBCS_OUTPUT_DBCS_ONLY=0xdb; /* runtime-only type for DBCS-only handling of SISO tables */ |
| |
| /* GB 18030 data ------------------------------------------------------------ */ |
| |
| /* helper macros for linear values for GB 18030 four-byte sequences */ |
| protected static long LINEAR_18030(long a, long b, long c, long d) {return ((((a)*10+(b))*126L+(c))*10L+(d));} |
| |
| protected static long LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30); |
| |
| protected static long LINEAR(long x) {return LINEAR_18030(x>>>24, (x>>>16)&0xff, (x>>>8)&0xff, x&0xff);} |
| |
| /* |
| * Some ranges of GB 18030 where both the Unicode code points and the |
| * GB four-byte sequences are contiguous and are handled algorithmically by |
| * the special callback functions below. |
| * The values are start & end of Unicode & GB codes. |
| * |
| * Note that single surrogates are not mapped by GB 18030 |
| * as of the re-released mapping tables from 2000-nov-30. |
| */ |
| protected static final long gb18030Ranges[][] = new long[/*13*/][/*4*/]{ |
| {0x10000L, 0x10FFFFL, LINEAR(0x90308130L), LINEAR(0xE3329A35L)}, |
| {0x9FA6L, 0xD7FFL, LINEAR(0x82358F33L), LINEAR(0x8336C738L)}, |
| {0x0452L, 0x200FL, LINEAR(0x8130D330L), LINEAR(0x8136A531L)}, |
| {0xE865L, 0xF92BL, LINEAR(0x8336D030L), LINEAR(0x84308534L)}, |
| {0x2643L, 0x2E80L, LINEAR(0x8137A839L), LINEAR(0x8138FD38L)}, |
| {0xFA2AL, 0xFE2FL, LINEAR(0x84309C38L), LINEAR(0x84318537L)}, |
| {0x3CE1L, 0x4055L, LINEAR(0x8231D438L), LINEAR(0x8232AF32L)}, |
| {0x361BL, 0x3917L, LINEAR(0x8230A633L), LINEAR(0x8230F237L)}, |
| {0x49B8L, 0x4C76L, LINEAR(0x8234A131L), LINEAR(0x8234E733L)}, |
| {0x4160L, 0x4336L, LINEAR(0x8232C937L), LINEAR(0x8232F837L)}, |
| {0x478EL, 0x4946L, LINEAR(0x8233E838L), LINEAR(0x82349638L)}, |
| {0x44D7L, 0x464BL, LINEAR(0x8233A339L), LINEAR(0x8233C931L)}, |
| {0xFFE6L, 0xFFFFL, LINEAR(0x8431A234L), LINEAR(0x8431A439L)} |
| }; |
| |
| /* bit flag for UConverter.options indicating GB 18030 special handling */ |
| protected static final int MBCS_OPTION_GB18030 = 0x8000; |
| |
| /** |
| * MBCS action codes for conversions to Unicode. |
| * These values are in bits 23..20 of the state table entries. |
| */ |
| protected static final int MBCS_STATE_VALID_DIRECT_16 = 0; |
| protected static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1; |
| protected static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1; |
| protected static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1; |
| protected static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1; |
| protected static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1; |
| protected static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1; |
| protected static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1; |
| protected static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1; |
| |
| /* Methods for state table entries */ |
| protected static int MBCS_ENTRY_TRANSITION(int state, int offset) {return (state<<24L)|offset; } |
| protected static int MBCS_ENTRY_FINAL(int state, int action, int value) {return (int)(0x80000000|((int)(state)<<24L)|((action)<<20L)|(value));} |
| protected static boolean MBCS_ENTRY_IS_TRANSITION(int entry) {return (entry)>=0; } |
| protected static boolean MBCS_ENTRY_IS_FINAL(int entry) {return (entry)<0;} |
| protected static int MBCS_ENTRY_TRANSITION_STATE(int entry) {return ((entry)>>>24);} |
| protected static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) {return ((entry)&0xffffff);} |
| protected static int MBCS_ENTRY_FINAL_STATE(int entry) {return ((entry)>>>24)&0x7f;} |
| protected static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) {return ((entry)<0x80100000);} |
| protected static int MBCS_ENTRY_FINAL_ACTION(int entry) {return ((entry)>>>20)&0xf;} |
| protected static int MBCS_ENTRY_FINAL_VALUE(int entry) {return ((entry)&0xfffff); } |
| protected static char MBCS_ENTRY_FINAL_VALUE_16(int entry) {return (char)(entry);} |
| |
| /** |
| * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. |
| * It works for single-byte, single-state codepages that only map |
| * to and from BMP code points, and it always |
| * returns fallback values. |
| */ |
| protected static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) |
| { |
| return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b & UConverterConstants.UNSIGNED_BYTE_MASK]); |
| } |
| |
| /* single-byte fromUnicode: get the 16-bit result word */ |
| protected static char MBCS_SINGLE_RESULT_FROM_U(char[] table, byte[] results, int c) |
| { |
| int i1 = table[c>>>10] +((c>>>4)&0x3f); |
| int i = 2* (table[i1] +(c&0xf)); // used as index into byte[] array treated as char[] array |
| return (char)(((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) <<8) | (results[i+1] & UConverterConstants.UNSIGNED_BYTE_MASK)); |
| } |
| |
| /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ |
| protected static int MBCS_STAGE_2_FROM_U(char[] table, int c) |
| { |
| int i = 2 * (table[(c)>>>10] +((c>>>4)&0x3f)); // 2x because used as index into char[] array treated as int[] array |
| return ((table[i] & UConverterConstants.UNSIGNED_SHORT_MASK) <<16) | (table[i+1] & UConverterConstants.UNSIGNED_SHORT_MASK); |
| } |
| |
| protected static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) {return ( ((stage2Entry) & (1<< (16+((c)&0xf)) )) !=0);} |
| |
| protected static char MBCS_VALUE_2_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) |
| { |
| int i = 2 * (16*((char)stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK)+(c&0xf)); |
| return (char)(((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) <<8) | (bytes[i+1] & UConverterConstants.UNSIGNED_BYTE_MASK)); |
| } |
| |
| protected static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) |
| { |
| int i = 4 * (16*((char)stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK)+(c&0xf)); |
| return ((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) <<24) | |
| ((bytes[i+1] & UConverterConstants.UNSIGNED_BYTE_MASK) <<16) | |
| ((bytes[i+2] & UConverterConstants.UNSIGNED_BYTE_MASK) <<8) | |
| (bytes[i+3] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } |
| |
| protected static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) |
| { |
| return ((16*((char)(stage2Entry) & UConverterConstants.UNSIGNED_SHORT_MASK)+((c)&0xf))*3); |
| } |
| |
| //------------UConverterExt------------------------------------------------------- |
| |
| protected static final int EXT_INDEXES_LENGTH = 0; /* 0 */ |
| |
| protected static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */ |
| protected static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1; |
| protected static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1; |
| protected static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1; |
| |
| protected static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */ |
| protected static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1; |
| protected static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1; |
| protected static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1; |
| protected static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1; |
| |
| protected static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */ |
| protected static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1; |
| protected static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1; |
| protected static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1; |
| protected static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1; |
| protected static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1; |
| protected static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1; |
| |
| protected static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */ |
| protected static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1; |
| protected static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1; |
| |
| protected static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */ |
| |
| protected static final int EXT_SIZE=31; |
| protected static final int EXT_INDEXES_MIN_LENGTH=32; |
| |
| /* toUnicode helpers -------------------------------------------------------- */ |
| |
| protected static final int TO_U_BYTE_SHIFT = 24; |
| protected static final int TO_U_VALUE_MASK = 0xffffff; |
| protected static final int TO_U_MIN_CODE_POINT = 0x1f0000; |
| protected static final int TO_U_MAX_CODE_POINT = 0x2fffff; |
| protected static final int TO_U_ROUNDTRIP_FLAG = (1<<23); |
| protected static final int TO_U_INDEX_MASK = 0x3ffff; |
| protected static final int TO_U_LENGTH_SHIFT = 18; |
| protected static final int TO_U_LENGTH_OFFSET = 12; |
| |
| /* maximum number of indexed UChars */ |
| protected static final int MAX_UCHARS = 19; |
| |
| protected static int TO_U_GET_BYTE(int word) |
| { |
| return word>>>TO_U_BYTE_SHIFT; |
| } |
| |
| protected static int TO_U_GET_VALUE(int word) |
| { |
| return word&TO_U_VALUE_MASK; |
| } |
| |
| protected static boolean TO_U_IS_ROUNDTRIP(int value) |
| { |
| return (value&TO_U_ROUNDTRIP_FLAG)!=0; |
| } |
| |
| protected static boolean TO_U_IS_PARTIAL(int value) |
| { |
| return (value&UConverterConstants.UNSIGNED_INT_MASK)<TO_U_MIN_CODE_POINT; |
| } |
| |
| protected static int TO_U_GET_PARTIAL_INDEX(int value) |
| { |
| return value; |
| } |
| |
| protected static int TO_U_MASK_ROUNDTRIP(int value) |
| { |
| return value&~TO_U_ROUNDTRIP_FLAG; |
| } |
| |
| protected static int TO_U_MAKE_WORD(byte b, int value) |
| { |
| return ((b&UConverterConstants.UNSIGNED_BYTE_MASK)<<TO_U_BYTE_SHIFT)|value; |
| } |
| |
| /* use after masking off the roundtrip flag */ |
| protected static boolean TO_U_IS_CODE_POINT(int value) |
| { |
| return (value&UConverterConstants.UNSIGNED_INT_MASK)<=TO_U_MAX_CODE_POINT; |
| } |
| |
| protected static int TO_U_GET_CODE_POINT(int value) |
| { |
| return (int)((value&UConverterConstants.UNSIGNED_INT_MASK)-TO_U_MIN_CODE_POINT); |
| } |
| |
| protected static int TO_U_GET_INDEX(int value) |
| { |
| return value&TO_U_INDEX_MASK; |
| } |
| |
| protected static int TO_U_GET_LENGTH(int value) |
| { |
| return (value>>>TO_U_LENGTH_SHIFT)-TO_U_LENGTH_OFFSET; |
| } |
| |
| /* fromUnicode helpers ------------------------------------------------------ */ |
| |
| /* most trie constants are shared with ucnvmbcs.h */ |
| protected static final int STAGE_2_LEFT_SHIFT = 2; |
| protected static final int STAGE_3_GRANULARITY = 4; |
| |
| /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */ |
| protected static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) |
| { |
| return stage3.get(stage3.position() + ((int)stage12.get( stage12.position() + (stage12.get(stage12.position()+s1Index) +((c>>>4)&0x3f)) )<<STAGE_2_LEFT_SHIFT) +(c&0xf) ); |
| } |
| |
| protected static final int FROM_U_LENGTH_SHIFT = 24; |
| protected static final int FROM_U_ROUNDTRIP_FLAG = 1<<31; |
| protected static final int FROM_U_RESERVED_MASK = 0x60000000; |
| protected static final int FROM_U_DATA_MASK = 0xffffff; |
| |
| /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */ |
| protected static final int FROM_U_SUBCHAR1 = 0x80000001; |
| |
| /* at most 3 bytes in the lower part of the value */ |
| protected static final int FROM_U_MAX_DIRECT_LENGTH = 3; |
| |
| /* maximum number of indexed bytes */ |
| protected static final int MAX_BYTES = 0x1f; |
| |
| protected static boolean FROM_U_IS_PARTIAL(int value) {return (value>>>FROM_U_LENGTH_SHIFT)==0;} |
| protected static int FROM_U_GET_PARTIAL_INDEX(int value) {return value;} |
| |
| protected static boolean FROM_U_IS_ROUNDTRIP(int value) {return (value&FROM_U_ROUNDTRIP_FLAG)!=0;} |
| protected static int FROM_U_MASK_ROUNDTRIP(int value) {return value&~FROM_U_ROUNDTRIP_FLAG;} |
| |
| /* use after masking off the roundtrip flag */ |
| protected static int FROM_U_GET_LENGTH(int value) {return (value>>>FROM_U_LENGTH_SHIFT)&MAX_BYTES;} |
| |
| /* get bytes or bytes index */ |
| protected static int FROM_U_GET_DATA(int value) {return value&FROM_U_DATA_MASK;} |
| |
| /* get the pointer to an extension array from indexes[index] */ |
| protected static Buffer ARRAY(ByteBuffer indexes, int index, Class itemType) |
| { |
| int oldpos = indexes.position(); |
| Buffer b; |
| |
| indexes.position(indexes.getInt(index*4)); |
| if(itemType == int.class) |
| b = indexes.asIntBuffer(); |
| else if(itemType == short.class) |
| b = indexes.asShortBuffer(); |
| else if(itemType == byte.class) |
| b = indexes.slice(); |
| else if(itemType == char.class) |
| b = indexes.asCharBuffer(); |
| else |
| b = indexes.slice(); |
| indexes.position(oldpos); |
| return b; |
| } |
| |
| protected static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) |
| { |
| indexes.position(0); |
| return indexes.getInt(EXT_COUNT_BYTES)&0xff; |
| } |
| |
| /* |
| * @return index of the UChar, if found; else <0 |
| */ |
| protected static int findFromU(CharBuffer fromUSection, int length, char u) |
| { |
| int i, start, limit; |
| |
| /* binary search */ |
| start=0; |
| limit=length; |
| for(;;) { |
| i=limit-start; |
| if(i<=1) { |
| break; /* done */ |
| } |
| /* start<limit-1 */ |
| |
| if(i<=4) { |
| /* linear search for the last part */ |
| if(u<=fromUSection.get(fromUSection.position() + start)) { |
| break; |
| } |
| if(++start<limit && u<=fromUSection.get(fromUSection.position() +start)) { |
| break; |
| } |
| if(++start<limit && u<=fromUSection.get(fromUSection.position() + start)) { |
| break; |
| } |
| /* always break at start==limit-1 */ |
| ++start; |
| break; |
| } |
| |
| i=(start+limit)/2; |
| if(u<fromUSection.get(fromUSection.position() +i)) { |
| limit=i; |
| } else { |
| start=i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if(start<limit && u==fromUSection.get(fromUSection.position() +start)) { |
| return start; |
| } else { |
| return -1; /* not found */ |
| } |
| } |
| |
| /* |
| * @return lookup value for the byte, if found; else 0 |
| */ |
| protected static int findToU(IntBuffer toUSection, int length, short byt) |
| { |
| long word0, word; |
| int i, start, limit; |
| |
| /* check the input byte against the lowest and highest section bytes */ |
| //agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position property |
| start = TO_U_GET_BYTE(toUSection.get(toUSection.position())); |
| limit = TO_U_GET_BYTE(toUSection.get(toUSection.position() + length-1)); |
| if(byt<start || limit<byt) { |
| return 0; /* the byte is out of range */ |
| } |
| |
| if(length==((limit-start)+1)) { |
| /* direct access on a linear array */ |
| return TO_U_GET_VALUE(toUSection.get(toUSection.position()+byt-start)); /* could be 0 */ |
| } |
| |
| /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ |
| word0 = TO_U_MAKE_WORD((byte)byt, 0) & UConverterConstants.UNSIGNED_INT_MASK; |
| |
| /* |
| * Shift byte once instead of each section word and add 0xffffff. |
| * We will compare the shifted/added byte (bbffffff) against |
| * section words which have byte values in the same bit position. |
| * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv |
| * for all v=0..f |
| * so we need not mask off the lower 24 bits of each section word. |
| */ |
| word = word0|TO_U_VALUE_MASK; |
| |
| /* binary search */ |
| start = 0; |
| limit = length; |
| for(;;) { |
| i=limit-start; |
| if(i<=1) { |
| break; /* done */ |
| } |
| /* start<limit-1 */ |
| |
| if(i<=4) { |
| /* linear search for the last part */ |
| if(word0<=(toUSection.get(toUSection.position()+start) & UConverterConstants.UNSIGNED_INT_MASK)) { |
| break; |
| } |
| if(++start<limit && word0<=(toUSection.get(toUSection.position()+start)&UConverterConstants.UNSIGNED_INT_MASK)) { |
| break; |
| } |
| if(++start<limit && word0<=(toUSection.get(toUSection.position()+start)&UConverterConstants.UNSIGNED_INT_MASK)) { |
| break; |
| } |
| /* always break at start==limit-1 */ |
| ++start; |
| break; |
| } |
| |
| i=(start+limit)/2; |
| if(word<(toUSection.get(toUSection.position()+i)&UConverterConstants.UNSIGNED_INT_MASK)) { |
| limit=i; |
| } else { |
| start=i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if(start<limit && byt==TO_U_GET_BYTE((int)(word=(toUSection.get(toUSection.position()+start)&UConverterConstants.UNSIGNED_INT_MASK)))) { |
| return TO_U_GET_VALUE((int)word); /* never 0 */ |
| } else { |
| return 0; /* not found */ |
| } |
| } |
| |
| /* |
| * TRUE if not an SI/SO stateful converter, |
| * or if the match length fits with the current converter state |
| */ |
| protected static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState, int match) |
| { |
| return sisoState<0 || (sisoState==0) == (match==1); |
| } |
| |
| /* |
| * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), |
| * or 1 for DBCS-only, |
| * or -1 if the converter is not SI/SO stateful |
| * |
| * Note: For SI/SO stateful converters getting here, |
| * cnv->mode==0 is equivalent to firstLength==1. |
| */ |
| protected static int SISO_STATE(UConverterSharedData sharedData, int mode) |
| { |
| return sharedData.mbcs.outputType==MBCS_OUTPUT_2_SISO ? (byte)mode : |
| sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1; |
| } |
| |
| class CharsetDecoderMBCS extends CharsetDecoderICU{ |
| |
| CharsetDecoderMBCS(CharsetICU cs) { |
| super(cs); |
| } |
| |
| protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){ |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex; |
| int stateTable[][/*256*/]; |
| char[] unicodeCodeUnits; |
| |
| int offset; |
| byte state; |
| int byteIndex; |
| byte[] bytes; |
| |
| int sourceIndex, nextSourceIndex; |
| |
| int entry = 0; |
| char c; |
| byte action; |
| |
| if(preToULength>0) { |
| /* |
| * pass sourceIndex=-1 because we continue from an earlier buffer |
| * in the future, this may change with continuous offsets |
| */ |
| cr[0] = continueMatchToU(source, target, offsets, -1, flush); |
| |
| if(cr[0].isError() || preToULength<0) { |
| return cr[0]; |
| } |
| } |
| |
| if(sharedData.mbcs.countStates==1) { |
| if((sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush); |
| } |
| else { |
| cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush); |
| } |
| return cr[0]; |
| } |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| stateTable = sharedData.mbcs.swapLFNLStateTable; |
| } |
| else { |
| stateTable = sharedData.mbcs.stateTable; |
| } |
| unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; |
| |
| /* get the converter state from UConverter */ |
| offset = (int)toUnicodeStatus; |
| byteIndex = toULength; |
| bytes = toUBytesArray; |
| |
| /* |
| * if we are in the SBCS state for a DBCS-only converter, |
| * then load the DBCS state from the MBCS data |
| * (dbcsOnlyState==0 if it is not a DBCS-only converter) |
| */ |
| if((state=(byte)(mode))==0) { |
| state = sharedData.mbcs.dbcsOnlyState; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = byteIndex==0 ? 0 : -1; |
| nextSourceIndex = 0; |
| |
| /* conversion loop */ |
| while(sourceArrayIndex<source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. |
| * It does not catch output of more than one code unit that |
| * overflows as a result of a surrogate pair or callback output |
| * from the last source byte. |
| * Therefore, those situations also test for overflows and will |
| * then break the loop, too. |
| */ |
| if(!target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| if(byteIndex==0) { |
| /* optimized loop for 1/2-byte input and BMP output */ |
| if(offsets==null) { |
| do { |
| entry = stateTable[state][source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
| state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); |
| offset = MBCS_ENTRY_TRANSITION_OFFSET(entry); |
| |
| ++sourceArrayIndex; |
| if(sourceArrayIndex<source.limit() && |
| MBCS_ENTRY_IS_FINAL(entry=stateTable[state][source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK]) && |
| MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && |
| (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe) { |
| ++sourceArrayIndex; |
| target.put(c); |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| offset = 0; |
| } |
| else { |
| /* set the state and leave the optimized loop */ |
| bytes[0] = source.get(sourceArrayIndex-1); |
| byteIndex = 1; |
| break; |
| } |
| } |
| else { |
| if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| ++sourceArrayIndex; |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| } |
| else { |
| /* leave the optimized loop */ |
| break; |
| } |
| } |
| } while(sourceArrayIndex<source.limit() && target.hasRemaining()); |
| } |
| else /* offsets!=NULL */ { |
| //agljport:todo see ucnvmbcs.c for deleted block |
| do { |
| entry = stateTable[state][source.get(sourceArrayIndex)]; |
| if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
| state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); |
| offset = MBCS_ENTRY_TRANSITION_OFFSET(entry); |
| |
| ++sourceArrayIndex; |
| if(sourceArrayIndex<source.limit() && |
| MBCS_ENTRY_IS_FINAL(entry=stateTable[state][source.get(sourceArrayIndex)]) && |
| MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && |
| (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe) { |
| |
| ++sourceArrayIndex; |
| target.put(c); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| sourceIndex = (nextSourceIndex+=2); |
| } |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| offset = 0; |
| } |
| else { |
| /* set the state and leave the optimized loop */ |
| ++nextSourceIndex; |
| bytes[0] = source.get(sourceArrayIndex-1); |
| byteIndex = 1; |
| break; |
| } |
| } |
| else { |
| if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| ++sourceArrayIndex; |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| sourceIndex = ++nextSourceIndex; |
| } |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| } |
| else { |
| /* leave the optimized loop */ |
| break; |
| } |
| } |
| } while(sourceArrayIndex<source.limit() && target.hasRemaining()); |
| } |
| |
| /* |
| * these tests and break statements could be put inside the loop |
| * if C had "break outerLoop" like Java |
| */ |
| if(sourceArrayIndex>=source.limit()) { |
| break; |
| } |
| if(!target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| ++nextSourceIndex; |
| bytes[byteIndex++] = source.get(sourceArrayIndex++); |
| } |
| else /* byteIndex>0 */ { |
| ++nextSourceIndex; |
| entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++)) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| } |
| |
| if(MBCS_ENTRY_IS_TRANSITION(entry)) { |
| state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); |
| offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); |
| continue; |
| } |
| |
| /* save the previous state for proper extension mapping with SI/SO-stateful converters */ |
| mode = state; |
| |
| /* set the next state early so that we can reuse the entry variable */ |
| state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ |
| |
| /* |
| * An if-else-if chain provides more reliable performance for |
| * the most common cases compared to a switch. |
| */ |
| action = (byte)(MBCS_ENTRY_FINAL_ACTION(entry)); |
| if(action==MBCS_STATE_VALID_16) { |
| offset += MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[offset]; |
| if(c<0xfffe) { |
| /* output BMP code point */ |
| target.put(c); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| else if(c==0xfffe) { |
| if(isToUUseFallback() && (entry=(int)getFallback(sharedData.mbcs, offset))!=0xfffe) { |
| /* output fallback BMP code point */ |
| target.put((char)entry); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| } |
| else { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| } |
| else if(action==MBCS_STATE_VALID_DIRECT_16) { |
| /* output BMP code point */ |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| else if(action==MBCS_STATE_VALID_16_PAIR) { |
| offset += MBCS_ENTRY_FINAL_VALUE_16(entry); |
| c = unicodeCodeUnits[offset++]; |
| if(c<0xd800) { |
| /* output BMP code point below 0xd800 */ |
| target.put(c); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| else if(isToUUseFallback() ? c<=0xdfff : c<=0xdbff) { |
| /* output roundtrip or fallback surrogate pair */ |
| target.put((char)(c&0xdbff)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| if(target.hasRemaining()) { |
| target.put(unicodeCodeUnits[offset]); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| } |
| else { |
| /* target overflow */ |
| charErrorBufferArray[0] = unicodeCodeUnits[offset]; |
| charErrorBufferLength = 1; |
| cr[0] = CoderResult.OVERFLOW; |
| |
| offset = 0; |
| break; |
| } |
| } |
| else if(isToUUseFallback() ? (c&0xfffe)==0xe000 : c==0xe000) { |
| /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ |
| target.put(unicodeCodeUnits[offset]); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| else if(c==0xffff) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| } |
| else if(action==MBCS_STATE_VALID_DIRECT_20 || |
| (action==MBCS_STATE_FALLBACK_DIRECT_20 && isToUUseFallback())) { |
| entry = MBCS_ENTRY_FINAL_VALUE(entry); |
| /* output surrogate pair */ |
| target.put((char)(0xd800|(char)(entry>>10))); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| c = (char)(0xdc00|(char)(entry&0x3ff)); |
| if(target.hasRemaining()) { |
| target.put(c); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| } |
| else { |
| /* target overflow */ |
| charErrorBufferArray[0]=c; |
| charErrorBufferLength=1; |
| cr[0] = CoderResult.OVERFLOW; |
| |
| offset = 0; |
| break; |
| } |
| } |
| else if(action==MBCS_STATE_CHANGE_ONLY) { |
| /* |
| * This serves as a state change without any output. |
| * It is useful for reading simple stateful encodings, |
| * for example using just Shift-In/Shift-Out codes. |
| * The 21 unused bits may later be used for more sophisticated |
| * state transitions. |
| */ |
| if(sharedData.mbcs.dbcsOnlyState==0) { |
| byteIndex = 0; |
| } |
| else { |
| /* SI/SO are illegal for DBCS-only conversion */ |
| state = (byte)(mode); /* restore the previous state */ |
| |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| } |
| else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { |
| if(isToUUseFallback()) { |
| /* output BMP code point */ |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| byteIndex = 0; |
| } |
| } |
| else if(action==MBCS_STATE_UNASSIGNED) { |
| /* just fall through */ |
| } |
| else if(action==MBCS_STATE_ILLEGAL) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(byteIndex); |
| } |
| else { |
| /* reserved, must never occur */ |
| byteIndex = 0; |
| } |
| |
| /* end of action codes: prepare for a new character */ |
| offset=0; |
| |
| if(byteIndex==0) { |
| sourceIndex = nextSourceIndex; |
| } |
| else if(cr[0].isError()) { |
| /* callback(illegal) */ |
| break; |
| } |
| else /* unassigned sequences indicated with byteIndex>0 */ { |
| /* try an extension mapping */ |
| int sourceBeginIndex = sourceArrayIndex; |
| source.position(sourceArrayIndex); |
| byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex = nextSourceIndex+(int)(sourceArrayIndex-sourceBeginIndex); |
| |
| if(cr[0].isError()|| cr[0].isOverflow()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| toUnicodeStatus = offset; |
| mode = state; |
| toULength = byteIndex; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* |
| * continue partial match with new input |
| * never called for simple, single-character conversion |
| */ |
| protected CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, boolean flush) |
| { |
| CoderResult cr = CoderResult.UNDERFLOW; |
| |
| int[] value = new int[1]; |
| int match, length; |
| |
| match = matchToU((byte)SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source, value, flush); |
| |
| if(match>0) { |
| if(match>=preToULength) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position()+match-preToULength); |
| preToULength = 0; |
| } |
| else { |
| /* the match did not use all of preToU[] - keep the rest for replay */ |
| length = preToULength - match; |
| System.arraycopy(preToUArray, preToUBegin+match, preToUArray, preToUBegin, length); |
| preToULength=(byte)-length; |
| } |
| |
| /* write result */ |
| cr = writeToU(value[0], target, offsets, srcIndex); |
| } |
| else if(match<0) { |
| /* save state for partial match */ |
| int j, sArrayIndex; |
| |
| /* just _append_ the newly consumed input to preToU[] */ |
| sArrayIndex = source.position(); |
| match =- match; |
| for(j=preToULength; j<match; ++j) { |
| preToUArray[j] = source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ |
| preToULength=(byte)match; |
| } |
| else /* match==0 */ { |
| /* |
| * no match |
| * |
| * We need to split the previous input into two parts: |
| * |
| * 1. The first codepage character is unmappable - that's how we got into |
| * trying the extension data in the first place. |
| * We need to move it from the preToU buffer |
| * to the error buffer, set an error code, |
| * and prepare the rest of the previous input for 2. |
| * |
| * 2. The rest of the previous input must be converted once we |
| * come back from the callback for the first character. |
| * At that time, we have to try again from scratch to convert |
| * these input characters. |
| * The replay will be handled by the ucnv.c conversion code. |
| */ |
| |
| /* move the first codepage character to the error field */ |
| System.arraycopy(preToUArray, preToUBegin, toUBytesArray, toUBytesBegin, preToUFirstLength); |
| toULength = preToUFirstLength; |
| |
| /* move the rest up inside the buffer */ |
| length = preToULength-preToUFirstLength; |
| if(length>0) { |
| System.arraycopy(preToUArray, preToUBegin+preToUFirstLength, preToUArray, preToUBegin, length); |
| } |
| |
| /* mark preToU for replay */ |
| preToULength = (byte)-length; |
| |
| /* set the error code for unassigned */ |
| cr = CoderResult.unmappableForLength(preToUFirstLength); |
| } |
| return cr; |
| } |
| |
| /* |
| * this works like natchFromU() except |
| * - the first character is in pre |
| * - no trie is used |
| * - the returned matchLength is not offset by 2 |
| */ |
| protected int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source, int[] pMatchValue, boolean flush) |
| { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| IntBuffer toUTable, toUSection; |
| |
| int value, matchValue, srcLength; |
| int i, j, index, length, matchLength; |
| short b; |
| |
| if(cx==null || cx.asIntBuffer().get(EXT_TO_U_LENGTH)<=0) { |
| return 0; /* no extension data, no match */ |
| } |
| |
| /* initialize */ |
| toUTable = (IntBuffer)ARRAY(cx, EXT_TO_U_INDEX, int.class); |
| index = 0; |
| |
| matchValue = 0; |
| i = j = matchLength=0; |
| srcLength = source.remaining(); |
| |
| if(sisoState==0) { |
| /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ |
| if(preLength>1) { |
| return 0; /* no match of a DBCS sequence in SBCS mode */ |
| } |
| else if(preLength==1) { |
| srcLength = 0; |
| } |
| else /* preLength==0 */ { |
| if(srcLength>1) { |
| srcLength = 1; |
| } |
| } |
| flush = true; |
| } |
| |
| /* we must not remember fallback matches when not using fallbacks */ |
| |
| /* match input units until there is a full match or the input is consumed */ |
| for(;;) { |
| /* go to the next section */ |
| int oldpos = toUTable.position(); |
| toUSection=((IntBuffer)toUTable.position(index)).slice(); |
| toUTable.position(oldpos); |
| |
| /* read first pair of the section */ |
| value = toUSection.get(); |
| length = TO_U_GET_BYTE(value); |
| value =TO_U_GET_VALUE(value); |
| if(value!=0 && |
| (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback()) && |
| TO_U_VERIFY_SISO_MATCH(sisoState, i+j)) { |
| /* remember longest match so far */ |
| matchValue=value; |
| matchLength=i+j; |
| } |
| |
| /* match pre[] then src[] */ |
| if(i<preLength) { |
| b=(short)(preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } |
| else if(j<srcLength) { |
| b=(short)(source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| } |
| else { |
| /* all input consumed, partial match */ |
| if(flush || (length=(i+j))>MAX_BYTES) { |
| /* |
| * end of the entire input stream, stop with the longest match so far |
| * or: partial match must not be longer than UCNV_EXT_MAX_BYTES |
| * because it must fit into state buffers |
| */ |
| break; |
| } |
| else { |
| /* continue with more input next time */ |
| return -length; |
| } |
| } |
| |
| /* search for the current UChar */ |
| value = findToU(toUSection, length, b); |
| if(value==0) { |
| /* no match here, stop with the longest match so far */ |
| break; |
| } else { |
| if(TO_U_IS_PARTIAL(value)) { |
| /* partial match, continue */ |
| index = TO_U_GET_PARTIAL_INDEX(value); |
| } else { |
| if((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback()) && |
| TO_U_VERIFY_SISO_MATCH(sisoState, i+j)) { |
| /* full match, stop with result */ |
| matchValue = value; |
| matchLength = i+j; |
| } |
| else { |
| /* full match on fallback not taken, stop with the longest match so far */ |
| } |
| break; |
| } |
| } |
| } |
| |
| if(matchLength==0) { |
| /* no match at all */ |
| return 0; |
| } |
| |
| /* return result */ |
| pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue); |
| return matchLength; |
| } |
| |
| protected CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) |
| { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| /* output the result */ |
| if(TO_U_IS_CODE_POINT(value)) { |
| /* output a single code point */ |
| return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex); |
| } else { |
| /* output a string - with correct data we have resultLength>0 */ |
| |
| char[] a = new char[TO_U_GET_LENGTH(value)]; |
| CharBuffer cb = ((CharBuffer)ARRAY(cx, EXT_TO_U_UCHARS_INDEX, char.class)); |
| cb.position(TO_U_GET_INDEX(value)); |
| cb.get(a, 0, a.length); |
| return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex); |
| } |
| } |
| |
| protected CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) |
| { |
| CoderResult cr = CoderResult.UNDERFLOW; |
| int tBeginIndex = target.position(); |
| |
| if(target.hasRemaining()) { |
| if(c<=0xffff) { |
| target.put((char)c); |
| c = UConverterConstants.U_SENTINEL; |
| } else /* c is a supplementary code point */ { |
| target.put(UTF16.getLeadSurrogate(c)); |
| c = UTF16.getTrailSurrogate(c); |
| if(target.hasRemaining()) { |
| target.put((char)c); |
| c = UConverterConstants.U_SENTINEL; |
| } |
| } |
| |
| /* write offsets */ |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| if((tBeginIndex+1)<target.position()) { |
| offsets.put(sourceIndex); |
| } |
| } |
| } |
| |
| /* write overflow from c */ |
| if(c>=0) { |
| charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c); |
| cr = CoderResult.OVERFLOW; |
| } |
| |
| return cr; |
| } |
| |
| /* |
| * Input sequence: cnv->toUBytes[0..length[ |
| * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input |
| * else return 0 after output has been written to the target |
| */ |
| protected int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex, boolean flush, CoderResult[] cr) |
| { |
| //ByteBuffer cx; |
| |
| if(sharedData.mbcs.extIndexes!=null && |
| initialMatchToU(length, source, target, offsets, sourceIndex, flush, cr)) { |
| return 0; /* an extension mapping handled the input */ |
| } |
| |
| /* GB 18030 */ |
| if(length==4 && (options&MBCS_OPTION_GB18030)!=0) { |
| long[] range; |
| long linear; |
| int i; |
| |
| linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]); |
| range = gb18030Ranges[0]; |
| for(i=0; i<gb18030Ranges.length/gb18030Ranges[0].length; range=gb18030Ranges[++i]) { |
| if(range[2]<=linear && linear<=range[3]) { |
| /* found the sequence, output the Unicode code point for it */ |
| cr[0] = CoderResult.UNDERFLOW; |
| |
| /* add the linear difference between the input and start sequences to the start code point */ |
| linear = range[0]+(linear-range[2]); |
| |
| /* output this code point */ |
| cr[0] = toUWriteCodePoint((int)linear, target, offsets, sourceIndex); |
| |
| return 0; |
| } |
| } |
| } |
| |
| /* no mapping */ |
| cr[0] = CoderResult.unmappableForLength(length); |
| return length; |
| } |
| |
| /* |
| * target<targetLimit; set error code for overflow |
| */ |
| protected boolean initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, boolean flush, CoderResult[] cr) |
| { |
| int[] value = new int[1]; |
| int match = 0; |
| |
| /* try to match */ |
| match = matchToU((byte)SISO_STATE(sharedData, mode), toUBytesArray, toUBytesBegin, firstLength, source, value, flush); |
| if(match>0) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position()+match-firstLength); |
| |
| /* write result to target */ |
| cr[0] = writeToU(value[0], target, offsets, srcIndex); |
| return true; |
| } |
| else if(match<0) { |
| /* save state for partial match */ |
| byte[] sArray; |
| int sArrayIndex; |
| int j; |
| |
| /* copy the first code point */ |
| sArray = toUBytesArray; |
| sArrayIndex = toUBytesBegin; |
| preToUFirstLength = (byte)firstLength; |
| for(j=0; j<firstLength; ++j) { |
| preToUArray[j]=sArray[sArrayIndex++]; |
| } |
| |
| /* now copy the newly consumed input */ |
| sArrayIndex = source.position(); |
| match =- match; |
| for(; j<match; ++j) { |
| preToUArray[j] = source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); |
| preToULength=(byte)match; |
| return true; |
| } |
| else /* match==0 no match */ { |
| return false; |
| } |
| } |
| |
| /* |
| * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages |
| * that only map to and from the BMP. |
| * In addition to single-byte optimizations, the offset calculations |
| * become much easier. |
| */ |
| protected CoderResult cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) |
| { |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex, lastSource; |
| int targetCapacity, length; |
| int[][] stateTable; |
| |
| int sourceIndex; |
| |
| int entry; |
| byte action; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| targetCapacity = target.remaining(); |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| stateTable = sharedData.mbcs.swapLFNLStateTable; |
| } |
| else { |
| stateTable = sharedData.mbcs.stateTable; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = 0; |
| lastSource = sourceArrayIndex; |
| |
| /* |
| * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
| * for the minimum of the sourceLength and targetCapacity |
| */ |
| length = source.remaining(); |
| if(length<targetCapacity) { |
| targetCapacity=length; |
| } |
| |
| /* conversion loop */ |
| while(targetCapacity>0) { |
| entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| /* MBCS_ENTRY_IS_FINAL(entry) */ |
| |
| /* test the most common case first */ |
| if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| --targetCapacity; |
| continue; |
| } |
| |
| /* |
| * An if-else-if chain provides more reliable performance for |
| * the most common cases compared to a switch. |
| */ |
| action = (byte)(MBCS_ENTRY_FINAL_ACTION(entry)); |
| if(action==MBCS_STATE_FALLBACK_DIRECT_16) { |
| if(isToUUseFallback()) { |
| /* output BMP code point */ |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| --targetCapacity; |
| continue; |
| } |
| } |
| else if(action==MBCS_STATE_UNASSIGNED) { |
| /* just fall through */ |
| } |
| else if(action==MBCS_STATE_ILLEGAL) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(sourceArrayIndex-lastSource); |
| } else { |
| /* reserved, must never occur */ |
| continue; |
| } |
| |
| /* set offsets since the start or the last extension */ |
| if(offsets!=null) { |
| int count = sourceArrayIndex-lastSource; |
| |
| /* predecrement: do not set the offset for the callback-causing character */ |
| while(--count>0) { |
| offsets.put(sourceIndex++); |
| } |
| /* offset and sourceIndex are now set for the current character */ |
| } |
| |
| if(cr[0].isError()) { |
| /* callback(illegal) */ |
| break; |
| } |
| else /* unassigned sequences indicated with byteIndex>0 */ { |
| /* try an extension mapping */ |
| lastSource = sourceArrayIndex; |
| toUBytesArray[0]=source.get(sourceArrayIndex-1); |
| source.position(sourceArrayIndex); |
| toULength = toU((byte)1, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex += 1+(int)(sourceArrayIndex-lastSource); |
| |
| if(cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| targetCapacity = target.remaining(); |
| length = source.remaining(); |
| if(length<targetCapacity) { |
| targetCapacity = length; |
| } |
| } |
| } |
| |
| if(!cr[0].isError() && sourceArrayIndex<source.capacity() && !target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| |
| /* set offsets since the start or the last callback */ |
| if(offsets!=null) { |
| int count = sourceArrayIndex-lastSource; |
| while(count>0) { |
| offsets.put(sourceIndex++); |
| --count; |
| } |
| } |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ |
| protected CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) |
| { |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex; |
| int[][] stateTable; |
| |
| int sourceIndex; |
| |
| int entry; |
| char c; |
| byte action; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| stateTable = sharedData.mbcs.swapLFNLStateTable; |
| } |
| else { |
| stateTable = sharedData.mbcs.stateTable; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = 0; |
| |
| /* conversion loop */ |
| while(sourceArrayIndex<source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. |
| * It does not catch output of more than one code unit that |
| * overflows as a result of a surrogate pair or callback output |
| * from the last source byte. |
| * Therefore, those situations also test for overflows and will |
| * then break the loop, too. |
| */ |
| if(!target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; |
| /* MBCS_ENTRY_IS_FINAL(entry) */ |
| |
| /* test the most common case first */ |
| if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { |
| /* output BMP code point */ |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| |
| /* normal end of action codes: prepare for a new character */ |
| ++sourceIndex; |
| continue; |
| } |
| |
| /* |
| * An if-else-if chain provides more reliable performance for |
| * the most common cases compared to a switch. |
| */ |
| action = (byte)(MBCS_ENTRY_FINAL_ACTION(entry)); |
| if(action==MBCS_STATE_VALID_DIRECT_20 || |
| (action==MBCS_STATE_FALLBACK_DIRECT_20 && isToUUseFallback())) { |
| |
| entry = MBCS_ENTRY_FINAL_VALUE(entry); |
| /* output surrogate pair */ |
| target.put((char)(0xd800|(char)(entry>>>10))); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| c = (char)(0xdc00|(char)(entry&0x3ff)); |
| if(target.hasRemaining()) { |
| target.put(c); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| } |
| else { |
| /* target overflow */ |
| charErrorBufferArray[0]=c; |
| charErrorBufferLength=1; |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| |
| ++sourceIndex; |
| continue; |
| } |
| else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { |
| if(isToUUseFallback()) { |
| /* output BMP code point */ |
| target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| |
| ++sourceIndex; |
| continue; |
| } |
| } |
| else if(action==MBCS_STATE_UNASSIGNED) { |
| /* just fall through */ |
| } |
| else if(action==MBCS_STATE_ILLEGAL) { |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| } |
| else { |
| /* reserved, must never occur */ |
| ++sourceIndex; |
| continue; |
| } |
| |
| if(cr[0].isError()) { |
| /* callback(illegal) */ |
| break; |
| } |
| else /* unassigned sequences indicated with byteIndex>0 */ { |
| /* try an extension mapping */ |
| int sourceBeginIndex = sourceArrayIndex; |
| toUBytesArray[0] = source.get(sourceArrayIndex-1); |
| source.position(sourceArrayIndex); |
| toULength = toU((byte)1, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex += 1+(int)(sourceArrayIndex-sourceBeginIndex); |
| |
| if(cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } |
| } |
| } |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| protected int getFallback(UConverterMBCSTable mbcsTable, int offset) |
| { |
| MBCSToUFallback[] toUFallbacks; |
| int i, start, limit; |
| |
| limit = mbcsTable.countToUFallbacks; |
| if(limit>0) { |
| /* do a binary search for the fallback mapping */ |
| toUFallbacks = mbcsTable.toUFallbacks; |
| start = 0; |
| while(start<limit-1) { |
| i = (start+limit)/2; |
| if(offset<toUFallbacks[i].offset) { |
| limit = i; |
| } |
| else { |
| start = i; |
| } |
| } |
| |
| /* did we really find it? */ |
| if(offset==toUFallbacks[start].offset) { |
| return toUFallbacks[start].codePoint; |
| } |
| } |
| |
| return 0xfffe; |
| } |
| |
| } |
| |
| class CharsetEncoderMBCS extends CharsetEncoderICU{ |
| |
| CharsetEncoderMBCS(CharsetICU cs) { |
| super(cs, fromUSubstitution); |
| implReset(); |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| preFromUFirstCP = UConverterConstants.U_SENTINEL; |
| } |
| |
| protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ |
| |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex; |
| char[] table; |
| byte[] pArray, bytes; |
| int pArrayIndex, outputType, c; |
| int prevSourceIndex, sourceIndex, nextSourceIndex; |
| int stage2Entry, value, length, prevLength; |
| short unicodeMask; |
| |
| try{ |
| |
| if(preFromUFirstCP>=0) { |
| /* |
| * pass sourceIndex=-1 because we continue from an earlier buffer |
| * in the future, this may change with continuous offsets |
| */ |
| cr[0] = continueMatchFromU(source, target, offsets, flush, -1); |
| |
| if(cr[0].isError() || preFromULength<0) { |
| return cr[0]; |
| } |
| } |
| |
| /* use optimized function if possible */ |
| outputType = sharedData.mbcs.outputType; |
| unicodeMask = sharedData.mbcs.unicodeMask; |
| if(outputType==MBCS_OUTPUT_1 && (unicodeMask&UConverterConstants.HAS_SURROGATES) == 0) { |
| if((unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush); |
| } else { |
| cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush); |
| } |
| return cr[0]; |
| } else if(outputType==MBCS_OUTPUT_2) { |
| cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush); |
| return cr[0]; |
| } |
| |
| table = sharedData.mbcs.fromUnicodeTable; |
| sourceArrayIndex = source.position(); |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes; |
| } else { |
| bytes = sharedData.mbcs.fromUnicodeBytes; |
| } |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| if(outputType==MBCS_OUTPUT_2_SISO) { |
| prevLength=(int)fromUnicodeStatus; |
| if(prevLength==0) { |
| /* set the real value */ |
| prevLength=1; |
| } |
| } else { |
| /* prevent fromUnicodeStatus from being set to something non-0 */ |
| prevLength=0; |
| } |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| prevSourceIndex=-1; |
| sourceIndex= c==0 ? 0 : -1; |
| nextSourceIndex=0; |
| |
| /* conversion loop */ |
| /* |
| * This is another piece of ugly code: |
| * A goto into the loop if the converter state contains a first surrogate |
| * from the previous function call. |
| * It saves me to check in each loop iteration a check of if(c==0) |
| * and duplicating the trail-surrogate-handling code in the else |
| * branch of that check. |
| * I could not find any other way to get around this other than |
| * using a function call for the conversion and callback, which would |
| * be even more inefficient. |
| * |
| * Markus Scherer 2000-jul-19 |
| */ |
| boolean doloop = true; |
| if(c!=0 && target.hasRemaining()) { |
| SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength); |
| doloop = getTrail(source, target, unicodeMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| prevSourceIndex = x.prevSourceIndex; |
| prevLength = x.prevLength; |
| } |
| |
| if(doloop) { |
| while(sourceArrayIndex<source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. |
| * It does not catch output of more than one byte that |
| * overflows as a result of a multi-byte character or callback output |
| * from the last source character. |
| * Therefore, those situations also test for overflows and will |
| * then break the loop, too. |
| */ |
| if(target.hasRemaining()) { |
| /* |
| * Get a correct Unicode code point: |
| * a single UChar for a BMP code point or |
| * a matched surrogate pair for a "supplementary code point". |
| */ |
| c = source.get(sourceArrayIndex++); |
| ++nextSourceIndex; |
| /* |
| * This also tests if the codepage maps single surrogates. |
| * If it does, then surrogates are not paired but mapped separately. |
| * Note that in this case unmatched surrogates are not detected. |
| */ |
| if(UTF16.isSurrogate((char)c) && (unicodeMask&UConverterConstants.HAS_SURROGATES) == 0) { |
| if(UTF16.isLeadSurrogate((char)c)) { |
| //getTrail: |
| SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength); |
| doloop = getTrail(source, target, unicodeMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| prevSourceIndex = x.prevSourceIndex; |
| |
| if(doloop) |
| continue; |
| else |
| break; |
| } |
| else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| |
| /* convert the Unicode code point in c into codepage bytes */ |
| |
| /* |
| * The basic lookup is a triple-stage compact array (trie) lookup. |
| * For details see the beginning of this file. |
| * |
| * Single-byte codepages are handled with a different data structure |
| * by _MBCSSingle... functions. |
| * |
| * The result consists of a 32-bit value from stage 2 and |
| * a pointer to as many bytes as are stored per character. |
| * The pointer points to the character's bytes in stage 3. |
| * Bits 15..0 of the stage 2 entry contain the stage 3 index |
| * for that pointer, while bits 31..16 are flags for which of |
| * the 16 characters in the block are roundtrip-assigned. |
| * |
| * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t |
| * respectively as uint32_t, in the platform encoding. |
| * For 3-byte codepages, the bytes are always stored in big-endian order. |
| * |
| * For EUC encodings that use only either 0x8e or 0x8f as the first |
| * byte of their longest byte sequences, the first two bytes in |
| * this third stage indicate with their 7th bits whether these bytes |
| * are to be written directly or actually need to be preceeded by |
| * one of the two Single-Shift codes. With this, the third stage |
| * stores one byte fewer per character than the actual maximum length of |
| * EUC byte sequences. |
| * |
| * Other than that, leading zero bytes are removed and the other |
| * bytes output. A single zero byte may be output if the "assigned" |
| * bit in stage 2 was on. |
| * The data structure does not support zero byte output as a fallback, |
| * and also does not allow output of leading zeros. |
| */ |
| stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
| |
| /* get the bytes and the length for the output */ |
| switch(outputType) { |
| case MBCS_OUTPUT_2: |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| length=1; |
| } |
| else { |
| length=2; |
| } |
| break; |
| case MBCS_OUTPUT_2_SISO: |
| /* 1/2-byte stateful with Shift-In/Shift-Out */ |
| /* |
| * Save the old state in the converter object |
| * right here, then change the local prevLength state variable if necessary. |
| * Then, if this character turns out to be unassigned or a fallback that |
| * is not taken, the callback code must not save the new state in the converter |
| * because the new state is for a character that is not output. |
| * However, the callback must still restore the state from the converter |
| * in case the callback function changed it for its output. |
| */ |
| fromUnicodeStatus=prevLength; /* save the old state */ |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==false) { |
| /* no mapping, leave value==0 */ |
| length = 0; |
| } |
| else if(prevLength<=1) { |
| length = 1; |
| } |
| else { |
| /* change from double-byte mode to single-byte */ |
| value |= UConverterConstants.SI<<8; |
| length = 2; |
| prevLength = 1; |
| } |
| } |
| else { |
| if(prevLength==2) { |
| length = 2; |
| } |
| else { |
| /* change from single-byte mode to double-byte */ |
| value |= UConverterConstants.SO<<16; |
| length = 3; |
| prevLength = 2; |
| } |
| } |
| break; |
| case MBCS_OUTPUT_DBCS_ONLY: |
| /* table with single-byte results, but only DBCS mappings used */ |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| /* no mapping or SBCS result, not taken for DBCS-only */ |
| value = stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ |
| length = 0; |
| } else { |
| length = 2; |
| } |
| break; |
| case MBCS_OUTPUT_3: |
| pArray = bytes; |
| pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); |
| value = ((pArray[pArrayIndex]&UConverterConstants.UNSIGNED_BYTE_MASK)<<16)|((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK); |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| length = 1; |
| } |
| else if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xffff) { |
| length = 2; |
| } |
| else { |
| length = 3; |
| } |
| break; |
| case MBCS_OUTPUT_4: |
| value = MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| length = 1; |
| } |
| else if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xffff) { |
| length = 2; |
| } |
| else if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xffffff) { |
| length = 3; |
| } |
| else { |
| length = 4; |
| } |
| break; |
| case MBCS_OUTPUT_3_EUC: |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| /* EUC 16-bit fixed-length representation */ |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| length = 1; |
| } |
| else if((value&0x8000)==0) { |
| value |= 0x8e8000; |
| length = 3; |
| } |
| else if((value&0x80)==0) { |
| value |= 0x8f0080; |
| length = 3; |
| } |
| else { |
| length = 2; |
| } |
| break; |
| case MBCS_OUTPUT_4_EUC: |
| pArray = bytes; |
| pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); |
| value = ((pArray[pArrayIndex]&UConverterConstants.UNSIGNED_BYTE_MASK)<<16)|((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK); |
| /* EUC 16-bit fixed-length representation applied to the first two bytes */ |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| length = 1; |
| } |
| else if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xffff) { |
| length = 2; |
| } |
| else if((value&0x800000)==0) { |
| value |= 0x8e800000; |
| length = 4; |
| } |
| else if((value&0x8000)==0) { |
| value |= 0x8f008000; |
| length = 4; |
| } |
| else { |
| length = 3; |
| } |
| break; |
| default: |
| /* must not occur */ |
| /* |
| * To avoid compiler warnings that value & length may be |
| * used without having been initialized, we set them here. |
| * In reality, this is unreachable code. |
| * Not having a default branch also causes warnings with |
| * some compilers. |
| */ |
| value = stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ |
| length = 0; |
| break; |
| } |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value!=0))) { |
| /* |
| * We allow a 0 byte output if the "assigned" bit is set for this entry. |
| * There is no way with this data structure for fallback output |
| * to be a zero byte. |
| */ |
| |
| //unassigned: |
| SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength); |
| doloop = unassigned(source, target, offsets, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| prevSourceIndex = x.prevSourceIndex; |
| prevLength = x.prevLength; |
| if(doloop) |
| continue; |
| else |
| break; |
| } |
| |
| /* write the output character bytes from value and length */ |
| /* from the first if in the loop we know that targetCapacity>0 */ |
| if(length<=target.remaining()) { |
| if(offsets==null) { |
| switch(length) { |
| /* each branch falls through to the next one */ |
| case 4: |
| target.put((byte)(value>>>24)); |
| case 3: |
| target.put((byte)(value>>>16)); |
| case 2: |
| target.put((byte)(value>>>8)); |
| case 1: |
| target.put((byte)value); |
| default: |
| /* will never occur */ |
| break; |
| } |
| } |
| else { |
| switch(length) { |
| /* each branch falls through to the next one */ |
| case 4: |
| target.put((byte)(value>>>24)); |
| offsets.put(sourceIndex); |
| case 3: |
| target.put((byte)(value>>>16)); |
| offsets.put(sourceIndex); |
| case 2: |
| target.put((byte)(value>>>8)); |
| offsets.put(sourceIndex); |
| case 1: |
| target.put((byte)value); |
| offsets.put(sourceIndex); |
| default: |
| /* will never occur */ |
| break; |
| } |
| } |
| } |
| else { |
| int errorBufferArrayIndex; |
| |
| /* |
| * We actually do this backwards here: |
| * In order to save an intermediate variable, we output |
| * first to the overflow buffer what does not fit into the |
| * regular target. |
| */ |
| /* we know that 1<=targetCapacity<length<=4 */ |
| length -= target.remaining(); |
| |
| errorBufferArrayIndex = 0; |
| switch(length) { |
| /* each branch falls through to the next one */ |
| case 3: |
| errorBuffer[errorBufferArrayIndex++]=(byte)(value>>>16); |
| case 2: |
| errorBuffer[errorBufferArrayIndex++]=(byte)(value>>>8); |
| case 1: |
| errorBuffer[errorBufferArrayIndex]=(byte)value; |
| default: |
| /* will never occur */ |
| break; |
| } |
| errorBufferLength = (byte)length; |
| |
| /* now output what fits into the regular target */ |
| value>>>=8*length; /* length was reduced by targetCapacity */ |
| switch(target.remaining()) { |
| /* each branch falls through to the next one */ |
| case 3: |
| target.put((byte)(value>>>16)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| case 2: |
| target.put((byte)(value>>>8)); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| case 1: |
| target.put((byte)value); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| default: |
| /* will never occur */ |
| break; |
| } |
| |
| /* target overflow */ |
| cr[0] = CoderResult.OVERFLOW; |
| c=0; |
| break; |
| } |
| |
| /* normal end of conversion: prepare for a new character */ |
| c=0; |
| if(offsets!=null) { |
| prevSourceIndex=sourceIndex; |
| sourceIndex=nextSourceIndex; |
| } |
| continue; |
| } |
| else { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| |
| /* |
| * the end of the input stream and detection of truncated input |
| * are handled by the framework, but for EBCDIC_STATEFUL conversion |
| * we need to emit an SI at the very end |
| * |
| * conditions: |
| * successful |
| * EBCDIC_STATEFUL in DBCS mode |
| * end of input and no truncated input |
| */ |
| if(outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && |
| flush && sourceArrayIndex>=source.limit() && c==0){ |
| |
| /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ |
| if(target.hasRemaining()) { |
| target.put((byte)UConverterConstants.SI); |
| if(offsets!=null) { |
| /* set the last source character's index (sourceIndex points at sourceLimit now) */ |
| offsets.put(prevSourceIndex); |
| } |
| } |
| else { |
| /* target is full */ |
| errorBuffer[0]=(byte)UConverterConstants.SI; |
| errorBufferLength=1; |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| prevLength=1; /* we switched into SBCS */ |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32=c; |
| fromUnicodeStatus=prevLength; |
| |
| source.position(sourceArrayIndex); |
| } |
| catch(BufferOverflowException ex){ |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| |
| return cr[0]; |
| } |
| |
| /* |
| * continue partial match with new input, requires cnv->preFromUFirstCP>=0 |
| * never called for simple, single-character conversion |
| */ |
| protected CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush, int srcIndex) |
| { |
| CoderResult cr = CoderResult.UNDERFLOW; |
| int[] value = new int[1]; |
| int match; |
| |
| match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, target, value, flush); |
| if(match>=2) { |
| match-=2; /* remove 2 for the initial code point */ |
| |
| if(match>=preFromULength) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position()+match-preFromULength); |
| preFromULength=0; |
| } else { |
| /* the match did not use all of preFromU[] - keep the rest for replay */ |
| int length = preFromULength-match; |
| System.arraycopy(preFromUArray, preFromUBegin+match, preFromUArray, preFromUBegin, length); |
| preFromULength=(byte)-length; |
| } |
| |
| /* finish the partial match */ |
| preFromUFirstCP = UConverterConstants.U_SENTINEL; |
| |
| /* write result */ |
| writeFromU(value[0], target, offsets, srcIndex); |
| } |
| else if(match<0) { |
| /* save state for partial match */ |
| int sArrayIndex; |
| int j; |
| |
| /* just _append_ the newly consumed input to preFromU[] */ |
| sArrayIndex = source.position(); |
| match =- match-2; /* remove 2 for the initial code point */ |
| for(j=preFromULength; j<match; ++j) { |
| preFromUArray[j]=source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ |
| preFromULength=(byte)match; |
| } |
| else /* match==0 or 1 */ { |
| /* |
| * no match |
| * |
| * We need to split the previous input into two parts: |
| * |
| * 1. The first code point is unmappable - that's how we got into |
| * trying the extension data in the first place. |
| * We need to move it from the preFromU buffer |
| * to the error buffer, set an error code, |
| * and prepare the rest of the previous input for 2. |
| * |
| * 2. The rest of the previous input must be converted once we |
| * come back from the callback for the first code point. |
| * At that time, we have to try again from scratch to convert |
| * these input characters. |
| * The replay will be handled by the ucnv.c conversion code. |
| */ |
| |
| if(match==1) { |
| /* matched, no mapping but request for <subchar1> */ |
| useSubChar1=true; |
| } |
| |
| /* move the first code point to the error field */ |
| fromUChar32 = preFromUFirstCP; |
| preFromUFirstCP = UConverterConstants.U_SENTINEL; |
| |
| /* mark preFromU for replay */ |
| preFromULength = (byte) - preFromULength; |
| |
| /* set the error code for unassigned */ |
| cr = CoderResult.unmappableForLength(source.position()); |
| } |
| return cr; |
| } |
| |
| /* |
| * @param cx pointer to extension data; if NULL, returns 0 |
| * @param firstCP the first code point before all the other UChars |
| * @param pre UChars that must match; !initialMatch: partial match with them |
| * @param preLength length of pre, >=0 |
| * @param src UChars that can be used to complete a match |
| * @param srcLength length of src, >=0 |
| * @param pMatchValue [out] output result value for the match from the data structure |
| * @param useFallback "use fallback" flag, usually from cnv->useFallback |
| * @param flush TRUE if the end of the input stream is reached |
| * @return >1: matched, return value=total match length (number of input units matched) |
| * 1: matched, no mapping but request for <subchar1> |
| * (only for the first code point) |
| * 0: no match |
| * <0: partial match, return value=negative total match length |
| * (partial matches are never returned for flush==TRUE) |
| * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) |
| * the matchLength is 2 if only firstCP matched, and >2 if firstCP and |
| * further code units matched |
| */ |
| //static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength, const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush) |
| protected int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source, ByteBuffer target, int[] pMatchValue, boolean flush) |
| { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| |
| CharBuffer stage12, stage3; |
| IntBuffer stage3b; |
| |
| CharBuffer fromUTableUChars, fromUSectionUChars; |
| IntBuffer fromUTableValues, fromUSectionValues; |
| |
| int value, matchValue; |
| int i, j, index, length, matchLength; |
| char c; |
| |
| if(cx==null) { |
| return 0; /* no extension data, no match */ |
| } |
| |
| /* trie lookup of firstCP */ |
| index=firstCP>>>10; /* stage 1 index */ |
| if(index>=cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH)) { |
| return 0; /* the first code point is outside the trie */ |
| } |
| |
| stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class); |
| stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class); |
| index = FROM_U(stage12, stage3, index, firstCP); |
| |
| stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class); |
| value = stage3b.get(stage3b.position() + index); |
| if(value==0) { |
| return 0; |
| } |
| |
| if(TO_U_IS_PARTIAL(value)) { |
| /* partial match, enter the loop below */ |
| index = FROM_U_GET_PARTIAL_INDEX(value); |
| |
| /* initialize */ |
| fromUTableUChars = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class); |
| fromUTableValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class); |
| |
| matchValue=0; |
| i=j=matchLength=0; |
| |
| /* we must not remember fallback matches when not using fallbacks */ |
| |
| /* match input units until there is a full match or the input is consumed */ |
| for(;;) { |
| /* go to the next section */ |
| int oldpos = fromUTableUChars.position(); |
| fromUSectionUChars = ((CharBuffer)fromUTableUChars.position(index)).slice(); |
| fromUTableUChars.position(oldpos); |
| oldpos = fromUTableValues.position(); |
| fromUSectionValues = ((IntBuffer)fromUTableValues.position(index)).slice(); |
| fromUTableValues.position(oldpos); |
| |
| /* read first pair of the section */ |
| length = fromUSectionUChars.get(); |
| value = fromUSectionValues.get(); |
| if( value!=0 && |
| (FROM_U_IS_ROUNDTRIP(value) || |
| isFromUUseFallback(firstCP)) |
| ) { |
| /* remember longest match so far */ |
| matchValue = value; |
| matchLength = 2+i+j; |
| } |
| |
| /* match pre[] then src[] */ |
| if(i<preLength) { |
| c = preArray[preArrayBegin + i++]; |
| } else if(j<source.remaining()) { |
| c = source.get(source.position() + j++); |
| } else { |
| /* all input consumed, partial match */ |
| if(flush || (length=(i+j))>MAX_UCHARS) { |
| /* |
| * end of the entire input stream, stop with the longest match so far |
| * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS |
| * because it must fit into state buffers |
| */ |
| break; |
| } else { |
| /* continue with more input next time */ |
| return -(2+length); |
| } |
| } |
| |
| /* search for the current UChar */ |
| index = findFromU(fromUSectionUChars, length, c); |
| if(index<0) { |
| /* no match here, stop with the longest match so far */ |
| break; |
| } else { |
| value = fromUSectionValues.get(fromUSectionValues.position() + index); |
| if(FROM_U_IS_PARTIAL(value)) { |
| /* partial match, continue */ |
| index = FROM_U_GET_PARTIAL_INDEX(value); |
| } else { |
| if( FROM_U_IS_ROUNDTRIP(value) || |
| isFromUUseFallback(firstCP) |
| ) { |
| /* full match, stop with result */ |
| matchValue=value; |
| matchLength=2+i+j; |
| } else { |
| /* full match on fallback not taken, stop with the longest match so far */ |
| } |
| break; |
| } |
| } |
| } |
| |
| if(matchLength==0) { |
| /* no match at all */ |
| return 0; |
| } |
| } else /* result from firstCP trie lookup */ { |
| if( FROM_U_IS_ROUNDTRIP(value) || |
| isFromUUseFallback(firstCP) |
| ) { |
| /* full match, stop with result */ |
| matchValue=value; |
| matchLength=2; |
| } else { |
| /* fallback not taken */ |
| return 0; |
| } |
| } |
| |
| if((matchValue&FROM_U_RESERVED_MASK) != 0) { |
| /* do not interpret values with reserved bits used, for forward compatibility */ |
| return 0; |
| } |
| |
| /* return result */ |
| if(matchValue==FROM_U_SUBCHAR1) { |
| return 1; /* assert matchLength==2 */ |
| } |
| |
| pMatchValue[0]=FROM_U_MASK_ROUNDTRIP(matchValue); |
| return matchLength; |
| } |
| |
| protected CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) |
| { |
| ByteBuffer cx = sharedData.mbcs.extIndexes; |
| |
| byte bufferArray[] = new byte[1+MAX_BYTES]; |
| int bufferArrayIndex = 0; |
| byte[] resultArray; |
| int resultArrayIndex; |
| int length, prevLength; |
| |
| length = FROM_U_GET_LENGTH(value); |
| value = FROM_U_GET_DATA(value); |
| |
| /* output the result */ |
| if(length<=FROM_U_MAX_DIRECT_LENGTH) { |
| /* |
| * Generate a byte array and then write it below. |
| * This is not the fastest possible way, but it should be ok for |
| * extension mappings, and it is much simpler. |
| * Offset and overflow handling are only done once this way. |
| */ |
| int p = bufferArrayIndex+1; /* reserve buffer[0] for shiftByte below */ |
| switch(length) { |
| case 3: |
| bufferArray[p++] = (byte)(value>>>16); |
| case 2: |
| bufferArray[p++] = (byte)(value>>>8); |
| case 1: |
| bufferArray[p++] = (byte)value; |
| default: |
| break; /* will never occur */ |
| } |
| resultArray = bufferArray; |
| resultArrayIndex = bufferArrayIndex+1; |
| } |
| else { |
| byte[] slice = new byte[length]; |
| |
| ByteBuffer bb = ((ByteBuffer)ARRAY(cx, EXT_FROM_U_BYTES_INDEX, byte.class)); |
| bb.position(value); |
| bb.get(slice, 0, slice.length); |
| |
| resultArray = slice; |
| resultArrayIndex = 0; |
| } |
| |
| /* with correct data we have length>0 */ |
| |
| if((prevLength=(int)fromUnicodeStatus)!=0) { |
| /* handle SI/SO stateful output */ |
| byte shiftByte; |
| |
| if(prevLength>1 && length==1) { |
| /* change from double-byte mode to single-byte */ |
| shiftByte = (byte)UConverterConstants.SI; |
| fromUnicodeStatus = 1; |
| } |
| else if(prevLength==1 && length>1) { |
| /* change from single-byte mode to double-byte */ |
| shiftByte = (byte)UConverterConstants.SO; |
| fromUnicodeStatus = 2; |
| } |
| else { |
| shiftByte = 0; |
| } |
| |
| if(shiftByte!=0) { |
| /* prepend the shift byte to the result bytes */ |
| bufferArray[0] = shiftByte; |
| if(resultArray!=bufferArray || resultArrayIndex!=bufferArrayIndex+1) { |
| System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex+1, length); |
| } |
| resultArray = bufferArray; |
| resultArrayIndex = bufferArrayIndex; |
| ++length; |
| } |
| } |
| |
| return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex); |
| } |
| |
| /* |
| * @return if(U_FAILURE) return the code point for cnv->fromUChar32 |
| * else return 0 after output has been written to the target |
| */ |
| protected int fromU(int cp_, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex, boolean flush, CoderResult[] cr) |
| { |
| //ByteBuffer cx; |
| long cp = cp_ & UConverterConstants.UNSIGNED_INT_MASK; |
| |
| useSubChar1=false; |
| |
| if( sharedData.mbcs.extIndexes!=null && initialMatchFromU((int)cp, source, target, offsets, sourceIndex, flush, cr)) { |
| return 0; /* an extension mapping handled the input */ |
| } |
| |
| /* GB 18030 */ |
| if((options&MBCS_OPTION_GB18030)!=0) { |
| long[] range; |
| int i; |
| |
| for(i=0; i<gb18030Ranges.length; ++i) { |
| range=gb18030Ranges[i]; |
| if(range[0]<=cp && cp<=range[1]) { |
| /* found the Unicode code point, output the four-byte sequence for it */ |
| long linear; |
| byte bytes[] = new byte[4]; |
| |
| /* get the linear value of the first GB 18030 code in this range */ |
| linear=range[2]-LINEAR_18030_BASE; |
| |
| /* add the offset from the beginning of the range */ |
| linear+=(cp-range[0]); |
| |
| bytes[3]=(byte)(0x30+linear%10); linear/=10; |
| bytes[2]=(byte)(0x81+linear%126); linear/=126; |
| bytes[1]=(byte)(0x30+linear%10); linear/=10; |
| bytes[0]=(byte)(0x81+linear); |
| |
| /* output this sequence */ |
| cr[0] = fromUWriteBytes(this, bytes, 0, 4, target, offsets, sourceIndex); |
| return 0; |
| } |
| } |
| } |
| |
| /* no mapping */ |
| cr[0] = CoderResult.unmappableForLength(1); |
| return (int)cp; |
| } |
| |
| /* |
| * target<targetLimit; set error code for overflow |
| */ |
| protected boolean initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, int srcIndex, boolean flush, CoderResult[] cr) |
| { |
| int[] value = new int[1]; |
| int match; |
| |
| /* try to match */ |
| match = matchFromU(cp, null, 0, 0, source, target, value, flush); |
| |
| /* reject a match if the result is a single byte for DBCS-only */ |
| if( match>=2 && |
| !(FROM_U_GET_LENGTH(value[0])==1 && |
| sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) |
| ) { |
| /* advance src pointer for the consumed input */ |
| source.position(source.position()+match-2); /* remove 2 for the initial code point */ |
| |
| /* write result to target */ |
| cr[0] = writeFromU(value[0], target, offsets, srcIndex); |
| return true; |
| } else if(match<0) { |
| /* save state for partial match */ |
| int sArrayIndex; |
| int j; |
| |
| /* copy the first code point */ |
| preFromUFirstCP=cp; |
| |
| /* now copy the newly consumed input */ |
| sArrayIndex = source.position(); |
| match =- match-2; /* remove 2 for the initial code point */ |
| for(j=0; j<match; ++j) { |
| preFromUArray[j]=source.get(sArrayIndex++); |
| } |
| source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ |
| preFromULength=(byte)match; |
| return true; |
| } else if(match==1) { |
| /* matched, no mapping but request for <subchar1> */ |
| useSubChar1=true; |
| return false; |
| } else /* match==0 no match */ { |
| return false; |
| } |
| } |
| |
| /* |
| * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages |
| * that map only to and from the BMP. |
| * In addition to single-byte/state optimizations, the offset calculations |
| * become much easier. |
| */ |
| protected CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ |
| |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex, lastSource; |
| int targetCapacity, length; |
| char[] table; |
| byte[] results; |
| |
| int c, sourceIndex; |
| char value, minValue; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| targetCapacity = target.remaining(); |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| results = sharedData.mbcs.swapLFNLFromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? |
| } |
| else { |
| results = sharedData.mbcs.fromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? |
| } |
| |
| if(useFallback) { |
| /* use all roundtrip and fallback results */ |
| minValue = 0x800; |
| } |
| else { |
| /* use only roundtrips and fallbacks from private-use characters */ |
| minValue = 0xc00; |
| } |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex = c==0 ? 0 : -1; |
| lastSource = sourceArrayIndex; |
| |
| /* |
| * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
| * for the minimum of the sourceLength and targetCapacity |
| */ |
| length = source.limit()-sourceArrayIndex; |
| if(length<targetCapacity) { |
| targetCapacity=length; |
| } |
| |
| boolean doloop = true; |
| if(c!=0 && targetCapacity>0) { |
| SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); |
| doloop = getTrailSingleBMP(source, x, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| } |
| |
| if(doloop) { |
| while(targetCapacity>0) { |
| /* |
| * Get a correct Unicode code point: |
| * a single UChar for a BMP code point or |
| * a matched surrogate pair for a "supplementary code point". |
| */ |
| c = source.get(sourceArrayIndex++); |
| /* |
| * Do not immediately check for single surrogates: |
| * Assume that they are unassigned and check for them in that case. |
| * This speeds up the conversion of assigned characters. |
| */ |
| /* convert the Unicode code point in c into codepage bytes */ |
| value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if(value>=minValue) { |
| /* assigned, write the output character bytes from value and length */ |
| /* length==1 */ |
| /* this is easy because we know that there is enough space */ |
| target.put((byte)value); |
| --targetCapacity; |
| |
| /* normal end of conversion: prepare for a new character */ |
| c=0; |
| continue; |
| } |
| else if(!UTF16.isSurrogate((char)c)) { |
| /* normal, unassigned BMP character */ |
| } |
| else if(UTF16.isLeadSurrogate((char)c)) { |
| //getTrail: |
| SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); |
| doloop = getTrailSingleBMP(source, x, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| if(!doloop) |
| break; |
| } |
| else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| |
| /* c does not have a mapping */ |
| |
| /* get the number of code units for c to correctly advance sourceIndex */ |
| length = UTF16.getCharCount(c); |
| |
| /* set offsets since the start or the last extension */ |
| if(offsets!=null) { |
| int count = sourceArrayIndex-lastSource; |
| |
| /* do not set the offset for this character */ |
| count -= length; |
| |
| while(count>0) { |
| offsets.put(sourceIndex++); |
| --count; |
| } |
| /* offsets and sourceIndex are now set for the current character */ |
| } |
| |
| /* try an extension mapping */ |
| lastSource = sourceArrayIndex; |
| source.position(sourceArrayIndex); |
| c = fromU(c, source, target, offsets, sourceIndex, flush, cr); |
| sourceArrayIndex = source.position(); |
| sourceIndex += length+(sourceArrayIndex-lastSource); |
| lastSource = sourceArrayIndex; |
| |
| if(cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| break; |
| } else { |
| /* a mapping was written to the target, continue */ |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| targetCapacity = target.remaining(); |
| length = source.limit() - sourceArrayIndex; |
| if(length<targetCapacity) { |
| targetCapacity=length; |
| } |
| } |
| } |
| } |
| |
| if(sourceArrayIndex<source.limit() && !target.hasRemaining()) { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| } |
| |
| /* set offsets since the start or the last callback */ |
| if(offsets!=null) { |
| int count = sourceArrayIndex-lastSource; |
| while(count>0) { |
| offsets.put(sourceIndex++); |
| --count; |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32=c; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ |
| protected CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ |
| |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex; |
| |
| char[] table; |
| byte[] results; //agljport:comment results is used to to get 16-bit values out of byte[] array |
| |
| int c; |
| int sourceIndex, nextSourceIndex; |
| |
| char value, minValue; |
| |
| /* set up the local pointers */ |
| short unicodeMask; |
| sourceArrayIndex = source.position(); |
| |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| results = sharedData.mbcs.swapLFNLFromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? |
| } |
| else { |
| results = sharedData.mbcs.fromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? |
| } |
| |
| if(useFallback) { |
| /* use all roundtrip and fallback results */ |
| minValue = 0x800; |
| } |
| else { |
| /* use only roundtrips and fallbacks from private-use characters */ |
| minValue = 0xc00; |
| } |
| //agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation |
| unicodeMask = sharedData.mbcs.unicodeMask; |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex= c==0 ? 0 : -1; |
| nextSourceIndex=0; |
| |
| boolean doloop = true; |
| if(c!=0 && target.hasRemaining()) { |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = getTrailDouble(source, target, unicodeMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| } |
| |
| if(doloop) { |
| while(sourceArrayIndex<source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. |
| * It does not catch output of more than one byte that |
| * overflows as a result of a multi-byte character or callback output |
| * from the last source character. |
| * Therefore, those situations also test for overflows and will |
| * then break the loop, too. |
| */ |
| if(target.hasRemaining()) { |
| /* |
| * Get a correct Unicode code point: |
| * a single UChar for a BMP code point or |
| * a matched surrogate pair for a "supplementary code point". |
| */ |
| c = source.get(sourceArrayIndex++); |
| ++nextSourceIndex; |
| if(UTF16.isSurrogate((char)c)) { |
| if(UTF16.isLeadSurrogate((char)c)) { |
| //getTrail: |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = getTrailDouble(source, target, unicodeMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| if(doloop) |
| continue; |
| else |
| break; |
| } |
| else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| |
| /* convert the Unicode code point in c into codepage bytes */ |
| value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if(value>=minValue) { |
| /* assigned, write the output character bytes from value and length */ |
| /* length==1 */ |
| /* this is easy because we know that there is enough space */ |
| target.put((byte)value); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| |
| /* normal end of conversion: prepare for a new character */ |
| c=0; |
| sourceIndex = nextSourceIndex; |
| } |
| else { /* unassigned */ |
| /* try an extension mapping */ |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = unassignedDouble(source, target, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| if(!doloop) |
| break; |
| } |
| } |
| else { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32=c; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ |
| protected CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ |
| CoderResult[] cr = {CoderResult.UNDERFLOW}; |
| |
| int sourceArrayIndex; |
| |
| char[] table; |
| byte[] bytes; |
| |
| int c, sourceIndex, nextSourceIndex; |
| |
| int stage2Entry; |
| int value; |
| int length; |
| short unicodeMask; |
| |
| /* use optimized function if possible */ |
| unicodeMask = sharedData.mbcs.unicodeMask; |
| |
| /* set up the local pointers */ |
| sourceArrayIndex = source.position(); |
| |
| table = sharedData.mbcs.fromUnicodeTable; |
| |
| if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { |
| bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes; |
| } else { |
| bytes = sharedData.mbcs.fromUnicodeBytes; |
| } |
| |
| /* get the converter state from UConverter */ |
| c = fromUChar32; |
| |
| /* sourceIndex=-1 if the current character began in the previous buffer */ |
| sourceIndex= c==0 ? 0 : -1; |
| nextSourceIndex=0; |
| |
| /* conversion loop */ |
| boolean doloop = true; |
| if(c!=0 && target.hasRemaining()) { |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = getTrailDouble(source, target, unicodeMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| } |
| |
| if(doloop) { |
| while(sourceArrayIndex<source.limit()) { |
| /* |
| * This following test is to see if available input would overflow the output. |
| * It does not catch output of more than one byte that |
| * overflows as a result of a multi-byte character or callback output |
| * from the last source character. |
| * Therefore, those situations also test for overflows and will |
| * then break the loop, too. |
| */ |
| if(target.hasRemaining()) { |
| /* |
| * Get a correct Unicode code point: |
| * a single UChar for a BMP code point or |
| * a matched surrogate pair for a "supplementary code point". |
| */ |
| c = source.get(sourceArrayIndex++); |
| ++nextSourceIndex; |
| /* |
| * This also tests if the codepage maps single surrogates. |
| * If it does, then surrogates are not paired but mapped separately. |
| * Note that in this case unmatched surrogates are not detected. |
| */ |
| if(UTF16.isSurrogate((char)c) && (unicodeMask&UConverterConstants.HAS_SURROGATES) == 0) { |
| if(UTF16.isLeadSurrogate((char)c)) { |
| //getTrail: |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| doloop = getTrailDouble(source, target, unicodeMask, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| |
| if(doloop){ |
| continue; |
| } else { |
| break; |
| } |
| } |
| else { |
| /* this is an unmatched trail code unit (2nd surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(1); |
| break; |
| } |
| } |
| |
| /* convert the Unicode code point in c into codepage bytes */ |
| stage2Entry = MBCS_STAGE_2_FROM_U(table, c); |
| |
| /* get the bytes and the length for the output */ |
| /* MBCS_OUTPUT_2 */ |
| value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); |
| if((value & UConverterConstants.UNSIGNED_INT_MASK) <=0xff) { |
| length=1; |
| } |
| else { |
| length=2; |
| } |
| |
| /* is this code point assigned, or do we use fallbacks? */ |
| if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value!=0))) { |
| /* |
| * We allow a 0 byte output if the "assigned" bit is set for this entry. |
| * There is no way with this data structure for fallback output |
| * to be a zero byte. |
| */ |
| |
| //unassigned: |
| SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); |
| |
| doloop = unassignedDouble(source, target, x, flush, cr); |
| c = x.c; |
| sourceArrayIndex = x.sourceArrayIndex; |
| sourceIndex = x.sourceIndex; |
| nextSourceIndex = x.nextSourceIndex; |
| if(doloop) |
| continue; |
| else |
| break; |
| } |
| |
| /* write the output character bytes from value and length */ |
| /* from the first if in the loop we know that targetCapacity>0 */ |
| if(length==1) { |
| /* this is easy because we know that there is enough space */ |
| target.put((byte)value); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| } |
| else /* length==2 */ { |
| target.put((byte)(value>>>8)); |
| if(2<=target.remaining()) { |
| target.put((byte)value); |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| offsets.put(sourceIndex); |
| } |
| } |
| else { |
| if(offsets!=null) { |
| offsets.put(sourceIndex); |
| } |
| errorBuffer[0]=(byte)value; |
| errorBufferLength=1; |
| |
| /* target overflow */ |
| cr[0] = CoderResult.OVERFLOW; |
| c=0; |
| break; |
| } |
| } |
| |
| /* normal end of conversion: prepare for a new character */ |
| c=0; |
| sourceIndex=nextSourceIndex; |
| continue; |
| } |
| else { |
| /* target is full */ |
| cr[0] = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| |
| /* set the converter state back into UConverter */ |
| fromUChar32=c; |
| |
| /* write back the updated pointers */ |
| source.position(sourceArrayIndex); |
| |
| return cr[0]; |
| } |
| |
| protected final class SideEffectsSingleBMP { |
| int c, sourceArrayIndex; |
| SideEffectsSingleBMP(int c_, int sourceArrayIndex_) |
| { |
| c = c_; |
| sourceArrayIndex = sourceArrayIndex_; |
| } |
| } |
| |
| // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets |
| // assumes input c is lead surrogate |
| protected final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr) |
| { |
| if(x.sourceArrayIndex<source.limit()) { |
| /* test the following code unit */ |
| char trail=source.get(x.sourceArrayIndex); |
| if(UTF16.isTrailSurrogate(trail)) { |
| ++x.sourceArrayIndex; |
| x.c = UCharacter.getCodePoint((char)x.c, trail); |
| /* this codepage does not map supplementary code points */ |
| /* callback(unassigned) */ |
| cr[0]=CoderResult.unmappableForLength(2); |
| return false; |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(2); |
| return false; |
| } |
| } else { |
| /* no more input */ |
| return false; |
| } |
| //return true; |
| } |
| |
| protected final class SideEffects { |
| int c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength; |
| SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_, int prevLength_) |
| { |
| c = c_; |
| sourceArrayIndex = sourceArrayIndex_; |
| sourceIndex = sourceIndex_; |
| nextSourceIndex = nextSourceIndex_; |
| prevSourceIndex = prevSourceIndex_; |
| prevLength = prevLength_; |
| } |
| } |
| |
| // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets |
| // assumes input c is lead surrogate |
| protected final boolean getTrail(CharBuffer source, ByteBuffer target, int unicodeMask, SideEffects x, boolean flush, CoderResult[] cr) |
| { |
| if(x.sourceArrayIndex<source.limit()) { |
| /* test the following code unit */ |
| char trail = source.get(x.sourceArrayIndex); |
| if(UTF16.isTrailSurrogate(trail)) { |
| ++x.sourceArrayIndex; |
| ++x.nextSourceIndex; |
| x.c = UCharacter.getCodePoint((char)x.c, trail); |
| if((unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| fromUnicodeStatus = x.prevLength; /* save the old state */ |
| /* callback(unassigned) */ |
| return unassigned(source, target, null, x, flush, cr); |
| } |
| /* convert this supplementary code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(2); |
| return false; |
| } |
| } else { |
| /* no more input */ |
| return false; |
| } |
| return true; |
| } |
| |
| // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets |
| protected final boolean unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x, boolean flush, CoderResult[] cr) |
| { |
| /* try an extension mapping */ |
| int sourceBegin = x.sourceArrayIndex; |
| source.position(x.sourceArrayIndex); |
| x.c = fromU(x.c, source, target, null, x.sourceIndex, flush, cr); |
| x.sourceArrayIndex = source.position(); |
| x.nextSourceIndex += x.sourceArrayIndex-sourceBegin; |
| x.prevLength=(int)fromUnicodeStatus; |
| |
| if(cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| return false; |
| } else { |
| /* a mapping was written to the target, continue */ |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| //x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; |
| |
| /* normal end of conversion: prepare for a new character */ |
| if(offsets!=null) { |
| x.prevSourceIndex=x.sourceIndex; |
| x.sourceIndex=x.nextSourceIndex; |
| } |
| return true; |
| } |
| } |
| |
| protected final class SideEffectsDouble { |
| int c, sourceArrayIndex, sourceIndex, nextSourceIndex; |
| SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_) |
| { |
| c = c_; |
| sourceArrayIndex = sourceArrayIndex_; |
| sourceIndex = sourceIndex_; |
| nextSourceIndex = nextSourceIndex_; |
| } |
| } |
| |
| // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets |
| // assumes input c is lead surrogate |
| protected final boolean getTrailDouble(CharBuffer source, ByteBuffer target, int unicodeMask, SideEffectsDouble x, boolean flush, CoderResult[] cr) |
| { |
| if(x.sourceArrayIndex<source.limit()) { |
| /* test the following code unit */ |
| char trail=source.get(x.sourceArrayIndex); |
| if(UTF16.isTrailSurrogate(trail)) { |
| ++x.sourceArrayIndex; |
| ++x.nextSourceIndex; |
| x.c = UCharacter.getCodePoint((char)x.c, trail); |
| if((unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) { |
| /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| /* callback(unassigned) */ |
| return unassignedDouble(source, target, x, flush, cr); |
| } |
| /* convert this supplementary code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| cr[0] = CoderResult.malformedForLength(2); |
| return false; |
| } |
| } else { |
| /* no more input */ |
| return false; |
| } |
| return true; |
| } |
| |
| // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets |
| protected final boolean unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x, boolean flush, CoderResult[] cr) |
| { |
| /* try an extension mapping */ |
| int sourceBegin = x.sourceArrayIndex; |
| source.position(x.sourceArrayIndex); |
| x.c = fromU(x.c, source, target, null, x.sourceIndex, flush, cr); |
| x.sourceArrayIndex = source.position(); |
| x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; |
| |
| if(cr[0].isError()) { |
| /* not mappable or buffer overflow */ |
| return false; |
| } else { |
| /* a mapping was written to the target, continue */ |
| |
| /* recalculate the targetCapacity after an extension mapping */ |
| //x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; |
| |
| /* normal end of conversion: prepare for a new character */ |
| x.sourceIndex=x.nextSourceIndex; |
| return true; |
| } |
| } |
| /** |
| * Overrides super class method |
| * @param encoder |
| * @param source |
| * @param target |
| * @param offsets |
| * @return |
| */ |
| protected CoderResult cbFromUWriteSub ( CharsetEncoderICU encoder, |
| CharBuffer source, ByteBuffer target, |
| IntBuffer offsets){ |
| CharsetMBCS cs = (CharsetMBCS) encoder.charset(); |
| byte[] subchar, p; |
| byte[] buffer = new byte[4]; |
| int length,i=0; |
| /* first, select between subChar and subChar1 */ |
| if( cs.subChar1!=0 && |
| (cs.sharedData.mbcs.extIndexes!=null ? |
| encoder.useSubChar1 : |
| (encoder.invalidUCharBuffer[0]<=0xff)) |
| ) { |
| /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ |
| subchar = new byte[1]; |
| subchar[0]=cs.subChar1; |
| length=1; |
| } else { |
| /* select subChar in all other cases */ |
| subchar=cs.subChar; |
| length=cs.subCharLen; |
| } |
| |
| /* reset the selector for the next code point */ |
| encoder.useSubChar1=false; |
| |
| switch(cs.sharedData.mbcs.outputType) { |
| case MBCS_OUTPUT_2_SISO: |
| p=buffer; |
| |
| /* fromUnicodeStatus contains prevLength */ |
| switch(length) { |
| case 1: |
| if(encoder.fromUnicodeStatus==2) { |
| /* DBCS mode and SBCS sub char: change to SBCS */ |
| encoder.fromUnicodeStatus=1; |
| p[i++]=UConverterConstants.SI; |
| } |
| p[i++]=subchar[0]; |
| break; |
| case 2: |
| if(encoder.fromUnicodeStatus<=1) { |
| /* SBCS mode and DBCS sub char: change to DBCS */ |
| encoder.fromUnicodeStatus=2; |
| p[i++]=UConverterConstants.SO; |
| } |
| p[i++]=subchar[0]; |
| p[i++]=subchar[1]; |
| break; |
| default: |
| throw new IllegalArgumentException(); |
| } |
| return super.cbFromUWriteSub(encoder, source, target, offsets); |
| default: |
| return super.cbFromUWriteSub(encoder, source, target, offsets); |
| } |
| } |
| } |
| |
| |
| public CharsetDecoder newDecoder() { |
| return new CharsetDecoderMBCS(this); |
| } |
| |
| public CharsetEncoder newEncoder() { |
| return new CharsetEncoderMBCS(this); |
| } |
| |
| } |