| /** |
| ******************************************************************************* |
| * Copyright (C) 2006-2011, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.charset; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.IntBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.util.VersionInfo; |
| |
| /** |
| * @author Niti Hantaweepant |
| */ |
| class CharsetUTF16 extends CharsetICU { |
| |
| private static final int SIGNATURE_LENGTH = 2; |
| private static final byte[] fromUSubstitution_BE = { (byte) 0xff, (byte) 0xfd }; |
| private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff }; |
| private static final byte[] BOM_BE = { (byte) 0xfe, (byte) 0xff }; |
| private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe }; |
| private static final int ENDIAN_XOR_BE = 0; |
| private static final int ENDIAN_XOR_LE = 1; |
| private static final int NEED_TO_WRITE_BOM = 1; |
| |
| private boolean isEndianSpecified; |
| private boolean isBigEndian; |
| private int endianXOR; |
| private byte[] bom; |
| private byte[] fromUSubstitution; |
| |
| private int version; |
| |
| public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases) { |
| super(icuCanonicalName, javaCanonicalName, aliases); |
| |
| /* Get the version number (e.g. UTF-16LE,version=1) */ |
| int versionIndex = icuCanonicalName.indexOf("version="); |
| if (versionIndex > 0) { |
| version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue(); |
| } else { |
| version = 0; |
| } |
| |
| this.isEndianSpecified = (this instanceof CharsetUTF16BE || this instanceof CharsetUTF16LE); |
| this.isBigEndian = !(this instanceof CharsetUTF16LE); |
| |
| if (isBigEndian) { |
| this.bom = BOM_BE; |
| this.fromUSubstitution = fromUSubstitution_BE; |
| this.endianXOR = ENDIAN_XOR_BE; |
| } else { |
| this.bom = BOM_LE; |
| this.fromUSubstitution = fromUSubstitution_LE; |
| this.endianXOR = ENDIAN_XOR_LE; |
| } |
| |
| /* UnicodeBig and UnicodeLittle requires maxBytesPerChar set to 4 in Java 5 or less */ |
| if ((VersionInfo.javaVersion().getMajor() == 1 && VersionInfo.javaVersion().getMinor() <= 5) |
| && (isEndianSpecified && version == 1)) { |
| maxBytesPerChar = 4; |
| } else { |
| maxBytesPerChar = 2; |
| } |
| |
| minBytesPerChar = 2; |
| maxCharsPerByte = 1; |
| } |
| |
| class CharsetDecoderUTF16 extends CharsetDecoderICU { |
| |
| private boolean isBOMReadYet; |
| private int actualEndianXOR; |
| private byte[] actualBOM; |
| |
| public CharsetDecoderUTF16(CharsetICU cs) { |
| super(cs); |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| isBOMReadYet = false; |
| actualBOM = null; |
| } |
| |
| protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { |
| /* |
| * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual |
| * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that |
| * are in the current buffer. |
| */ |
| if (!isBOMReadYet) { |
| while (true) { |
| if (!source.hasRemaining()) |
| return CoderResult.UNDERFLOW; |
| |
| toUBytesArray[toULength++] = source.get(); |
| |
| if (toULength == 1) { |
| // on the first byte, we haven't decided whether or not it's bigEndian yet |
| if ((!isEndianSpecified || isBigEndian) |
| && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) { |
| actualBOM = BOM_BE; |
| actualEndianXOR = ENDIAN_XOR_BE; |
| } else if ((!isEndianSpecified || !isBigEndian) |
| && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) { |
| actualBOM = BOM_LE; |
| actualEndianXOR = ENDIAN_XOR_LE; |
| } else { |
| // we do not have a BOM (and we have toULength==1 bytes) |
| if (isEndianSpecified && version == 1) { |
| actualBOM = isBigEndian ? CharsetUTF16.BOM_BE : CharsetUTF16.BOM_LE; |
| actualEndianXOR = isBigEndian ? CharsetUTF16.ENDIAN_XOR_BE : CharsetUTF16.ENDIAN_XOR_LE; |
| } else { |
| actualBOM = null; |
| actualEndianXOR = endianXOR; |
| } |
| break; |
| } |
| } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) { |
| return CoderResult.malformedForLength(2); |
| } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) { |
| // we found a BOM! at last! |
| // too bad we have to get ignore it now (like it was unwanted or something) |
| toULength = 0; |
| break; |
| } else if (isEndianSpecified || toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) { |
| // we do not have a BOM (and we have toULength bytes) |
| actualBOM = null; |
| actualEndianXOR = endianXOR; |
| break; |
| } else if (toULength == SIGNATURE_LENGTH) { |
| // we found a BOM! at last! |
| // too bad we have to get ignore it now (like it was unwanted or something) |
| toULength = 0; |
| break; |
| } |
| } |
| |
| isBOMReadYet = true; |
| } |
| |
| // now that we no longer need to look for a BOM, let's do some work |
| |
| // if we have unfinished business |
| if (toUnicodeStatus != 0) { |
| CoderResult cr = decodeTrail(source, target, offsets, (char) toUnicodeStatus); |
| if (cr != null) |
| return cr; |
| } |
| |
| char char16; |
| |
| while (true) { |
| while (toULength < 2) { |
| if (!source.hasRemaining()) |
| return CoderResult.UNDERFLOW; |
| toUBytesArray[toULength++] = source.get(); |
| } |
| |
| if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) { |
| return CoderResult.malformedForLength(2); |
| } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) { |
| // we found a BOM! at last! |
| // too bad we have to get ignore it now (like it was unwanted or something) |
| toULength = 0; |
| continue; |
| } |
| |
| if (!target.hasRemaining()) |
| return CoderResult.OVERFLOW; |
| |
| char16 = (char) (((toUBytesArray[0 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[1 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK))); |
| |
| if (!UTF16.isSurrogate(char16)) { |
| toULength = 0; |
| target.put(char16); |
| } else { |
| CoderResult cr = decodeTrail(source, target, offsets, char16); |
| if (cr != null) |
| return cr; |
| } |
| } |
| } |
| |
| private final CoderResult decodeTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, char lead) { |
| if (!UTF16.isLeadSurrogate(lead)) { |
| // 2 bytes, lead malformed |
| toUnicodeStatus = 0; |
| return CoderResult.malformedForLength(2); |
| } |
| |
| while (toULength < 4) { |
| if (!source.hasRemaining()) { |
| // let this be unfinished business |
| toUnicodeStatus = lead; |
| return CoderResult.UNDERFLOW; |
| } |
| toUBytesArray[toULength++] = source.get(); |
| } |
| |
| char trail = (char) (((toUBytesArray[2 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[3 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK))); |
| |
| if (!UTF16.isTrailSurrogate(trail)) { |
| // pretend like we didnt read the last 2 bytes |
| toULength = 2; |
| source.position(source.position() - 2); |
| |
| // 2 bytes, lead malformed |
| toUnicodeStatus = 0; |
| return CoderResult.malformedForLength(2); |
| } |
| |
| toUnicodeStatus = 0; |
| toULength = 0; |
| |
| target.put(lead); |
| |
| if (target.hasRemaining()) { |
| target.put(trail); |
| return null; |
| } else { |
| /* Put in overflow buffer (not handled here) */ |
| charErrorBufferArray[0] = trail; |
| charErrorBufferLength = 1; |
| return CoderResult.OVERFLOW; |
| } |
| } |
| } |
| |
| class CharsetEncoderUTF16 extends CharsetEncoderICU { |
| private final byte[] temp = new byte[4]; |
| |
| public CharsetEncoderUTF16(CharsetICU cs) { |
| super(cs, fromUSubstitution); |
| fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM; |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM; |
| } |
| |
| protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { |
| CoderResult cr; |
| |
| /* write the BOM if necessary */ |
| if (fromUnicodeStatus == NEED_TO_WRITE_BOM) { |
| if (!target.hasRemaining()) |
| return CoderResult.OVERFLOW; |
| |
| fromUnicodeStatus = 0; |
| cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1); |
| if (cr.isOverflow()) |
| return cr; |
| } |
| |
| if (fromUChar32 != 0) { |
| if (!target.hasRemaining()) |
| return CoderResult.OVERFLOW; |
| |
| // a note: fromUChar32 will either be 0 or a lead surrogate |
| cr = encodeChar(source, target, offsets, (char) fromUChar32); |
| if (cr != null) |
| return cr; |
| } |
| |
| while (true) { |
| if (!source.hasRemaining()) |
| return CoderResult.UNDERFLOW; |
| if (!target.hasRemaining()) |
| return CoderResult.OVERFLOW; |
| |
| cr = encodeChar(source, target, offsets, source.get()); |
| if (cr != null) |
| return cr; |
| } |
| } |
| |
| private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) { |
| int sourceIndex = source.position() - 1; |
| CoderResult cr; |
| |
| if (UTF16.isSurrogate(ch)) { |
| cr = handleSurrogates(source, ch); |
| if (cr != null) |
| return cr; |
| |
| char trail = UTF16.getTrailSurrogate(fromUChar32); |
| fromUChar32 = 0; |
| |
| // 4 bytes |
| temp[0 ^ endianXOR] = (byte) (ch >>> 8); |
| temp[1 ^ endianXOR] = (byte) (ch); |
| temp[2 ^ endianXOR] = (byte) (trail >>> 8); |
| temp[3 ^ endianXOR] = (byte) (trail); |
| cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex); |
| } else { |
| // 2 bytes |
| temp[0 ^ endianXOR] = (byte) (ch >>> 8); |
| temp[1 ^ endianXOR] = (byte) (ch); |
| cr = fromUWriteBytes(this, temp, 0, 2, target, offsets, sourceIndex); |
| } |
| return (cr.isUnderflow() ? null : cr); |
| } |
| } |
| |
| public CharsetDecoder newDecoder() { |
| return new CharsetDecoderUTF16(this); |
| } |
| |
| public CharsetEncoder newEncoder() { |
| return new CharsetEncoderUTF16(this); |
| } |
| |
| void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ |
| getNonSurrogateUnicodeSet(setFillIn); |
| } |
| } |