| /** |
| ******************************************************************************* |
| * Copyright (C) 2006, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.charset; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.IntBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.UTF16; |
| /** |
| * @author Niti Hantaweepant |
| */ |
| class CharsetUTF8 extends CharsetICU { |
| |
| protected byte[] fromUSubstitution = new byte[]{(byte)0xef, (byte)0xbf, (byte)0xbd}; |
| |
| public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases){ |
| super(icuCanonicalName, javaCanonicalName, aliases); |
| maxBytesPerChar = 4; |
| minBytesPerChar = 1; |
| maxCharsPerByte = 1; |
| } |
| |
| /* UTF-8 Conversion DATA |
| * for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9 |
| */ |
| private static final long OFFSETS_FROM_UTF8[] = {0, |
| 0x00000000L, 0x00003080L, 0x000E2080L, |
| 0x03C82080L, 0xFA082080L, 0x82082080L}; |
| |
| private static final byte BYTES_FROM_UTF8[] = |
| { |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
| }; |
| |
| /* |
| * Starting with Unicode 3.0.1: |
| * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; |
| * byte sequences with more than 4 bytes are illegal in UTF-8, |
| * which is tested with impossible values for them |
| */ |
| private static final long UTF8_MIN_CHAR32[] = { 0L, 0L, 0x80L, 0x800L, 0x10000L, 0xffffffffL, 0xffffffffL }; |
| |
| class CharsetDecoderUTF8 extends CharsetDecoderICU{ |
| |
| public CharsetDecoderUTF8(CharsetICU cs) { |
| super(cs); |
| } |
| |
| protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){ |
| CoderResult cr = CoderResult.UNDERFLOW; |
| |
| int sourceArrayIndex = source.position(); |
| |
| // Todo: CESU8 implementation |
| // boolean isCESU8 = args.converter.sharedData == _CESU8Data; |
| boolean isCESU8 = (UConverterSharedData._CESU8Data != null); |
| int ch, ch2 = 0; |
| int i, inBytes; |
| |
| donefornow: |
| { |
| if (toUnicodeStatus!=0 && target.hasRemaining()) |
| { |
| inBytes = mode; /* restore # of bytes to consume */ |
| i = toULength; /* restore # of bytes consumed */ |
| |
| ch = toUnicodeStatus; /*Stores the previously calculated ch from a previous call*/ |
| toUnicodeStatus = 0; |
| |
| while (i < inBytes) |
| { |
| if (sourceArrayIndex<source.limit()) |
| { |
| toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| if (!isTrail((byte)ch2)) |
| { |
| break; /* i < inBytes */ |
| } |
| ch = (ch << 6) + ch2; |
| ++sourceArrayIndex; |
| i++; |
| } |
| else |
| { |
| /* stores a partially calculated target*/ |
| toUnicodeStatus = ch; |
| mode = inBytes; |
| toULength = (byte) i; |
| break donefornow; |
| } |
| } |
| |
| /* Remove the accumulated high bits */ |
| ch -= OFFSETS_FROM_UTF8[inBytes]; |
| |
| /* |
| * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
| * - use only trail bytes after a lead byte (checked above) |
| * - use the right number of trail bytes for a given lead byte |
| * - encode a code point <= U+10ffff |
| * - use the fewest possible number of bytes for their code points |
| * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
| * |
| * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
| * There are no irregular sequences any more. |
| * In CESU-8, only surrogates, not supplementary code points, are encoded directly. |
| */ |
| if (i == inBytes && ch <= UConverterSharedData.MAXIMUM_UTF && ch >= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch))) |
| { |
| /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
| toULength = 0; |
| if (ch <= UConverterSharedData.MAXIMUM_UCS2) |
| { |
| /* fits in 16 bits */ |
| target.put((char)ch); |
| } |
| else |
| { |
| /* write out the surrogates */ |
| ch -= UConverterSharedData.HALF_BASE; |
| target.put((char) ((ch >> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START)); |
| ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START; |
| if(target.hasRemaining()) { |
| target.put((char)ch); |
| |
| } else /* targetCapacity==1 */ { |
| charErrorBufferArray[charErrorBufferBegin+0]=(char)ch; |
| charErrorBufferLength=1; |
| cr = CoderResult.OVERFLOW; |
| |
| } |
| } |
| } |
| else |
| { |
| toULength = (byte)i; |
| cr = CoderResult.malformedForLength(sourceArrayIndex); |
| break donefornow; |
| } |
| } |
| |
| while (sourceArrayIndex < source.limit() && target.hasRemaining()) |
| { |
| ch = source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK; |
| if (ch < 0x80) /* Simple case */ |
| { |
| target.put((char)ch); |
| } |
| else |
| { |
| /* store the first char */ |
| toUBytesArray[0] = (byte)ch; |
| inBytes = BYTES_FROM_UTF8[(int)ch]; /* lookup current sequence length */ |
| i = 1; |
| |
| while (i < inBytes) |
| { |
| if (sourceArrayIndex < source.limit()) |
| { |
| toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK); |
| if (!isTrail((byte)ch2)) |
| { |
| break; /* i < inBytes */ |
| } |
| ch = (ch << 6) + ch2; |
| ++sourceArrayIndex; |
| i++; |
| } |
| else |
| { |
| /* stores a partially calculated target*/ |
| toUnicodeStatus = ch; |
| mode = inBytes; |
| toULength = (byte) i; |
| break donefornow; |
| } |
| } |
| |
| /* Remove the accumulated high bits */ |
| ch -= OFFSETS_FROM_UTF8[inBytes]; |
| |
| /* |
| * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
| * - use only trail bytes after a lead byte (checked above) |
| * - use the right number of trail bytes for a given lead byte |
| * - encode a code point <= U+10ffff |
| * - use the fewest possible number of bytes for their code points |
| * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
| * |
| * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
| * There are no irregular sequences any more. |
| * In CESU-8, only surrogates, not supplementary code points, are encoded directly. |
| */ |
| if (i == inBytes && ch <= UConverterSharedData.MAXIMUM_UTF && ch >= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch))) |
| { |
| /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
| toULength = 0; |
| if (ch <= UConverterSharedData.MAXIMUM_UCS2) |
| { |
| /* fits in 16 bits */ |
| target.put((char) ch); |
| } |
| else |
| { |
| /* write out the surrogates */ |
| ch -= UConverterSharedData.HALF_BASE; |
| target.put((char) ((ch >>> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START)); |
| ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START; |
| if (target.hasRemaining()) |
| { |
| target.put((char)ch); |
| } |
| else |
| { |
| /* Put in overflow buffer (not handled here) */ |
| charErrorBufferArray[charErrorBufferBegin+0]=(char)ch; |
| charErrorBufferLength=1; |
| cr = CoderResult.OVERFLOW; |
| break; |
| } |
| } |
| } |
| else |
| { |
| toULength = (byte)i; |
| cr = CoderResult.malformedForLength(sourceArrayIndex); |
| break; |
| } |
| } |
| } |
| } |
| |
| if (sourceArrayIndex < source.limit() && !target.hasRemaining()) |
| { |
| /* End of target buffer */ |
| cr = CoderResult.OVERFLOW; |
| } |
| |
| source.position(sourceArrayIndex); |
| |
| return cr; |
| } |
| |
| } |
| class CharsetEncoderUTF8 extends CharsetEncoderICU{ |
| |
| public CharsetEncoderUTF8(CharsetICU cs) { |
| super(cs, fromUSubstitution); |
| implReset(); |
| } |
| |
| protected void implReset() { |
| super.implReset(); |
| } |
| |
| protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ |
| CoderResult cr = CoderResult.UNDERFLOW; |
| |
| int sourceArrayIndex = source.position(); |
| |
| // Todo: CESU8 implementation |
| // boolean isCESU8 = args.converter.sharedData == _CESU8Data; |
| boolean isCESU8 = (UConverterSharedData._CESU8Data != null); |
| |
| int ch; |
| short indexToWrite; |
| byte temp[] = new byte[4]; |
| boolean doloop = true; |
| |
| if (fromUChar32 != 0 && target.hasRemaining()){ |
| ch = fromUChar32; |
| fromUChar32 = 0; |
| |
| if (sourceArrayIndex < source.limit()) { |
| /* test the following code unit */ |
| char trail = source.get(sourceArrayIndex); |
| if(UTF16.isTrailSurrogate(trail)) { |
| ++sourceArrayIndex; |
| ch = UCharacter.getCodePoint((char)ch, trail); |
| /* convert this supplementary code point */ |
| /* exit this condition tree */ |
| } else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| fromUChar32 = (int)ch; |
| cr = CoderResult.malformedForLength(sourceArrayIndex); |
| doloop = false; |
| } |
| } else { |
| /* no more input */ |
| fromUChar32 = (int)ch; |
| doloop = false; |
| } |
| |
| if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE){ |
| indexToWrite = 2; |
| temp[2] = (byte) ((ch >>> 12) | 0xe0); |
| }else{ |
| indexToWrite = 3; |
| temp[3] = (byte) ((ch >>> 18) | 0xf0); |
| temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80); |
| } |
| temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80); |
| temp[0] = (byte) ((ch & 0x3f) | 0x80); |
| |
| for (; indexToWrite >= 0; indexToWrite--){ |
| if (target.hasRemaining()){ |
| target.put(temp[indexToWrite]); |
| }else{ |
| errorBuffer[errorBufferLength++] = temp[indexToWrite]; |
| cr = CoderResult.OVERFLOW; |
| } |
| } |
| } |
| |
| if(doloop) { |
| while (sourceArrayIndex < source.limit() && target.hasRemaining()){ |
| ch = source.get(sourceArrayIndex++); |
| if (ch < 0x80){ /* Single byte */ |
| target.put((byte)ch); |
| }else if (ch < 0x800) { /* Double byte */ |
| target.put((byte) ((ch >>> 6) | 0xc0)); |
| if (target.hasRemaining()){ |
| target.put((byte) ((ch & 0x3f) | 0x80)); |
| }else{ |
| errorBuffer[0] = (byte) ((ch & 0x3f) | 0x80); |
| errorBufferLength = 1; |
| cr = CoderResult.OVERFLOW; |
| break; |
| } |
| }else{ /* Check for surrogates */ |
| if(UTF16.isSurrogate((char)ch) && !isCESU8) { |
| if(UTF16.isLeadSurrogate((char)ch)) { |
| |
| if (sourceArrayIndex < source.limit()) { |
| /* test the following code unit */ |
| char trail = source.get(sourceArrayIndex); |
| if(UTF16.isTrailSurrogate(trail)) { |
| ++sourceArrayIndex; |
| ch = UCharacter.getCodePoint((char)ch, trail); |
| //ch2 = 0; |
| /* convert this supplementary code point */ |
| /* exit this condition tree */ |
| } |
| else { |
| /* this is an unmatched lead code unit (1st surrogate) */ |
| /* callback(illegal) */ |
| fromUChar32 = ch; |
| cr = CoderResult.malformedForLength(sourceArrayIndex); |
| break; |
| } |
| } |
| else { |
| /* no more input */ |
| fromUChar32 = ch; |
| break; |
| } |
| } |
| else { |
| fromUChar32 = (int)ch; |
| cr = CoderResult.malformedForLength(sourceArrayIndex); |
| break; |
| } |
| } |
| |
| if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) |
| { |
| indexToWrite = 2; |
| temp[2] = (byte) ((ch >>> 12) | 0xe0); |
| } |
| else |
| { |
| indexToWrite = 3; |
| temp[3] = (byte) ((ch >>> 18) | 0xf0); |
| temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80); |
| } |
| temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80); |
| temp[0] = (byte) ((ch & 0x3f) | 0x80); |
| |
| for (; indexToWrite >= 0; indexToWrite--) |
| { |
| if (target.hasRemaining()) |
| { |
| target.put(temp[indexToWrite]); |
| } |
| else |
| { |
| errorBuffer[errorBufferLength++] = temp[indexToWrite]; |
| cr = CoderResult.OVERFLOW; |
| } |
| } |
| } |
| } |
| } |
| |
| if (sourceArrayIndex < source.limit() && !target.hasRemaining()){ |
| cr = CoderResult.OVERFLOW; |
| } |
| |
| source.position(sourceArrayIndex); |
| |
| |
| return cr; |
| } |
| } |
| |
| /* single-code point definitions -------------------------------------------- */ |
| |
| /* |
| * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? |
| * @param c 8-bit code unit (byte) |
| * @return TRUE or FALSE |
| * @draft ICU 3.6 |
| */ |
| //static final boolean isSingle(byte c) {return (((c)&0x80)==0);} |
| |
| /* |
| * Is this code unit (byte) a UTF-8 lead byte? |
| * @param c 8-bit code unit (byte) |
| * @return TRUE or FALSE |
| * @draft ICU 3.6 |
| */ |
| //static final boolean isLead(byte c) {return ((((c)-0xc0) & UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);} |
| |
| /** |
| * Is this code unit (byte) a UTF-8 trail byte? |
| * @param c 8-bit code unit (byte) |
| * @return TRUE or FALSE |
| * @draft ICU 3.6 |
| */ |
| static final boolean isTrail(byte c) {return (((c)&0xc0)==0x80);} |
| |
| /* |
| * How many code units (bytes) are used for the UTF-8 encoding |
| * of this Unicode code point? |
| * @param c 32-bit code point |
| * @return 1..4, or 0 if c is a surrogate or not a Unicode code point |
| * @draft ICU 3.6 |
| */ |
| /*static final int length(int c) |
| { |
| long uc = c & UConverterConstants.UNSIGNED_INT_MASK; |
| return |
| (uc<=0x7f ? 1 : |
| (uc<=0x7ff ? 2 : |
| (uc<=0xd7ff ? 3 : |
| (uc<=0xdfff || uc>0x10ffff ? 0 : |
| (uc<=0xffff ? 3 : 4) |
| ) |
| ) |
| ) |
| ); |
| }*/ |
| |
| public CharsetDecoder newDecoder() { |
| return new CharsetDecoderUTF8(this); |
| } |
| |
| public CharsetEncoder newEncoder() { |
| return new CharsetEncoderUTF8(this); |
| } |
| } |