blob: ab8b5da701a72513598da8dcf32df6c9d002b23b [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 2006-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Niti Hantaweepant
*/
class CharsetUTF8 extends CharsetICU {
private static final byte[] fromUSubstitution = new byte[] { (byte) 0xef, (byte) 0xbf, (byte) 0xbd };
public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
super(icuCanonicalName, javaCanonicalName, aliases);
/* max 3 bytes per code unit from UTF-8 (4 bytes from surrogate _pair_) */
maxBytesPerChar = 3;
minBytesPerChar = 1;
maxCharsPerByte = 1;
}
private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7, 0x3, 0x1 };
private static final byte BYTES_FROM_UTF8[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
/*
* Starting with Unicode 3.0.1: UTF-8 byte sequences of length N _must_ encode code points of or
* above utf8_minChar32[N]; byte sequences with more than 4 bytes are illegal in UTF-8, which is
* tested with impossible values for them
*/
private static final int UTF8_MIN_CHAR32[] = { 0, 0, 0x80, 0x800, 0x10000,
Integer.MAX_VALUE, Integer.MAX_VALUE };
private final boolean isCESU8 = this instanceof CharsetCESU8;
class CharsetDecoderUTF8 extends CharsetDecoderICU {
public CharsetDecoderUTF8(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
boolean flush) {
if (!source.hasRemaining()) {
/* no input, nothing to do */
return CoderResult.UNDERFLOW;
}
if (!target.hasRemaining()) {
/* no output available, can't do anything */
return CoderResult.OVERFLOW;
}
if (source.hasArray() && target.hasArray()) {
/* source and target are backed by arrays, so use the arrays for optimal performance */
byte[] sourceArray = source.array();
int sourceIndex = source.arrayOffset() + source.position();
int sourceLimit = source.arrayOffset() + source.limit();
char[] targetArray = target.array();
int targetIndex = target.arrayOffset() + target.position();
int targetLimit = target.arrayOffset() + target.limit();
byte ch;
int char32, bytesExpected, bytesSoFar;
CoderResult cr;
if (mode == 0) {
/* nothing is stored in toUnicodeStatus, read a byte as input */
char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff;
bytesExpected = BYTES_FROM_UTF8[char32];
char32 &= BITMASK_FROM_UTF8[bytesExpected];
bytesSoFar = 1;
} else {
/* a partially or fully built code point is stored in toUnicodeStatus */
char32 = toUnicodeStatus;
bytesExpected = mode;
bytesSoFar = toULength;
toUnicodeStatus = 0;
mode = 0;
toULength = 0;
}
outer: while (true) {
if (bytesSoFar < bytesExpected) {
/* read a trail byte and insert its relevant bits into char32 */
if (sourceIndex >= sourceLimit) {
/* no source left, save the state for later and break out of the loop */
toUnicodeStatus = char32;
mode = bytesExpected;
toULength = bytesSoFar;
cr = CoderResult.UNDERFLOW;
break;
}
if (((ch = toUBytesArray[bytesSoFar] = sourceArray[sourceIndex++]) & 0xc0) != 0x80) {
/* not a trail byte (is not of the form 10xxxxxx) */
sourceIndex--;
toULength = bytesSoFar;
cr = CoderResult.malformedForLength(bytesSoFar);
break;
}
char32 = (char32 << 6) | (ch & 0x3f);
bytesSoFar++;
} else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
&& (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
/*
* char32 is a valid code point and is composed of the correct number of
* bytes ... we now need to output it in UTF-16
*/
if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
/* fits in 16 bits */
targetArray[targetIndex++] = (char) char32;
} else {
/* fit char32 into 20 bits */
char32 -= UConverterConstants.HALF_BASE;
/* write out the surrogates */
targetArray[targetIndex++] = (char) ((char32 >>> UConverterConstants.HALF_SHIFT) + UConverterConstants.SURROGATE_HIGH_START);
if (targetIndex >= targetLimit) {
/* put in overflow buffer (not handled here) */
charErrorBufferArray[charErrorBufferLength++] = (char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START);
cr = CoderResult.OVERFLOW;
break;
}
targetArray[targetIndex++] = (char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START);
}
/*
* we're finished outputing, so now we need to read in the first byte of the
* next byte sequence that could form a code point
*/
if (sourceIndex >= sourceLimit) {
cr = CoderResult.UNDERFLOW;
break;
}
if (targetIndex >= targetLimit) {
cr = CoderResult.OVERFLOW;
break;
}
/* keep reading the next input (and writing it) while bytes == 1 */
while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff]) == 1) {
targetArray[targetIndex++] = (char) char32;
if (sourceIndex >= sourceLimit) {
cr = CoderResult.UNDERFLOW;
break outer;
}
if (targetIndex >= targetLimit) {
cr = CoderResult.OVERFLOW;
break outer;
}
}
/* remove the bits that indicate the number of bytes */
char32 &= BITMASK_FROM_UTF8[bytesExpected];
bytesSoFar = 1;
} else {
/*
* either the lead byte in the code sequence is invalid (bytes == 0) or the
* lead byte combined with all the trail chars does not form a valid code
* point
*/
toULength = bytesSoFar;
cr = CoderResult.malformedForLength(bytesSoFar);
break;
}
}
source.position(sourceIndex - source.arrayOffset());
target.position(targetIndex - target.arrayOffset());
return cr;
} else {
int sourceIndex = source.position();
int sourceLimit = source.limit();
int targetIndex = target.position();
int targetLimit = target.limit();
byte ch;
int char32, bytesExpected, bytesSoFar;
CoderResult cr;
if (mode == 0) {
/* nothing is stored in toUnicodeStatus, read a byte as input */
char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff;
bytesExpected = BYTES_FROM_UTF8[char32];
char32 &= BITMASK_FROM_UTF8[bytesExpected];
bytesSoFar = 1;
} else {
/* a partially or fully built code point is stored in toUnicodeStatus */
char32 = toUnicodeStatus;
bytesExpected = mode;
bytesSoFar = toULength;
toUnicodeStatus = 0;
mode = 0;
toULength = 0;
}
outer: while (true) {
if (bytesSoFar < bytesExpected) {
/* read a trail byte and insert its relevant bits into char32 */
if (sourceIndex >= sourceLimit) {
/* no source left, save the state for later and break out of the loop */
toUnicodeStatus = char32;
mode = bytesExpected;
toULength = bytesSoFar;
cr = CoderResult.UNDERFLOW;
break;
}
if (((ch = toUBytesArray[bytesSoFar] = source.get(sourceIndex++)) & 0xc0) != 0x80) {
/* not a trail byte (is not of the form 10xxxxxx) */
sourceIndex--;
toULength = bytesSoFar;
cr = CoderResult.malformedForLength(bytesSoFar);
break;
}
char32 = (char32 << 6) | (ch & 0x3f);
bytesSoFar++;
}
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
*/
else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff
&& (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {
/*
* char32 is a valid code point and is composed of the correct number of
* bytes ... we now need to output it in UTF-16
*/
if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
/* fits in 16 bits */
target.put(targetIndex++, (char) char32);
} else {
/* fit char32 into 20 bits */
char32 -= UConverterConstants.HALF_BASE;
/* write out the surrogates */
target.put(
targetIndex++,
(char) ((char32 >>> UConverterConstants.HALF_SHIFT) + UConverterConstants.SURROGATE_HIGH_START));
if (targetIndex >= targetLimit) {
/* put in overflow buffer (not handled here) */
charErrorBufferArray[charErrorBufferLength++] = (char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START);
cr = CoderResult.OVERFLOW;
break;
}
target.put(
targetIndex++,
(char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START));
}
/*
* we're finished outputing, so now we need to read in the first byte of the
* next byte sequence that could form a code point
*/
if (sourceIndex >= sourceLimit) {
cr = CoderResult.UNDERFLOW;
break;
}
if (targetIndex >= targetLimit) {
cr = CoderResult.OVERFLOW;
break;
}
/* keep reading the next input (and writing it) while bytes == 1 */
while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff]) == 1) {
target.put(targetIndex++, (char) char32);
if (sourceIndex >= sourceLimit) {
cr = CoderResult.UNDERFLOW;
break outer;
}
if (targetIndex >= targetLimit) {
cr = CoderResult.OVERFLOW;
break outer;
}
}
/* remove the bits that indicate the number of bytes */
char32 &= BITMASK_FROM_UTF8[bytesExpected];
bytesSoFar = 1;
} else {
/*
* either the lead byte in the code sequence is invalid (bytes == 0) or the
* lead byte combined with all the trail chars does not form a valid code
* point
*/
toULength = bytesSoFar;
cr = CoderResult.malformedForLength(bytesSoFar);
break;
}
}
source.position(sourceIndex);
target.position(targetIndex);
return cr;
}
}
}
class CharsetEncoderUTF8 extends CharsetEncoderICU {
public CharsetEncoderUTF8(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
protected void implReset() {
super.implReset();
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets,
boolean flush) {
if (!source.hasRemaining()) {
/* no input, nothing to do */
return CoderResult.UNDERFLOW;
}
if (!target.hasRemaining()) {
/* no output available, can't do anything */
return CoderResult.OVERFLOW;
}
if (source.hasArray() && target.hasArray()) {
/* source and target are backed by arrays, so use the arrays for optimal performance */
char[] sourceArray = source.array();
int srcIdx = source.arrayOffset() + source.position();
int sourceLimit = source.arrayOffset() + source.limit();
byte[] targetArray = target.array();
int tgtIdx = target.arrayOffset() + target.position();
int targetLimit = target.arrayOffset() + target.limit();
int char32;
CoderResult cr;
/* take care of the special condition of fromUChar32 not being 0 (it is a surrogate) */
if (fromUChar32 != 0) {
/* 4 bytes to encode from char32 and a following char in source */
sourceIndex = srcIdx;
targetIndex = tgtIdx;
cr = encodeFourBytes(sourceArray, targetArray, sourceLimit, targetLimit,
fromUChar32);
srcIdx = sourceIndex;
tgtIdx = targetIndex;
if (cr != null) {
source.position(srcIdx - source.arrayOffset());
target.position(tgtIdx - target.arrayOffset());
return cr;
}
}
while (true) {
if (srcIdx >= sourceLimit) {
/* nothing left to read */
cr = CoderResult.UNDERFLOW;
break;
}
if (tgtIdx >= targetLimit) {
/* no space left to write */
cr = CoderResult.OVERFLOW;
break;
}
/* reach the next char into char32 */
char32 = sourceArray[srcIdx++];
if (char32 <= 0x7f) {
/* 1 byte to encode from char32 */
targetArray[tgtIdx++] = encodeHeadOf1(char32);
} else if (char32 <= 0x7ff) {
/* 2 bytes to encode from char32 */
targetArray[tgtIdx++] = encodeHeadOf2(char32);
if (tgtIdx >= targetLimit) {
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
cr = CoderResult.OVERFLOW;
break;
}
targetArray[tgtIdx++] = encodeLastTail(char32);
} else if (!UTF16.isSurrogate((char) char32) || isCESU8) {
/* 3 bytes to encode from char32 */
targetArray[tgtIdx++] = encodeHeadOf3(char32);
if (tgtIdx >= targetLimit) {
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
cr = CoderResult.OVERFLOW;
break;
}
targetArray[tgtIdx++] = encodeSecondToLastTail(char32);
if (tgtIdx >= targetLimit) {
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
cr = CoderResult.OVERFLOW;
break;
}
targetArray[tgtIdx++] = encodeLastTail(char32);
} else {
/* 4 bytes to encode from char32 and a following char in source */
sourceIndex = srcIdx;
targetIndex = tgtIdx;
cr = encodeFourBytes(sourceArray, targetArray, sourceLimit, targetLimit,
char32);
srcIdx = sourceIndex;
tgtIdx = targetIndex;
if (cr != null)
break;
}
}
/* set the new source and target positions and return the CoderResult stored in cr */
source.position(srcIdx - source.arrayOffset());
target.position(tgtIdx - target.arrayOffset());
return cr;
} else {
int char32;
CoderResult cr;
/* take care of the special condition of fromUChar32 not being 0 (it is a surrogate) */
if (fromUChar32 != 0) {
/* 4 bytes to encode from char32 and a following char in source */
cr = encodeFourBytes(source, target, fromUChar32);
if (cr != null)
return cr;
}
while (true) {
if (!source.hasRemaining()) {
/* nothing left to read */
cr = CoderResult.UNDERFLOW;
break;
}
if (!target.hasRemaining()) {
/* no space left to write */
cr = CoderResult.OVERFLOW;
break;
}
/* reach the next char into char32 */
char32 = source.get();
if (char32 <= 0x7f) {
/* 1 byte to encode from char32 */
target.put(encodeHeadOf1(char32));
} else if (char32 <= 0x7ff) {
/* 2 bytes to encode from char32 */
target.put(encodeHeadOf2(char32));
if (!target.hasRemaining()) {
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
cr = CoderResult.OVERFLOW;
break;
}
target.put(encodeLastTail(char32));
} else if (!UTF16.isSurrogate((char) char32) || isCESU8) {
/* 3 bytes to encode from char32 */
target.put(encodeHeadOf3(char32));
if (!target.hasRemaining()) {
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
cr = CoderResult.OVERFLOW;
break;
}
target.put(encodeSecondToLastTail(char32));
if (!target.hasRemaining()) {
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
cr = CoderResult.OVERFLOW;
break;
}
target.put(encodeLastTail(char32));
} else {
/* 4 bytes to encode from char32 and a following char in source */
cr = encodeFourBytes(source, target, char32);
if (cr != null)
break;
}
}
/* set the new source and target positions and return the CoderResult stored in cr */
return cr;
}
}
private final CoderResult encodeFourBytes(char[] sourceArray, byte[] targetArray,
int sourceLimit, int targetLimit, int char32) {
/* we need to read another char to match up the surrogate stored in char32 */
/* handle the surrogate stuff, returning on a non-null CoderResult */
CoderResult cr = handleSurrogates(sourceArray, sourceIndex, sourceLimit, (char)char32);
if (cr != null)
return cr;
sourceIndex++;
char32 = fromUChar32;
fromUChar32 = 0;
/* the rest is routine -- encode four bytes, stopping on overflow */
targetArray[targetIndex++] = encodeHeadOf4(char32);
if (targetIndex >= targetLimit) {
errorBuffer[errorBufferLength++] = encodeThirdToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
return CoderResult.OVERFLOW;
}
targetArray[targetIndex++] = encodeThirdToLastTail(char32);
if (targetIndex >= targetLimit) {
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
return CoderResult.OVERFLOW;
}
targetArray[targetIndex++] = encodeSecondToLastTail(char32);
if (targetIndex >= targetLimit) {
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
return CoderResult.OVERFLOW;
}
targetArray[targetIndex++] = encodeLastTail(char32);
/* return null for success */
return null;
}
private final CoderResult encodeFourBytes(CharBuffer source, ByteBuffer target, int char32) {
/* handle the surrogate stuff, returning on a non-null CoderResult */
CoderResult cr = handleSurrogates(source, (char)char32);
if (cr != null)
return cr;
char32 = fromUChar32;
fromUChar32 = 0;
/* the rest is routine -- encode four bytes, stopping on overflow */
target.put(encodeHeadOf4(char32));
if (!target.hasRemaining()) {
errorBuffer[errorBufferLength++] = encodeThirdToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
return CoderResult.OVERFLOW;
}
target.put(encodeThirdToLastTail(char32));
if (!target.hasRemaining()) {
errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
return CoderResult.OVERFLOW;
}
target.put(encodeSecondToLastTail(char32));
if (!target.hasRemaining()) {
errorBuffer[errorBufferLength++] = encodeLastTail(char32);
return CoderResult.OVERFLOW;
}
target.put(encodeLastTail(char32));
/* return null for success */
return null;
}
private int sourceIndex;
private int targetIndex;
}
private static final byte encodeHeadOf1(int char32) {
return (byte) char32;
}
private static final byte encodeHeadOf2(int char32) {
return (byte) (0xc0 | (char32 >>> 6));
}
private static final byte encodeHeadOf3(int char32) {
return (byte) (0xe0 | ((char32 >>> 12)));
}
private static final byte encodeHeadOf4(int char32) {
return (byte) (0xf0 | ((char32 >>> 18)));
}
private static final byte encodeThirdToLastTail(int char32) {
return (byte) (0x80 | ((char32 >>> 12) & 0x3f));
}
private static final byte encodeSecondToLastTail(int char32) {
return (byte) (0x80 | ((char32 >>> 6) & 0x3f));
}
private static final byte encodeLastTail(int char32) {
return (byte) (0x80 | (char32 & 0x3f));
}
/* single-code point definitions -------------------------------------------- */
/*
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
*/
// static final boolean isSingle(byte c) {return (((c)&0x80)==0);}
/*
* Is this code unit (byte) a UTF-8 lead byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
*/
// static final boolean isLead(byte c) {return ((((c)-0xc0) &
// UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
/*
* Is this code unit (byte) a UTF-8 trail byte?
*
* @param c
* 8-bit code unit (byte)
* @return TRUE or FALSE
*/
/*private static final boolean isTrail(byte c) {
return (((c) & 0xc0) == 0x80);
}*/
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF8(this);
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF8(this);
}
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getNonSurrogateUnicodeSet(setFillIn);
}
}