blob: 27b44260e3dc15d0e58a4280e576222591d92098 [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.ByteBuffer;
import com.ibm.icu.charset.CharsetCallback;
import com.ibm.icu.impl.Assert;
/**
* An abstract class that provides framework methods of decoding operations for concrete
* subclasses.
* In the future this class will contain API that will implement converter sematics of ICU4C.
* @draft ICU 3.6
* @provisional This API might change or be removed in a future release.
*/
public abstract class CharsetDecoderICU extends CharsetDecoder{
int toUnicodeStatus;
byte[] toUBytesArray = new byte[128];
int toUBytesBegin = 0;
int toULength;
char[] charErrorBufferArray = new char[128];
int charErrorBufferLength;
int charErrorBufferBegin;
char[] invalidCharBuffer = new char[128];
int invalidCharLength;
/* store previous UChars/chars to continue partial matches */
byte[] preToUArray;
int preToUBegin;
int preToULength; /* negative: replay */
int preToUFirstLength; /* length of first character */
int mode;
Object toUContext = null;
private CharsetCallback.Decoder onUnmappableInput = CharsetCallback.TO_U_CALLBACK_STOP;
private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
CharsetCallback.Decoder toCharErrorBehaviour= new CharsetCallback.Decoder(){
public CoderResult call(CharsetDecoderICU decoder, Object context,
ByteBuffer source, CharBuffer target, IntBuffer offsets,
char[] buffer, int length, CoderResult cr) {
if(cr.isUnmappable()){
return onUnmappableInput.call(decoder, context,
source, target, offsets,
buffer, length, cr);
}else if(cr.isMalformed()){
return onMalformedInput.call(decoder, context,
source, target, offsets,
buffer, length, cr);
}
return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context,
source, target, offsets,
buffer, length, cr);
}
};
/**
* Construct a CharsetDecorderICU based on the information provided from a
* CharsetICU object.
* @param cs The CharsetICU object containing information about how to
* charset to decode.
* @draft ICU 3.6
* @provisional This API might change or be removed in a future release.
*/
CharsetDecoderICU(CharsetICU cs) {
super(cs, (float) (1/(float)cs.maxCharsPerByte), cs.maxCharsPerByte);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
* @param newAction action to be taken
* @exception IllegalArgumentException
* @stable ICU 3.6
*/
protected final void implOnMalformedInput(CodingErrorAction newAction) {
onMalformedInput = getCallback(newAction);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
* @param newAction action to be taken
* @exception IllegalArgumentException
* @stable ICU 3.6
*/
protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
onUnmappableInput = getCallback(newAction);
}
private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
if(action==CodingErrorAction.REPLACE){
return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
}else if(action==CodingErrorAction.IGNORE){
return CharsetCallback.TO_U_CALLBACK_SKIP;
}else if(action==CodingErrorAction.REPORT){
return CharsetCallback.TO_U_CALLBACK_STOP;
}
return CharsetCallback.TO_U_CALLBACK_STOP;
}
private final ByteBuffer EMPTY = ByteBuffer.allocate(0);
/**
* Flushes any characters saved in the converter's internal buffer and
* resets the converter.
* @param out action to be taken
* @return result of flushing action and completes the decoding all input.
* Returns CoderResult.UNDERFLOW if the action succeeds.
* @stable ICU 3.6
*/
protected final CoderResult implFlush(CharBuffer out) {
return decode(EMPTY, out, null, true);
}
/**
* Resets the to Unicode mode of converter
* @stable ICU 3.6
*/
protected void implReset() {
toUnicodeStatus = 0 ;
toULength = 0;
charErrorBufferLength = 0;
charErrorBufferBegin = 0;
/* store previous UChars/chars to continue partial matches */
preToUBegin = 0;
preToULength = 0; /* negative: replay */
preToUFirstLength = 0;
mode = 0;
}
/**
* Decodes one or more bytes. The default behaviour of the converter
* is stop and report if an error in input stream is encountered.
* To set different behaviour use @see CharsetDecoder.onMalformedInput()
* This method allows a buffer by buffer conversion of a data stream.
* The state of the conversion is saved between calls to convert.
* Among other things, this means multibyte input sequences can be
* split between calls. If a call to convert results in an Error, the
* conversion may be continued by calling convert again with suitably
* modified parameters.All conversions should be finished with a call to
* the flush method.
* @param in buffer to decode
* @param out buffer to populate with decoded result
* @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
* @stable ICU 3.6
*/
protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
if(!in.hasRemaining()){
return CoderResult.UNDERFLOW;
}
in.position(in.position()+toUCountPending());
/* do the conversion */
CoderResult ret = decode(in, out, null, false);
setSourcePosition(in);
return ret;
}
/**
* Implements the ICU semantic for decode operation
* @param in The input byte buffer
* @param out The output character buffer
* @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
* @draft ICU 3.6
* @provisional This API might change or be removed in a future release.
*/
abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush);
/**
* Implements the ICU semantic for decode operation
* @param source The input byte buffer
* @param target The output character buffer
* @param offsets
* @param flush true if, and only if, the invoker can provide no
* additional input bytes beyond those in the given buffer.
* @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
* @draft ICU 3.6
* @provisional This API might change or be removed in a future release.
*/
final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
/* check parameters */
if(target==null || source==null) {
throw new IllegalArgumentException();
}
/*
* Make sure that the buffer sizes do not exceed the number range for
* int32_t because some functions use the size (in units or bytes)
* rather than comparing pointers, and because offsets are int32_t values.
*
* size_t is guaranteed to be unsigned and large enough for the job.
*
* Return with an error instead of adjusting the limits because we would
* not be able to maintain the semantics that either the source must be
* consumed or the target filled (unless an error occurs).
* An adjustment would be sourceLimit=t+0x7fffffff; for example.
*/
/*agljport:fix
if(
((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
) {
*err=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
*/
/* flush the target overflow buffer */
if(charErrorBufferLength>0) {
char[] overflow = null;
int i, length;
overflow=charErrorBufferArray;
length=charErrorBufferLength;
i=0;
do {
if(target.remaining()<=0) {
/* the overflow buffer contains too much, keep the rest */
int j=0;
do {
overflow[j++]=overflow[i++];
} while(i<length);
charErrorBufferLength=(byte)j;
return CoderResult.OVERFLOW;
}
/* copy the overflow contents to the target */
target.put(overflow[i++]);
if(offsets!=null) {
offsets.put(-1); /* no source index available for old output */
}
} while(i<length);
/* the overflow buffer is completely copied to the target */
charErrorBufferLength=0;
}
if(!flush && source.remaining()==0 && preToULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
return CoderResult.UNDERFLOW;
}
/*
* Do not simply return with a buffer overflow error if
* !flush && t==targetLimit
* because it is possible that the source will not generate any output.
* For example, the skip callback may be called;
* it does not output anything.
*/
return toUnicodeWithCallback(source, target, offsets, flush);
}
/* maximum number of indexed bytes */
private static final int EXT_MAX_BYTES = 0x1f;
private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
int limit;
int delta, offset;
if(sourceIndex>=0) {
/*
* adjust each offset by adding the previous sourceIndex
* minus the length of the input sequence that caused an
* error, if any
*/
delta=sourceIndex-errorInputLength;
} else {
/*
* set each offset to -1 because this conversion function
* does not handle offsets
*/
delta=-1;
}
limit=offsets.position()+length;
if(delta==0) {
/* most common case, nothing to do */
} else if(delta>0) {
/* add the delta to each offset (but not if the offset is <0) */
while(offsets.position()<limit) {
offset=offsets.get(offsets.position());
if(offset>=0) {
offsets.put(offset+delta);
}
//FIXME: ++offsets;
}
} else /* delta<0 */ {
/*
* set each offset to -1 because this conversion function
* does not handle offsets
* or the error input sequence started in a previous buffer
*/
while(offsets.position()<limit) {
offsets.put(-1);
}
}
}
final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
int sourceIndex;
int errorInputLength;
boolean converterSawEndOfInput, calledCallback;
int t=target.position();
int s=source.position();
/* variables for m:n conversion */
ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
int replayArrayIndex = 0;
ByteBuffer realSource=null;
boolean realFlush=false;
int realSourceIndex=0;
CoderResult cr = CoderResult.UNDERFLOW;
/* get the converter implementation function */
sourceIndex=0;
if(preToULength>=0) {
/* normal mode */
} else {
/*
* Previous m:n conversion stored source units from a partial match
* and failed to consume all of them.
* We need to "replay" them from a temporary buffer and convert them first.
*/
realSource=source;
realFlush=flush;
realSourceIndex=sourceIndex;
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
source=replayArray;
source.position(0);
source.limit(replayArrayIndex-preToULength);
flush=false;
sourceIndex=-1;
preToULength=0;
}
/*
* loop for conversion and error handling
*
* loop {
* convert
* loop {
* update offsets
* handle end of input
* handle errors/call callback
* }
* }
*/
for(;;) {
if(cr.isUnderflow()) {
/* convert */
cr = decodeLoop(source, target, offsets, flush);
/*
* set a flag for whether the converter
* successfully processed the end of the input
*
* need not check cnv->preToULength==0 because a replay (<0) will cause
* s<sourceLimit before converterSawEndOfInput is checked
*/
converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength==0);
} else {
/* handle error from getNextUChar() */
converterSawEndOfInput=false;
}
/* no callback called yet for this iteration */
calledCallback=false;
/* no sourceIndex adjustment for conversion, only for callback output */
errorInputLength=0;
/*
* loop for offsets and error handling
*
* iterates at most 3 times:
* 1. to clean up after the conversion function
* 2. after the callback
* 3. after the callback again if there was truncated input
*/
for(;;) {
/* update offsets if we write any */
if(offsets!=null) {
int length=(target.position()-t);
if(length>0) {
updateOffsets(offsets, length, sourceIndex, errorInputLength);
/*
* if a converter handles offsets and updates the offsets
* pointer at the end, then pArgs->offset should not change
* here;
* however, some converters do not handle offsets at all
* (sourceIndex<0) or may not update the offsets pointer
*/
//TODO: pArgs->offsets=offsets+=length;
}
if(sourceIndex>=0) {
sourceIndex+=(source.position()-s);
}
}
if(preToULength<0) {
/*
* switch the source to new replay units (cannot occur while replaying)
* after offset handling and before end-of-input and callback handling
*/
if(realSource==null)
{
realSource=source;
realFlush=flush;
realSourceIndex=sourceIndex;
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
source=replayArray;
source.limit(replayArrayIndex-preToULength);
flush=false;
if((sourceIndex+=preToULength)<0) {
sourceIndex=-1;
}
preToULength=0;
} else {
/* see implementation note before _fromUnicodeWithCallback() */
//agljport:todo U_ASSERT(realSource==NULL);
Assert.assrt(realSource==null);
}
}
/* update pointers */
s=source.position();
t=target.position();
if(cr.isUnderflow()) {
if(s<source.limit())
{
/*
* continue with the conversion loop while there is still input left
* (continue converting by breaking out of only the inner loop)
*/
break;
} else if(realSource!=null) {
/* switch back from replaying to the real source and continue */
source = realSource;
flush=realFlush;
sourceIndex=realSourceIndex;
realSource=null;
break;
} else if(flush && toULength>0) {
/*
* the entire input stream is consumed
* and there is a partial, truncated input sequence left
*/
/* inject an error and continue with callback handling */
cr = CoderResult.malformedForLength(toULength);
calledCallback=false; /* new error condition */
} else {
/* input consumed */
if(flush) {
/*
* return to the conversion loop once more if the flush
* flag is set and the conversion function has not
* successfully processed the end of the input yet
*
* (continue converting by breaking out of only the inner loop)
*/
if(!converterSawEndOfInput) {
break;
}
/* reset the converter without calling the callback function */
implReset();
}
/* done successfully */
return cr;
}
}
/* U_FAILURE(*err) */
{
if( calledCallback || cr.isOverflow() ||
(cr.isMalformed() && cr.isUnmappable())
) {
/*
* the callback did not or cannot resolve the error:
* set output pointers and return
*
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
*
* if we were replaying, then the replay buffer must be
* copied back into the UConverter
* and the real arguments must be restored
*/
if(realSource!=null) {
int length;
Assert.assrt(preToULength==0);
length=(int)(source.limit()-source.position());
if(length>0) {
//UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
source.get(preToUArray, preToUBegin, length);
preToULength=(byte)-length;
}
source=realSource;
flush=realFlush;
}
return cr;
}
}
/* copy toUBytes[] to invalidCharBuffer[] */
errorInputLength=invalidCharLength=toULength;
if(errorInputLength>0) {
copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
}
/* set the converter state to deal with the next character */
toULength=0;
/* call the callback function */
cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
/*
* loop back to the offset handling
*
* this flag will indicate after offset handling
* that a callback was called;
* if the callback did not resolve the error, then we return
*/
calledCallback=true;
}
}
}
/**
* Returns the number of chars held in the converter's internal state
* because more input is needed for completing the conversion. This function is
* useful for mapping semantics of ICU's converter interface to those of iconv,
* and this information is not needed for normal conversion.
* @return The number of chars in the state. -1 if an error is encountered.
* @draft ICU 3.6
*/
/*public*/ int toUCountPending() {
if(preToULength > 0){
return preToULength ;
}else if(preToULength < 0){
return -preToULength;
}else if(toULength > 0){
return toULength;
}
return 0;
}
private final void setSourcePosition(ByteBuffer source){
// ok was there input held in the previous invocation of decodeLoop
// that resulted in output in this invocation?
source.position(source.position() - toUCountPending());
}
private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
for(int i=srcOffset; i<length; i++){
dst[dstOffset++]=(char)src[srcOffset++];
}
}
/**
* ONLY used by ToU callback functions.
* This function will write out the specified characters to the target
* character buffer.
* @return A CoderResult object that contains the error result when an error occurs.
* @draft ICU 3.6
* @provisional This API might change or be removed in a future release.
*/
static final CoderResult toUWriteUChars( CharsetDecoderICU cnv,
char[] ucharsArray, int ucharsBegin, int length,
CharBuffer target, IntBuffer offsets, int sourceIndex) {
CoderResult cr = CoderResult.UNDERFLOW;
/* write UChars */
if(offsets==null) {
while(length>0 && target.hasRemaining()) {
target.put(ucharsArray[ucharsBegin++]);
--length;
}
} else {
/* output with offsets */
while(length>0 && target.hasRemaining()) {
target.put(ucharsArray[ucharsBegin++]);
offsets.put(sourceIndex);
--length;
}
}
/* write overflow */
if(length>0) {
cnv.charErrorBufferLength= 0;
cr = CoderResult.OVERFLOW;
do {
cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++];
} while(--length>0);
}
return cr;
}
/**
* This function will write out the Unicode substitution character to the
* target character buffer.
* Sub classes to override this method if required
* @param decoder
* @param source
* @param target
* @param offsets
* @return A CoderResult object that contains the error result when an error occurs.
* @draft ICU 3.6
* @provisional This API might change or be removed in a future release.
*/
CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
ByteBuffer source, CharBuffer target,
IntBuffer offsets){
String sub = decoder.replacement();
CharsetICU cs = (CharsetICU) decoder.charset();
if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
char[] subArr = new char[] { 0x1a };
return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
.length(), target, offsets, source.position());
} else {
return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
0, sub.length(), target, offsets, source.position());
}
}
}