icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF32.java - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /**
  *******************************************************************************
  * Copyright (C) 2006-2008, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
 package com.ibm.icu.charset;

 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.IntBuffer;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;

 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 /**
  * @author Niti Hantaweepant
  */
 class CharsetUTF32 extends CharsetICU {

     private static final int SIGNATURE_LENGTH = 4;
     private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd };
     private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 };
     private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff };
     private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 };
     private static final int ENDIAN_XOR_BE = 0;
     private static final int ENDIAN_XOR_LE = 3;
     private static final int NEED_TO_WRITE_BOM = 1;

     private boolean isEndianSpecified;
     private boolean isBigEndian;
     private int endianXOR;
     private byte[] bom;
     private byte[] fromUSubstitution;

     public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
         super(icuCanonicalName, javaCanonicalName, aliases);

         this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE);
         this.isBigEndian = !(this instanceof CharsetUTF32LE);

         if (isBigEndian) {
             this.bom = BOM_BE;
             this.fromUSubstitution = fromUSubstitution_BE;
             this.endianXOR = ENDIAN_XOR_BE;
         } else {
             this.bom = BOM_LE;
             this.fromUSubstitution = fromUSubstitution_LE;
             this.endianXOR = ENDIAN_XOR_LE;
         }

         maxBytesPerChar = 4;
         minBytesPerChar = 4;
         maxCharsPerByte = 1;
     }

     class CharsetDecoderUTF32 extends CharsetDecoderICU {

         private boolean isBOMReadYet;
         private int actualEndianXOR;
         private byte[] actualBOM;

         public CharsetDecoderUTF32(CharsetICU cs) {
             super(cs);
         }

         @Override
         protected void implReset() {
             super.implReset();
             isBOMReadYet = false;
             actualBOM = null;
         }

         @Override
         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
             /*
              * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
              * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
              * are in the current buffer.
              */
             if (!isBOMReadYet) {
                 while (true) {
                     if (!source.hasRemaining())
                         return CoderResult.UNDERFLOW;

                     toUBytesArray[toULength++] = source.get();

                     if (toULength == 1) {
                         // on the first byte, we haven't decided whether or not it's bigEndian yet
                         if ((!isEndianSpecified || isBigEndian)
                                 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
                             actualBOM = BOM_BE;
                             actualEndianXOR = ENDIAN_XOR_BE;
                         } else if ((!isEndianSpecified || !isBigEndian)
                                 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
                             actualBOM = BOM_LE;
                             actualEndianXOR = ENDIAN_XOR_LE;
                         } else {
                             // we do not have a BOM (and we have toULength==1 bytes)
                             actualBOM = null;
                             actualEndianXOR = endianXOR;
                             break;
                         }
                     } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
                         // we do not have a BOM (and we have toULength bytes)
                         actualBOM = null;
                         actualEndianXOR = endianXOR;
                         break;
                     } else if (toULength == SIGNATURE_LENGTH) {
                         // we found a BOM! at last!
                         // too bad we have to get ignore it now (like it was unwanted or something)
                         toULength = 0;
                         break;
                     }
                 }

                 isBOMReadYet = true;
             }

             // now that we no longer need to look for a BOM, let's do some work
             int char32;

             while (true) {
                 while (toULength < 4) {
                     if (!source.hasRemaining())
                         return CoderResult.UNDERFLOW;
                     toUBytesArray[toULength++] = source.get();
                 }

                 if (!target.hasRemaining())
                     return CoderResult.OVERFLOW;

                 char32 = 0;
                 for (int i = 0; i < 4; i++)
                     char32 = (char32 << 8)
                             | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK);

                 if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) {
                     toULength = 0;
                     if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
                         /* fits in 16 bits */
                         target.put((char) char32);
                     } else {
                         /* write out the surrogates */
                         target.put(UTF16.getLeadSurrogate(char32));
                         char32 = UTF16.getTrailSurrogate(char32);
                         if (target.hasRemaining()) {
                             target.put((char) char32);
                         } else {
                             /* Put in overflow buffer (not handled here) */
                             charErrorBufferArray[0] = (char) char32;
                             charErrorBufferLength = 1;
                             return CoderResult.OVERFLOW;
                         }
                     }
                 } else {
                     return CoderResult.malformedForLength(toULength);
                 }
             }
         }
     }

     class CharsetEncoderUTF32 extends CharsetEncoderICU {
         private final byte[] temp = new byte[4];

         public CharsetEncoderUTF32(CharsetICU cs) {
             super(cs, fromUSubstitution);
             fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
         }

         @Override
         protected void implReset() {
             super.implReset();
             fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
         }

         @Override
         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
             CoderResult cr;

             /* write the BOM if necessary */
             if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
                 if (!target.hasRemaining())
                     return CoderResult.OVERFLOW;

                 fromUnicodeStatus = 0;
                 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
                 if (cr.isOverflow())
                     return cr;
             }

             if (fromUChar32 != 0) {
                 if (!target.hasRemaining())
                     return CoderResult.OVERFLOW;

                 // a note: fromUChar32 will either be 0 or a lead surrogate
                 cr = encodeChar(source, target, offsets, (char) fromUChar32);
                 if (cr != null)
                     return cr;
             }

             while (true) {
                 if (!source.hasRemaining())
                     return CoderResult.UNDERFLOW;
                 if (!target.hasRemaining())
                     return CoderResult.OVERFLOW;

                 cr = encodeChar(source, target, offsets, source.get());
                 if (cr != null)
                     return cr;
             }
         }

         private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
             int sourceIndex = source.position() - 1;
             CoderResult cr;
             int char32;

             if (UTF16.isSurrogate(ch)) {
                 cr = handleSurrogates(source, ch);
                 if (cr != null)
                     return cr;

                 char32 = fromUChar32;
                 fromUChar32 = 0;
             } else {
                 char32 = ch;
             }

             /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
             // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0)
             temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f)
             temp[2 ^ endianXOR] = (byte) (char32 >>> 8);
             temp[3 ^ endianXOR] = (byte) (char32);
             cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
             return (cr.isUnderflow() ? null : cr);
         }
     }

     @Override
     public CharsetDecoder newDecoder() {
         return new CharsetDecoderUTF32(this);
     }

     @Override
     public CharsetEncoder newEncoder() {
         return new CharsetEncoderUTF32(this);
     }


     @Override
     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
         getNonSurrogateUnicodeSet(setFillIn);
     }
 }
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	/**
	*******************************************************************************
	* Copyright (C) 2006-2008, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*/
	package com.ibm.icu.charset;

	import java.nio.ByteBuffer;
	import java.nio.CharBuffer;
	import java.nio.IntBuffer;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CharsetEncoder;
	import java.nio.charset.CoderResult;

	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;

	/**
	* @author Niti Hantaweepant
	*/
	class CharsetUTF32 extends CharsetICU {

	private static final int SIGNATURE_LENGTH = 4;
	private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd };
	private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 };
	private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff };
	private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 };
	private static final int ENDIAN_XOR_BE = 0;
	private static final int ENDIAN_XOR_LE = 3;
	private static final int NEED_TO_WRITE_BOM = 1;

	private boolean isEndianSpecified;
	private boolean isBigEndian;
	private int endianXOR;
	private byte[] bom;
	private byte[] fromUSubstitution;

	public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
	super(icuCanonicalName, javaCanonicalName, aliases);

	this.isEndianSpecified = (this instanceof CharsetUTF32BE \|\| this instanceof CharsetUTF32LE);
	this.isBigEndian = !(this instanceof CharsetUTF32LE);

	if (isBigEndian) {
	this.bom = BOM_BE;
	this.fromUSubstitution = fromUSubstitution_BE;
	this.endianXOR = ENDIAN_XOR_BE;
	} else {
	this.bom = BOM_LE;
	this.fromUSubstitution = fromUSubstitution_LE;
	this.endianXOR = ENDIAN_XOR_LE;
	}

	maxBytesPerChar = 4;
	minBytesPerChar = 4;
	maxCharsPerByte = 1;
	}

	class CharsetDecoderUTF32 extends CharsetDecoderICU {

	private boolean isBOMReadYet;
	private int actualEndianXOR;
	private byte[] actualBOM;

	public CharsetDecoderUTF32(CharsetICU cs) {
	super(cs);
	}

	@Override
	protected void implReset() {
	super.implReset();
	isBOMReadYet = false;
	actualBOM = null;
	}

	@Override
	protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
	/*
	* If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
	* converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
	* are in the current buffer.
	*/
	if (!isBOMReadYet) {
	while (true) {
	if (!source.hasRemaining())
	return CoderResult.UNDERFLOW;

	toUBytesArray[toULength++] = source.get();

	if (toULength == 1) {
	// on the first byte, we haven't decided whether or not it's bigEndian yet
	if ((!isEndianSpecified \|\| isBigEndian)
	&& toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
	actualBOM = BOM_BE;
	actualEndianXOR = ENDIAN_XOR_BE;
	} else if ((!isEndianSpecified \|\| !isBigEndian)
	&& toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
	actualBOM = BOM_LE;
	actualEndianXOR = ENDIAN_XOR_LE;
	} else {
	// we do not have a BOM (and we have toULength==1 bytes)
	actualBOM = null;
	actualEndianXOR = endianXOR;
	break;
	}
	} else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
	// we do not have a BOM (and we have toULength bytes)
	actualBOM = null;
	actualEndianXOR = endianXOR;
	break;
	} else if (toULength == SIGNATURE_LENGTH) {
	// we found a BOM! at last!
	// too bad we have to get ignore it now (like it was unwanted or something)
	toULength = 0;
	break;
	}
	}

	isBOMReadYet = true;
	}

	// now that we no longer need to look for a BOM, let's do some work
	int char32;

	while (true) {
	while (toULength < 4) {
	if (!source.hasRemaining())
	return CoderResult.UNDERFLOW;
	toUBytesArray[toULength++] = source.get();
	}

	if (!target.hasRemaining())
	return CoderResult.OVERFLOW;

	char32 = 0;
	for (int i = 0; i < 4; i++)
	char32 = (char32 << 8)
	\| (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK);

	if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) {
	toULength = 0;
	if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
	/* fits in 16 bits */
	target.put((char) char32);
	} else {
	/* write out the surrogates */
	target.put(UTF16.getLeadSurrogate(char32));
	char32 = UTF16.getTrailSurrogate(char32);
	if (target.hasRemaining()) {
	target.put((char) char32);
	} else {
	/* Put in overflow buffer (not handled here) */
	charErrorBufferArray[0] = (char) char32;
	charErrorBufferLength = 1;
	return CoderResult.OVERFLOW;
	}
	}
	} else {
	return CoderResult.malformedForLength(toULength);
	}
	}
	}
	}

	class CharsetEncoderUTF32 extends CharsetEncoderICU {
	private final byte[] temp = new byte[4];

	public CharsetEncoderUTF32(CharsetICU cs) {
	super(cs, fromUSubstitution);
	fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
	}

	@Override
	protected void implReset() {
	super.implReset();
	fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
	}

	@Override
	protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
	CoderResult cr;

	/* write the BOM if necessary */
	if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
	if (!target.hasRemaining())
	return CoderResult.OVERFLOW;

	fromUnicodeStatus = 0;
	cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
	if (cr.isOverflow())
	return cr;
	}

	if (fromUChar32 != 0) {
	if (!target.hasRemaining())
	return CoderResult.OVERFLOW;

	// a note: fromUChar32 will either be 0 or a lead surrogate
	cr = encodeChar(source, target, offsets, (char) fromUChar32);
	if (cr != null)
	return cr;
	}

	while (true) {
	if (!source.hasRemaining())
	return CoderResult.UNDERFLOW;
	if (!target.hasRemaining())
	return CoderResult.OVERFLOW;

	cr = encodeChar(source, target, offsets, source.get());
	if (cr != null)
	return cr;
	}
	}

	private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
	int sourceIndex = source.position() - 1;
	CoderResult cr;
	int char32;

	if (UTF16.isSurrogate(ch)) {
	cr = handleSurrogates(source, ch);
	if (cr != null)
	return cr;

	char32 = fromUChar32;
	fromUChar32 = 0;
	} else {
	char32 = ch;
	}

	/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
	// temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0)
	temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f)
	temp[2 ^ endianXOR] = (byte) (char32 >>> 8);
	temp[3 ^ endianXOR] = (byte) (char32);
	cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
	return (cr.isUnderflow() ? null : cr);
	}
	}

	@Override
	public CharsetDecoder newDecoder() {
	return new CharsetDecoderUTF32(this);
	}

	@Override
	public CharsetEncoder newEncoder() {
	return new CharsetEncoderUTF32(this);
	}


	@Override
	void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
	getNonSurrogateUnicodeSet(setFillIn);
	}
	}