unicodetools/com/ibm/text/utility/UTF8StreamWriter.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 1996-2001, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java,v $
 * $Date: 2002/03/15 00:34:46 $
 * $Revision: 1.5 $
 *
 *******************************************************************************
 */

 package com.ibm.text.utility;
 import java.io.*;

 /**
  * Utility class that writes UTF8.<br>
  * Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
  * <br>
  * Example of Usage:
  * <pre>
  * PrintWriter log = new PrintWriter(
  *   new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
  * </pre>
  * NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads.
  */
  // TODO: Fix case of surrogate pair crossing input buffer boundary

 public final class UTF8StreamWriter extends Writer {

     private OutputStream output;
     private byte[] bBuffer; // do a bit of buffering ourselves for efficiency
     private int bSafeEnd;
     private int bEnd;
     private int bIndex = 0;
     private int highSurrogate = 0;
     private boolean removeCR;
     private boolean Latin1;

     public UTF8StreamWriter(OutputStream stream, int buffersize) {
         this(stream, buffersize, true, false);
     }

     public UTF8StreamWriter(OutputStream stream, int buffersize, boolean removeCR, boolean latin1) {
         if (buffersize < 5) {
             throw new IllegalArgumentException("UTF8StreamWriter buffersize must be >= 5");
         }
         output = stream;
         bBuffer = new byte[buffersize];
         bEnd = buffersize;
         bSafeEnd = buffersize - 4;
         this.removeCR = removeCR;
         this.Latin1 = latin1;
     }

     private static final int
         NEED_2_BYTES = 1<<7,
         NEED_3_BYTES = 1<<(2*5 + 1),
         NEED_4_BYTES = 1<<(3*5 + 1);

     private static final int
         TRAILING_BOTTOM_MASK = 0x3F,
         TRAILING_TOP = 0x80;

     private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);

     public final void write(char[] buffer, int cStart, int cLength) throws IOException {
         int cEnd = cStart + cLength;
         while (cStart < cEnd) {

             // write if we need to

             if (bIndex > bSafeEnd) {
                 output.write(bBuffer, 0, bIndex);
                 bIndex = 0;
             }

             // get code point

             int utf32 = buffer[cStart++];

             if (utf32 == 0x0D && removeCR) continue; // skip write

             if (Latin1) {
                 if (utf32 > 0xFF) bBuffer[bIndex++] = (byte)'?';
                 else bBuffer[bIndex++] = (byte)utf32;
                 continue;
             }

             // special check for surrogates

             if (highSurrogate != 0) {
                 if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
                     writeCodePoint((highSurrogate << 10) + utf32 + MAGIC);
                     highSurrogate = 0;
                     continue;
                 }
                 writeCodePoint(highSurrogate);
                 highSurrogate = 0;
             }

             if (0xD800 <= utf32 && utf32 <= 0xDBFF) {
                 highSurrogate = utf32;
                 continue;
             }

             // normal case

             writeCodePoint(utf32);
         }
     }

     private final void writeCodePoint(int utf32) {

         // convert to bytes

 		if (utf32 < NEED_2_BYTES) {
 		    bBuffer[bIndex++] = (byte)utf32;
 		    return;
         }

 		// Find out how many bytes we need to write
 		// At this point, it is at least 2.

 	    //int count;
 		int backIndex;
 		int firstByteMark;
 		if (utf32 < NEED_3_BYTES) {
 		    backIndex = bIndex += 2;
 		    firstByteMark = 0xC0;
 		} else if (utf32 < NEED_4_BYTES) {
 		    backIndex = bIndex += 3;
 		    firstByteMark = 0xE0;
 			bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
 			utf32 >>= 6;
 		} else {
 		    backIndex = bIndex += 4;
 		    firstByteMark = 0xF0;
 			bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
 			utf32 >>= 6;
 			bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
 			utf32 >>= 6;
 		};
 		bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK));
 		utf32 >>= 6;
 		bBuffer[--backIndex] = (byte)(firstByteMark | utf32);
     }

     private void internalFlush() throws IOException {
         if (highSurrogate != 0) {
             if (bIndex > bEnd) {
                 output.write(bBuffer, 0, bIndex);
                 bIndex = 0;
             }
             writeCodePoint(highSurrogate);
             highSurrogate = 0;
         }

         // write buffer if we need to
         if (bIndex != 0) {
             output.write(bBuffer, 0, bIndex);
             bIndex = 0;
         }
     }

     public void close() throws IOException {
         internalFlush();
         output.close();
     }

     public void flush() throws IOException {
         internalFlush();
         output.flush();
     }
 }
	/**
	*******************************************************************************
	* Copyright (C) 1996-2001, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java,v $
	* $Date: 2002/03/15 00:34:46 $
	* $Revision: 1.5 $
	*
	*******************************************************************************
	*/

	package com.ibm.text.utility;
	import java.io.*;

	/**
	* Utility class that writes UTF8.<br>
	* Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
	* <br>
	* Example of Usage:
	* <pre>
	* PrintWriter log = new PrintWriter(
	* new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
	* </pre>
	* NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads.
	*/
	// TODO: Fix case of surrogate pair crossing input buffer boundary

	public final class UTF8StreamWriter extends Writer {

	private OutputStream output;
	private byte[] bBuffer; // do a bit of buffering ourselves for efficiency
	private int bSafeEnd;
	private int bEnd;
	private int bIndex = 0;
	private int highSurrogate = 0;
	private boolean removeCR;
	private boolean Latin1;

	public UTF8StreamWriter(OutputStream stream, int buffersize) {
	this(stream, buffersize, true, false);
	}

	public UTF8StreamWriter(OutputStream stream, int buffersize, boolean removeCR, boolean latin1) {
	if (buffersize < 5) {
	throw new IllegalArgumentException("UTF8StreamWriter buffersize must be >= 5");
	}
	output = stream;
	bBuffer = new byte[buffersize];
	bEnd = buffersize;
	bSafeEnd = buffersize - 4;
	this.removeCR = removeCR;
	this.Latin1 = latin1;
	}

	private static final int
	NEED_2_BYTES = 1<<7,
	NEED_3_BYTES = 1<<(2*5 + 1),
	NEED_4_BYTES = 1<<(3*5 + 1);

	private static final int
	TRAILING_BOTTOM_MASK = 0x3F,
	TRAILING_TOP = 0x80;

	private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);

	public final void write(char[] buffer, int cStart, int cLength) throws IOException {
	int cEnd = cStart + cLength;
	while (cStart < cEnd) {

	// write if we need to

	if (bIndex > bSafeEnd) {
	output.write(bBuffer, 0, bIndex);
	bIndex = 0;
	}

	// get code point

	int utf32 = buffer[cStart++];

	if (utf32 == 0x0D && removeCR) continue; // skip write

	if (Latin1) {
	if (utf32 > 0xFF) bBuffer[bIndex++] = (byte)'?';
	else bBuffer[bIndex++] = (byte)utf32;
	continue;
	}

	// special check for surrogates

	if (highSurrogate != 0) {
	if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
	writeCodePoint((highSurrogate << 10) + utf32 + MAGIC);
	highSurrogate = 0;
	continue;
	}
	writeCodePoint(highSurrogate);
	highSurrogate = 0;
	}

	if (0xD800 <= utf32 && utf32 <= 0xDBFF) {
	highSurrogate = utf32;
	continue;
	}

	// normal case

	writeCodePoint(utf32);
	}
	}

	private final void writeCodePoint(int utf32) {

	// convert to bytes

	if (utf32 < NEED_2_BYTES) {
	bBuffer[bIndex++] = (byte)utf32;
	return;
	}

	// Find out how many bytes we need to write
	// At this point, it is at least 2.

	//int count;
	int backIndex;
	int firstByteMark;
	if (utf32 < NEED_3_BYTES) {
	backIndex = bIndex += 2;
	firstByteMark = 0xC0;
	} else if (utf32 < NEED_4_BYTES) {
	backIndex = bIndex += 3;
	firstByteMark = 0xE0;
	bBuffer[--backIndex] = (byte)(TRAILING_TOP \| (utf32 & TRAILING_BOTTOM_MASK));
	utf32 >>= 6;
	} else {
	backIndex = bIndex += 4;
	firstByteMark = 0xF0;
	bBuffer[--backIndex] = (byte)(TRAILING_TOP \| (utf32 & TRAILING_BOTTOM_MASK));
	utf32 >>= 6;
	bBuffer[--backIndex] = (byte)(TRAILING_TOP \| (utf32 & TRAILING_BOTTOM_MASK));
	utf32 >>= 6;
	};
	bBuffer[--backIndex] = (byte)(TRAILING_TOP \| (utf32 & TRAILING_BOTTOM_MASK));
	utf32 >>= 6;
	bBuffer[--backIndex] = (byte)(firstByteMark \| utf32);
	}

	private void internalFlush() throws IOException {
	if (highSurrogate != 0) {
	if (bIndex > bEnd) {
	output.write(bBuffer, 0, bIndex);
	bIndex = 0;
	}
	writeCodePoint(highSurrogate);
	highSurrogate = 0;
	}

	// write buffer if we need to
	if (bIndex != 0) {
	output.write(bBuffer, 0, bIndex);
	bIndex = 0;
	}
	}

	public void close() throws IOException {
	internalFlush();
	output.close();
	}

	public void flush() throws IOException {
	internalFlush();
	output.flush();
	}
	}