blob: e65456688595903daa3f16f80d336a14cd20a1fc [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 1996-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* A compression engine implementing the Standard Compression Scheme
* for Unicode (SCSU) as outlined in <A
* HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
* Report #6</A>.
*
* <P>The SCSU works by using dynamically positioned <EM>windows</EM>
* consisting of 128 consecutive characters in Unicode. During compression,
* characters within a window are encoded in the compressed stream as the bytes
* <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
* (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the
* storage size of traditional character sets, for example 1 byte per
* character for ASCII or Latin-1 text, and 2 bytes per character for CJK
* ideographs.</P>
*
* <P><STRONG>USAGE</STRONG></P>
*
* <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
* straightforward manner to compress simple strings:</P>
*
* <PRE>
* String s = ... ; // get string from somewhere
* byte [] compressed = UnicodeCompressor.compress(s);
* </PRE>
*
* <P>The static methods have a fairly large memory footprint.
* For finer-grained control over memory usage,
* <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
* iterative compression:</P>
*
* <PRE>
* // Compress an array "chars" of length "len" using a buffer of 512 bytes
* // to the OutputStream "out"
*
* UnicodeCompressor myCompressor = new UnicodeCompressor();
* final static int BUFSIZE = 512;
* byte [] byteBuffer = new byte [ BUFSIZE ];
* int bytesWritten = 0;
* int [] unicharsRead = new int [1];
* int totalCharsCompressed = 0;
* int totalBytesWritten = 0;
*
* do {
* // do the compression
* bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
* len, unicharsRead,
* byteBuffer, 0, BUFSIZE);
*
* // do something with the current set of bytes
* out.write(byteBuffer, 0, bytesWritten);
*
* // update the no. of characters compressed
* totalCharsCompressed += unicharsRead[0];
*
* // update the no. of bytes written
* totalBytesWritten += bytesWritten;
*
* } while(totalCharsCompressed < len);
*
* myCompressor.reset(); // reuse compressor
* </PRE>
*
* @see UnicodeDecompressor
*
* @author Stephen F. Booth
* @draft ICU 2.4
*/
/*
*
* COMPRESSION STRATEGY
*
* Single Byte Mode
*
* There are three relevant cases.
* If the character is in the current window or is Latin-1 (U+0000,
* U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
* directly in the stream as a single byte.
*
* 1. Current character is in defined, inactive window.
* 2. Current character is in undefined window.
* 3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
*
* 1. Current character is in defined, inactive window
* A. Look ahead two characters
* B. If both following characters in same window as current character,
* switch to defined window
* C. If only next character is in same window as current character,
* quote defined window
* D. If neither of following characters is in same window as current,
* quote defined window
*
* 2. Current character is in undefined window
* A. Look ahead two characters
* B. If both following characters in same window as current character,
* define new window
* C. If only next character in same window as current character,
* switch to Unicode mode
* NOTE: This costs us one extra byte. However,
* since we have a limited number of windows to work with, it is
* assumed the cost will pay off later in savings from a window with
* more characters in it.
* D. If neither of following characters in same window as current,
* switch to Unicode mode. Alternative to above: just quote
* Unicode (same byte cost)
*
* 3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
* A. Look ahead one character
* B. If next character in non-compressible region, switch to
* Unicode mode
* C. If next character not in non-compressible region, quote Unicode
*
*
* The following chart illustrates the bytes required for encoding characters
* in each possible way
*
*
* SINGLE BYTE MODE
* Characters in a row with same index
* tag encountered 1 2 3 4
* ---------------------------------------------------------------
* none (in current window) 1 2 3 4
*
* quote Unicode 3 6 9 12
*
* window not switch to Unicode 3 5 7 9 byte
* defined define window 3 4 5 6 cost
*
* window switch to window 2 3 4 5
* defined quote window 2 4 6 8
*
* Unicode Mode
*
* There are two relevant cases.
* If the character is in the non-compressible region
* (U+3400 - U+DFFF), the character is simply written to the
* stream as a pair of bytes.
*
* 1. Current character is in defined, inactive window.
* 2. Current character is in undefined window.
*
* 1.Current character is in defined, inactive window
* A. Look ahead one character
* B. If next character has same index as current character,
* switch to defined window (and switch to single-byte mode)
* C. If not, just put bytes in stream
*
*
* 2. Current character is in undefined window
* A. Look ahead two characters
* B. If both in same window as current character, define window
* (and switch to single-byte mode)
* C. If only next character in same window, just put bytes in stream
* NOTE: This costs us one extra byte. However,
* since we have a limited number of windows to work with, it is
* assumed the cost will pay off later in savings from a window with
* more characters in it.
* D. If neither in same window, put bytes in stream
*
*
* The following chart illustrates the bytes required for encoding characters
* in each possible way
*
*
* UNICODE MODE
* Characters in a row with same index
* tag encountered 1 2 3 4
* ---------------------------------------------------------------
* none 2 4 6 8
*
* quote Unicode 3 6 9 12
*
* window not define window 3 4 5 6 byte
* defined cost
* window switch to window 2 3 4 5
* defined
*/
public final class UnicodeCompressor implements SCSU
{
//==========================
// Class variables
//==========================
/** For quick identification of a byte as a single-byte mode tag */
private static boolean [] sSingleTagTable = {
// table generated by CompressionTableGenerator
false, true, true, true, true, true, true, true, true, false,
false, true, true, false, true, true, true, true, true, true,
true, true, true, true, true, true, true, true, true, true,
true, true, false, false, false, false, false, false,false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false
};
/** For quick identification of a byte as a unicode mode tag */
private static boolean [] sUnicodeTagTable = {
// table generated by CompressionTableGenerator
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, true,
true, true, true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true, false, false,
false, false, false, false, false, false, false, false, false,
false, false
};
//==========================
// Instance variables
//==========================
/** Alias to current dynamic window */
private int fCurrentWindow = 0;
/** Dynamic compression window offsets */
private int [] fOffsets = new int [ NUMWINDOWS ];
/** Current compression mode */
private int fMode = SINGLEBYTEMODE;
/** Keeps count of times character indices are encountered */
private int [] fIndexCount = new int [ MAXINDEX + 1 ];
/** The time stamps indicate when a window was last defined */
private int [] fTimeStamps = new int [ NUMWINDOWS ];
/** The current time stamp */
private int fTimeStamp = 0;
/**
* Create a UnicodeCompressor.
* Sets all windows to their default values.
* @see #reset
* @draft ICU 2.4
*/
public UnicodeCompressor()
{
reset(); // initialize to defaults
}
/**
* Compress a string into a byte array.
* @param buffer The string to compress.
* @return A byte array containing the compressed characters.
* @see #compress(char [], int, int)
* @draft ICU 2.4
*/
public static byte [] compress(String buffer)
{
return compress(buffer.toCharArray(), 0, buffer.length());
}
/**
* Compress a Unicode character array into a byte array.
* @param buffer The character buffer to compress.
* @param start The start of the character run to compress.
* @param limit The limit of the character run to compress.
* @return A byte array containing the compressed characters.
* @see #compress(String)
* @draft ICU 2.4
*/
public static byte [] compress(char [] buffer,
int start,
int limit)
{
UnicodeCompressor comp = new UnicodeCompressor();
// use a buffer that we know will never overflow
// in the worst case, each character will take 3 bytes
// to encode: UQU, hibyte, lobyte. In this case, the
// compressed data will look like: SCU, UQU, hibyte, lobyte, ...
// buffer must be at least 4 bytes in size
int len = Math.max(4, 3 * (limit - start) + 1);
byte [] temp = new byte [len];
int byteCount = comp.compress(buffer, start, limit, null,
temp, 0, len);
byte [] result = new byte [byteCount];
System.arraycopy(temp, 0, result, 0, byteCount);
return result;
}
/**
* Compress a Unicode character array into a byte array.
*
* This function will only consume input that can be completely
* output.
*
* @param charBuffer The character buffer to compress.
* @param charBufferStart The start of the character run to compress.
* @param charBufferLimit The limit of the character run to compress.
* @param charsRead A one-element array. If not null, on return
* the number of characters read from charBuffer.
* @param byteBuffer A buffer to receive the compressed data. This
* buffer must be at minimum four bytes in size.
* @param byteBufferStart The starting offset to which to write
* compressed data.
* @param byteBufferLimit The limiting offset for writing compressed data.
* @return The number of bytes written to byteBuffer.
* @draft ICU 2.4
*/
public int compress(char [] charBuffer,
int charBufferStart,
int charBufferLimit,
int [] charsRead,
byte [] byteBuffer,
int byteBufferStart,
int byteBufferLimit)
{
// the current position in the target byte buffer
int bytePos = byteBufferStart;
// the current position in the source unicode character buffer
int ucPos = charBufferStart;
// the current unicode character from the source buffer
int curUC = INVALIDCHAR;
// the index for the current character
int curIndex = -1;
// look ahead
int nextUC = INVALIDCHAR;
int forwardUC = INVALIDCHAR;
// temporary for window searching
int whichWindow = 0;
// high and low bytes of the current unicode character
int hiByte = 0;
int loByte = 0;
// byteBuffer must be at least 4 bytes in size
if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
throw new IllegalArgumentException("byteBuffer.length < 4");
mainLoop:
while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
switch(fMode) {
// main single byte mode compression loop
case SINGLEBYTEMODE:
singleByteModeLoop:
while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
// get current char
curUC = charBuffer[ucPos++];
// get next char
if(ucPos < charBufferLimit)
nextUC = charBuffer[ucPos];
else
nextUC = INVALIDCHAR;
// chars less than 0x0080 (excluding tags) go straight
// in stream
if(curUC < 0x0080) {
loByte = curUC & 0xFF;
// we need to check and make sure we don't
// accidentally write a single byte mode tag to
// the stream unless it's quoted
if(sSingleTagTable[loByte]) {
// make sure there is enough room to
// write both bytes if not, rewind the
// source stream and break out
if( (bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
// since we know the byte is less than 0x80, SQUOTE0
// will use static window 0, or ASCII
byteBuffer[bytePos++] = (byte) SQUOTE0;
}
byteBuffer[bytePos++] = (byte) loByte;
}
// if the char belongs to current window, convert it
// to a byte by adding the generic compression offset
// and subtracting the window's offset
else if(inDynamicWindow(curUC, fCurrentWindow) ) {
byteBuffer[bytePos++] = (byte)
(curUC - fOffsets[ fCurrentWindow ]
+ COMPRESSIONOFFSET);
}
// if char is not in compressible range, either switch to or
// quote from unicode
else if( ! isCompressible(curUC) ) {
// only check next character if it is valid
if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
// make sure there is enough room to
// write all three bytes if not,
// rewind the source stream and break
// out
if( (bytePos + 2) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte) SQUOTEU;
byteBuffer[bytePos++] = (byte) (curUC >>> 8);
byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
}
else {
// make sure there is enough room to
// write all four bytes if not, rewind
// the source stream and break out
if((bytePos + 3) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte) SCHANGEU;
hiByte = curUC >>> 8;
loByte = curUC & 0xFF;
if(sUnicodeTagTable[hiByte])
// add quote Unicode tag
byteBuffer[bytePos++] = (byte) UQUOTEU;
byteBuffer[bytePos++] = (byte) hiByte;
byteBuffer[bytePos++] = (byte) loByte;
fMode = UNICODEMODE;
break singleByteModeLoop;
}
}
// if the char is in a currently defined dynamic
// window, figure out which one, and either switch to
// it or quote from it
else if((whichWindow = findDynamicWindow(curUC))
!= INVALIDWINDOW ) {
// look ahead
if( (ucPos + 1) < charBufferLimit )
forwardUC = charBuffer[ucPos + 1];
else
forwardUC = INVALIDCHAR;
// all three chars in same window, switch to that
// window inDynamicWindow will return false for
// INVALIDCHAR
if(inDynamicWindow(nextUC, whichWindow)
&& inDynamicWindow(forwardUC, whichWindow)) {
// make sure there is enough room to
// write both bytes if not, rewind the
// source stream and break out
if( (bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
byteBuffer[bytePos++] = (byte)
(curUC - fOffsets[whichWindow]
+ COMPRESSIONOFFSET);
fTimeStamps [ whichWindow ] = ++fTimeStamp;
fCurrentWindow = whichWindow;
}
// either only next char or neither in same
// window, so quote
else {
// make sure there is enough room to
// write both bytes if not, rewind the
// source stream and break out
if((bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
byteBuffer[bytePos++] = (byte)
(curUC - fOffsets[whichWindow]
+ COMPRESSIONOFFSET);
}
}
// if a static window is defined, and the following
// character is not in that static window, quote from
// the static window Note: to quote from a static
// window, don't add 0x80
else if((whichWindow = findStaticWindow(curUC))
!= INVALIDWINDOW
&& ! inStaticWindow(nextUC, whichWindow) ) {
// make sure there is enough room to write both
// bytes if not, rewind the source stream and
// break out
if((bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
byteBuffer[bytePos++] = (byte)
(curUC - sOffsets[whichWindow]);
}
// if a window is not defined, decide if we want to
// define a new one or switch to unicode mode
else {
// determine index for current char (char is compressible)
curIndex = makeIndex(curUC);
fIndexCount[curIndex]++;
// look ahead
if((ucPos + 1) < charBufferLimit)
forwardUC = charBuffer[ucPos + 1];
else
forwardUC = INVALIDCHAR;
// if we have encountered this index at least once
// before, define a new window
// OR
// three chars in a row with same index, define a
// new window (makeIndex will return RESERVEDINDEX
// for INVALIDCHAR)
if((fIndexCount[curIndex] > 1) ||
(curIndex == makeIndex(nextUC)
&& curIndex == makeIndex(forwardUC))) {
// make sure there is enough room to write all
// three bytes if not, rewind the source
// stream and break out
if( (bytePos + 2) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
// get least recently defined window
whichWindow = getLRDefinedWindow();
byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
byteBuffer[bytePos++] = (byte) curIndex;
byteBuffer[bytePos++] = (byte)
(curUC - sOffsetTable[curIndex]
+ COMPRESSIONOFFSET);
fOffsets[whichWindow] = sOffsetTable[curIndex];
fCurrentWindow = whichWindow;
fTimeStamps [whichWindow] = ++fTimeStamp;
}
// only two chars in a row with same index, so
// switch to unicode mode (makeIndex will return
// RESERVEDINDEX for INVALIDCHAR)
// OR
// three chars have different indices, so switch
// to unicode mode
else {
// make sure there is enough room to write all
// four bytes if not, rewind the source stream
// and break out
if((bytePos + 3) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte) SCHANGEU;
hiByte = curUC >>> 8;
loByte = curUC & 0xFF;
if(sUnicodeTagTable[hiByte])
// add quote Unicode tag
byteBuffer[bytePos++] = (byte) UQUOTEU;
byteBuffer[bytePos++] = (byte) hiByte;
byteBuffer[bytePos++] = (byte) loByte;
fMode = UNICODEMODE;
break singleByteModeLoop;
}
}
}
break;
case UNICODEMODE:
// main unicode mode compression loop
unicodeModeLoop:
while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
// get current char
curUC = charBuffer[ucPos++];
// get next char
if( ucPos < charBufferLimit )
nextUC = charBuffer[ucPos];
else
nextUC = INVALIDCHAR;
// if we have two uncompressible chars in a row,
// put the current char's bytes in the stream
if( ! isCompressible(curUC)
|| (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
// make sure there is enough room to write all three bytes
// if not, rewind the source stream and break out
if( (bytePos + 2) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
hiByte = curUC >>> 8;
loByte = curUC & 0xFF;
if(sUnicodeTagTable[ hiByte ])
// add quote Unicode tag
byteBuffer[bytePos++] = (byte) UQUOTEU;
byteBuffer[bytePos++] = (byte) hiByte;
byteBuffer[bytePos++] = (byte) loByte;
}
// bytes less than 0x80 can go straight in the stream,
// but in single-byte mode
else if(curUC < 0x0080) {
loByte = curUC & 0xFF;
// if two chars in a row below 0x80 and the
// current char is not a single-byte mode tag,
// switch to single-byte mode
if(nextUC != INVALIDCHAR
&& nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
// make sure there is enough room to
// write both bytes if not, rewind the
// source stream and break out
if( (bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
// use the last-active window
whichWindow = fCurrentWindow;
byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
byteBuffer[bytePos++] = (byte) loByte;
//fCurrentWindow = 0;
fTimeStamps [whichWindow] = ++fTimeStamp;
fMode = SINGLEBYTEMODE;
break unicodeModeLoop;
}
// otherwise, just write the bytes to the stream
// (this will cover the case of only 1 char less than 0x80
// and single-byte mode tags)
else {
// make sure there is enough room to
// write both bytes if not, rewind the
// source stream and break out
if((bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
// since the character is less than 0x80, the
// high byte is always 0x00 - no need for
// (curUC >>> 8)
byteBuffer[bytePos++] = (byte) 0x00;
byteBuffer[bytePos++] = (byte) loByte;
}
}
// figure out if the current char is in a defined window
else if((whichWindow = findDynamicWindow(curUC))
!= INVALIDWINDOW ) {
// if two chars in a row in the same window,
// switch to that window and go to single-byte mode
// inDynamicWindow will return false for INVALIDCHAR
if(inDynamicWindow(nextUC, whichWindow)) {
// make sure there is enough room to
// write both bytes if not, rewind the
// source stream and break out
if((bytePos + 1) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
byteBuffer[bytePos++] = (byte)
(curUC - fOffsets[whichWindow]
+ COMPRESSIONOFFSET);
fTimeStamps [ whichWindow ] = ++fTimeStamp;
fCurrentWindow = whichWindow;
fMode = SINGLEBYTEMODE;
break unicodeModeLoop;
}
// otherwise, just quote the unicode for the char
else {
// make sure there is enough room to
// write all three bytes if not,
// rewind the source stream and break
// out
if((bytePos + 2) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
hiByte = curUC >>> 8;
loByte = curUC & 0xFF;
if(sUnicodeTagTable[ hiByte ])
// add quote Unicode tag
byteBuffer[bytePos++] = (byte) UQUOTEU;
byteBuffer[bytePos++] = (byte) hiByte;
byteBuffer[bytePos++] = (byte) loByte;
}
}
// char is not in a defined window
else {
// determine index for current char (char is compressible)
curIndex = makeIndex(curUC);
fIndexCount[curIndex]++;
// look ahead
if( (ucPos + 1) < charBufferLimit )
forwardUC = charBuffer[ucPos + 1];
else
forwardUC = INVALIDCHAR;
// if we have encountered this index at least once
// before, define a new window for it that hasn't
// previously been redefined
// OR
// if three chars in a row with the same index,
// define a new window (makeIndex will return
// RESERVEDINDEX for INVALIDCHAR)
if((fIndexCount[curIndex] > 1) ||
(curIndex == makeIndex(nextUC)
&& curIndex == makeIndex(forwardUC))) {
// make sure there is enough room to
// write all three bytes if not,
// rewind the source stream and break
// out
if((bytePos + 2) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
// get least recently defined window
whichWindow = getLRDefinedWindow();
byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
byteBuffer[bytePos++] = (byte) curIndex;
byteBuffer[bytePos++] = (byte)
(curUC - sOffsetTable[curIndex]
+ COMPRESSIONOFFSET);
fOffsets[whichWindow] = sOffsetTable[curIndex];
fCurrentWindow = whichWindow;
fTimeStamps [whichWindow] = ++fTimeStamp;
fMode = SINGLEBYTEMODE;
break unicodeModeLoop;
}
// otherwise just quote the unicode, and save our
// windows for longer runs
else {
// make sure there is enough room to
// write all three bytes if not,
// rewind the source stream and break
// out
if((bytePos + 2) >= byteBufferLimit)
{ --ucPos; break mainLoop; }
hiByte = curUC >>> 8;
loByte = curUC & 0xFF;
if(sUnicodeTagTable[ hiByte ])
// add quote Unicode tag
byteBuffer[bytePos++] = (byte) UQUOTEU;
byteBuffer[bytePos++] = (byte) hiByte;
byteBuffer[bytePos++] = (byte) loByte;
}
}
}
} // end switch
}
// fill in output parameter
if(charsRead != null)
charsRead [0] = (ucPos - charBufferStart);
// return # of bytes written
return (bytePos - byteBufferStart);
}
/**
* Reset the compressor to its initial state.
* @draft ICU 2.4
*/
public void reset()
{
int i;
// reset dynamic windows
fOffsets[0] = 0x0080; // Latin-1
fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
fOffsets[2] = 0x0400; // Cyrillic
fOffsets[3] = 0x0600; // Arabic
fOffsets[4] = 0x0900; // Devanagari
fOffsets[5] = 0x3040; // Hiragana
fOffsets[6] = 0x30A0; // Katakana
fOffsets[7] = 0xFF00; // Fullwidth ASCII
// reset time stamps
for(i = 0; i < NUMWINDOWS; i++) {
fTimeStamps[i] = 0;
}
// reset count of seen indices
for(i = 0; i <= MAXINDEX; i++ ) {
fIndexCount[i] = 0;
}
fTimeStamp = 0; // Reset current time stamp
fCurrentWindow = 0; // Make current window Latin-1
fMode = SINGLEBYTEMODE; // Always start in single-byte mode
}
//==========================
// Determine the index for a character
//==========================
/**
* Create the index value for a character.
* For more information on this function, refer to table X-3
* <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
* @param c The character in question.
* @return An index for c
*/
private static int makeIndex(int c)
{
// check the predefined indices
if(c >= 0x00C0 && c < 0x0140)
return LATININDEX;
else if(c >= 0x0250 && c < 0x02D0)
return IPAEXTENSIONINDEX;
else if(c >= 0x0370 && c < 0x03F0)
return GREEKINDEX;
else if(c >= 0x0530 && c < 0x0590)
return ARMENIANINDEX;
else if(c >= 0x3040 && c < 0x30A0)
return HIRAGANAINDEX;
else if(c >= 0x30A0 && c < 0x3120)
return KATAKANAINDEX;
else if(c >= 0xFF60 && c < 0xFF9F)
return HALFWIDTHKATAKANAINDEX;
// calculate index
else if(c >= 0x0080 && c < 0x3400)
return (c / 0x80) & 0xFF;
else if(c >= 0xE000 && c <= 0xFFFF)
return ((c - 0xAC00) / 0x80) & 0xFF;
// should never happen
else {
return RESERVEDINDEX;
}
}
//==========================
// Check if a given character fits in a window
//==========================
/**
* Determine if a character is in a dynamic window.
* @param c The character to test
* @param whichWindow The dynamic window the test
* @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
* false otherwise.
*/
private boolean inDynamicWindow(int c,
int whichWindow)
{
return (c >= fOffsets[whichWindow]
&& c < (fOffsets[whichWindow] + 0x80));
}
/**
* Determine if a character is in a static window.
* @param c The character to test
* @param whichWindow The static window the test
* @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
* false otherwise.
*/
private static boolean inStaticWindow(int c,
int whichWindow)
{
return (c >= sOffsets[whichWindow]
&& c < (sOffsets[whichWindow] + 0x80));
}
//==========================
// Check if a given character is compressible
//==========================
/**
* Determine if a character is compressible.
* @param c The character to test.
* @return true if the <TT>c</TT> is compressible, false otherwise.
*/
private static boolean isCompressible(int c)
{
return (c < 0x3400 || c >= 0xE000);
}
//==========================
// Check if a window is defined for a given character
//==========================
/**
* Determine if a dynamic window for a certain character is defined
* @param c The character in question
* @return The dynamic window containing <TT>c</TT>, or
* INVALIDWINDOW if not defined.
*/
private int findDynamicWindow(int c)
{
// supposedly faster to count down
//for(int i = 0; i < NUMWINDOWS; i++) {
for(int i = NUMWINDOWS - 1; i >= 0; --i) {
if(inDynamicWindow(c, i)) {
++fTimeStamps[i];
return i;
}
}
return INVALIDWINDOW;
}
/**
* Determine if a static window for a certain character is defined
* @param c The character in question
* @return The static window containing <TT>c</TT>, or
* INVALIDWINDOW if not defined.
*/
private static int findStaticWindow(int c)
{
// supposedly faster to count down
//for(int i = 0; i < NUMSTATICWINDOWS; i++) {
for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
if(inStaticWindow(c, i)) {
return i;
}
}
return INVALIDWINDOW;
}
//==========================
// Find the least-recently used window
//==========================
/** Find the least-recently defined window */
private int getLRDefinedWindow()
{
int leastRU = Integer.MAX_VALUE;
int whichWindow = INVALIDWINDOW;
// find least recently used window
// supposedly faster to count down
//for( int i = 0; i < NUMWINDOWS; i++ ) {
for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
if( fTimeStamps[i] < leastRU ) {
leastRU = fTimeStamps[i];
whichWindow = i;
}
}
return whichWindow;
}
};