| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2016, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.text; |
| |
| /** |
| * A compression engine implementing the Standard Compression Scheme |
| * for Unicode (SCSU) as outlined in <A |
| * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical |
| * Report #6</A>. |
| * |
| * <P>The SCSU works by using dynamically positioned <EM>windows</EM> |
| * consisting of 128 consecutive characters in Unicode. During compression, |
| * characters within a window are encoded in the compressed stream as the bytes |
| * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters |
| * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the |
| * storage size of traditional character sets, for example 1 byte per |
| * character for ASCII or Latin-1 text, and 2 bytes per character for CJK |
| * ideographs.</P> |
| * |
| * <P><STRONG>USAGE</STRONG></P> |
| * |
| * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a |
| * straightforward manner to compress simple strings:</P> |
| * |
| * <PRE> |
| * String s = ... ; // get string from somewhere |
| * byte [] compressed = UnicodeCompressor.compress(s); |
| * </PRE> |
| * |
| * <P>The static methods have a fairly large memory footprint. |
| * For finer-grained control over memory usage, |
| * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing |
| * iterative compression:</P> |
| * |
| * <PRE> |
| * // Compress an array "chars" of length "len" using a buffer of 512 bytes |
| * // to the OutputStream "out" |
| * |
| * UnicodeCompressor myCompressor = new UnicodeCompressor(); |
| * final static int BUFSIZE = 512; |
| * byte [] byteBuffer = new byte [ BUFSIZE ]; |
| * int bytesWritten = 0; |
| * int [] unicharsRead = new int [1]; |
| * int totalCharsCompressed = 0; |
| * int totalBytesWritten = 0; |
| * |
| * do { |
| * // do the compression |
| * bytesWritten = myCompressor.compress(chars, totalCharsCompressed, |
| * len, unicharsRead, |
| * byteBuffer, 0, BUFSIZE); |
| * |
| * // do something with the current set of bytes |
| * out.write(byteBuffer, 0, bytesWritten); |
| * |
| * // update the no. of characters compressed |
| * totalCharsCompressed += unicharsRead[0]; |
| * |
| * // update the no. of bytes written |
| * totalBytesWritten += bytesWritten; |
| * |
| * } while(totalCharsCompressed < len); |
| * |
| * myCompressor.reset(); // reuse compressor |
| * </PRE> |
| * |
| * @see UnicodeDecompressor |
| * |
| * @author Stephen F. Booth |
| * @stable ICU 2.4 |
| */ |
| |
| /* |
| * |
| * COMPRESSION STRATEGY |
| * |
| * Single Byte Mode |
| * |
| * There are three relevant cases. |
| * If the character is in the current window or is Latin-1 (U+0000, |
| * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed |
| * directly in the stream as a single byte. |
| * |
| * 1. Current character is in defined, inactive window. |
| * 2. Current character is in undefined window. |
| * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF). |
| * |
| * 1. Current character is in defined, inactive window |
| * A. Look ahead two characters |
| * B. If both following characters in same window as current character, |
| * switch to defined window |
| * C. If only next character is in same window as current character, |
| * quote defined window |
| * D. If neither of following characters is in same window as current, |
| * quote defined window |
| * |
| * 2. Current character is in undefined window |
| * A. Look ahead two characters |
| * B. If both following characters in same window as current character, |
| * define new window |
| * C. If only next character in same window as current character, |
| * switch to Unicode mode |
| * NOTE: This costs us one extra byte. However, |
| * since we have a limited number of windows to work with, it is |
| * assumed the cost will pay off later in savings from a window with |
| * more characters in it. |
| * D. If neither of following characters in same window as current, |
| * switch to Unicode mode. Alternative to above: just quote |
| * Unicode (same byte cost) |
| * |
| * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF) |
| * A. Look ahead one character |
| * B. If next character in non-compressible region, switch to |
| * Unicode mode |
| * C. If next character not in non-compressible region, quote Unicode |
| * |
| * |
| * The following chart illustrates the bytes required for encoding characters |
| * in each possible way |
| * |
| * |
| * SINGLE BYTE MODE |
| * Characters in a row with same index |
| * tag encountered 1 2 3 4 |
| * --------------------------------------------------------------- |
| * none (in current window) 1 2 3 4 |
| * |
| * quote Unicode 3 6 9 12 |
| * |
| * window not switch to Unicode 3 5 7 9 byte |
| * defined define window 3 4 5 6 cost |
| * |
| * window switch to window 2 3 4 5 |
| * defined quote window 2 4 6 8 |
| * |
| * Unicode Mode |
| * |
| * There are two relevant cases. |
| * If the character is in the non-compressible region |
| * (U+3400 - U+DFFF), the character is simply written to the |
| * stream as a pair of bytes. |
| * |
| * 1. Current character is in defined, inactive window. |
| * 2. Current character is in undefined window. |
| * |
| * 1.Current character is in defined, inactive window |
| * A. Look ahead one character |
| * B. If next character has same index as current character, |
| * switch to defined window (and switch to single-byte mode) |
| * C. If not, just put bytes in stream |
| * |
| * |
| * 2. Current character is in undefined window |
| * A. Look ahead two characters |
| * B. If both in same window as current character, define window |
| * (and switch to single-byte mode) |
| * C. If only next character in same window, just put bytes in stream |
| * NOTE: This costs us one extra byte. However, |
| * since we have a limited number of windows to work with, it is |
| * assumed the cost will pay off later in savings from a window with |
| * more characters in it. |
| * D. If neither in same window, put bytes in stream |
| * |
| * |
| * The following chart illustrates the bytes required for encoding characters |
| * in each possible way |
| * |
| * |
| * UNICODE MODE |
| * Characters in a row with same index |
| * tag encountered 1 2 3 4 |
| * --------------------------------------------------------------- |
| * none 2 4 6 8 |
| * |
| * quote Unicode 3 6 9 12 |
| * |
| * window not define window 3 4 5 6 byte |
| * defined cost |
| * window switch to window 2 3 4 5 |
| * defined |
| */ |
| public final class UnicodeCompressor implements SCSU |
| { |
| //========================== |
| // Class variables |
| //========================== |
| |
| /** For quick identification of a byte as a single-byte mode tag */ |
| private static boolean [] sSingleTagTable = { |
| // table generated by CompressionTableGenerator |
| false, true, true, true, true, true, true, true, true, false, |
| false, true, true, false, true, true, true, true, true, true, |
| true, true, true, true, true, true, true, true, true, true, |
| true, true, false, false, false, false, false, false,false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false |
| }; |
| |
| /** For quick identification of a byte as a unicode mode tag */ |
| private static boolean [] sUnicodeTagTable = { |
| // table generated by CompressionTableGenerator |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false, false, false, false, false, false, false, true, |
| true, true, true, true, true, true, true, true, true, true, |
| true, true, true, true, true, true, true, true, false, false, |
| false, false, false, false, false, false, false, false, false, |
| false, false |
| }; |
| |
| //========================== |
| // Instance variables |
| //========================== |
| |
| /** Alias to current dynamic window */ |
| private int fCurrentWindow = 0; |
| |
| /** Dynamic compression window offsets */ |
| private int [] fOffsets = new int [ NUMWINDOWS ]; |
| |
| /** Current compression mode */ |
| private int fMode = SINGLEBYTEMODE; |
| |
| /** Keeps count of times character indices are encountered */ |
| private int [] fIndexCount = new int [ MAXINDEX + 1 ]; |
| |
| /** The time stamps indicate when a window was last defined */ |
| private int [] fTimeStamps = new int [ NUMWINDOWS ]; |
| |
| /** The current time stamp */ |
| private int fTimeStamp = 0; |
| |
| |
| /** |
| * Create a UnicodeCompressor. |
| * Sets all windows to their default values. |
| * @see #reset |
| * @stable ICU 2.4 |
| */ |
| public UnicodeCompressor() |
| { |
| reset(); // initialize to defaults |
| } |
| |
| /** |
| * Compress a string into a byte array. |
| * @param buffer The string to compress. |
| * @return A byte array containing the compressed characters. |
| * @see #compress(char [], int, int) |
| * @stable ICU 2.4 |
| */ |
| public static byte [] compress(String buffer) |
| { |
| return compress(buffer.toCharArray(), 0, buffer.length()); |
| } |
| |
| /** |
| * Compress a Unicode character array into a byte array. |
| * @param buffer The character buffer to compress. |
| * @param start The start of the character run to compress. |
| * @param limit The limit of the character run to compress. |
| * @return A byte array containing the compressed characters. |
| * @see #compress(String) |
| * @stable ICU 2.4 |
| */ |
| public static byte [] compress(char [] buffer, |
| int start, |
| int limit) |
| { |
| UnicodeCompressor comp = new UnicodeCompressor(); |
| |
| // use a buffer that we know will never overflow |
| // in the worst case, each character will take 3 bytes |
| // to encode: UQU, hibyte, lobyte. In this case, the |
| // compressed data will look like: SCU, UQU, hibyte, lobyte, ... |
| // buffer must be at least 4 bytes in size |
| int len = Math.max(4, 3 * (limit - start) + 1); |
| byte [] temp = new byte [len]; |
| |
| int byteCount = comp.compress(buffer, start, limit, null, |
| temp, 0, len); |
| |
| byte [] result = new byte [byteCount]; |
| System.arraycopy(temp, 0, result, 0, byteCount); |
| return result; |
| } |
| |
| /** |
| * Compress a Unicode character array into a byte array. |
| * |
| * This function will only consume input that can be completely |
| * output. |
| * |
| * @param charBuffer The character buffer to compress. |
| * @param charBufferStart The start of the character run to compress. |
| * @param charBufferLimit The limit of the character run to compress. |
| * @param charsRead A one-element array. If not null, on return |
| * the number of characters read from charBuffer. |
| * @param byteBuffer A buffer to receive the compressed data. This |
| * buffer must be at minimum four bytes in size. |
| * @param byteBufferStart The starting offset to which to write |
| * compressed data. |
| * @param byteBufferLimit The limiting offset for writing compressed data. |
| * @return The number of bytes written to byteBuffer. |
| * @stable ICU 2.4 |
| */ |
| public int compress(char [] charBuffer, |
| int charBufferStart, |
| int charBufferLimit, |
| int [] charsRead, |
| byte [] byteBuffer, |
| int byteBufferStart, |
| int byteBufferLimit) |
| { |
| // the current position in the target byte buffer |
| int bytePos = byteBufferStart; |
| |
| // the current position in the source unicode character buffer |
| int ucPos = charBufferStart; |
| |
| // the current unicode character from the source buffer |
| int curUC = INVALIDCHAR; |
| |
| // the index for the current character |
| int curIndex = -1; |
| |
| // look ahead |
| int nextUC = INVALIDCHAR; |
| int forwardUC = INVALIDCHAR; |
| |
| // temporary for window searching |
| int whichWindow = 0; |
| |
| // high and low bytes of the current unicode character |
| int hiByte = 0; |
| int loByte = 0; |
| |
| |
| // byteBuffer must be at least 4 bytes in size |
| if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4) |
| throw new IllegalArgumentException("byteBuffer.length < 4"); |
| |
| mainLoop: |
| while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { |
| switch(fMode) { |
| // main single byte mode compression loop |
| case SINGLEBYTEMODE: |
| singleByteModeLoop: |
| while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { |
| // get current char |
| curUC = charBuffer[ucPos++]; |
| |
| // get next char |
| if(ucPos < charBufferLimit) |
| nextUC = charBuffer[ucPos]; |
| else |
| nextUC = INVALIDCHAR; |
| |
| // chars less than 0x0080 (excluding tags) go straight |
| // in stream |
| if(curUC < 0x0080) { |
| loByte = curUC & 0xFF; |
| |
| // we need to check and make sure we don't |
| // accidentally write a single byte mode tag to |
| // the stream unless it's quoted |
| if(sSingleTagTable[loByte]) { |
| // make sure there is enough room to |
| // write both bytes if not, rewind the |
| // source stream and break out |
| if( (bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| // since we know the byte is less than 0x80, SQUOTE0 |
| // will use static window 0, or ASCII |
| byteBuffer[bytePos++] = (byte) SQUOTE0; |
| } |
| |
| byteBuffer[bytePos++] = (byte) loByte; |
| } |
| |
| // if the char belongs to current window, convert it |
| // to a byte by adding the generic compression offset |
| // and subtracting the window's offset |
| else if(inDynamicWindow(curUC, fCurrentWindow) ) { |
| byteBuffer[bytePos++] = (byte) |
| (curUC - fOffsets[ fCurrentWindow ] |
| + COMPRESSIONOFFSET); |
| } |
| |
| // if char is not in compressible range, either switch to or |
| // quote from unicode |
| else if( ! isCompressible(curUC) ) { |
| // only check next character if it is valid |
| if(nextUC != INVALIDCHAR && isCompressible(nextUC)) { |
| // make sure there is enough room to |
| // write all three bytes if not, |
| // rewind the source stream and break |
| // out |
| if( (bytePos + 2) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte) SQUOTEU; |
| byteBuffer[bytePos++] = (byte) (curUC >>> 8); |
| byteBuffer[bytePos++] = (byte) (curUC & 0xFF); |
| } |
| else { |
| // make sure there is enough room to |
| // write all four bytes if not, rewind |
| // the source stream and break out |
| if((bytePos + 3) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte) SCHANGEU; |
| |
| hiByte = curUC >>> 8; |
| loByte = curUC & 0xFF; |
| |
| if(sUnicodeTagTable[hiByte]) |
| // add quote Unicode tag |
| byteBuffer[bytePos++] = (byte) UQUOTEU; |
| |
| byteBuffer[bytePos++] = (byte) hiByte; |
| byteBuffer[bytePos++] = (byte) loByte; |
| |
| fMode = UNICODEMODE; |
| break singleByteModeLoop; |
| } |
| } |
| |
| // if the char is in a currently defined dynamic |
| // window, figure out which one, and either switch to |
| // it or quote from it |
| else if((whichWindow = findDynamicWindow(curUC)) |
| != INVALIDWINDOW ) { |
| // look ahead |
| if( (ucPos + 1) < charBufferLimit ) |
| forwardUC = charBuffer[ucPos + 1]; |
| else |
| forwardUC = INVALIDCHAR; |
| |
| // all three chars in same window, switch to that |
| // window inDynamicWindow will return false for |
| // INVALIDCHAR |
| if(inDynamicWindow(nextUC, whichWindow) |
| && inDynamicWindow(forwardUC, whichWindow)) { |
| // make sure there is enough room to |
| // write both bytes if not, rewind the |
| // source stream and break out |
| if( (bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) |
| (curUC - fOffsets[whichWindow] |
| + COMPRESSIONOFFSET); |
| fTimeStamps [ whichWindow ] = ++fTimeStamp; |
| fCurrentWindow = whichWindow; |
| } |
| |
| // either only next char or neither in same |
| // window, so quote |
| else { |
| // make sure there is enough room to |
| // write both bytes if not, rewind the |
| // source stream and break out |
| if((bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) |
| (curUC - fOffsets[whichWindow] |
| + COMPRESSIONOFFSET); |
| } |
| } |
| |
| // if a static window is defined, and the following |
| // character is not in that static window, quote from |
| // the static window Note: to quote from a static |
| // window, don't add 0x80 |
| else if((whichWindow = findStaticWindow(curUC)) |
| != INVALIDWINDOW |
| && ! inStaticWindow(nextUC, whichWindow) ) { |
| // make sure there is enough room to write both |
| // bytes if not, rewind the source stream and |
| // break out |
| if((bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) |
| (curUC - sOffsets[whichWindow]); |
| } |
| |
| // if a window is not defined, decide if we want to |
| // define a new one or switch to unicode mode |
| else { |
| // determine index for current char (char is compressible) |
| curIndex = makeIndex(curUC); |
| fIndexCount[curIndex]++; |
| |
| // look ahead |
| if((ucPos + 1) < charBufferLimit) |
| forwardUC = charBuffer[ucPos + 1]; |
| else |
| forwardUC = INVALIDCHAR; |
| |
| // if we have encountered this index at least once |
| // before, define a new window |
| // OR |
| // three chars in a row with same index, define a |
| // new window (makeIndex will return RESERVEDINDEX |
| // for INVALIDCHAR) |
| if((fIndexCount[curIndex] > 1) || |
| (curIndex == makeIndex(nextUC) |
| && curIndex == makeIndex(forwardUC))) { |
| // make sure there is enough room to write all |
| // three bytes if not, rewind the source |
| // stream and break out |
| if( (bytePos + 2) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| // get least recently defined window |
| whichWindow = getLRDefinedWindow(); |
| |
| byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) curIndex; |
| byteBuffer[bytePos++] = (byte) |
| (curUC - sOffsetTable[curIndex] |
| + COMPRESSIONOFFSET); |
| |
| fOffsets[whichWindow] = sOffsetTable[curIndex]; |
| fCurrentWindow = whichWindow; |
| fTimeStamps [whichWindow] = ++fTimeStamp; |
| } |
| |
| // only two chars in a row with same index, so |
| // switch to unicode mode (makeIndex will return |
| // RESERVEDINDEX for INVALIDCHAR) |
| // OR |
| // three chars have different indices, so switch |
| // to unicode mode |
| else { |
| // make sure there is enough room to write all |
| // four bytes if not, rewind the source stream |
| // and break out |
| if((bytePos + 3) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte) SCHANGEU; |
| |
| hiByte = curUC >>> 8; |
| loByte = curUC & 0xFF; |
| |
| if(sUnicodeTagTable[hiByte]) |
| // add quote Unicode tag |
| byteBuffer[bytePos++] = (byte) UQUOTEU; |
| |
| byteBuffer[bytePos++] = (byte) hiByte; |
| byteBuffer[bytePos++] = (byte) loByte; |
| |
| fMode = UNICODEMODE; |
| break singleByteModeLoop; |
| } |
| } |
| } |
| break; |
| |
| case UNICODEMODE: |
| // main unicode mode compression loop |
| unicodeModeLoop: |
| while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { |
| // get current char |
| curUC = charBuffer[ucPos++]; |
| |
| // get next char |
| if( ucPos < charBufferLimit ) |
| nextUC = charBuffer[ucPos]; |
| else |
| nextUC = INVALIDCHAR; |
| |
| // if we have two uncompressible chars in a row, |
| // put the current char's bytes in the stream |
| if( ! isCompressible(curUC) |
| || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) { |
| // make sure there is enough room to write all three bytes |
| // if not, rewind the source stream and break out |
| if( (bytePos + 2) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| hiByte = curUC >>> 8; |
| loByte = curUC & 0xFF; |
| |
| if(sUnicodeTagTable[ hiByte ]) |
| // add quote Unicode tag |
| byteBuffer[bytePos++] = (byte) UQUOTEU; |
| |
| byteBuffer[bytePos++] = (byte) hiByte; |
| byteBuffer[bytePos++] = (byte) loByte; |
| } |
| |
| // bytes less than 0x80 can go straight in the stream, |
| // but in single-byte mode |
| else if(curUC < 0x0080) { |
| loByte = curUC & 0xFF; |
| |
| // if two chars in a row below 0x80 and the |
| // current char is not a single-byte mode tag, |
| // switch to single-byte mode |
| if(nextUC != INVALIDCHAR |
| && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) { |
| // make sure there is enough room to |
| // write both bytes if not, rewind the |
| // source stream and break out |
| if( (bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| // use the last-active window |
| whichWindow = fCurrentWindow; |
| byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) loByte; |
| |
| //fCurrentWindow = 0; |
| fTimeStamps [whichWindow] = ++fTimeStamp; |
| fMode = SINGLEBYTEMODE; |
| break unicodeModeLoop; |
| } |
| |
| // otherwise, just write the bytes to the stream |
| // (this will cover the case of only 1 char less than 0x80 |
| // and single-byte mode tags) |
| else { |
| // make sure there is enough room to |
| // write both bytes if not, rewind the |
| // source stream and break out |
| if((bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| // since the character is less than 0x80, the |
| // high byte is always 0x00 - no need for |
| // (curUC >>> 8) |
| byteBuffer[bytePos++] = (byte) 0x00; |
| byteBuffer[bytePos++] = (byte) loByte; |
| } |
| } |
| |
| // figure out if the current char is in a defined window |
| else if((whichWindow = findDynamicWindow(curUC)) |
| != INVALIDWINDOW ) { |
| // if two chars in a row in the same window, |
| // switch to that window and go to single-byte mode |
| // inDynamicWindow will return false for INVALIDCHAR |
| if(inDynamicWindow(nextUC, whichWindow)) { |
| // make sure there is enough room to |
| // write both bytes if not, rewind the |
| // source stream and break out |
| if((bytePos + 1) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) |
| (curUC - fOffsets[whichWindow] |
| + COMPRESSIONOFFSET); |
| |
| fTimeStamps [ whichWindow ] = ++fTimeStamp; |
| fCurrentWindow = whichWindow; |
| fMode = SINGLEBYTEMODE; |
| break unicodeModeLoop; |
| } |
| |
| // otherwise, just quote the unicode for the char |
| else { |
| // make sure there is enough room to |
| // write all three bytes if not, |
| // rewind the source stream and break |
| // out |
| if((bytePos + 2) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| hiByte = curUC >>> 8; |
| loByte = curUC & 0xFF; |
| |
| if(sUnicodeTagTable[ hiByte ]) |
| // add quote Unicode tag |
| byteBuffer[bytePos++] = (byte) UQUOTEU; |
| |
| byteBuffer[bytePos++] = (byte) hiByte; |
| byteBuffer[bytePos++] = (byte) loByte; |
| } |
| } |
| |
| // char is not in a defined window |
| else { |
| // determine index for current char (char is compressible) |
| curIndex = makeIndex(curUC); |
| fIndexCount[curIndex]++; |
| |
| // look ahead |
| if( (ucPos + 1) < charBufferLimit ) |
| forwardUC = charBuffer[ucPos + 1]; |
| else |
| forwardUC = INVALIDCHAR; |
| |
| // if we have encountered this index at least once |
| // before, define a new window for it that hasn't |
| // previously been redefined |
| // OR |
| // if three chars in a row with the same index, |
| // define a new window (makeIndex will return |
| // RESERVEDINDEX for INVALIDCHAR) |
| if((fIndexCount[curIndex] > 1) || |
| (curIndex == makeIndex(nextUC) |
| && curIndex == makeIndex(forwardUC))) { |
| // make sure there is enough room to |
| // write all three bytes if not, |
| // rewind the source stream and break |
| // out |
| if((bytePos + 2) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| // get least recently defined window |
| whichWindow = getLRDefinedWindow(); |
| |
| byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow); |
| byteBuffer[bytePos++] = (byte) curIndex; |
| byteBuffer[bytePos++] = (byte) |
| (curUC - sOffsetTable[curIndex] |
| + COMPRESSIONOFFSET); |
| |
| fOffsets[whichWindow] = sOffsetTable[curIndex]; |
| fCurrentWindow = whichWindow; |
| fTimeStamps [whichWindow] = ++fTimeStamp; |
| fMode = SINGLEBYTEMODE; |
| break unicodeModeLoop; |
| } |
| |
| // otherwise just quote the unicode, and save our |
| // windows for longer runs |
| else { |
| // make sure there is enough room to |
| // write all three bytes if not, |
| // rewind the source stream and break |
| // out |
| if((bytePos + 2) >= byteBufferLimit) |
| { --ucPos; break mainLoop; } |
| |
| hiByte = curUC >>> 8; |
| loByte = curUC & 0xFF; |
| |
| if(sUnicodeTagTable[ hiByte ]) |
| // add quote Unicode tag |
| byteBuffer[bytePos++] = (byte) UQUOTEU; |
| |
| byteBuffer[bytePos++] = (byte) hiByte; |
| byteBuffer[bytePos++] = (byte) loByte; |
| } |
| } |
| } |
| } // end switch |
| } |
| |
| // fill in output parameter |
| if(charsRead != null) |
| charsRead [0] = (ucPos - charBufferStart); |
| |
| // return # of bytes written |
| return (bytePos - byteBufferStart); |
| } |
| |
| /** |
| * Reset the compressor to its initial state. |
| * @stable ICU 2.4 |
| */ |
| public void reset() |
| { |
| int i; |
| |
| // reset dynamic windows |
| fOffsets[0] = 0x0080; // Latin-1 |
| fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A |
| fOffsets[2] = 0x0400; // Cyrillic |
| fOffsets[3] = 0x0600; // Arabic |
| fOffsets[4] = 0x0900; // Devanagari |
| fOffsets[5] = 0x3040; // Hiragana |
| fOffsets[6] = 0x30A0; // Katakana |
| fOffsets[7] = 0xFF00; // Fullwidth ASCII |
| |
| |
| // reset time stamps |
| for(i = 0; i < NUMWINDOWS; i++) { |
| fTimeStamps[i] = 0; |
| } |
| |
| // reset count of seen indices |
| for(i = 0; i <= MAXINDEX; i++ ) { |
| fIndexCount[i] = 0; |
| } |
| |
| fTimeStamp = 0; // Reset current time stamp |
| fCurrentWindow = 0; // Make current window Latin-1 |
| fMode = SINGLEBYTEMODE; // Always start in single-byte mode |
| } |
| |
| //========================== |
| // Determine the index for a character |
| //========================== |
| |
| /** |
| * Create the index value for a character. |
| * For more information on this function, refer to table X-3 |
| * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>. |
| * @param c The character in question. |
| * @return An index for c |
| */ |
| private static int makeIndex(int c) |
| { |
| // check the predefined indices |
| if(c >= 0x00C0 && c < 0x0140) |
| return LATININDEX; |
| else if(c >= 0x0250 && c < 0x02D0) |
| return IPAEXTENSIONINDEX; |
| else if(c >= 0x0370 && c < 0x03F0) |
| return GREEKINDEX; |
| else if(c >= 0x0530 && c < 0x0590) |
| return ARMENIANINDEX; |
| else if(c >= 0x3040 && c < 0x30A0) |
| return HIRAGANAINDEX; |
| else if(c >= 0x30A0 && c < 0x3120) |
| return KATAKANAINDEX; |
| else if(c >= 0xFF60 && c < 0xFF9F) |
| return HALFWIDTHKATAKANAINDEX; |
| |
| // calculate index |
| else if(c >= 0x0080 && c < 0x3400) |
| return (c / 0x80) & 0xFF; |
| else if(c >= 0xE000 && c <= 0xFFFF) |
| return ((c - 0xAC00) / 0x80) & 0xFF; |
| |
| // should never happen |
| else { |
| return RESERVEDINDEX; |
| } |
| } |
| |
| //========================== |
| // Check if a given character fits in a window |
| //========================== |
| |
| /** |
| * Determine if a character is in a dynamic window. |
| * @param c The character to test |
| * @param whichWindow The dynamic window the test |
| * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, |
| * false otherwise. |
| */ |
| private boolean inDynamicWindow(int c, |
| int whichWindow) |
| { |
| return (c >= fOffsets[whichWindow] |
| && c < (fOffsets[whichWindow] + 0x80)); |
| } |
| |
| /** |
| * Determine if a character is in a static window. |
| * @param c The character to test |
| * @param whichWindow The static window the test |
| * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, |
| * false otherwise. |
| */ |
| private static boolean inStaticWindow(int c, |
| int whichWindow) |
| { |
| return (c >= sOffsets[whichWindow] |
| && c < (sOffsets[whichWindow] + 0x80)); |
| } |
| |
| //========================== |
| // Check if a given character is compressible |
| //========================== |
| |
| /** |
| * Determine if a character is compressible. |
| * @param c The character to test. |
| * @return true if the <TT>c</TT> is compressible, false otherwise. |
| */ |
| private static boolean isCompressible(int c) |
| { |
| return (c < 0x3400 || c >= 0xE000); |
| } |
| |
| //========================== |
| // Check if a window is defined for a given character |
| //========================== |
| |
| /** |
| * Determine if a dynamic window for a certain character is defined |
| * @param c The character in question |
| * @return The dynamic window containing <TT>c</TT>, or |
| * INVALIDWINDOW if not defined. |
| */ |
| private int findDynamicWindow(int c) |
| { |
| // supposedly faster to count down |
| //for(int i = 0; i < NUMWINDOWS; i++) { |
| for(int i = NUMWINDOWS - 1; i >= 0; --i) { |
| if(inDynamicWindow(c, i)) { |
| ++fTimeStamps[i]; |
| return i; |
| } |
| } |
| |
| return INVALIDWINDOW; |
| } |
| |
| /** |
| * Determine if a static window for a certain character is defined |
| * @param c The character in question |
| * @return The static window containing <TT>c</TT>, or |
| * INVALIDWINDOW if not defined. |
| */ |
| private static int findStaticWindow(int c) |
| { |
| // supposedly faster to count down |
| //for(int i = 0; i < NUMSTATICWINDOWS; i++) { |
| for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) { |
| if(inStaticWindow(c, i)) { |
| return i; |
| } |
| } |
| |
| return INVALIDWINDOW; |
| } |
| |
| //========================== |
| // Find the least-recently used window |
| //========================== |
| |
| /** Find the least-recently defined window */ |
| private int getLRDefinedWindow() |
| { |
| int leastRU = Integer.MAX_VALUE; |
| int whichWindow = INVALIDWINDOW; |
| |
| // find least recently used window |
| // supposedly faster to count down |
| //for( int i = 0; i < NUMWINDOWS; i++ ) { |
| for(int i = NUMWINDOWS - 1; i >= 0; --i ) { |
| if( fTimeStamps[i] < leastRU ) { |
| leastRU = fTimeStamps[i]; |
| whichWindow = i; |
| } |
| } |
| |
| return whichWindow; |
| } |
| |
| } |