| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2016, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.text; |
| |
| /** |
| * A decompression engine implementing the Standard Compression Scheme |
| * for Unicode (SCSU) as outlined in <A |
| * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical |
| * Report #6</A>. |
| * |
| * <P><STRONG>USAGE</STRONG></P> |
| * |
| * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a |
| * straightforward manner to decompress simple strings:</P> |
| * |
| * <PRE> |
| * byte [] compressed = ... ; // get compressed bytes from somewhere |
| * String result = UnicodeDecompressor.decompress(compressed); |
| * </PRE> |
| * |
| * <P>The static methods have a fairly large memory footprint. |
| * For finer-grained control over memory usage, |
| * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing |
| * iterative decompression:</P> |
| * |
| * <PRE> |
| * // Decompress an array "bytes" of length "len" using a buffer of 512 chars |
| * // to the Writer "out" |
| * |
| * UnicodeDecompressor myDecompressor = new UnicodeDecompressor(); |
| * final static int BUFSIZE = 512; |
| * char [] charBuffer = new char [ BUFSIZE ]; |
| * int charsWritten = 0; |
| * int [] bytesRead = new int [1]; |
| * int totalBytesDecompressed = 0; |
| * int totalCharsWritten = 0; |
| * |
| * do { |
| * // do the decompression |
| * charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed, |
| * len, bytesRead, |
| * charBuffer, 0, BUFSIZE); |
| * |
| * // do something with the current set of chars |
| * out.write(charBuffer, 0, charsWritten); |
| * |
| * // update the no. of bytes decompressed |
| * totalBytesDecompressed += bytesRead[0]; |
| * |
| * // update the no. of chars written |
| * totalCharsWritten += charsWritten; |
| * |
| * } while(totalBytesDecompressed < len); |
| * |
| * myDecompressor.reset(); // reuse decompressor |
| * </PRE> |
| * |
| * <P>Decompression is performed according to the standard set forth in |
| * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical |
| * Report #6</A></P> |
| * |
| * @see UnicodeCompressor |
| * |
| * @author Stephen F. Booth |
| * @stable ICU 2.4 |
| */ |
| public final class UnicodeDecompressor implements SCSU |
| { |
| //========================== |
| // Instance variables |
| //========================== |
| |
| /** Alias to current dynamic window */ |
| private int fCurrentWindow = 0; |
| |
| /** Dynamic compression window offsets */ |
| private int [] fOffsets = new int [ NUMWINDOWS ]; |
| |
| /** Current compression mode */ |
| private int fMode = SINGLEBYTEMODE; |
| |
| /** Size of our internal buffer */ |
| private final static int BUFSIZE = 3; |
| |
| /** Internal buffer for saving state */ |
| private byte [] fBuffer = new byte [BUFSIZE]; |
| |
| /** Number of characters in our internal buffer */ |
| private int fBufferLength = 0; |
| |
| |
| /** |
| * Create a UnicodeDecompressor. |
| * Sets all windows to their default values. |
| * @see #reset |
| * @stable ICU 2.4 |
| */ |
| public UnicodeDecompressor(){ |
| reset(); // initialize to defaults |
| } |
| |
| /** |
| * Decompress a byte array into a String. |
| * @param buffer The byte array to decompress. |
| * @return A String containing the decompressed characters. |
| * @see #decompress(byte [], int, int) |
| * @stable ICU 2.4 |
| */ |
| public static String decompress(byte [] buffer){ |
| char [] buf = decompress(buffer, 0, buffer.length); |
| return new String(buf); |
| } |
| |
| /** |
| * Decompress a byte array into a Unicode character array. |
| * @param buffer The byte array to decompress. |
| * @param start The start of the byte run to decompress. |
| * @param limit The limit of the byte run to decompress. |
| * @return A character array containing the decompressed bytes. |
| * @see #decompress(byte []) |
| * @stable ICU 2.4 |
| */ |
| public static char [] decompress(byte [] buffer, int start, int limit) { |
| UnicodeDecompressor comp = new UnicodeDecompressor(); |
| |
| // use a buffer we know will never overflow |
| // in the worst case, each byte will decompress |
| // to a surrogate pair (buffer must be at least 2 chars) |
| int len = Math.max(2, 2 * (limit - start)); |
| char [] temp = new char [len]; |
| |
| int charCount = comp.decompress(buffer, start, limit, null, |
| temp, 0, len); |
| |
| char [] result = new char [charCount]; |
| System.arraycopy(temp, 0, result, 0, charCount); |
| return result; |
| } |
| |
| /** |
| * Decompress a byte array into a Unicode character array. |
| * |
| * This function will either completely fill the output buffer, |
| * or consume the entire input. |
| * |
| * @param byteBuffer The byte buffer to decompress. |
| * @param byteBufferStart The start of the byte run to decompress. |
| * @param byteBufferLimit The limit of the byte run to decompress. |
| * @param bytesRead A one-element array. If not null, on return |
| * the number of bytes read from byteBuffer. |
| * @param charBuffer A buffer to receive the decompressed data. |
| * This buffer must be at minimum two characters in size. |
| * @param charBufferStart The starting offset to which to write |
| * decompressed data. |
| * @param charBufferLimit The limiting offset for writing |
| * decompressed data. |
| * @return The number of Unicode characters written to charBuffer. |
| * @stable ICU 2.4 |
| */ |
| public int decompress(byte [] byteBuffer, |
| int byteBufferStart, |
| int byteBufferLimit, |
| int [] bytesRead, |
| char [] charBuffer, |
| int charBufferStart, |
| int charBufferLimit) |
| { |
| // the current position in the source byte buffer |
| int bytePos = byteBufferStart; |
| |
| // the current position in the target char buffer |
| int ucPos = charBufferStart; |
| |
| // the current byte from the source buffer |
| int aByte = 0x00; |
| |
| |
| // charBuffer must be at least 2 chars in size |
| if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2) |
| throw new IllegalArgumentException("charBuffer.length < 2"); |
| |
| // if our internal buffer isn't empty, flush its contents |
| // to the output buffer before doing any more decompression |
| if(fBufferLength > 0) { |
| |
| int newBytes = 0; |
| |
| // fill the buffer completely, to guarantee one full character |
| if(fBufferLength != BUFSIZE) { |
| newBytes = fBuffer.length - fBufferLength; |
| |
| // verify there are newBytes bytes in byteBuffer |
| if(byteBufferLimit - byteBufferStart < newBytes) |
| newBytes = byteBufferLimit - byteBufferStart; |
| |
| System.arraycopy(byteBuffer, byteBufferStart, |
| fBuffer, fBufferLength, newBytes); |
| } |
| |
| // reset buffer length to 0 before recursive call |
| fBufferLength = 0; |
| |
| // call self recursively to decompress the buffer |
| int count = decompress(fBuffer, 0, fBuffer.length, null, |
| charBuffer, charBufferStart, |
| charBufferLimit); |
| |
| // update the positions into the arrays |
| ucPos += count; |
| bytePos += newBytes; |
| } |
| |
| // the main decompression loop |
| mainLoop: |
| while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { |
| switch(fMode) { |
| case SINGLEBYTEMODE: |
| // single-byte mode decompression loop |
| singleByteModeLoop: |
| while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { |
| aByte = byteBuffer[bytePos++] & 0xFF; |
| switch(aByte) { |
| // All bytes from 0x80 through 0xFF are remapped |
| // to chars or surrogate pairs according to the |
| // currently active window |
| case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: |
| case 0x85: case 0x86: case 0x87: case 0x88: case 0x89: |
| case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: |
| case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: |
| case 0x94: case 0x95: case 0x96: case 0x97: case 0x98: |
| case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: |
| case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: |
| case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: |
| case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: |
| case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: |
| case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: |
| case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: |
| case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0: |
| case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5: |
| case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA: |
| case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: |
| case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: |
| case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9: |
| case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: |
| case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3: |
| case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8: |
| case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED: |
| case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2: |
| case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7: |
| case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC: |
| case 0xFD: case 0xFE: case 0xFF: |
| // For offsets <= 0xFFFF, convert to a single char |
| // by adding the window's offset and subtracting |
| // the generic compression offset |
| if(fOffsets[ fCurrentWindow ] <= 0xFFFF) { |
| charBuffer[ucPos++] = (char) |
| (aByte + fOffsets[ fCurrentWindow ] |
| - COMPRESSIONOFFSET); |
| } |
| // For offsets > 0x10000, convert to a surrogate pair by |
| // normBase = window's offset - 0x10000 |
| // high surr. = 0xD800 + (normBase >> 10) |
| // low surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F) |
| else { |
| // make sure there is enough room to write |
| // both characters |
| // if not, save state and break out |
| if((ucPos + 1) >= charBufferLimit) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| int normalizedBase = fOffsets[ fCurrentWindow ] |
| - 0x10000; |
| charBuffer[ucPos++] = (char) |
| (0xD800 + (normalizedBase >> 10)); |
| charBuffer[ucPos++] = (char) |
| (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F)); |
| } |
| break; |
| |
| // bytes from 0x20 through 0x7F are treated as ASCII and |
| // are remapped to chars by padding the high byte |
| // (this is the same as quoting from static window 0) |
| // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) |
| // are treated as ASCII as well |
| case 0x00: case 0x09: case 0x0A: case 0x0D: |
| case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: |
| case 0x25: case 0x26: case 0x27: case 0x28: case 0x29: |
| case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: |
| case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33: |
| case 0x34: case 0x35: case 0x36: case 0x37: case 0x38: |
| case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: |
| case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42: |
| case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: |
| case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: |
| case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: |
| case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: |
| case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B: |
| case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60: |
| case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: |
| case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A: |
| case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: |
| case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: |
| case 0x75: case 0x76: case 0x77: case 0x78: case 0x79: |
| case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E: |
| case 0x7F: |
| charBuffer[ucPos++] = (char) aByte; |
| break; |
| |
| // quote unicode |
| case SQUOTEU: |
| // verify we have two bytes following tag |
| // if not, save state and break out |
| if( (bytePos + 1) >= byteBufferLimit ) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| aByte = byteBuffer[bytePos++]; |
| charBuffer[ucPos++] = (char) |
| (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); |
| break; |
| |
| // switch to Unicode mode |
| case SCHANGEU: |
| fMode = UNICODEMODE; |
| break singleByteModeLoop; |
| //break; |
| |
| // handle all quote tags |
| case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3: |
| case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7: |
| // verify there is a byte following the tag |
| // if not, save state and break out |
| if(bytePos >= byteBufferLimit) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| // if the byte is in the range 0x00 - 0x7F, use |
| // static window n otherwise, use dynamic window n |
| int dByte = byteBuffer[bytePos++] & 0xFF; |
| charBuffer[ucPos++] = (char) |
| (dByte+ (dByte >= 0x00 && dByte < 0x80 |
| ? sOffsets[aByte - SQUOTE0] |
| : (fOffsets[aByte - SQUOTE0] |
| - COMPRESSIONOFFSET))); |
| break; |
| |
| // handle all change tags |
| case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3: |
| case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7: |
| fCurrentWindow = aByte - SCHANGE0; |
| break; |
| |
| // handle all define tags |
| case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3: |
| case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7: |
| // verify there is a byte following the tag |
| // if not, save state and break out |
| if(bytePos >= byteBufferLimit) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| fCurrentWindow = aByte - SDEFINE0; |
| fOffsets[fCurrentWindow] = |
| sOffsetTable[byteBuffer[bytePos++] & 0xFF]; |
| break; |
| |
| // handle define extended tag |
| case SDEFINEX: |
| // verify we have two bytes following tag |
| // if not, save state and break out |
| if((bytePos + 1) >= byteBufferLimit ) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| aByte = byteBuffer[bytePos++] & 0xFF; |
| fCurrentWindow = (aByte & 0xE0) >> 5; |
| fOffsets[fCurrentWindow] = 0x10000 + |
| (0x80 * (((aByte & 0x1F) << 8) |
| | (byteBuffer[bytePos++] & 0xFF))); |
| break; |
| |
| // reserved, shouldn't happen |
| case SRESERVED: |
| break; |
| |
| } // end switch |
| } // end while |
| break; |
| |
| case UNICODEMODE: |
| // unicode mode decompression loop |
| unicodeModeLoop: |
| while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { |
| aByte = byteBuffer[bytePos++] & 0xFF; |
| switch(aByte) { |
| // handle all define tags |
| case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3: |
| case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7: |
| // verify there is a byte following tag |
| // if not, save state and break out |
| if(bytePos >= byteBufferLimit ) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| fCurrentWindow = aByte - UDEFINE0; |
| fOffsets[fCurrentWindow] = |
| sOffsetTable[byteBuffer[bytePos++] & 0xFF]; |
| fMode = SINGLEBYTEMODE; |
| break unicodeModeLoop; |
| //break; |
| |
| // handle define extended tag |
| case UDEFINEX: |
| // verify we have two bytes following tag |
| // if not, save state and break out |
| if((bytePos + 1) >= byteBufferLimit ) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| aByte = byteBuffer[bytePos++] & 0xFF; |
| fCurrentWindow = (aByte & 0xE0) >> 5; |
| fOffsets[fCurrentWindow] = 0x10000 + |
| (0x80 * (((aByte & 0x1F) << 8) |
| | (byteBuffer[bytePos++] & 0xFF))); |
| fMode = SINGLEBYTEMODE; |
| break unicodeModeLoop; |
| //break; |
| |
| // handle all change tags |
| case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3: |
| case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7: |
| fCurrentWindow = aByte - UCHANGE0; |
| fMode = SINGLEBYTEMODE; |
| break unicodeModeLoop; |
| //break; |
| |
| // quote unicode |
| case UQUOTEU: |
| // verify we have two bytes following tag |
| // if not, save state and break out |
| if(bytePos >= byteBufferLimit - 1) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| aByte = byteBuffer[bytePos++]; |
| charBuffer[ucPos++] = (char) |
| (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); |
| break; |
| |
| default: |
| // verify there is a byte following tag |
| // if not, save state and break out |
| if(bytePos >= byteBufferLimit ) { |
| --bytePos; |
| System.arraycopy(byteBuffer, bytePos, |
| fBuffer, 0, |
| byteBufferLimit - bytePos); |
| fBufferLength = byteBufferLimit - bytePos; |
| bytePos += fBufferLength; |
| break mainLoop; |
| } |
| |
| charBuffer[ucPos++] = (char) |
| (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); |
| break; |
| |
| } // end switch |
| } // end while |
| break; |
| |
| } // end switch( fMode ) |
| } // end while |
| |
| // fill in output parameter |
| if(bytesRead != null) |
| bytesRead [0] = (bytePos - byteBufferStart); |
| |
| // return # of chars written |
| return (ucPos - charBufferStart); |
| } |
| |
| /** |
| * Reset the decompressor to its initial state. |
| * @stable ICU 2.4 |
| */ |
| public void reset() |
| { |
| // reset dynamic windows |
| fOffsets[0] = 0x0080; // Latin-1 |
| fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A |
| fOffsets[2] = 0x0400; // Cyrillic |
| fOffsets[3] = 0x0600; // Arabic |
| fOffsets[4] = 0x0900; // Devanagari |
| fOffsets[5] = 0x3040; // Hiragana |
| fOffsets[6] = 0x30A0; // Katakana |
| fOffsets[7] = 0xFF00; // Fullwidth ASCII |
| |
| |
| fCurrentWindow = 0; // Make current window Latin-1 |
| fMode = SINGLEBYTEMODE; // Always start in single-byte mode |
| fBufferLength = 0; // Empty buffer |
| } |
| } |