| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * |
| * File scsu.c |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 05/17/99 stephen Creation (ported from java UnicodeCompressor.java) |
| * 09/21/99 stephen Updated to handle data splits on decompression. |
| ******************************************************************************* |
| */ |
| |
| #include <limits.h> |
| |
| #include "unicode/scsu.h" |
| |
| #include "cmemory.h" |
| |
| /* Generic window shift */ |
| #define COMPRESSIONOFFSET 0x80 |
| |
| /* Indicates a window index is invalid */ |
| #define INVALIDWINDOW -1 |
| |
| /* Indicates a character doesn't exist in input */ |
| #define INVALIDCHAR -1 |
| |
| /* Compression modes */ |
| #define SINGLEBYTEMODE 0 |
| #define UNICODEMODE 1 |
| |
| /* Reserved index value */ |
| #define RESERVEDINDEX 0x00 |
| |
| /* Indices for scripts which cross a half-block boundary */ |
| #define LATININDEX 0xF9 |
| #define IPAEXTENSIONINDEX 0xFA |
| #define GREEKINDEX 0xFB |
| #define ARMENIANINDEX 0xFC |
| #define HIRAGANAINDEX 0xFD |
| #define KATAKANAINDEX 0xFE |
| #define HALFWIDTHKATAKANAINDEX 0xFF |
| |
| /* Single-byte mode tags */ |
| #define SDEFINEX 0x0B |
| /* 0x0C is a reserved value*/ |
| #define SRESERVED 0x0C |
| #define SQUOTEU 0x0E |
| #define SCHANGEU 0x0F |
| |
| #define SQUOTE0 0x01 |
| #define SQUOTE1 0x02 |
| #define SQUOTE2 0x03 |
| #define SQUOTE3 0x04 |
| #define SQUOTE4 0x05 |
| #define SQUOTE5 0x06 |
| #define SQUOTE6 0x07 |
| #define SQUOTE7 0x08 |
| |
| #define SCHANGE0 0x10 |
| #define SCHANGE1 0x11 |
| #define SCHANGE2 0x12 |
| #define SCHANGE3 0x13 |
| #define SCHANGE4 0x14 |
| #define SCHANGE5 0x15 |
| #define SCHANGE6 0x16 |
| #define SCHANGE7 0x17 |
| |
| #define SDEFINE0 0x18 |
| #define SDEFINE1 0x19 |
| #define SDEFINE2 0x1A |
| #define SDEFINE3 0x1B |
| #define SDEFINE4 0x1C |
| #define SDEFINE5 0x1D |
| #define SDEFINE6 0x1E |
| #define SDEFINE7 0x1F |
| |
| /* Unicode mode tags */ |
| #define UCHANGE0 0xE0 |
| #define UCHANGE1 0xE1 |
| #define UCHANGE2 0xE2 |
| #define UCHANGE3 0xE3 |
| #define UCHANGE4 0xE4 |
| #define UCHANGE5 0xE5 |
| #define UCHANGE6 0xE6 |
| #define UCHANGE7 0xE7 |
| |
| #define UDEFINE0 0xE8 |
| #define UDEFINE1 0xE9 |
| #define UDEFINE2 0xEA |
| #define UDEFINE3 0xEB |
| #define UDEFINE4 0xEC |
| #define UDEFINE5 0xED |
| #define UDEFINE6 0xEE |
| #define UDEFINE7 0xEF |
| |
| #define UQUOTEU 0xF0 |
| #define UDEFINEX 0xF1 |
| /* 0xF2 is a reserved value*/ |
| #define URESERVED 0xF2 |
| |
| /* Local function prototypes */ |
| static int32_t scsu_makeIndex(int32_t c); |
| static bool_t scsu_inDynamicWindow(const UnicodeCompressor *comp, |
| int32_t c, |
| int32_t whichWindow); |
| static bool_t scsu_inStaticWindow(int32_t c, |
| int32_t whichWindow); |
| static bool_t scsu_isCompressible(int32_t c); |
| static int32_t scsu_findDynamicWindow(const UnicodeCompressor *comp, |
| int32_t c); |
| static int32_t scsu_findStaticWindow(int32_t c); |
| static int32_t scsu_getLRDefinedWindow(const UnicodeCompressor *comp); |
| |
| /* Static tables generated by CompressionTableGenerator */ |
| |
| /** For window offset mapping */ |
| static int32_t sOffsetTable [] = { |
| 0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, |
| 0x500, 0x580, 0x600, 0x680, 0x700, 0x780, 0x800, 0x880, 0x900, |
| 0x980, 0xa00, 0xa80, 0xb00, 0xb80, 0xc00, 0xc80, 0xd00, 0xd80, |
| 0xe00, 0xe80, 0xf00, 0xf80, 0x1000, 0x1080, 0x1100, 0x1180, |
| 0x1200, 0x1280, 0x1300, 0x1380, 0x1400, 0x1480, 0x1500, 0x1580, |
| 0x1600, 0x1680, 0x1700, 0x1780, 0x1800, 0x1880, 0x1900, 0x1980, |
| 0x1a00, 0x1a80, 0x1b00, 0x1b80, 0x1c00, 0x1c80, 0x1d00, 0x1d80, |
| 0x1e00, 0x1e80, 0x1f00, 0x1f80, 0x2000, 0x2080, 0x2100, 0x2180, |
| 0x2200, 0x2280, 0x2300, 0x2380, 0x2400, 0x2480, 0x2500, 0x2580, |
| 0x2600, 0x2680, 0x2700, 0x2780, 0x2800, 0x2880, 0x2900, 0x2980, |
| 0x2a00, 0x2a80, 0x2b00, 0x2b80, 0x2c00, 0x2c80, 0x2d00, 0x2d80, |
| 0x2e00, 0x2e80, 0x2f00, 0x2f80, 0x3000, 0x3080, 0x3100, 0x3180, |
| 0x3200, 0x3280, 0x3300, 0x3380, 0xe000, 0xe080, 0xe100, 0xe180, |
| 0xe200, 0xe280, 0xe300, 0xe380, 0xe400, 0xe480, 0xe500, 0xe580, |
| 0xe600, 0xe680, 0xe700, 0xe780, 0xe800, 0xe880, 0xe900, 0xe980, |
| 0xea00, 0xea80, 0xeb00, 0xeb80, 0xec00, 0xec80, 0xed00, 0xed80, |
| 0xee00, 0xee80, 0xef00, 0xef80, 0xf000, 0xf080, 0xf100, 0xf180, |
| 0xf200, 0xf280, 0xf300, 0xf380, 0xf400, 0xf480, 0xf500, 0xf580, |
| 0xf600, 0xf680, 0xf700, 0xf780, 0xf800, 0xf880, 0xf900, 0xf980, |
| 0xfa00, 0xfa80, 0xfb00, 0xfb80, 0xfc00, 0xfc80, 0xfd00, 0xfd80, |
| 0xfe00, 0xfe80, 0xff00, 0xff80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
| 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
| 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
| 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
| 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
| 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, |
| 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x250, 0x370, |
| 0x530, 0x3040, 0x30a0, 0xff60 |
| }; |
| |
| /** For quick identification of a byte as a single-byte mode tag */ |
| static bool_t sSingleTagTable [] = { |
| FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, |
| FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, |
| TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, |
| TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| }; |
| |
| /** For quick identification of a byte as a unicode mode tag */ |
| static bool_t sUnicodeTagTable [] = { |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, |
| TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, |
| TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, |
| FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, |
| FALSE |
| }; |
| |
| /** Static compression window offsets */ |
| static int32_t sOffsets [] = { |
| 0x0000, /* for quoting single-byte mode tags*/ |
| 0x0080, /* Latin-1 Supplement*/ |
| 0x0100, /* Latin Extended-A*/ |
| 0x0300, /* Combining Diacritical Marks*/ |
| 0x2000, /* General Punctuation*/ |
| 0x2080, /* Curency Symbols*/ |
| 0x2100, /* Letterlike Symbols and Number Forms*/ |
| 0x3000 /* CJK Symbols and Punctuation*/ |
| }; |
| |
| |
| void |
| scsu_init(UnicodeCompressor *comp) |
| { |
| /* initialize to defaults*/ |
| scsu_reset(comp); |
| } |
| |
| void |
| scsu_compress(UnicodeCompressor *comp, |
| uint8_t **target, |
| const uint8_t *targetLimit, |
| const UChar **source, |
| const UChar *sourceLimit, |
| UErrorCode *status) |
| { |
| /* the current position in the source unichar buffer*/ |
| const UChar *unicharBuffer = *source; |
| |
| /* the current position in the target byte buffer*/ |
| uint8_t *byteBuffer = *target; |
| |
| /* the current unicode character from the source buffer*/ |
| int32_t curUC = INVALIDCHAR; |
| |
| /* the index for the current character*/ |
| int32_t curIndex = -1; |
| |
| /* look ahead*/ |
| int32_t nextUC = INVALIDCHAR; |
| int32_t forwardUC = INVALIDCHAR; |
| |
| /* temporary for window searching*/ |
| int32_t whichWindow = 0; |
| |
| /* high and low bytes of the current unicode character*/ |
| int32_t hiByte = 0; |
| int32_t loByte = 0; |
| |
| |
| /* verify we weren't passed a failing error code */ |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| /* verify the target buffer can hold at least 4 bytes */ |
| else if(targetLimit - byteBuffer < 4) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| mainLoop: |
| while( unicharBuffer < sourceLimit && byteBuffer < targetLimit) { |
| switch( comp->fMode ) { |
| |
| /* main single byte mode compression loop*/ |
| case SINGLEBYTEMODE: |
| while( unicharBuffer < sourceLimit && byteBuffer < targetLimit ) { |
| |
| /* get current char*/ |
| curUC = *unicharBuffer++; |
| |
| /* get next char*/ |
| if( unicharBuffer < sourceLimit ) |
| nextUC = *unicharBuffer; |
| else |
| nextUC = INVALIDCHAR; |
| |
| /* chars less than 0x0080 (excluding tags) go straight in |
| stream */ |
| if( curUC < 0x0080 ) { |
| loByte = curUC; |
| |
| /* we need to check and make sure we don't |
| accidentally write a single byte mode tag to |
| the stream unless it's quoted */ |
| if(sSingleTagTable[loByte]) { |
| /* make sure there is enough room to write |
| both bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| /* since we know the byte is less than 0x80, SQUOTE0 |
| will use static window 0, or Latin-1*/ |
| *byteBuffer++ = (uint8_t) SQUOTE0; |
| } |
| |
| *byteBuffer++ = (uint8_t) loByte; |
| } |
| |
| /* if the char belongs to current window, convert it |
| to a byte by adding the generic compression offset |
| and subtracting the window's offset*/ |
| else if(scsu_inDynamicWindow(comp, |
| curUC, comp->fCurrentWindow) ) { |
| *byteBuffer++ = (uint8_t) |
| (curUC - comp->fOffsets[ comp->fCurrentWindow ] |
| + COMPRESSIONOFFSET); |
| } |
| |
| /* if char is not in compressible range, either switch |
| to or quote from unicode*/ |
| else if( ! scsu_isCompressible(curUC) ) { |
| /* only check next character if it is valid*/ |
| if(nextUC != INVALIDCHAR && scsu_isCompressible(nextUC)) { |
| /* make sure there is enough room to write all |
| three bytes if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) SQUOTEU; |
| *byteBuffer++ = (uint8_t) (curUC >> 8); |
| *byteBuffer++ = (uint8_t) curUC; |
| } |
| else { |
| /* make sure there is enough room to write all |
| four bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 3) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) SCHANGEU; |
| |
| hiByte = curUC >> 8; |
| loByte = curUC; |
| |
| /* add quote Unicode tag */ |
| if( sUnicodeTagTable[hiByte] ) |
| *byteBuffer++ = (uint8_t) UQUOTEU; |
| |
| *byteBuffer++ = (uint8_t) hiByte; |
| *byteBuffer++ = (uint8_t) loByte; |
| |
| comp->fMode = UNICODEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| } |
| |
| /* if the char is in a currently defined dynamic |
| window, figure out which one, and either switch to |
| it or quote from it*/ |
| else if( (whichWindow = scsu_findDynamicWindow(comp, curUC)) |
| != INVALIDWINDOW ) { |
| /* look ahead*/ |
| if( (unicharBuffer + 1) < sourceLimit ) |
| forwardUC = *(unicharBuffer + 1); |
| else |
| forwardUC = INVALIDCHAR; |
| |
| /* all three chars in same window, switch to that |
| window- inDynamicWindow will return FALSE for |
| INVALIDCHAR*/ |
| if( scsu_inDynamicWindow(comp, nextUC, whichWindow) |
| && scsu_inDynamicWindow(comp, forwardUC, whichWindow)){ |
| /* make sure there is enough room to write |
| both bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) (SCHANGE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) |
| (curUC - comp->fOffsets[whichWindow] |
| + COMPRESSIONOFFSET); |
| comp->fTimeStamps [ whichWindow ] = ++(comp->fTimeStamp); |
| comp->fCurrentWindow = whichWindow; |
| } |
| |
| /* either only next char or neither in same |
| window, so quote*/ |
| else { |
| /* make sure there is enough room to write |
| both bytes and if not, rewind the source stream |
| and break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) (SQUOTE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) |
| (curUC - comp->fOffsets[whichWindow] |
| + COMPRESSIONOFFSET); |
| } |
| } |
| |
| /* if a static window is defined, and the following |
| character is not in that static window, quote from |
| the static window Note: to quote from a static |
| window, don't add 0x80*/ |
| else if( (whichWindow = scsu_findStaticWindow(curUC)) |
| != INVALIDWINDOW |
| && ! scsu_inStaticWindow(nextUC, whichWindow) ) { |
| /* make sure there is enough room to write both |
| bytes if not, rewind the source stream and |
| break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) (SQUOTE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) (curUC - sOffsets[whichWindow]); |
| } |
| |
| /* if a window is not defined, decide if we want to |
| define a new one or switch to unicode mode*/ |
| else { |
| /* determine index for current char (char is |
| compressible)*/ |
| curIndex = scsu_makeIndex(curUC); |
| comp->fIndexCount[curIndex]++; |
| |
| /* look ahead*/ |
| if( (unicharBuffer + 1) < sourceLimit ) |
| forwardUC = *(unicharBuffer + 1); |
| else |
| forwardUC = INVALIDCHAR; |
| |
| /* if we have encountered this index at least once |
| before, define a new window*/ |
| if( comp->fIndexCount[curIndex] > 1 ) { |
| /* make sure there is enough room to write all |
| three bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| /* get least recently defined window*/ |
| whichWindow = scsu_getLRDefinedWindow(comp); |
| |
| *byteBuffer++ = (uint8_t) (SDEFINE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) curIndex; |
| *byteBuffer++ = (uint8_t) |
| (curUC - sOffsetTable[curIndex] |
| + COMPRESSIONOFFSET); |
| |
| comp->fOffsets[whichWindow] = sOffsetTable[curIndex]; |
| comp->fCurrentWindow = whichWindow; |
| comp->fTimeStamps [whichWindow] = ++(comp->fTimeStamp); |
| } |
| |
| /* three chars in a row with same index, define a |
| new window- makeIndex will return RESERVEDINDEX |
| for INVALIDCHAR*/ |
| else if( curIndex == scsu_makeIndex(nextUC) |
| && curIndex == scsu_makeIndex(forwardUC) ) { |
| /* make sure there is enough room to write all |
| three bytes if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| whichWindow = scsu_getLRDefinedWindow(comp); |
| |
| *byteBuffer++ = (uint8_t) (SDEFINE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) curIndex; |
| *byteBuffer++ = (uint8_t) |
| (curUC - sOffsetTable[curIndex] |
| + COMPRESSIONOFFSET); |
| |
| comp->fOffsets[whichWindow] = sOffsetTable[curIndex]; |
| comp->fCurrentWindow = whichWindow; |
| comp->fTimeStamps [whichWindow] = ++(comp->fTimeStamp); |
| } |
| |
| /* only two chars in a row with same index, so |
| switch to unicode mode makeIndex will return |
| RESERVEDINDEX for INVALIDCHAR*/ |
| else if( curIndex == scsu_makeIndex(nextUC) |
| && curIndex != scsu_makeIndex(forwardUC) ) { |
| /* make sure there is enough room to write all |
| four bytes if not, rewind the source stream |
| and break out*/ |
| if( (byteBuffer + 3) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) SCHANGEU; |
| |
| hiByte = curUC >> 8; |
| loByte = curUC; |
| |
| /* add quote Unicode tag */ |
| if( sUnicodeTagTable[hiByte] ) |
| *byteBuffer++ = (uint8_t) UQUOTEU; |
| |
| *byteBuffer++ = (uint8_t) hiByte; |
| *byteBuffer++ = (uint8_t) loByte; |
| |
| comp->fMode = UNICODEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| |
| /* three chars have different indices, so switch |
| to unicode mode*/ |
| else { |
| /* make sure there is enough room to write all |
| four bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 3) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) SCHANGEU; |
| |
| hiByte = curUC >> 8; |
| loByte = curUC; |
| |
| /* add quote Unicode tag*/ |
| if( sUnicodeTagTable[ hiByte ] ) |
| *byteBuffer++ = (uint8_t) UQUOTEU; |
| |
| *byteBuffer++ = (uint8_t) hiByte; |
| *byteBuffer++ = (uint8_t) loByte; |
| |
| comp->fMode = UNICODEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| } |
| } |
| break; |
| |
| /* main unicode mode compression loop*/ |
| case UNICODEMODE: |
| while(unicharBuffer < sourceLimit && byteBuffer < targetLimit) { |
| |
| /* get current char*/ |
| curUC = *unicharBuffer++; |
| |
| /* get next char*/ |
| if( unicharBuffer < sourceLimit ) |
| nextUC = *unicharBuffer; |
| else |
| nextUC = INVALIDCHAR; |
| |
| /* if we have two uncompressible unichars in a row, |
| put the current char's bytes in the stream*/ |
| if( ! scsu_isCompressible(curUC) |
| || (nextUC != INVALIDCHAR |
| && ! scsu_isCompressible(nextUC)) ) { |
| /* make sure there is enough room to write all |
| three bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| hiByte = curUC >> 8; |
| loByte = curUC; |
| |
| /* add quote Unicode tag*/ |
| if( sUnicodeTagTable[ hiByte ] ) |
| *byteBuffer++ = (uint8_t) UQUOTEU; |
| |
| *byteBuffer++ = (uint8_t) hiByte; |
| *byteBuffer++ = (uint8_t) loByte; |
| } |
| |
| /* bytes less than 0x80 can go straight in the stream, |
| but in single-byte mode*/ |
| else if( curUC < 0x0080 ) { |
| loByte = curUC; |
| |
| /* if two chars in a row below 0x80 and the |
| current char is not a single-byte mode tag, |
| switch to single-byte mode*/ |
| if(nextUC != INVALIDCHAR |
| && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) { |
| /* make sure there is enough room to write |
| both bytes and if not, rewind the source stream |
| and break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| /* use window 0, but any would work*/ |
| *byteBuffer++ = (uint8_t) UCHANGE0; |
| *byteBuffer++ = (uint8_t) loByte; |
| |
| comp->fCurrentWindow = 0; |
| comp->fTimeStamps [0] = ++(comp->fTimeStamp); |
| comp->fMode = SINGLEBYTEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| |
| /* otherwise, just write the bytes to the stream |
| (this will cover the case of only 1 char less |
| than 0x80 and single-byte mode tags)*/ |
| else { |
| /* make sure there is enough room to write |
| both bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| /* since the character is less than 0x80, the |
| high byte is always 0x00 - no need for |
| (curUC >> 8)*/ |
| *byteBuffer++ = (uint8_t) 0x00; |
| *byteBuffer++ = (uint8_t) loByte; |
| } |
| } |
| |
| /* figure out if the current unichar is in a defined |
| window*/ |
| else if( (whichWindow = scsu_findDynamicWindow(comp, curUC)) |
| != INVALIDWINDOW ) { |
| /* if two chars in a row in the same window, |
| switch to that window and go to single-byte |
| mode inDynamicWindow will return FALSE for |
| INVALIDCHAR*/ |
| if( scsu_inDynamicWindow(comp, nextUC, whichWindow) ) { |
| /* make sure there is enough room to write |
| both bytes if not, rewind the source stream |
| and break out*/ |
| if( (byteBuffer + 1) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| *byteBuffer++ = (uint8_t) (UCHANGE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) |
| (curUC - comp->fOffsets[whichWindow] |
| + COMPRESSIONOFFSET); |
| |
| comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp); |
| comp->fCurrentWindow = whichWindow; |
| comp->fMode = SINGLEBYTEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| |
| /* otherwise, just quote the unicode for the |
| char*/ |
| else { |
| /* make sure there is enough room to write all |
| three bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| hiByte = curUC >> 8; |
| loByte = curUC; |
| |
| /* add quote Unicode tag*/ |
| if( sUnicodeTagTable[ hiByte ] ) |
| *byteBuffer++ = (uint8_t) UQUOTEU; |
| |
| *byteBuffer++ = (uint8_t) hiByte; |
| *byteBuffer++ = (uint8_t) loByte; |
| } |
| } |
| |
| /* char is not in a defined window*/ |
| else { |
| /* determine index for current char (char is |
| compressible)*/ |
| curIndex = scsu_makeIndex(curUC); |
| comp->fIndexCount[curIndex]++; |
| |
| /* look ahead*/ |
| if( (unicharBuffer + 1) < sourceLimit ) |
| forwardUC = *unicharBuffer; |
| else |
| forwardUC = INVALIDCHAR; |
| |
| /* if we have encountered this index at least once |
| before, define a new window for it that hasn't |
| previously been redefined*/ |
| if( comp->fIndexCount[curIndex] > 1 ) { |
| /* make sure there is enough room to write all |
| three bytes if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| /* get least recently defined window*/ |
| whichWindow = scsu_getLRDefinedWindow(comp); |
| |
| *byteBuffer++ = (uint8_t) (UDEFINE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) curIndex; |
| *byteBuffer++ = (uint8_t) |
| (curUC - sOffsetTable[curIndex] |
| + COMPRESSIONOFFSET); |
| |
| comp->fOffsets[whichWindow] = sOffsetTable[curIndex]; |
| comp->fCurrentWindow = whichWindow; |
| comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp); |
| comp->fMode = SINGLEBYTEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| |
| /* if three chars in a row with the same index, |
| define a new window makeIndex will return |
| RESERVEDINDEX for INVALIDCHAR*/ |
| else if( curIndex == scsu_makeIndex(nextUC) |
| && curIndex == scsu_makeIndex(forwardUC) ) { |
| /* make sure there is enough room to write all |
| three bytes if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| whichWindow = scsu_getLRDefinedWindow(comp); |
| |
| *byteBuffer++ = (uint8_t) (UDEFINE0 + whichWindow); |
| *byteBuffer++ = (uint8_t) curIndex; |
| *byteBuffer++ = (uint8_t) |
| (curUC - sOffsetTable[curIndex] |
| + COMPRESSIONOFFSET); |
| |
| comp->fOffsets[whichWindow] = sOffsetTable[curIndex]; |
| comp->fCurrentWindow = whichWindow; |
| comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp); |
| comp->fMode = SINGLEBYTEMODE; |
| |
| /* use a goto here for speed, to avoid having |
| to check fMode in the while loop at the top |
| of the case */ |
| goto mainLoop; |
| } |
| |
| /* otherwise just quote the unicode, and save our |
| windows for longer runs*/ |
| else { |
| /* make sure there is enough room to write all |
| three bytes and if not, rewind the source |
| stream and break out*/ |
| if( (byteBuffer + 2) >= targetLimit) { |
| --unicharBuffer; |
| goto finish; |
| } |
| |
| hiByte = curUC >> 8; |
| loByte = curUC; |
| |
| /* add quote Unicode tag*/ |
| if( sUnicodeTagTable[ hiByte ] ) |
| *byteBuffer++ = (uint8_t) UQUOTEU; |
| |
| *byteBuffer++ = (uint8_t) hiByte; |
| *byteBuffer++ = (uint8_t) loByte; |
| } |
| } |
| } |
| } /* end switch*/ |
| } |
| |
| finish: |
| |
| /* fill in output parameters*/ |
| *target = byteBuffer; |
| *source = unicharBuffer; |
| |
| if(unicharBuffer < sourceLimit) |
| *status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| |
| void |
| scsu_decompress(UnicodeCompressor *comp, |
| UChar **target, |
| const UChar *targetLimit, |
| const uint8_t **source, |
| const uint8_t *sourceLimit, |
| UErrorCode *status) |
| { |
| /* the current position in the source byte buffer*/ |
| const uint8_t *byteBuffer = *source; |
| |
| /* the current position in the target unichar buffer*/ |
| UChar *unicharBuffer = *target; |
| |
| /* the current byte from the source buffer*/ |
| int32_t aByte = 0x00; |
| |
| /* temporary for calculating surrogate pairs */ |
| int32_t normalizedBase; |
| |
| /* temporary used for look-ahead */ |
| int32_t dByte; |
| |
| |
| /* verify we weren't passed a failing error code */ |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| /* verify the target buffer can hold at least 1 UChar */ |
| else if(targetLimit - unicharBuffer < sizeof(UChar)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| /* if our internal buffer isn't empty, flush its contents |
| to the output buffer before doing any more decompression */ |
| if(comp->fBufferLength > 0) { |
| |
| int32_t newBytes = 0; |
| const uint8_t *newSource = comp->fBuffer; |
| const uint8_t *newSourceLimit = comp->fBuffer + USCSU_BUFSIZE; |
| |
| /* fill the buffer completely, to guarantee one full character */ |
| if(comp->fBufferLength != USCSU_BUFSIZE) { |
| newBytes = USCSU_BUFSIZE - comp->fBufferLength; |
| |
| /* verify there are newBytes bytes in byteBuffer */ |
| if(sourceLimit - byteBuffer < newBytes) |
| newBytes = sourceLimit - byteBuffer; |
| |
| uprv_memcpy(comp->fBuffer + comp->fBufferLength, byteBuffer, newBytes); |
| } |
| |
| /* reset buffer length to 0 before recursive call */ |
| comp->fBufferLength = 0; |
| |
| /* call self recursively to decompress the buffer */ |
| scsu_decompress(comp, &unicharBuffer, targetLimit, |
| &newSource, newSourceLimit, status); |
| |
| /* update the positions into the arrays */ |
| /* unicharBuffer was updated by the call to decompress above */ |
| byteBuffer += newBytes; |
| } |
| |
| /* the main decompression loop*/ |
| mainLoop: |
| while(byteBuffer < sourceLimit && unicharBuffer < targetLimit) { |
| |
| switch(comp->fMode) { |
| |
| /* single-byte mode decompression loop*/ |
| case SINGLEBYTEMODE: |
| while(byteBuffer < sourceLimit && unicharBuffer < targetLimit) { |
| |
| /* get the next byte */ |
| aByte = *byteBuffer++; |
| |
| switch(aByte) { |
| /* All bytes from 0x80 through 0xFF are remapped to |
| chars or surrogate pairs according to the currently |
| active window */ |
| case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: |
| case 0x85: case 0x86: case 0x87: case 0x88: case 0x89: |
| case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: |
| case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: |
| case 0x94: case 0x95: case 0x96: case 0x97: case 0x98: |
| case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: |
| case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: |
| case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: |
| case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: |
| case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: |
| case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: |
| case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: |
| case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0: |
| case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5: |
| case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA: |
| case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: |
| case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: |
| case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9: |
| case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: |
| case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3: |
| case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8: |
| case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED: |
| case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2: |
| case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7: |
| case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC: |
| case 0xFD: case 0xFE: case 0xFF: |
| |
| /* For offsets <= 0xFFFF, convert to a single char by |
| adding the window's offset and subtracting the |
| generic compression offset*/ |
| if(comp->fOffsets[ comp->fCurrentWindow ] <= 0xFFFF) { |
| *unicharBuffer++ = (UChar) |
| (aByte + comp->fOffsets[comp->fCurrentWindow] |
| - COMPRESSIONOFFSET); |
| } |
| /* For offsets > 0x10000, convert to a surrogate pair by |
| normBase = window's offset - 0x10000 |
| high surrogate = 0xD800 + (normBase >> 10) |
| low surrogate = 0xDC00 + (normBase & 0x3FF) |
| + (byte & 0x7F) */ |
| else { |
| /* make sure there is enough room to write |
| both characters |
| if not, save state and break out */ |
| if((unicharBuffer + 1) >= targetLimit) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| normalizedBase = comp->fOffsets[comp->fCurrentWindow] |
| - 0x10000; |
| *unicharBuffer++ = |
| (UChar) (0xD800 + (normalizedBase >> 10)); |
| *unicharBuffer++ = (UChar) |
| (0xDC00 + (normalizedBase & 0x3FF) |
| + (aByte & 0x7F)); |
| } |
| break; |
| |
| /* bytes from 0x20 through 0x7F are treated as ASCII |
| and are remapped to chars by padding the high byte |
| (this is the same as quoting from static window 0) |
| NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) are |
| treated as ASCII as well*/ |
| case 0x00: case 0x09: case 0x0A: case 0x0D: |
| case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: |
| case 0x25: case 0x26: case 0x27: case 0x28: case 0x29: |
| case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: |
| case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33: |
| case 0x34: case 0x35: case 0x36: case 0x37: case 0x38: |
| case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: |
| case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42: |
| case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: |
| case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: |
| case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: |
| case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: |
| case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B: |
| case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60: |
| case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: |
| case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A: |
| case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: |
| case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: |
| case 0x75: case 0x76: case 0x77: case 0x78: case 0x79: |
| case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E: |
| case 0x7F: |
| *unicharBuffer++ = (UChar) aByte; |
| break; |
| |
| /* quote unicode*/ |
| case SQUOTEU: |
| /* verify we have two bytes following tag and if not, |
| rewind the source stream and break out */ |
| if( (byteBuffer + 1) >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| aByte = *byteBuffer++; |
| *unicharBuffer++ = |
| (UChar) (aByte << 8 | *byteBuffer++); |
| break; |
| |
| /* switch to Unicode mode*/ |
| case SCHANGEU: |
| comp->fMode = UNICODEMODE; |
| /* use a goto here for speed, to avoid having to check |
| fMode in the while loop at the top of the case */ |
| goto mainLoop; |
| break; |
| |
| /* handle all quote tags*/ |
| case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3: |
| case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7: |
| /* verify there is a byte following the tag and if |
| not, rewind the source stream and break out*/ |
| if( byteBuffer >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| /* if the byte is in the range 0x00 - 0x7F, use static |
| window n- otherwise, use dynamic window n */ |
| dByte = *byteBuffer++; |
| *unicharBuffer++ = (UChar) |
| (dByte + (dByte >= 0x00 && dByte < 0x80 |
| ? sOffsets[aByte - SQUOTE0] |
| : (comp->fOffsets[aByte - SQUOTE0] |
| - COMPRESSIONOFFSET))); |
| break; |
| |
| /* handle all change tags*/ |
| case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3: |
| case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7: |
| comp->fCurrentWindow = (aByte - SCHANGE0); |
| break; |
| |
| /* handle all define tags*/ |
| case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3: |
| case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7: |
| /* verify there is a byte following the tag and if |
| not, rewind the source stream and break out*/ |
| if( byteBuffer >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| comp->fCurrentWindow = (aByte - SDEFINE0); |
| comp->fOffsets[comp->fCurrentWindow] = |
| sOffsetTable[*byteBuffer++]; |
| break; |
| |
| /* handle define extended tag*/ |
| case SDEFINEX: |
| /* verify we have two bytes following tag and if not, |
| rewind the source stream and break out*/ |
| if( (byteBuffer + 1) >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| aByte = *byteBuffer++; |
| comp->fCurrentWindow = (aByte & 0xE0) >> 5; |
| comp->fOffsets[comp->fCurrentWindow] = 0x10000 |
| + (0x80 |
| * (((aByte & 0x1F) << 8) | *byteBuffer++)); |
| break; |
| |
| /* reserved, shouldn't happen*/ |
| case SRESERVED: |
| break; |
| |
| } /* end switch*/ |
| } /* end while*/ |
| break; |
| |
| /* unicode mode decompression loop*/ |
| case UNICODEMODE: |
| while( byteBuffer < sourceLimit && unicharBuffer < targetLimit ) { |
| |
| /* get the next byte */ |
| aByte = *byteBuffer++; |
| |
| switch( aByte ) { |
| /* handle all define tags*/ |
| case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3: |
| case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7: |
| /* verify there is a byte following tag and if not, |
| rewind the source stream and break out*/ |
| if( byteBuffer >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| comp->fCurrentWindow = (aByte - UDEFINE0); |
| comp->fOffsets[comp->fCurrentWindow] = |
| sOffsetTable[*byteBuffer++]; |
| comp->fMode = SINGLEBYTEMODE; |
| /* use a goto here for speed, to avoid having to check |
| fMode in the while loop at the top of the case */ |
| goto mainLoop; |
| break; |
| |
| /* handle define extended tag*/ |
| case UDEFINEX: |
| /* verify we have two bytes following tag if not, |
| rewind the source stream and break out*/ |
| if( (byteBuffer + 1) >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| aByte = *byteBuffer++; |
| comp->fCurrentWindow = (aByte & 0xE0) >> 5; |
| comp->fOffsets[comp->fCurrentWindow] = 0x10000 |
| + (0x80 |
| * (((aByte & 0x1F) << 8) | *byteBuffer++)); |
| comp->fMode = SINGLEBYTEMODE; |
| /* use a goto here for speed, to avoid having to check |
| fMode in the while loop at the top of the case */ |
| goto mainLoop; |
| break; |
| |
| /* handle all change tags*/ |
| case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3: |
| case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7: |
| comp->fCurrentWindow = (aByte - UCHANGE0); |
| comp->fMode = SINGLEBYTEMODE; |
| /* use a goto here for speed, to avoid having to check |
| fMode in the while loop at the top of the case */ |
| goto mainLoop; |
| break; |
| |
| /* quote unicode*/ |
| case UQUOTEU: |
| /* verify we have two bytes following tag if not, |
| rewind the source stream and break out*/ |
| if( byteBuffer >= sourceLimit - 1) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| aByte = *byteBuffer++; |
| *unicharBuffer++ = (UChar) |
| (aByte << 8 | *byteBuffer++); |
| break; |
| |
| default: |
| /* verify there is a byte following tag if not, rewind |
| the source stream and break out*/ |
| if( byteBuffer >= sourceLimit ) { |
| --byteBuffer; |
| uprv_memcpy(comp->fBuffer, byteBuffer, |
| sourceLimit - byteBuffer); |
| comp->fBufferLength = sourceLimit - byteBuffer; |
| byteBuffer += comp->fBufferLength; |
| goto finish; |
| } |
| |
| *unicharBuffer++ = (UChar) (aByte << 8 | *byteBuffer++); |
| break; |
| |
| } /* end switch*/ |
| } /* end while*/ |
| break; |
| |
| } /* end switch( comp->fMode )*/ |
| } /* end while*/ |
| |
| |
| finish: |
| |
| /* fill in return values*/ |
| *target = unicharBuffer; |
| *source = byteBuffer; |
| |
| if(byteBuffer < sourceLimit) |
| *status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| |
| /** Reset the compressor to its initial state. */ |
| void |
| scsu_reset(UnicodeCompressor *comp) |
| { |
| int32_t i; |
| |
| /* reset dynamic windows*/ |
| comp->fOffsets[0] = 0x0080; /* Latin-1*/ |
| comp->fOffsets[1] = 0x00C0; /* Latin-1 Supplement + Latin Extended-A*/ |
| comp->fOffsets[2] = 0x0400; /* Cyrillic*/ |
| comp->fOffsets[3] = 0x0600; /* Arabic*/ |
| comp->fOffsets[4] = 0x0900; /* Devanagari*/ |
| comp->fOffsets[5] = 0x3040; /* Hiragana*/ |
| comp->fOffsets[6] = 0x30A0; /* Katakana*/ |
| comp->fOffsets[7] = 0xFF00; /* Fullwidth ASCII*/ |
| |
| /* reset time stamps*/ |
| for(i = 0; i < USCSU_NUM_WINDOWS; i++) { |
| comp->fTimeStamps[i] = 0; |
| } |
| |
| /* reset count of seen indices*/ |
| for( i = 0; i <= USCSU_MAX_INDEX; i++ ) { |
| comp->fIndexCount[i] = 0; |
| } |
| |
| comp->fTimeStamp = 0; /* Reset current time stamp*/ |
| comp->fCurrentWindow = 0; /* Make current window Latin-1*/ |
| comp->fMode = SINGLEBYTEMODE; /* Start in single-byte mode*/ |
| comp->fBufferLength = 0; /* Empty buffer */ |
| } |
| |
| /** |
| * Create the index value for a character. |
| * For more information on this function, refer to table X-3 |
| * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>. |
| * @param c The character in question. |
| * @return An index for c |
| */ |
| static int32_t |
| scsu_makeIndex(int32_t c) |
| { |
| /* check the predefined indices*/ |
| if( c >= 0x00C0 && c < 0x0140) |
| return LATININDEX; |
| else if( c >= 0x0250 && c < 0x02D0 ) |
| return IPAEXTENSIONINDEX; |
| else if( c >= 0x0370 && c < 0x03F0 ) |
| return GREEKINDEX; |
| else if( c >= 0x0530 && c < 0x0590 ) |
| return ARMENIANINDEX; |
| else if( c >= 0x3040 && c < 0x30A0 ) |
| return HIRAGANAINDEX; |
| else if( c >= 0x30A0 && c < 0x3120) |
| return KATAKANAINDEX; |
| else if( c >= 0xFF60 && c < 0xFF9F ) |
| return HALFWIDTHKATAKANAINDEX; |
| |
| /* calculate index*/ |
| else if( c >= 0x0080 && c < 0x3400 ) |
| return (c / 0x80) & 0xFF; |
| else if( c >= 0xE000 && c <= 0xFFFF ) |
| return ((c - 0xAC00) / 0x80) & 0xFF; |
| |
| /* should never happen*/ |
| else { |
| return RESERVEDINDEX; |
| } |
| } |
| |
| /** |
| * Determine if a character is in a dynamic window. |
| * @param c The character to test |
| * @param whichWindow The dynamic window the test |
| * @return TRUE if <TT>c</TT> will fit in <TT>whichWindow</TT>, FALSE |
| * otherwise. |
| */ |
| static bool_t |
| scsu_inDynamicWindow(const UnicodeCompressor *comp, |
| int32_t c, |
| int32_t whichWindow) |
| { |
| return (c >= comp->fOffsets[whichWindow] |
| && c < (comp->fOffsets[whichWindow] + 0x80)); |
| } |
| |
| /** |
| * Determine if a character is in a static window. |
| * @param c The character to test |
| * @param whichWindow The static window the test |
| * @return TRUE if <TT>c</TT> will fit in <TT>whichWindow</TT>, FALSE |
| * otherwise. |
| */ |
| static bool_t |
| scsu_inStaticWindow(int32_t c, |
| int32_t whichWindow) |
| { |
| return (c >= sOffsets[whichWindow] && c < (sOffsets[whichWindow] + 0x80)); |
| } |
| |
| /** |
| * Determine if a character is compressible. |
| * @param c The character to test. |
| * @return TRUE if the <TT>c</TT> is compressible, FALSE otherwise. |
| */ |
| static bool_t |
| scsu_isCompressible(int32_t c) |
| { |
| return (c < 0x3400 || c >= 0xE000); |
| } |
| |
| /** |
| * Determine if a dynamic window for a certain character is defined |
| * @param c The character in question |
| * @return The dynamic window containing <TT>c</TT>, or INVALIDWINDOW if |
| * not defined. |
| */ |
| static int32_t |
| scsu_findDynamicWindow(const UnicodeCompressor *comp, |
| int32_t c) |
| { |
| int32_t i; |
| |
| for(i = 0; i < USCSU_NUM_WINDOWS; i++) { |
| if(scsu_inDynamicWindow(comp, c, i)) { |
| return i; |
| } |
| } |
| |
| return INVALIDWINDOW; |
| } |
| |
| /** |
| * Determine if a static window for a certain character is defined |
| * @param c The character in question |
| * @return The static window containing <TT>c</TT>, or INVALIDWINDOW if |
| * not defined. |
| */ |
| static int32_t |
| scsu_findStaticWindow(int32_t c) |
| { |
| int32_t i; |
| |
| for(i = 0; i < USCSU_NUM_STATIC_WINDOWS; i++) { |
| if(scsu_inStaticWindow(c, i)) { |
| return i; |
| } |
| } |
| |
| return INVALIDWINDOW; |
| } |
| |
| /** Find the least-recently defined window */ |
| static int32_t |
| scsu_getLRDefinedWindow(const UnicodeCompressor *comp) |
| { |
| int32_t leastRU = LONG_MAX; |
| int32_t whichWindow = INVALIDWINDOW; |
| int32_t i; |
| |
| /* find least recently used window*/ |
| for(i = 0; i < USCSU_NUM_WINDOWS; i++ ) { |
| if(comp->fTimeStamps[i] < leastRU) { |
| leastRU = comp->fTimeStamps[i]; |
| whichWindow = i; |
| } |
| } |
| |
| return whichWindow; |
| } |