blob: 65692da519f75444dc66ecec214ec73f348c9d50 [file] [log] [blame]
/*
********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2003, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/
#include "ucmp8.h"
#include "cmemory.h"
/* internal constants*/
U_CAPI int32_t U_EXPORT2
ucmp8_getkUnicodeCount() { return UCMP8_kUnicodeCount;}
U_CAPI int32_t U_EXPORT2
ucmp8_getkBlockCount() { return UCMP8_kBlockCount;}
U_CAPI void U_EXPORT2
ucmp8_initBogus(CompactByteArray* array)
{
CompactByteArray* this_obj = array;
if (this_obj == NULL) return;
this_obj->fStructSize = sizeof(CompactByteArray);
this_obj->fArray = NULL;
this_obj->fIndex = NULL;
this_obj->fCount = UCMP8_kUnicodeCount;
this_obj->fCompact = FALSE;
this_obj->fBogus = TRUE;
this_obj->fAlias = FALSE;
this_obj->fIAmOwned = TRUE;
}
/* debug flags*/
/*=======================================================*/
U_CAPI void U_EXPORT2
ucmp8_init(CompactByteArray* array, int8_t defaultValue)
{
/* set up the index array and the data array.
* the index array always points into particular parts of the data array
* it is initially set up to point at regular block boundaries
* The following example uses blocks of 4 for simplicity
* Example: Expanded
* INDEX# 0 1 2 3 4
* INDEX 0 4 8 12 16 ...
* ARRAY abcdeababcedzyabcdea...
* | | | | | |...
* whenever you set an element in the array, it unpacks to this state
* After compression, the index will point to various places in the data array
* wherever there is a runs of the same elements as in the original
* Example: Compressed
* INDEX# 0 1 2 3 4
* INDEX 0 4 1 8 2 ...
* ARRAY abcdeabazyabc...
* If you look at the example, index# 2 in the expanded version points
* to data position number 8, which has elements "bced". In the compressed
* version, index# 2 points to data position 1, which also has "bced"
*/
CompactByteArray* this_obj = array;
int32_t i;
if (this_obj == NULL) return;
this_obj->fStructSize = sizeof(CompactByteArray);
this_obj->fArray = NULL;
this_obj->fIndex = NULL;
this_obj->fCount = UCMP8_kUnicodeCount;
this_obj->fCompact = FALSE;
this_obj->fBogus = FALSE;
this_obj->fAlias = FALSE;
this_obj->fIAmOwned = TRUE;
this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
if (!this_obj->fArray)
{
this_obj->fBogus = TRUE;
return;
}
this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
if (!this_obj->fIndex)
{
uprv_free(this_obj->fArray);
this_obj->fArray = NULL;
this_obj->fBogus = TRUE;
return;
}
for (i = 0; i < UCMP8_kUnicodeCount; ++i)
{
this_obj->fArray[i] = defaultValue;
}
for (i = 0; i < UCMP8_kIndexCount; ++i)
{
this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
}
}
U_CAPI CompactByteArray* U_EXPORT2
ucmp8_open(int8_t defaultValue)
{
/* set up the index array and the data array.
* the index array always points into particular parts of the data array
* it is initially set up to point at regular block boundaries
* The following example uses blocks of 4 for simplicity
* Example: Expanded
* INDEX# 0 1 2 3 4
* INDEX 0 4 8 12 16 ...
* ARRAY abcdeababcedzyabcdea...
* | | | | | |...
* whenever you set an element in the array, it unpacks to this state
* After compression, the index will point to various places in the data array
* wherever there is a runs of the same elements as in the original
* Example: Compressed
* INDEX# 0 1 2 3 4
* INDEX 0 4 1 8 2 ...
* ARRAY abcdeabazyabc...
* If you look at the example, index# 2 in the expanded version points
* to data position number 8, which has elements "bced". In the compressed
* version, index# 2 points to data position 1, which also has "bced"
*/
CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
int32_t i;
if (this_obj == NULL) return NULL;
this_obj->fStructSize = sizeof(CompactByteArray);
this_obj->fArray = NULL;
this_obj->fIndex = NULL;
this_obj->fCount = UCMP8_kUnicodeCount;
this_obj->fCompact = FALSE;
this_obj->fBogus = FALSE;
this_obj->fAlias = FALSE;
this_obj->fIAmOwned = FALSE;
this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
if (!this_obj->fArray)
{
this_obj->fBogus = TRUE;
return NULL;
}
this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
if (!this_obj->fIndex)
{
uprv_free(this_obj->fArray);
this_obj->fArray = NULL;
this_obj->fBogus = TRUE;
return NULL;
}
for (i = 0; i < UCMP8_kUnicodeCount; ++i)
{
this_obj->fArray[i] = defaultValue;
}
for (i = 0; i < UCMP8_kIndexCount; ++i)
{
this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
}
return this_obj;
}
U_CAPI CompactByteArray* U_EXPORT2
ucmp8_openAdopt(uint16_t *indexArray,
int8_t *newValues,
int32_t count)
{
CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
/* test for NULL */
if(this_obj == NULL)
return NULL;
ucmp8_initAdopt(this_obj, indexArray, newValues, count);
this_obj->fIAmOwned = FALSE;
return this_obj;
}
U_CAPI CompactByteArray* U_EXPORT2
ucmp8_openAlias(uint16_t *indexArray,
int8_t *newValues,
int32_t count)
{
CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
/* test for NULL */
if(this_obj == NULL)
return NULL;
ucmp8_initAlias(this_obj, indexArray, newValues, count);
this_obj->fIAmOwned = FALSE;
return this_obj;
}
/*=======================================================*/
U_CAPI CompactByteArray* U_EXPORT2
ucmp8_initAdopt(CompactByteArray *this_obj,
uint16_t *indexArray,
int8_t *newValues,
int32_t count)
{
if (this_obj) {
this_obj->fCount = count;
this_obj->fBogus = FALSE;
this_obj->fStructSize = sizeof(CompactByteArray);
this_obj->fArray = newValues;
this_obj->fIndex = indexArray;
this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
this_obj->fAlias = FALSE;
this_obj->fIAmOwned = TRUE;
}
return this_obj;
}
U_CAPI CompactByteArray* U_EXPORT2
ucmp8_initAlias(CompactByteArray *this_obj,
uint16_t *indexArray,
int8_t *newValues,
int32_t count)
{
if (this_obj) {
this_obj->fArray = NULL;
this_obj->fIndex = NULL;
this_obj->fCount = count;
this_obj->fBogus = FALSE;
this_obj->fStructSize = sizeof(CompactByteArray);
this_obj->fArray = newValues;
this_obj->fIndex = indexArray;
this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
this_obj->fAlias = TRUE;
this_obj->fIAmOwned = TRUE;
}
return this_obj;
}
/*=======================================================*/
U_CAPI void U_EXPORT2
ucmp8_close(CompactByteArray* this_obj)
{
if(this_obj != NULL) {
if(!this_obj->fAlias) {
if(this_obj->fArray != NULL) {
uprv_free(this_obj->fArray);
}
if(this_obj->fIndex != NULL) {
uprv_free(this_obj->fIndex);
}
}
if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
{
uprv_free(this_obj);
}
}
}
/*=======================================================*/
U_CAPI void U_EXPORT2
ucmp8_expand(CompactByteArray* this_obj)
{
/* can optimize later.
* if we have to expand, then walk through the blocks instead of using Get
* this code unpacks the array by copying the blocks to the normalized position.
* Example: Compressed
* INDEX# 0 1 2 3 4
* INDEX 0 4 1 8 2 ...
* ARRAY abcdeabazyabc...
* turns into
* Example: Expanded
* INDEX# 0 1 2 3 4
* INDEX 0 4 8 12 16 ...
* ARRAY abcdeababcedzyabcdea...
*/
int32_t i;
if (this_obj->fCompact)
{
int8_t* tempArray;
tempArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
if (!tempArray)
{
this_obj->fBogus = TRUE;
return;
}
for (i = 0; i < UCMP8_kUnicodeCount; ++i)
{
tempArray[i] = ucmp8_get(this_obj,(UChar)i); /* HSYS : How expand?*/
}
for (i = 0; i < UCMP8_kIndexCount; ++i)
{
this_obj->fIndex[i] = (uint16_t)(i<< UCMP8_kBlockShift);
}
uprv_free(this_obj->fArray);
this_obj->fArray = tempArray;
this_obj->fCompact = FALSE;
this_obj->fAlias = FALSE;
}
}
/*=======================================================*/
/* this_obj->fArray: an array to be overlapped
* start and count: specify the block to be overlapped
* tempIndex: the overlapped array (actually indices back into inputContents)
* inputHash: an index of hashes for tempIndex, where
* inputHash[i] = XOR of values from i-count+1 to i
*/
static int32_t
findOverlappingPosition(CompactByteArray* this_obj,
uint32_t start,
const UChar* tempIndex,
int32_t tempIndexCount,
uint32_t cycle)
{
/* this_obj is a utility routine for finding blocks that overlap.
* IMPORTANT: the cycle number is very important. Small cycles take a lot
* longer to work. In some cases, they may be able to get better compaction.
*/
int32_t i;
int32_t j;
int32_t currentCount;
for (i = 0; i < tempIndexCount; i += cycle)
{
currentCount = UCMP8_kBlockCount;
if (i + UCMP8_kBlockCount > tempIndexCount)
{
currentCount = tempIndexCount - i;
}
for (j = 0; j < currentCount; ++j)
{
if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]])
break;
}
if (j == currentCount)
break;
}
return i;
}
U_CAPI UBool U_EXPORT2
ucmp8_isBogus(const CompactByteArray* this_obj)
{
return (UBool)(this_obj == NULL || this_obj->fBogus);
}
U_CAPI const int8_t* U_EXPORT2
ucmp8_getArray(const CompactByteArray* this_obj)
{
return this_obj->fArray;
}
U_CAPI const uint16_t* U_EXPORT2
ucmp8_getIndex(const CompactByteArray* this_obj)
{
return this_obj->fIndex;
}
U_CAPI int32_t U_EXPORT2
ucmp8_getCount(const CompactByteArray* this_obj)
{
return this_obj->fCount;
}
U_CAPI void U_EXPORT2
ucmp8_set(CompactByteArray* this_obj,
UChar c,
int8_t value)
{
if (this_obj->fCompact == TRUE)
{
ucmp8_expand(this_obj);
if (this_obj->fBogus) return;
}
this_obj->fArray[(int32_t)c] = value;
}
U_CAPI void U_EXPORT2
ucmp8_setRange(CompactByteArray* this_obj,
UChar start,
UChar end,
int8_t value)
{
int32_t i;
if (this_obj->fCompact == TRUE)
{
ucmp8_expand(this_obj);
if (this_obj->fBogus)
return;
}
for (i = start; i <= end; ++i)
{
this_obj->fArray[i] = value;
}
}
/*=======================================================*/
U_CAPI void U_EXPORT2
ucmp8_compact(CompactByteArray* this_obj,
uint32_t cycle)
{
if (!this_obj->fCompact)
{
/* this_obj actually does the compaction.
* it walks throught the contents of the expanded array, finding the
* first block in the data that matches the contents of the current index.
* As it works, it keeps an updated pointer to the last position,
* so that it knows how big to make the final array
* If the matching succeeds, then the index will point into the data
* at some earlier position.
* If the matching fails, then last position pointer will be bumped,
* and the index will point to that last block of data.
*/
UChar* tempIndex;
int32_t tempIndexCount;
int8_t* tempArray;
int32_t iBlock, iIndex;
/* fix cycle, must be 0 < cycle <= blockcount*/
if (cycle <= 0)
cycle = 1;
else if (cycle > (uint32_t)UCMP8_kBlockCount)
cycle = UCMP8_kBlockCount;
/* make temp storage, larger than we need*/
tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
if (!tempIndex)
{
this_obj->fBogus = TRUE;
return;
}
/* set up first block.*/
tempIndexCount = UCMP8_kBlockCount;
for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex)
{
tempIndex[iIndex] = (uint16_t)iIndex;
} /* endfor (iIndex = 0; .....)*/
this_obj->fIndex[0] = 0;
/* for each successive block, find out its first position in the compacted array*/
for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock)
{
int32_t newCount, firstPosition, block;
block = iBlock << UCMP8_kBlockShift;
/* if (debugSmall) if (block > debugSmallLimit) break;*/
firstPosition = findOverlappingPosition(this_obj,
block,
tempIndex,
tempIndexCount,
cycle);
/* if not contained in the current list, copy the remainder
* invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
* we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
*/
newCount = firstPosition + UCMP8_kBlockCount;
if (newCount > tempIndexCount)
{
for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex)
{
tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
} /* endfor (iIndex = tempIndexCount....)*/
tempIndexCount = newCount;
} /* endif (newCount > tempIndexCount)*/
this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
} /* endfor (iBlock = 1.....)*/
/* now allocate and copy the items into the array*/
tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
if (!tempArray)
{
this_obj->fBogus = TRUE;
uprv_free(tempIndex);
return;
}
for (iIndex = 0; iIndex < tempIndexCount; ++iIndex)
{
tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
}
uprv_free(this_obj->fArray);
this_obj->fArray = tempArray;
this_obj->fCount = tempIndexCount;
/* free up temp storage*/
uprv_free(tempIndex);
this_obj->fCompact = TRUE;
} /* endif (!this_obj->fCompact)*/
}
U_CAPI uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, UMemoryStream *MS)
{
int32_t size = 0;
uprv_mstrm_write32(MS, ICU_UCMP8_VERSION);
size += 4;
uprv_mstrm_write32(MS, array->fCount);
size += 4;
uprv_mstrm_writeBlock(MS, array->fIndex, sizeof(array->fIndex[0])*UCMP8_kIndexCount);
size += sizeof(array->fIndex[0])*UCMP8_kIndexCount;
uprv_mstrm_writeBlock(MS, array->fArray, sizeof(array->fArray[0])*array->fCount);
size += sizeof(array->fArray[0])*array->fCount;
while(size%4) /* end padding */
{
uprv_mstrm_writePadding(MS, 1); /* Pad total so far to even size */
size += 1;
}
return size;
}
/* We use sizeof(*array), etc so that this code can be as portable as
possible between the ucmpX_ family.
*/
U_CAPI void U_EXPORT2 ucmp8_initFromData(CompactByteArray *this_obj, const uint8_t **source, UErrorCode *status)
{
uint32_t i;
const uint8_t *oldSource = *source;
if(U_FAILURE(*status))
return;
this_obj->fArray = NULL;
this_obj->fIndex = NULL;
this_obj->fBogus = FALSE;
this_obj->fStructSize = sizeof(CompactByteArray);
this_obj->fCompact = TRUE;
this_obj->fAlias = TRUE;
this_obj->fIAmOwned = TRUE;
i = * ((const uint32_t*) *source);
(*source) += 4;
if(i != ICU_UCMP8_VERSION)
{
*status = U_INVALID_FORMAT_ERROR;
return;
}
this_obj->fCount = * ((const uint32_t*)*source);
(*source) += 4;
this_obj->fIndex = (uint16_t*) *source;
(*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;
this_obj->fArray = (int8_t*) *source;
(*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;
/* eat up padding */
while((*source-(oldSource))%4)
(*source)++;
}