| /* |
| ********************************************************************** |
| * Copyright (C) 2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * file name: ucmpe32.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2001aug03 |
| * created by: Vladimir Weinstein |
| * |
| * This is basically a rip-off of trie developed by Markus for |
| * normalization data, but using a reduced ucmp interface |
| * Interface is implemented as much as required by the collation |
| * framework. |
| * This table is slow on data addition, but should support surrogates |
| * nicely. |
| */ |
| |
| #include "ucmpe32.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "filestrm.h" |
| |
| /* builder data ------------------------------------------------------------- */ |
| |
| CompactEIntArray* |
| ucmpe32_open(int32_t defaultValue, int32_t surrogateValue, UErrorCode *status) |
| { |
| int32_t *bla; |
| CompactEIntArray* this_obj = (CompactEIntArray*) uprv_malloc(sizeof(CompactEIntArray)); |
| if (U_FAILURE(*status) || this_obj == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| |
| this_obj->fAlias = FALSE; |
| |
| this_obj->fStructSize = sizeof(CompactEIntArray); |
| |
| this_obj->stage1Top = _UCMPE32_STAGE_1_MAX_COUNT; |
| this_obj->stage1 = (uint16_t *)uprv_malloc(_UCMPE32_STAGE_1_MAX_COUNT*sizeof(uint16_t)); |
| if(this_obj->stage1 == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(this_obj); |
| return NULL; |
| } |
| |
| |
| /* reset stage 1 of the trie */ |
| uprv_memset(this_obj->stage1, 0, this_obj->stage1Top*sizeof(uint16_t)); |
| |
| /* allocate stage 2 of the trie and reset the first block */ |
| this_obj->stage2= (int32_t*)uprv_malloc(60000*sizeof(*(this_obj->stage2))); |
| if(this_obj->stage2 == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(this_obj->stage1); |
| uprv_free(this_obj); |
| return NULL; |
| } |
| this_obj->fDefaultValue = defaultValue; |
| this_obj->fSurrogateValue = surrogateValue; |
| for(bla = this_obj->stage2; bla<this_obj->stage2+_UCMPE32_STAGE_2_BLOCK_COUNT; bla++) { |
| *bla = this_obj->fDefaultValue; |
| } |
| this_obj->stage2Top = _UCMPE32_STAGE_2_BLOCK_COUNT; |
| |
| this_obj->fCompact = FALSE; |
| this_obj->fBogus = FALSE; |
| |
| return this_obj; |
| } |
| |
| |
| |
| /* |
| * get or create a Norm unit; |
| * get or create the intermediate trie entries for it as well |
| */ |
| /********* THIS IS THE ADD FUNCTION ********************/ |
| void |
| ucmpe32_set32(CompactEIntArray* this_obj, UChar32 code, int32_t value) |
| { |
| uint16_t stage2Block, k; |
| |
| if (this_obj->fCompact == TRUE) { |
| return; |
| } |
| |
| { |
| uint32_t i; |
| uint16_t j; |
| |
| i=code>>_UCMPE32_TRIE_SHIFT; |
| j=this_obj->stage1[i]; |
| if(j==0) { |
| /* allocate a stage 2 block */ |
| int32_t *p, *bla; |
| p = this_obj->stage2+this_obj->stage2Top; |
| for(bla = p; bla<p+_UCMPE32_STAGE_2_BLOCK_COUNT; bla++) { |
| *bla = this_obj->fDefaultValue; |
| } |
| this_obj->stage2Top += _UCMPE32_STAGE_2_BLOCK_COUNT; |
| |
| this_obj->stage1[i]=j=(uint16_t)(p-this_obj->stage2); |
| } |
| stage2Block=j; |
| } |
| |
| k=(uint16_t)(stage2Block+(code&_UCMPE32_STAGE_2_MASK)); |
| |
| this_obj->stage2[k] = value; |
| } |
| |
| void |
| ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead, UChar trail, int32_t value) |
| { |
| if (this_obj->fCompact == TRUE) { |
| return; |
| } |
| ucmpe32_set(this_obj, (int32_t)UTF16_GET_PAIR_VALUE(lead, trail), value); |
| } |
| |
| |
| /* |
| * Fold the supplementary code point data for one lead surrogate. |
| */ |
| static uint16_t |
| foldLeadSurrogate(uint16_t *parent, int32_t parentCount, |
| int32_t *stage, int32_t *pStageCount, |
| uint32_t base, int32_t surrogateValue) { |
| uint32_t leadNorm32=0; |
| uint32_t i, j, s2; |
| uint32_t leadSurrogate=0xd7c0+(base>>10); |
| |
| #if 0 |
| printf("supplementary data for lead surrogate U+%04lx\n", (long)leadSurrogate); |
| #endif |
| /* calculate the 32-bit data word for the lead surrogate */ |
| for(i=0; i<_UCMPE32_SURROGATE_BLOCK_COUNT; ++i) { |
| s2=parent[(base>>_UCMPE32_TRIE_SHIFT)+i]; |
| if(s2!=0) { |
| for(j=0; j<_UCMPE32_STAGE_2_BLOCK_COUNT; ++j) { |
| /* basically, or all 32-bit data into the one for the lead surrogate */ |
| leadNorm32|=stage[s2+j]; |
| } |
| } |
| } |
| |
| if(leadNorm32==0) { |
| return 0; |
| } |
| |
| /* |
| * For FCD, replace the entire combined value by the surrogate index |
| * and make sure that it is not 0 (by not offsetting it by the BMP top, |
| * since here we have enough bits for this); |
| * lead surrogates are tested at runtime on the character code itself |
| * instead on special values of the trie data - |
| * this is because 16 bits in the FCD trie data do not allow for anything |
| * but the two leading and trailing combining classes of the canonical decomposition. |
| */ |
| leadNorm32= surrogateValue | ((parentCount<<_UCMPE32_TRIE_SHIFT)&~_UCMPE32_STAGE_2_MASK); |
| |
| /* enter the lead surrogate's data */ |
| s2=parent[leadSurrogate>>_UCMPE32_TRIE_SHIFT]; |
| if(s2==0) { |
| /* allocate a new stage 2 block in stage (the memory is there from makeAll32()/makeFCD()) */ |
| s2=parent[leadSurrogate>>_UCMPE32_TRIE_SHIFT]=(uint16_t)*pStageCount; |
| *pStageCount+=_UCMPE32_STAGE_2_BLOCK_COUNT; |
| } |
| stage[s2+(leadSurrogate&_UCMPE32_STAGE_2_MASK)]=leadNorm32; |
| |
| /* move the actual stage 1 indexes from the supplementary position to the new one */ |
| uprv_memmove(parent+parentCount, parent+(base>>_UCMPE32_TRIE_SHIFT), _UCMPE32_SURROGATE_BLOCK_COUNT*2); |
| |
| /* increment stage 1 top */ |
| return _UCMPE32_SURROGATE_BLOCK_COUNT; |
| } |
| |
| /* |
| * Fold the normalization data for supplementary code points into |
| * a compact area on top of the BMP-part of the trie index, |
| * with the lead surrogates indexing this compact area. |
| * |
| * Use after makeAll32(). |
| */ |
| static uint32_t |
| foldSupplementary(uint16_t *parent, int32_t parentCount, |
| int32_t *stage, int32_t *pStageCount, |
| int32_t surrogateValue) { |
| uint32_t c; |
| uint16_t i; |
| |
| /* search for any stage 1 entries for supplementary code points */ |
| for(c=0x10000; c<0x110000;) { |
| i=parent[c>>_UCMPE32_TRIE_SHIFT]; |
| if(i!=0) { |
| /* there is data, treat the full block for a lead surrogate */ |
| c&=~0x3ff; |
| parentCount+=foldLeadSurrogate(parent, parentCount, stage, pStageCount, c, surrogateValue); |
| c+=0x400; |
| } else { |
| c+=_UCMPE32_STAGE_2_BLOCK_COUNT; |
| } |
| } |
| #if 0 |
| printf("trie index count: BMP %u all Unicode %lu folded %u\n", |
| _UCMPE32_STAGE_1_BMP_COUNT, (long)_UCMPE32_STAGE_1_MAX_COUNT, parentCount); |
| #endif |
| return parentCount; |
| } |
| |
| void |
| ucmpe32_compact(CompactEIntArray* this_obj) { |
| |
| if(this_obj->fCompact == FALSE) { /* compacting can be done only once */ |
| /* |
| * This function is the common implementation for compacting |
| * the stage 2 tables of 32-bit values. |
| * It is a copy of genprops/store.c's compactStage() adapted for the 32-bit stage 2 tables. |
| */ |
| static uint16_t map[0x10000>>_UCMPE32_TRIE_SHIFT]; |
| int32_t x; |
| uint16_t i, start, prevEnd, newStart; |
| |
| /* fold supplementary code points into lead surrogates */ |
| this_obj->stage1Top=foldSupplementary(this_obj->stage1, _UCMPE32_STAGE_1_BMP_COUNT, |
| this_obj->stage2, &this_obj->stage2Top, this_obj->fSurrogateValue); |
| |
| map[0]=0; |
| newStart=_UCMPE32_STAGE_2_BLOCK_COUNT; |
| for(start=newStart; start<this_obj->stage2Top;) { |
| prevEnd=(uint16_t)(newStart-1); |
| x=this_obj->stage2[start]; |
| if(x==this_obj->stage2[prevEnd]) { |
| /* overlap by at least one */ |
| for(i=1; i<_UCMPE32_STAGE_2_BLOCK_COUNT |
| && x==this_obj->stage2[start+i] |
| && x==this_obj->stage2[prevEnd-i]; ++i) {} |
| |
| /* overlap by i */ |
| map[start>>_UCMPE32_TRIE_SHIFT]=(uint16_t)(newStart-i); |
| |
| /* move the non-overlapping indexes to their new positions */ |
| start+=i; |
| for(i=(uint16_t)(_UCMPE32_STAGE_2_BLOCK_COUNT-i); i>0; --i) { |
| this_obj->stage2[newStart++]=this_obj->stage2[start++]; |
| } |
| } else if(newStart<start) { |
| /* move the indexes to their new positions */ |
| map[start>>_UCMPE32_TRIE_SHIFT]=newStart; |
| for(i=_UCMPE32_STAGE_2_BLOCK_COUNT; i>0; --i) { |
| this_obj->stage2[newStart++]=this_obj->stage2[start++]; |
| } |
| } else /* no overlap && newStart==start */ { |
| map[start>>_UCMPE32_TRIE_SHIFT]=start; |
| newStart+=_UCMPE32_STAGE_2_BLOCK_COUNT; |
| start=newStart; |
| } |
| } |
| |
| /* now adjust the stage1 table */ |
| for(i=0; i<this_obj->stage1Top; ++i) { |
| this_obj->stage1[i]=map[this_obj->stage1[i]>>_UCMPE32_TRIE_SHIFT]; |
| } |
| #if 0 |
| /* we saved some space */ |
| printf("compacting trie: count of 32-bit words %lu->%lu\n", |
| (long)this_obj->stage2Top, (long)newStart); |
| #endif |
| this_obj->stage2Top = newStart; |
| this_obj->fCompact = TRUE; |
| } |
| } |
| |
| |
| CompactEIntArray* |
| ucmpe32_clone(CompactEIntArray* orig, UErrorCode *status) |
| { |
| CompactEIntArray* this_obj = (CompactEIntArray*) uprv_malloc(sizeof(CompactEIntArray)); |
| if(orig == NULL || orig->fBogus == TRUE || this_obj == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| this_obj->fAlias = FALSE; |
| |
| this_obj->fDefaultValue = orig->fDefaultValue; |
| this_obj->fSurrogateValue = orig->fSurrogateValue; |
| |
| this_obj->stage1Top = orig->stage1Top; |
| this_obj->stage1 = (uint16_t *)uprv_malloc(this_obj->stage1Top*sizeof(uint16_t)); |
| if(this_obj->stage1 == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(this_obj); |
| return NULL; |
| } |
| uprv_memcpy(this_obj->stage1, orig->stage1, this_obj->stage1Top*sizeof(uint16_t)); |
| |
| this_obj->stage2Top = orig->stage2Top; |
| this_obj->stage2 = (int32_t*)uprv_malloc(60000*sizeof(*(this_obj->stage2))); |
| if(this_obj->stage2 == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(this_obj->stage1); |
| uprv_free(this_obj); |
| return NULL; |
| } |
| uprv_memcpy(this_obj->stage2, orig->stage2, this_obj->stage2Top*sizeof(*(this_obj->stage2))); |
| |
| this_obj->fBogus = FALSE; |
| this_obj->fStructSize = sizeof(CompactEIntArray); |
| |
| this_obj->fCompact = orig->fCompact; |
| |
| return this_obj; |
| } |
| |
| |
| CompactEIntArray* |
| ucmpe32_openFromData( const uint8_t **source, |
| UErrorCode *status) |
| { |
| uint32_t i; |
| /* const uint8_t *oldSource = *source;*/ |
| |
| CompactEIntArray* this_obj = (CompactEIntArray*) uprv_malloc(sizeof(CompactEIntArray)); |
| |
| if(U_FAILURE(*status) || *source == NULL || this_obj == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| |
| i = * ((const uint32_t*) *source); |
| (*source) += 4; |
| |
| if(i != ICU_UCMPE32_VERSION) |
| { |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| |
| this_obj->fAlias = TRUE; |
| this_obj->stage1 = NULL; |
| this_obj->stage2 = NULL; |
| this_obj->fBogus = FALSE; |
| this_obj->fStructSize = sizeof(CompactEIntArray); |
| this_obj->fCompact = TRUE; |
| |
| this_obj->stage1Top = * ((const uint32_t*)*source); |
| (*source) += 4; |
| |
| this_obj->stage1 = (uint16_t*) *source; |
| (*source) += sizeof(this_obj->stage1[0])*this_obj->stage1Top; |
| |
| this_obj->stage2Top = * ((const uint32_t*)*source); |
| (*source) += 4; |
| |
| this_obj->stage2 = (int32_t*) *source; |
| (*source) += sizeof(this_obj->stage2[0])*this_obj->stage2Top; |
| |
| return this_obj; |
| } |
| |
| uint32_t |
| ucmpe32_flattenMem (const CompactEIntArray* this_obj, UMemoryStream *MS) |
| { |
| /* This dumps stuff in memory */ |
| /* there is no padding, as there is always an even number of 16-bit values */ |
| /* (stage1), so everything is always 32 bit aligned */ |
| |
| int32_t size = 0; |
| if(this_obj->fCompact == TRUE) { |
| uprv_mstrm_write32(MS, ICU_UCMPE32_VERSION); |
| size += 4; |
| |
| uprv_mstrm_write32(MS, this_obj->stage1Top); |
| size += 4; |
| |
| uprv_mstrm_writeBlock(MS, this_obj->stage1, this_obj->stage1Top*sizeof(this_obj->stage1[0])); |
| size += this_obj->stage1Top*sizeof(this_obj->stage1[0]); |
| |
| uprv_mstrm_write32(MS, this_obj->stage2Top); |
| size += 4; |
| |
| uprv_mstrm_writeBlock(MS, this_obj->stage2, this_obj->stage2Top*sizeof(this_obj->stage2[0])); |
| size += this_obj->stage2Top*sizeof(this_obj->stage2[0]); |
| } |
| return size; |
| } |
| |
| /*=======================================================*/ |
| |
| void ucmpe32_close(CompactEIntArray* this_obj) |
| { |
| if(this_obj != NULL) { |
| if(this_obj->fAlias == FALSE) { |
| if(this_obj->stage1 != NULL) { |
| uprv_free(this_obj->stage1); |
| } |
| if(this_obj->stage2 != NULL) { |
| uprv_free(this_obj->stage2); |
| } |
| } |
| uprv_free(this_obj); |
| } |
| } |
| |
| int32_t |
| ucmpe32_getSurrogateEx(CompactEIntArray *array, UChar lead, UChar trail) { |
| if(array->fCompact == FALSE) { |
| return(ucmpe32_get(array, (int32_t)UTF16_GET_PAIR_VALUE(lead, trail))); |
| } else { |
| return(ucmpe32_getSurrogate(array, ucmpe32_get(array, lead), trail)); |
| } |
| } |
| |
| /*=======================================================*/ |
| /* retrieval stuff as functions */ |
| #if 0 |
| int32_t |
| ucmpe32_get32(CompactEIntArray *this_obj, UChar32 index) { |
| int32_t index_lookup = this_obj->stage1[index >> _UCMPE32_TRIE_SHIFT] ; |
| int32_t addition = (index & _UCMPE32_STAGE_2_MASK); |
| |
| return (this_obj->stage2[index_lookup + addition]); |
| } |
| |
| |
| /* Lead surrogate data needs to be in the following format: */ |
| /* F50XXY000 - where X mask is 1111 (F) and Y mask is 1100 (C) */ |
| /* The ten bits for access will be in the middle of the field */ |
| int32_t |
| ucmpe32_getSurrogate(CompactEIntArray *array, int32_t leadValue32, UChar trail) { |
| int32_t c = ((leadValue32 & 0xffc00) | (trail & 0x3ff)); |
| int32_t index_lookup = array->stage1[(c >> _UCMPE32_TRIE_SHIFT)]; |
| int32_t addition = (c & _UCMPE32_STAGE_2_MASK); |
| return (array->stage2[index_lookup+ addition]); |
| } |
| #endif |