|  | // © 2016 and later: Unicode, Inc. and others. | 
|  | // License & terms of use: http://www.unicode.org/copyright.html | 
|  | /* | 
|  | ******************************************************************************* | 
|  | * | 
|  | *   Copyright (C) 2002-2011, International Business Machines | 
|  | *   Corporation and others.  All Rights Reserved. | 
|  | * | 
|  | ******************************************************************************* | 
|  | *   file name:  propsvec.c | 
|  | *   encoding:   UTF-8 | 
|  | *   tab size:   8 (not used) | 
|  | *   indentation:4 | 
|  | * | 
|  | *   created on: 2002feb22 | 
|  | *   created by: Markus W. Scherer | 
|  | * | 
|  | *   Store bits (Unicode character properties) in bit set vectors. | 
|  | */ | 
|  |  | 
|  | #include <stdlib.h> | 
|  | #include "unicode/utypes.h" | 
|  | #include "cmemory.h" | 
|  | #include "utrie.h" | 
|  | #include "utrie2.h" | 
|  | #include "uarrsort.h" | 
|  | #include "propsvec.h" | 
|  | #include "uassert.h" | 
|  |  | 
|  | struct UPropsVectors { | 
|  | uint32_t *v; | 
|  | int32_t columns;  /* number of columns, plus two for start & limit values */ | 
|  | int32_t maxRows; | 
|  | int32_t rows; | 
|  | int32_t prevRow;  /* search optimization: remember last row seen */ | 
|  | UBool isCompacted; | 
|  | }; | 
|  |  | 
|  | #define UPVEC_INITIAL_ROWS (1<<12) | 
|  | #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) | 
|  | #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) | 
|  |  | 
|  | U_CAPI UPropsVectors * U_EXPORT2 | 
|  | upvec_open(int32_t columns, UErrorCode *pErrorCode) { | 
|  | UPropsVectors *pv; | 
|  | uint32_t *v, *row; | 
|  | uint32_t cp; | 
|  |  | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return NULL; | 
|  | } | 
|  | if(columns<1) { | 
|  | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | 
|  | return NULL; | 
|  | } | 
|  | columns+=2; /* count range start and limit columns */ | 
|  |  | 
|  | pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); | 
|  | v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); | 
|  | if(pv==NULL || v==NULL) { | 
|  | uprv_free(pv); | 
|  | uprv_free(v); | 
|  | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | 
|  | return NULL; | 
|  | } | 
|  | uprv_memset(pv, 0, sizeof(UPropsVectors)); | 
|  | pv->v=v; | 
|  | pv->columns=columns; | 
|  | pv->maxRows=UPVEC_INITIAL_ROWS; | 
|  | pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); | 
|  |  | 
|  | /* set the all-Unicode row and the special-value rows */ | 
|  | row=pv->v; | 
|  | uprv_memset(row, 0, pv->rows*columns*4); | 
|  | row[0]=0; | 
|  | row[1]=0x110000; | 
|  | row+=columns; | 
|  | for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { | 
|  | row[0]=cp; | 
|  | row[1]=cp+1; | 
|  | row+=columns; | 
|  | } | 
|  | return pv; | 
|  | } | 
|  |  | 
|  | U_CAPI void U_EXPORT2 | 
|  | upvec_close(UPropsVectors *pv) { | 
|  | if(pv!=NULL) { | 
|  | uprv_free(pv->v); | 
|  | uprv_free(pv); | 
|  | } | 
|  | } | 
|  |  | 
|  | static uint32_t * | 
|  | _findRow(UPropsVectors *pv, UChar32 rangeStart) { | 
|  | uint32_t *row; | 
|  | int32_t columns, i, start, limit, prevRow; | 
|  |  | 
|  | columns=pv->columns; | 
|  | limit=pv->rows; | 
|  | prevRow=pv->prevRow; | 
|  |  | 
|  | /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ | 
|  | row=pv->v+prevRow*columns; | 
|  | if(rangeStart>=(UChar32)row[0]) { | 
|  | if(rangeStart<(UChar32)row[1]) { | 
|  | /* same row as last seen */ | 
|  | return row; | 
|  | } else if(rangeStart<(UChar32)(row+=columns)[1]) { | 
|  | /* next row after the last one */ | 
|  | pv->prevRow=prevRow+1; | 
|  | return row; | 
|  | } else if(rangeStart<(UChar32)(row+=columns)[1]) { | 
|  | /* second row after the last one */ | 
|  | pv->prevRow=prevRow+2; | 
|  | return row; | 
|  | } else if((rangeStart-(UChar32)row[1])<10) { | 
|  | /* we are close, continue looping */ | 
|  | prevRow+=2; | 
|  | do { | 
|  | ++prevRow; | 
|  | row+=columns; | 
|  | } while(rangeStart>=(UChar32)row[1]); | 
|  | pv->prevRow=prevRow; | 
|  | return row; | 
|  | } | 
|  | } else if(rangeStart<(UChar32)pv->v[1]) { | 
|  | /* the very first row */ | 
|  | pv->prevRow=0; | 
|  | return pv->v; | 
|  | } | 
|  |  | 
|  | /* do a binary search for the start of the range */ | 
|  | start=0; | 
|  | while(start<limit-1) { | 
|  | i=(start+limit)/2; | 
|  | row=pv->v+i*columns; | 
|  | if(rangeStart<(UChar32)row[0]) { | 
|  | limit=i; | 
|  | } else if(rangeStart<(UChar32)row[1]) { | 
|  | pv->prevRow=i; | 
|  | return row; | 
|  | } else { | 
|  | start=i; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* must be found because all ranges together always cover all of Unicode */ | 
|  | pv->prevRow=start; | 
|  | return pv->v+start*columns; | 
|  | } | 
|  |  | 
|  | U_CAPI void U_EXPORT2 | 
|  | upvec_setValue(UPropsVectors *pv, | 
|  | UChar32 start, UChar32 end, | 
|  | int32_t column, | 
|  | uint32_t value, uint32_t mask, | 
|  | UErrorCode *pErrorCode) { | 
|  | uint32_t *firstRow, *lastRow; | 
|  | int32_t columns; | 
|  | UChar32 limit; | 
|  | UBool splitFirstRow, splitLastRow; | 
|  |  | 
|  | /* argument checking */ | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  | if( pv==NULL || | 
|  | start<0 || start>end || end>UPVEC_MAX_CP || | 
|  | column<0 || column>=(pv->columns-2) | 
|  | ) { | 
|  | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | 
|  | return; | 
|  | } | 
|  | if(pv->isCompacted) { | 
|  | *pErrorCode=U_NO_WRITE_PERMISSION; | 
|  | return; | 
|  | } | 
|  | limit=end+1; | 
|  |  | 
|  | /* initialize */ | 
|  | columns=pv->columns; | 
|  | column+=2; /* skip range start and limit columns */ | 
|  | value&=mask; | 
|  |  | 
|  | /* find the rows whose ranges overlap with the input range */ | 
|  |  | 
|  | /* find the first and last rows, always successful */ | 
|  | firstRow=_findRow(pv, start); | 
|  | lastRow=_findRow(pv, end); | 
|  |  | 
|  | /* | 
|  | * Rows need to be split if they partially overlap with the | 
|  | * input range (only possible for the first and last rows) | 
|  | * and if their value differs from the input value. | 
|  | */ | 
|  | splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); | 
|  | splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); | 
|  |  | 
|  | /* split first/last rows if necessary */ | 
|  | if(splitFirstRow || splitLastRow) { | 
|  | int32_t count, rows; | 
|  |  | 
|  | rows=pv->rows; | 
|  | if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { | 
|  | uint32_t *newVectors; | 
|  | int32_t newMaxRows; | 
|  |  | 
|  | if(pv->maxRows<UPVEC_MEDIUM_ROWS) { | 
|  | newMaxRows=UPVEC_MEDIUM_ROWS; | 
|  | } else if(pv->maxRows<UPVEC_MAX_ROWS) { | 
|  | newMaxRows=UPVEC_MAX_ROWS; | 
|  | } else { | 
|  | /* Implementation bug, or UPVEC_MAX_ROWS too low. */ | 
|  | *pErrorCode=U_INTERNAL_PROGRAM_ERROR; | 
|  | return; | 
|  | } | 
|  | newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); | 
|  | if(newVectors==NULL) { | 
|  | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | 
|  | return; | 
|  | } | 
|  | uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4); | 
|  | firstRow=newVectors+(firstRow-pv->v); | 
|  | lastRow=newVectors+(lastRow-pv->v); | 
|  | uprv_free(pv->v); | 
|  | pv->v=newVectors; | 
|  | pv->maxRows=newMaxRows; | 
|  | } | 
|  |  | 
|  | /* count the number of row cells to move after the last row, and move them */ | 
|  | count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); | 
|  | if(count>0) { | 
|  | uprv_memmove( | 
|  | lastRow+(1+splitFirstRow+splitLastRow)*columns, | 
|  | lastRow+columns, | 
|  | count*4); | 
|  | } | 
|  | pv->rows=rows+splitFirstRow+splitLastRow; | 
|  |  | 
|  | /* split the first row, and move the firstRow pointer to the second part */ | 
|  | if(splitFirstRow) { | 
|  | /* copy all affected rows up one and move the lastRow pointer */ | 
|  | count = (int32_t)((lastRow-firstRow)+columns); | 
|  | uprv_memmove(firstRow+columns, firstRow, (size_t)count*4); | 
|  | lastRow+=columns; | 
|  |  | 
|  | /* split the range and move the firstRow pointer */ | 
|  | firstRow[1]=firstRow[columns]=(uint32_t)start; | 
|  | firstRow+=columns; | 
|  | } | 
|  |  | 
|  | /* split the last row */ | 
|  | if(splitLastRow) { | 
|  | /* copy the last row data */ | 
|  | uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4); | 
|  |  | 
|  | /* split the range and move the firstRow pointer */ | 
|  | lastRow[1]=lastRow[columns]=(uint32_t)limit; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* set the "row last seen" to the last row for the range */ | 
|  | pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); | 
|  |  | 
|  | /* set the input value in all remaining rows */ | 
|  | firstRow+=column; | 
|  | lastRow+=column; | 
|  | mask=~mask; | 
|  | for(;;) { | 
|  | *firstRow=(*firstRow&mask)|value; | 
|  | if(firstRow==lastRow) { | 
|  | break; | 
|  | } | 
|  | firstRow+=columns; | 
|  | } | 
|  | } | 
|  |  | 
|  | U_CAPI uint32_t U_EXPORT2 | 
|  | upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { | 
|  | uint32_t *row; | 
|  | UPropsVectors *ncpv; | 
|  |  | 
|  | if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { | 
|  | return 0; | 
|  | } | 
|  | ncpv=(UPropsVectors *)pv; | 
|  | row=_findRow(ncpv, c); | 
|  | return row[2+column]; | 
|  | } | 
|  |  | 
|  | U_CAPI uint32_t * U_EXPORT2 | 
|  | upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, | 
|  | UChar32 *pRangeStart, UChar32 *pRangeEnd) { | 
|  | uint32_t *row; | 
|  | int32_t columns; | 
|  |  | 
|  | if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | columns=pv->columns; | 
|  | row=pv->v+rowIndex*columns; | 
|  | if(pRangeStart!=NULL) { | 
|  | *pRangeStart=(UChar32)row[0]; | 
|  | } | 
|  | if(pRangeEnd!=NULL) { | 
|  | *pRangeEnd=(UChar32)row[1]-1; | 
|  | } | 
|  | return row+2; | 
|  | } | 
|  |  | 
|  | static int32_t U_CALLCONV | 
|  | upvec_compareRows(const void *context, const void *l, const void *r) { | 
|  | const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; | 
|  | const UPropsVectors *pv=(const UPropsVectors *)context; | 
|  | int32_t i, count, columns; | 
|  |  | 
|  | count=columns=pv->columns; /* includes start/limit columns */ | 
|  |  | 
|  | /* start comparing after start/limit but wrap around to them */ | 
|  | i=2; | 
|  | do { | 
|  | if(left[i]!=right[i]) { | 
|  | return left[i]<right[i] ? -1 : 1; | 
|  | } | 
|  | if(++i==columns) { | 
|  | i=0; | 
|  | } | 
|  | } while(--count>0); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | U_CAPI void U_EXPORT2 | 
|  | upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { | 
|  | uint32_t *row; | 
|  | int32_t i, columns, valueColumns, rows, count; | 
|  | UChar32 start, limit; | 
|  |  | 
|  | /* argument checking */ | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  | if(handler==NULL) { | 
|  | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | 
|  | return; | 
|  | } | 
|  | if(pv->isCompacted) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* Set the flag now: Sorting and compacting destroys the builder data structure. */ | 
|  | pv->isCompacted=TRUE; | 
|  |  | 
|  | rows=pv->rows; | 
|  | columns=pv->columns; | 
|  | U_ASSERT(columns>=3); /* upvec_open asserts this */ | 
|  | valueColumns=columns-2; /* not counting start & limit */ | 
|  |  | 
|  | /* sort the properties vectors to find unique vector values */ | 
|  | uprv_sortArray(pv->v, rows, columns*4, | 
|  | upvec_compareRows, pv, FALSE, pErrorCode); | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Find and set the special values. | 
|  | * This has to do almost the same work as the compaction below, | 
|  | * to find the indexes where the special-value rows will move. | 
|  | */ | 
|  | row=pv->v; | 
|  | count=-valueColumns; | 
|  | for(i=0; i<rows; ++i) { | 
|  | start=(UChar32)row[0]; | 
|  |  | 
|  | /* count a new values vector if it is different from the current one */ | 
|  | if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { | 
|  | count+=valueColumns; | 
|  | } | 
|  |  | 
|  | if(start>=UPVEC_FIRST_SPECIAL_CP) { | 
|  | handler(context, start, start, count, row+2, valueColumns, pErrorCode); | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | row+=columns; | 
|  | } | 
|  |  | 
|  | /* count is at the beginning of the last vector, add valueColumns to include that last vector */ | 
|  | count+=valueColumns; | 
|  |  | 
|  | /* Call the handler once more to signal the start of delivering real values. */ | 
|  | handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, | 
|  | count, row-valueColumns, valueColumns, pErrorCode); | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Move vector contents up to a contiguous array with only unique | 
|  | * vector values, and call the handler function for each vector. | 
|  | * | 
|  | * This destroys the Properties Vector structure and replaces it | 
|  | * with an array of just vector values. | 
|  | */ | 
|  | row=pv->v; | 
|  | count=-valueColumns; | 
|  | for(i=0; i<rows; ++i) { | 
|  | /* fetch these first before memmove() may overwrite them */ | 
|  | start=(UChar32)row[0]; | 
|  | limit=(UChar32)row[1]; | 
|  |  | 
|  | /* add a new values vector if it is different from the current one */ | 
|  | if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { | 
|  | count+=valueColumns; | 
|  | uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4); | 
|  | } | 
|  |  | 
|  | if(start<UPVEC_FIRST_SPECIAL_CP) { | 
|  | handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | row+=columns; | 
|  | } | 
|  |  | 
|  | /* count is at the beginning of the last vector, add one to include that last vector */ | 
|  | pv->rows=count/valueColumns+1; | 
|  | } | 
|  |  | 
|  | U_CAPI const uint32_t * U_EXPORT2 | 
|  | upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { | 
|  | if(!pv->isCompacted) { | 
|  | return NULL; | 
|  | } | 
|  | if(pRows!=NULL) { | 
|  | *pRows=pv->rows; | 
|  | } | 
|  | if(pColumns!=NULL) { | 
|  | *pColumns=pv->columns-2; | 
|  | } | 
|  | return pv->v; | 
|  | } | 
|  |  | 
|  | U_CAPI uint32_t * U_EXPORT2 | 
|  | upvec_cloneArray(const UPropsVectors *pv, | 
|  | int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { | 
|  | uint32_t *clonedArray; | 
|  | int32_t byteLength; | 
|  |  | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | return NULL; | 
|  | } | 
|  | if(!pv->isCompacted) { | 
|  | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | 
|  | return NULL; | 
|  | } | 
|  | byteLength=pv->rows*(pv->columns-2)*4; | 
|  | clonedArray=(uint32_t *)uprv_malloc(byteLength); | 
|  | if(clonedArray==NULL) { | 
|  | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | 
|  | return NULL; | 
|  | } | 
|  | uprv_memcpy(clonedArray, pv->v, byteLength); | 
|  | if(pRows!=NULL) { | 
|  | *pRows=pv->rows; | 
|  | } | 
|  | if(pColumns!=NULL) { | 
|  | *pColumns=pv->columns-2; | 
|  | } | 
|  | return clonedArray; | 
|  | } | 
|  |  | 
|  | U_CAPI UTrie2 * U_EXPORT2 | 
|  | upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { | 
|  | UPVecToUTrie2Context toUTrie2={ NULL, 0, 0, 0 }; | 
|  | upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); | 
|  | utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); | 
|  | if(U_FAILURE(*pErrorCode)) { | 
|  | utrie2_close(toUTrie2.trie); | 
|  | toUTrie2.trie=NULL; | 
|  | } | 
|  | return toUTrie2.trie; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts | 
|  | * some 16-bit field and builds and returns a UTrie2. | 
|  | */ | 
|  |  | 
|  | U_CAPI void U_CALLCONV | 
|  | upvec_compactToUTrie2Handler(void *context, | 
|  | UChar32 start, UChar32 end, | 
|  | int32_t rowIndex, uint32_t *row, int32_t columns, | 
|  | UErrorCode *pErrorCode) { | 
|  | (void)row; | 
|  | (void)columns; | 
|  | UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; | 
|  | if(start<UPVEC_FIRST_SPECIAL_CP) { | 
|  | utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); | 
|  | } else { | 
|  | switch(start) { | 
|  | case UPVEC_INITIAL_VALUE_CP: | 
|  | toUTrie2->initialValue=rowIndex; | 
|  | break; | 
|  | case UPVEC_ERROR_VALUE_CP: | 
|  | toUTrie2->errorValue=rowIndex; | 
|  | break; | 
|  | case UPVEC_START_REAL_VALUES_CP: | 
|  | toUTrie2->maxValue=rowIndex; | 
|  | if(rowIndex>0xffff) { | 
|  | /* too many rows for a 16-bit trie */ | 
|  | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | 
|  | } else { | 
|  | toUTrie2->trie=utrie2_open(toUTrie2->initialValue, | 
|  | toUTrie2->errorValue, pErrorCode); | 
|  | } | 
|  | break; | 
|  | default: | 
|  | break; | 
|  | } | 
|  | } | 
|  | } |