| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2008, International Business Machines |
| * Corporation, Google and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| */ |
| // Author : eldawy@google.com (Mohamed Eldawy) |
| // ucnvsel.cpp |
| // |
| // Purpose: To generate a list of encodings capable of handling |
| // a given Unicode text |
| // |
| // Started 09-April-2008 |
| |
| /** |
| * \file |
| * |
| * This is an implementation of an encoding selector. |
| * The goal is, given a unicode string, find the encodings |
| * this string can be mapped to. To make processing faster |
| * a trie is built when you call ucnvsel_open() that |
| * stores all encodings a codepoint can map to |
| */ |
| |
| #include "unicode/ucnvsel.h" |
| |
| #include <string.h> |
| |
| #include "unicode/uchar.h" |
| #include "unicode/uniset.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/ustring.h" |
| #include "unicode/uchriter.h" |
| #include "utrie.h" |
| #include "propsvec.h" |
| #include "uenumimp.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| |
| |
| U_NAMESPACE_USE |
| |
| // maximum possible serialized trie that can ever be reached |
| // this was obtained by attempting to serialize a trie for all fallback mapping |
| // and for all roundtrip mappings and then selecting the maximum |
| // this value actually adds around 30KB of unneeded extra space (the actual |
| // maximum space is around 220000). |
| // the reasoning is to make it still work if lots of other converters were |
| // added to ICU |
| #define CAPACITY 250000 |
| |
| |
| struct UConverterSelector { |
| uint8_t* serializedTrie; |
| uint32_t serializedTrieSize; |
| UTrie constructedTrie; // 16 bit trie containing offsets into pv |
| uint32_t* pv; // table of bits! |
| int32_t pvCount; |
| char** encodings; // which encodings did user ask to use? |
| int32_t encodingsCount; |
| }; |
| |
| |
| /* internal function */ |
| void generateSelectorData(UConverterSelector* result, |
| const USet* excludedEncodings, |
| const UConverterUnicodeSet whichSet, |
| UErrorCode* status); |
| |
| |
| U_CAPI int32_t ucnvsel_swap(const UDataSwapper *ds, |
| const void *inData, |
| int32_t length, |
| void *outData, |
| UErrorCode *status); |
| |
| |
| /* open a selector. If converterList is NULL, build for all converters. |
| If excludedCodePoints is NULL, don't exclude any codepoints */ |
| U_CAPI UConverterSelector* ucnvsel_open(const char* const* converterList, |
| int32_t converterListSize, |
| const USet* excludedCodePoints, |
| const UConverterUnicodeSet whichSet, |
| UErrorCode* status ) { |
| // allocate a new converter |
| UConverterSelector* newSelector; |
| int32_t i; // for loop counter |
| |
| // the compiler should realize the tail recursion here and optimize |
| // accordingly. This call is to get around the constness of |
| // converterList by smallest amount of code modification |
| if(converterListSize == 0 && converterList != NULL) { |
| return ucnvsel_open(NULL, 0, excludedCodePoints, whichSet, status); |
| } |
| |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| // ensure args make sense! |
| if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| |
| |
| newSelector = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); |
| if (!newSelector) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| uprv_memset(newSelector, 0, sizeof(UConverterSelector)); |
| |
| // make a backup copy of the list of converters |
| if (converterList != NULL && converterListSize > 0) { |
| newSelector->encodings = |
| (char**)uprv_malloc(converterListSize*sizeof(char*)); |
| // out of memory. Give user back the 100 bytes or so |
| // we allocated earlier, and wish them good luck ;) |
| if (!newSelector->encodings) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(newSelector); |
| return NULL; |
| } |
| |
| char* allStrings = NULL; |
| int32_t totalSize = 0; |
| for (i = 0 ; i < converterListSize ; i++) { |
| totalSize += uprv_strlen(converterList[i])+1; |
| } |
| allStrings = (char*) uprv_malloc(totalSize); |
| //out of memory :( |
| if (!allStrings) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(newSelector->encodings); |
| uprv_free(newSelector); |
| return NULL; |
| } |
| |
| for (i = 0 ; i < converterListSize ; i++) { |
| newSelector->encodings[i] = allStrings; |
| uprv_strcpy(newSelector->encodings[i], converterList[i]); |
| allStrings += uprv_strlen(newSelector->encodings[i]) + 1; // calling strlen |
| // twice per string is probably faster than allocating memory to |
| // cache the lengths! |
| } |
| } else { |
| int32_t count = ucnv_countAvailable(); |
| newSelector->encodings = |
| (char**)uprv_malloc(ucnv_countAvailable()*sizeof(char*)); |
| // out of memory. Give user back the 100 bytes or so |
| // we allocated earlier, and wish them good luck ;) |
| if (!newSelector->encodings) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(newSelector); |
| return NULL; |
| } |
| char* allStrings = NULL; |
| int32_t totalSize = 0; |
| for (i = 0 ; i < count ; i++) { |
| const char* conv_moniker = ucnv_getAvailableName(i); |
| totalSize += uprv_strlen(conv_moniker)+1; |
| } |
| allStrings = (char*) uprv_malloc(totalSize); |
| //out of memory :( |
| if (!allStrings) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(newSelector->encodings); |
| uprv_free(newSelector); |
| return NULL; |
| } |
| for (i = 0 ; i < count ; i++) { |
| const char* conv_moniker = ucnv_getAvailableName(i); |
| newSelector->encodings[i] = allStrings; |
| uprv_strcpy(newSelector->encodings[i], conv_moniker); |
| allStrings += uprv_strlen(conv_moniker) + 1; // calling strlen twice per |
| // string is probably faster than allocating memory to cache the |
| // lengths! |
| } |
| converterListSize = ucnv_countAvailable(); |
| } |
| |
| newSelector->encodingsCount = converterListSize; |
| generateSelectorData(newSelector, excludedCodePoints, whichSet, status); |
| |
| if (U_FAILURE(*status)) { |
| // at this point, we know pv and encodings have been allocated. No harm in |
| // calling ucnv_closeSelector() |
| ucnvsel_close(newSelector); |
| return NULL; |
| } |
| |
| return newSelector; |
| } |
| |
| |
| /* close opened selector */ |
| U_CAPI void ucnvsel_close(UConverterSelector *sel) { |
| if (!sel) { |
| return; |
| } |
| uprv_free(sel->encodings[0]); |
| uprv_free(sel->encodings); |
| upvec_close(sel->pv); |
| if (sel->serializedTrie) { // this can be reached when |
| // generateSelectorData() has failed, and |
| // the trie is not serialized yet! |
| uprv_free(sel->serializedTrie); |
| } |
| uprv_free(sel); |
| } |
| |
| /* unserialize a selector */ |
| U_CAPI UConverterSelector* ucnvsel_unserialize(const char* buffer, |
| int32_t length, |
| UErrorCode* status) { |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| // ensure args make sense! |
| if (buffer == NULL || length <= 0) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| UConverterSelector* sel; |
| int32_t i = 0; // for the for loop |
| // check length! |
| if (length < sizeof(int32_t) * 3) { |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| uint32_t sig, ASCIIness; |
| |
| memcpy(&sig, buffer, sizeof(int32_t)); |
| buffer += sizeof(uint32_t); |
| memcpy(&ASCIIness, buffer, sizeof(int32_t)); |
| buffer += sizeof(uint32_t); |
| // at this point, we don't know what the endianness or Asciiness of |
| // our system or data is. Detect everything! |
| // notice that a little trick is used here to save work. We don't actually |
| // detect endianness of the machine or of the data. We simply detect |
| // if the 2 are reversed. If they are, we send flags to udata_openSwapper() |
| // to indicate we need endian swapping. Those params are not REALLY |
| // the machine and data endianness |
| UBool dataEndianness = FALSE; |
| //if endianness need to be reversed |
| if (sig == 0x99887766) { |
| dataEndianness = TRUE; |
| } else if (sig != 0x66778899) { |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| |
| int32_t dataASCIIness = ASCIIness; |
| if(dataEndianness) { |
| //need to convert ASCIIness before using it! |
| dataASCIIness = ((char*)&ASCIIness)[3]; |
| } |
| int32_t machineASCIIness = U_CHARSET_FAMILY; |
| |
| //now, we have everything!! |
| if(dataEndianness || |
| dataASCIIness != machineASCIIness) { |
| //construct a data swapper! |
| UDataSwapper *ds; |
| |
| ds=udata_openSwapper(dataEndianness, dataASCIIness, FALSE, machineASCIIness, status); |
| char* newBuffer = (char*)uprv_malloc(length); |
| if(!newBuffer) { |
| udata_closeSwapper(ds); |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| //can we pass buffer twice to swap in place? |
| ucnvsel_swap(ds, buffer, length, newBuffer, status); |
| buffer = newBuffer; |
| udata_closeSwapper(ds); |
| } |
| |
| length -= 3 * sizeof(int32_t); //sig, Asciiness, and pvCount |
| // end of check length! |
| |
| sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); |
| //out of memory :( |
| if (!sel) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| uprv_memset(sel, 0, sizeof(UConverterSelector)); |
| |
| memcpy(&sel->pvCount, buffer, sizeof(int32_t)); |
| buffer+=sizeof(int32_t); |
| |
| // check length |
| if (length < (sel->pvCount+1)*sizeof(uint32_t)) { |
| uprv_free(sel); |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| length -= (sel->pvCount+1)*sizeof(uint32_t); |
| // end of check length |
| |
| sel->pv = (uint32_t*)uprv_malloc(sel->pvCount*sizeof(uint32_t)); |
| if(!sel->pv) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(sel); |
| return NULL; |
| } |
| |
| memcpy(sel->pv, buffer, sel->pvCount*sizeof(uint32_t)); |
| buffer += sel->pvCount*sizeof(uint32_t); |
| |
| int32_t encodingsLength; |
| memcpy(&encodingsLength, buffer, sizeof(int32_t)); |
| buffer += sizeof(int32_t); |
| char* tempEncodings = (char*) uprv_malloc(encodingsLength+1); |
| if(!tempEncodings) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(sel); |
| uprv_free(sel->pv); |
| return NULL; |
| } |
| |
| memcpy(tempEncodings, buffer, encodingsLength); |
| tempEncodings[encodingsLength] = 0; |
| buffer += encodingsLength; |
| // count how many strings are there! |
| int32_t numStrings = 0; |
| for (int32_t i = 0 ; i < encodingsLength + 1 ; i++) { |
| if (tempEncodings[i] == 0) { |
| numStrings++; |
| } |
| } |
| sel->encodingsCount = numStrings; |
| sel->encodings = (char**) uprv_malloc(numStrings * sizeof(char*)); |
| if(!sel->encodings) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_free(sel); |
| uprv_free(sel->pv); |
| uprv_free(tempEncodings); |
| return NULL; |
| } |
| |
| int32_t curString = 0; |
| sel->encodings[0] = tempEncodings; |
| for (i = 0 ; i < encodingsLength ; i++) { |
| if (tempEncodings[i] == 0) { |
| sel->encodings[++curString] = tempEncodings+i+1; |
| } |
| } |
| |
| // check length |
| if (length < sizeof(uint32_t)) { |
| uprv_free(sel->pv); |
| uprv_free(tempEncodings); |
| uprv_free(sel->encodings); |
| uprv_free(sel); |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| length -= sizeof(uint32_t); |
| // end of check length |
| |
| // the trie |
| memcpy(&sel->serializedTrieSize, buffer, sizeof(uint32_t)); |
| buffer += sizeof(uint32_t); |
| |
| // check length |
| if (length < sel->serializedTrieSize) { |
| uprv_free(sel->pv); |
| uprv_free(tempEncodings); |
| uprv_free(sel->encodings); |
| uprv_free(sel); |
| *status = U_INVALID_FORMAT_ERROR; |
| return NULL; |
| } |
| length -= sizeof(uint32_t); |
| // end of check length |
| |
| sel->serializedTrie = (uint8_t*) uprv_malloc(sel->serializedTrieSize); |
| if(!sel->serializedTrie) { |
| uprv_free(sel->pv); |
| uprv_free(tempEncodings); |
| uprv_free(sel->encodings); |
| uprv_free(sel); |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| memcpy(sel->serializedTrie, buffer, sel->serializedTrieSize); |
| // unserialize! |
| utrie_unserialize(&sel->constructedTrie, sel->serializedTrie, |
| sel->serializedTrieSize, status); |
| |
| return sel; |
| } |
| |
| /* serialize a selector */ |
| U_CAPI int32_t ucnvsel_serialize(const UConverterSelector* sel, |
| char* buffer, |
| int32_t bufferCapacity, |
| UErrorCode* status) { |
| // compute size and make sure it fits |
| int32_t totalSize; |
| int32_t encodingStrLength = 0; |
| |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return 0; |
| } |
| // ensure args make sense! |
| if (sel == NULL || bufferCapacity < 0) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| //utrie_swap(ds, inDa |
| totalSize = sizeof(uint32_t) /*signature*/+sizeof(uint32_t) /*ASCIIness*/+ |
| sizeof(uint32_t)*sel->pvCount /*pv*/+ sizeof(uint32_t) /*pvCount*/+ |
| sizeof(uint32_t) /*serializedTrieSize*/+ sel->serializedTrieSize /*trie*/; |
| |
| // this is a multi-string! strlen() will stop at the first one |
| encodingStrLength = |
| uprv_strlen(sel->encodings[sel->encodingsCount-1]) + |
| (sel->encodings[sel->encodingsCount-1] - sel->encodings[0]); |
| |
| totalSize += encodingStrLength + sizeof(uint32_t); |
| |
| if (totalSize > bufferCapacity) { |
| *status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return totalSize; |
| } |
| // ok, save! |
| // 0a. the signature |
| uint32_t sig = 0x66778899; |
| memcpy(buffer, &sig, sizeof(uint32_t)); |
| buffer+=sizeof(uint32_t); |
| // 0b. ASCIIness |
| uint32_t ASCIIness = U_CHARSET_FAMILY; |
| memcpy(buffer, &ASCIIness, sizeof(uint32_t)); |
| buffer+=sizeof(uint32_t); |
| |
| // 1. the array |
| memcpy(buffer, &sel->pvCount, sizeof(int32_t)); |
| buffer+=sizeof(int32_t); |
| memcpy(buffer, sel->pv, sel->pvCount*sizeof(int32_t)); |
| buffer+=sel->pvCount*sizeof(int32_t); |
| memcpy(buffer, &encodingStrLength, sizeof(int32_t)); |
| buffer+=sizeof(int32_t); |
| memcpy(buffer, sel->encodings[0], encodingStrLength); |
| buffer += encodingStrLength; |
| |
| // the trie |
| memcpy(buffer, &sel->serializedTrieSize, sizeof(uint32_t)); |
| buffer+=sizeof(uint32_t); |
| memcpy(buffer, sel->serializedTrie, sel->serializedTrieSize); |
| return totalSize; |
| } |
| |
| /* internal function! */ |
| void generateSelectorData(UConverterSelector* result, |
| const USet* excludedEncodings, |
| const UConverterUnicodeSet whichSet, |
| UErrorCode* status) { |
| const uint32_t encodingsSize = result->encodingsCount; |
| |
| // 66000 as suggested by Markus [I suggest something like 66000 which |
| // exceeds the number of BMP code points. There will be fewer ranges of |
| // combinations of encodings. (I believe there are no encodings that have |
| // interesting mappings for supplementary code points. All encodings either |
| // support all of them or none of them.)] |
| result->pv = upvec_open((encodingsSize+31)/32, 66000); // create for all |
| // unicode codepoints, and have space for all those bits needed! |
| |
| for (uint32_t i = 0; i < encodingsSize; ++i) { |
| uint32_t mask; |
| uint32_t column; |
| int32_t item_count; |
| int32_t j; |
| UConverter* test_converter = ucnv_open(result->encodings[i], status); |
| if (U_FAILURE(*status)) { |
| // status will propagate back to user |
| return; |
| } |
| USet* unicode_point_set; |
| unicode_point_set = uset_open(1, 0); // empty set |
| |
| ucnv_getUnicodeSet(test_converter, unicode_point_set, |
| whichSet, status); |
| |
| column = i / 32; |
| mask = 1 << (i%32); |
| // now iterate over intervals on set i! |
| item_count = uset_getItemCount(unicode_point_set); |
| |
| for (j = 0; j < item_count; ++j) { |
| UChar32 start_char; |
| UChar32 end_char; |
| UErrorCode smallStatus = U_ZERO_ERROR; |
| uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, |
| &smallStatus); |
| if (U_FAILURE(smallStatus)) { |
| // this will be reached for the converters that fill the set with |
| // strings. Those should be ignored by our system |
| } else { |
| // IMPORTANT: the intervals for usets are INCLUSIVE. However, the |
| // intervals for upvec are NOT INCLUSIVE. This is why we need |
| // end_char+1 here! |
| upvec_setValue(result->pv, start_char, end_char + 1, column, ~0, mask, |
| status); |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| } |
| } |
| ucnv_close(test_converter); |
| uset_close(unicode_point_set); |
| } |
| |
| |
| // handle excluded encodings! Simply set their values to all 1's in the upvec |
| if (excludedEncodings) { |
| int32_t item_count = uset_getItemCount(excludedEncodings); |
| for (int32_t j = 0; j < item_count; ++j) { |
| UChar32 start_char; |
| UChar32 end_char; |
| |
| uset_getItem(excludedEncodings, j, &start_char, &end_char, NULL, 0, |
| status); |
| if (U_FAILURE(*status)) { |
| return; |
| } else { |
| for (uint32_t col = 0 ; col < (encodingsSize+31)/32 ; col++) { |
| upvec_setValue(result->pv, start_char, end_char + 1, col, ~0, ~0, |
| status); |
| } |
| } |
| } |
| } |
| |
| // alright. Now, let's put things in the same exact form you'd get when you |
| // unserialize things. |
| UNewTrie* trie = utrie_open(NULL, NULL, CAPACITY, 0, 0, TRUE); |
| result->pvCount = upvec_compact(result->pv, upvec_compactToTrieHandler, |
| trie, status); |
| uint32_t length = utrie_serialize(trie, NULL, 0, NULL, TRUE, status); |
| result->serializedTrie = (uint8_t*) uprv_malloc(length); |
| length = utrie_serialize(trie, result->serializedTrie, length, NULL, TRUE, |
| status); |
| result->serializedTrieSize = length; |
| utrie_unserialize(&result->constructedTrie, result->serializedTrie, length, |
| status); |
| utrie_close(trie); |
| } |
| |
| |
| |
| // a bunch of functions for the enumeration thingie! Nothing fancy here. Just |
| // iterate over the selected encodings |
| struct Enumerator { |
| int16_t* index; |
| int16_t length; |
| int16_t cur; |
| const UConverterSelector* sel; |
| }; |
| |
| |
| static void U_CALLCONV |
| ucnvsel_close_selector_iterator(UEnumeration *enumerator) { |
| uprv_free(((Enumerator*)(enumerator->context))->index); |
| uprv_free(enumerator->context); |
| } |
| |
| static int32_t U_CALLCONV |
| ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return 0; |
| } |
| return ((Enumerator*)(enumerator->context))->length; |
| } |
| |
| |
| static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, |
| int32_t* resultLength, |
| UErrorCode* status) { |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| |
| int16_t cur = ((Enumerator*)(enumerator->context))->cur; |
| const UConverterSelector* sel; |
| const char* result; |
| if (cur >= ((Enumerator*)(enumerator->context))->length) { |
| return NULL; |
| } |
| sel = ((Enumerator*)(enumerator->context))->sel; |
| result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; |
| ((Enumerator*)(enumerator->context))->cur++; |
| if (resultLength) { |
| *resultLength = uprv_strlen(result); |
| } |
| return result; |
| } |
| |
| static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, |
| UErrorCode* status) { |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return ; |
| } |
| ((Enumerator*)(enumerator->context))->cur = 0; |
| } |
| |
| static const UEnumeration defaultEncodings = { |
| NULL, |
| NULL, |
| ucnvsel_close_selector_iterator, |
| ucnvsel_count_encodings, |
| uenum_unextDefault, |
| ucnvsel_next_encoding, |
| ucnvsel_reset_iterator |
| }; |
| |
| |
| // internal fn to intersect two sets of masks |
| // returns whether the mask has reduced to all zeros |
| UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { |
| int32_t i; |
| uint32_t oredDest = 0; |
| for (i = 0 ; i < len ; ++i) { |
| oredDest |= (dest[i] &= source1[i]); |
| } |
| return oredDest == 0; |
| } |
| |
| // internal fn to count how many 1's are there in a mask |
| // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html |
| int16_t countOnes(uint32_t* mask, int32_t len) { |
| int32_t i, totalOnes = 0; |
| for (i = 0 ; i < len ; ++i) { |
| uint32_t ent = mask[i]; |
| for (; ent; totalOnes++) |
| { |
| ent &= ent - 1; // clear the least significant bit set |
| } |
| } |
| return totalOnes; |
| } |
| |
| |
| /* internal function! */ |
| UEnumeration *ucnvsel_select(const UConverterSelector* sel, const void *s, |
| int32_t length, UErrorCode *status, UBool isUTF16) { |
| const UChar* utf16buffer = (UChar*) s; |
| const char* utf8buffer = (char*) s; |
| |
| UEnumeration *en = NULL; |
| uint32_t* mask; |
| UChar32 next = 0; |
| int32_t offset = 0; |
| int32_t i, j; |
| |
| // check if already failed |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| // ensure args make sense! |
| if (sel == NULL || (s == NULL && length != 0)) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| |
| // this is the context we will use. Store a table of indices to which |
| // encodings are legit. |
| struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); |
| result->index = NULL; // this will be allocated later! |
| result->length = result->cur = 0; |
| result->sel = sel; |
| |
| en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); |
| memcpy(en, &defaultEncodings, sizeof(UEnumeration)); |
| en->context = result; |
| |
| mask = (uint32_t*) uprv_malloc((sel->encodingsCount+31)/32 * |
| sizeof(uint32_t)); |
| uprv_memset(mask, ~0, (sel->encodingsCount+31)/32 * sizeof(uint32_t)); |
| |
| if(length == -1) { |
| if(isUTF16) |
| length = u_strlen(utf16buffer); |
| else |
| length = uprv_strlen(utf8buffer); |
| } |
| |
| if(s) { |
| while (offset < length) { |
| uint16_t result = 0; |
| if (isUTF16) |
| U16_NEXT(utf16buffer, offset, length, next) |
| else |
| U8_NEXT(utf8buffer, offset, length, next) |
| |
| if (next != -1) { |
| UTRIE_GET16((&sel->constructedTrie), next, result) |
| |
| if (intersectMasks(mask, sel->pv+result, (sel->encodingsCount+31)/32)) { |
| break; |
| } |
| } |
| } |
| } |
| |
| int16_t numOnes = countOnes(mask, (sel->encodingsCount+31)/32); |
| // now, we know the exact space we need for index |
| if (numOnes > 0) { |
| result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); |
| } //otherwise, index will remain NULL (and will never be touched by |
| //the enumerator code anyway) |
| |
| for (j = 0 ; j < (sel->encodingsCount+31)/32 ; j++) { |
| for (i = 0 ; i < 32 ; i++) { |
| uint32_t v = mask[j] & 1; |
| if (v && j*32+i < sel->encodingsCount) { |
| result->index[result->length++] = j*32+i; |
| } |
| mask[j] >>= 1; |
| } |
| } |
| uprv_free(mask); |
| return en; |
| } |
| |
| /* check a string against the selector - UTF16 version */ |
| U_CAPI UEnumeration *ucnvsel_selectForString(const UConverterSelector* sel, |
| const UChar *s, |
| int32_t length, |
| UErrorCode *status) { |
| return ucnvsel_select(sel, s, length, status, TRUE); |
| } |
| |
| /* check a string against the selector - UTF8 version */ |
| U_CAPI UEnumeration *ucnvsel_selectForUTF8(const UConverterSelector* sel, |
| const char *utf8str, |
| int32_t length, |
| UErrorCode *status) { |
| return ucnvsel_select(sel, utf8str, length, status, FALSE); |
| } |
| |
| |
| |
| |
| /** |
| * swap a selector into the desired Endianness and Asciiness of |
| * the system. Just as FYI, selectors are always saved in the format |
| * of the system that created them. They are only converted if used |
| * on another system. In other words, selectors created on different |
| * system can be different even if the params are identical (endianness |
| * and Asciiness differences only) |
| * |
| * @param ds pointer to data swapper containing swapping info |
| * @param inData pointer to incoming data |
| * @param length length of inData in bytes |
| * @param outData pointer to output data. Capacity should |
| * be at least equal to capacity of inData |
| * @param status an in/out ICU UErrorCode |
| * @return 0 on failure, number of bytes swapped on success |
| * number of bytes swapped can be smaller than length |
| * |
| */ |
| U_CAPI int32_t ucnvsel_swap(const UDataSwapper *ds, |
| const void *inData, |
| int32_t length, |
| void *outData, |
| UErrorCode *status) { |
| const char* inDataC = (const char*) inData; |
| char * outDataC = (char*) outData; |
| int32_t passedLength = length; |
| //args check |
| if(U_FAILURE(*status)) { |
| return 0; |
| } |
| if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { |
| *status=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| if(length < 3 * sizeof(uint32_t)) { |
| * status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| |
| ds->swapArray32(ds, inDataC, 3, outDataC, status); |
| int32_t pvCount = ((int32_t*)outData)[2]; |
| |
| if(((int32_t*)outData)[0] != 0x66778899) |
| return 0; |
| |
| length -= 3 * sizeof(uint32_t); |
| inDataC += 3 * sizeof(uint32_t); |
| outDataC += 3 * sizeof(uint32_t); |
| |
| |
| if(length < pvCount * sizeof(uint32_t)) { |
| * status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| ds->swapArray32(ds, inDataC, pvCount, outDataC, status); |
| length -= pvCount * sizeof(uint32_t); |
| inDataC += pvCount * sizeof(uint32_t); |
| outDataC += pvCount * sizeof(uint32_t); |
| |
| if(length < 1 * sizeof(uint32_t)) { |
| * status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| ds->swapArray32(ds, inDataC, 1, outDataC, status); |
| int32_t encodingStrLength = ((int32_t*)outData)[0]; |
| length -= sizeof(uint32_t); |
| inDataC += sizeof(uint32_t); |
| outDataC += sizeof(uint32_t); |
| |
| if(length < encodingStrLength) { |
| * status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| ds->swapInvChars(ds, inDataC, encodingStrLength, outDataC, status); |
| length -= encodingStrLength; |
| inDataC += encodingStrLength; |
| outDataC += encodingStrLength; |
| |
| if(length < 1 * sizeof(uint32_t)) { |
| * status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| ds->swapArray32(ds, inDataC, 1, outDataC, status); |
| int32_t trieSize = ((int32_t*)outData)[0]; |
| length -= sizeof(uint32_t); |
| inDataC += sizeof(uint32_t); |
| outDataC += sizeof(uint32_t); |
| |
| if(length < trieSize) { |
| * status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| utrie_swap(ds, inDataC, trieSize, outDataC, status); |
| length -= trieSize; |
| return passedLength - length; |
| } |