| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2010-2012, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * file name: bytestriebuilder.cpp |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2010sep25 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/bytestrie.h" |
| #include "unicode/bytestriebuilder.h" |
| #include "unicode/stringpiece.h" |
| #include "charstr.h" |
| #include "cmemory.h" |
| #include "uhash.h" |
| #include "uarrsort.h" |
| #include "uassert.h" |
| #include "ustr_imp.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| /* |
| * Note: This builder implementation stores (bytes, value) pairs with full copies |
| * of the byte sequences, until the BytesTrie is built. |
| * It might(!) take less memory if we collected the data in a temporary, dynamic trie. |
| */ |
| |
| class BytesTrieElement : public UMemory { |
| public: |
| // Use compiler's default constructor, initializes nothing. |
| |
| void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode); |
| |
| StringPiece getString(const CharString &strings) const { |
| int32_t offset=stringOffset; |
| int32_t length; |
| if(offset>=0) { |
| length=(uint8_t)strings[offset++]; |
| } else { |
| offset=~offset; |
| length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; |
| offset+=2; |
| } |
| return StringPiece(strings.data()+offset, length); |
| } |
| int32_t getStringLength(const CharString &strings) const { |
| int32_t offset=stringOffset; |
| if(offset>=0) { |
| return (uint8_t)strings[offset]; |
| } else { |
| offset=~offset; |
| return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; |
| } |
| } |
| |
| char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } |
| |
| int32_t getValue() const { return value; } |
| |
| int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; |
| |
| private: |
| const char *data(const CharString &strings) const { |
| int32_t offset=stringOffset; |
| if(offset>=0) { |
| ++offset; |
| } else { |
| offset=~offset+2; |
| } |
| return strings.data()+offset; |
| } |
| |
| // If the stringOffset is non-negative, then the first strings byte contains |
| // the string length. |
| // If the stringOffset is negative, then the first two strings bytes contain |
| // the string length (big-endian), and the offset needs to be bit-inverted. |
| // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) |
| int32_t stringOffset; |
| int32_t value; |
| }; |
| |
| void |
| BytesTrieElement::setTo(StringPiece s, int32_t val, |
| CharString &strings, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { |
| return; |
| } |
| int32_t length=s.length(); |
| if(length>0xffff) { |
| // Too long: We store the length in 1 or 2 bytes. |
| errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| return; |
| } |
| int32_t offset=strings.length(); |
| if(length>0xff) { |
| offset=~offset; |
| strings.append((char)(length>>8), errorCode); |
| } |
| strings.append((char)length, errorCode); |
| stringOffset=offset; |
| value=val; |
| strings.append(s, errorCode); |
| } |
| |
| int32_t |
| BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { |
| // TODO: add StringPiece::compare(), see ticket #8187 |
| StringPiece thisString=getString(strings); |
| StringPiece otherString=other.getString(strings); |
| int32_t lengthDiff=thisString.length()-otherString.length(); |
| int32_t commonLength; |
| if(lengthDiff<=0) { |
| commonLength=thisString.length(); |
| } else { |
| commonLength=otherString.length(); |
| } |
| int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); |
| return diff!=0 ? diff : lengthDiff; |
| } |
| |
| BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) |
| : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), |
| bytes(NULL), bytesCapacity(0), bytesLength(0) { |
| if(U_FAILURE(errorCode)) { |
| return; |
| } |
| strings=new CharString(); |
| if(strings==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| BytesTrieBuilder::~BytesTrieBuilder() { |
| delete strings; |
| delete[] elements; |
| uprv_free(bytes); |
| } |
| |
| BytesTrieBuilder & |
| BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { |
| return *this; |
| } |
| if(bytesLength>0) { |
| // Cannot add elements after building. |
| errorCode=U_NO_WRITE_PERMISSION; |
| return *this; |
| } |
| if(elementsLength==elementsCapacity) { |
| int32_t newCapacity; |
| if(elementsCapacity==0) { |
| newCapacity=1024; |
| } else { |
| newCapacity=4*elementsCapacity; |
| } |
| BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; |
| if(newElements==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| return *this; // error instead of dereferencing null |
| } |
| if(elementsLength>0) { |
| uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement)); |
| } |
| delete[] elements; |
| elements=newElements; |
| elementsCapacity=newCapacity; |
| } |
| elements[elementsLength++].setTo(s, value, *strings, errorCode); |
| return *this; |
| } |
| |
| U_CDECL_BEGIN |
| |
| static int32_t U_CALLCONV |
| compareElementStrings(const void *context, const void *left, const void *right) { |
| const CharString *strings=static_cast<const CharString *>(context); |
| const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); |
| const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); |
| return leftElement->compareStringTo(*rightElement, *strings); |
| } |
| |
| U_CDECL_END |
| |
| BytesTrie * |
| BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
| buildBytes(buildOption, errorCode); |
| BytesTrie *newTrie=NULL; |
| if(U_SUCCESS(errorCode)) { |
| newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); |
| if(newTrie==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| bytes=NULL; // The new trie now owns the array. |
| bytesCapacity=0; |
| } |
| } |
| return newTrie; |
| } |
| |
| StringPiece |
| BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
| buildBytes(buildOption, errorCode); |
| StringPiece result; |
| if(U_SUCCESS(errorCode)) { |
| result.set(bytes+(bytesCapacity-bytesLength), bytesLength); |
| } |
| return result; |
| } |
| |
| void |
| BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { |
| return; |
| } |
| if(bytes!=NULL && bytesLength>0) { |
| // Already built. |
| return; |
| } |
| if(bytesLength==0) { |
| if(elementsLength==0) { |
| errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| return; |
| } |
| uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), |
| compareElementStrings, strings, |
| FALSE, // need not be a stable sort |
| &errorCode); |
| if(U_FAILURE(errorCode)) { |
| return; |
| } |
| // Duplicate strings are not allowed. |
| StringPiece prev=elements[0].getString(*strings); |
| for(int32_t i=1; i<elementsLength; ++i) { |
| StringPiece current=elements[i].getString(*strings); |
| if(prev==current) { |
| errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| prev=current; |
| } |
| } |
| // Create and byte-serialize the trie for the elements. |
| bytesLength=0; |
| int32_t capacity=strings->length(); |
| if(capacity<1024) { |
| capacity=1024; |
| } |
| if(bytesCapacity<capacity) { |
| uprv_free(bytes); |
| bytes=static_cast<char *>(uprv_malloc(capacity)); |
| if(bytes==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| bytesCapacity=0; |
| return; |
| } |
| bytesCapacity=capacity; |
| } |
| StringTrieBuilder::build(buildOption, elementsLength, errorCode); |
| if(bytes==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| BytesTrieBuilder & |
| BytesTrieBuilder::clear() { |
| strings->clear(); |
| elementsLength=0; |
| bytesLength=0; |
| return *this; |
| } |
| |
| int32_t |
| BytesTrieBuilder::getElementStringLength(int32_t i) const { |
| return elements[i].getStringLength(*strings); |
| } |
| |
| UChar |
| BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { |
| return (uint8_t)elements[i].charAt(byteIndex, *strings); |
| } |
| |
| int32_t |
| BytesTrieBuilder::getElementValue(int32_t i) const { |
| return elements[i].getValue(); |
| } |
| |
| int32_t |
| BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { |
| const BytesTrieElement &firstElement=elements[first]; |
| const BytesTrieElement &lastElement=elements[last]; |
| int32_t minStringLength=firstElement.getStringLength(*strings); |
| while(++byteIndex<minStringLength && |
| firstElement.charAt(byteIndex, *strings)== |
| lastElement.charAt(byteIndex, *strings)) {} |
| return byteIndex; |
| } |
| |
| int32_t |
| BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { |
| int32_t length=0; // Number of different bytes at byteIndex. |
| int32_t i=start; |
| do { |
| char byte=elements[i++].charAt(byteIndex, *strings); |
| while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { |
| ++i; |
| } |
| ++length; |
| } while(i<limit); |
| return length; |
| } |
| |
| int32_t |
| BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { |
| do { |
| char byte=elements[i++].charAt(byteIndex, *strings); |
| while(byte==elements[i].charAt(byteIndex, *strings)) { |
| ++i; |
| } |
| } while(--count>0); |
| return i; |
| } |
| |
| int32_t |
| BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { |
| char b=(char)byte; |
| while(b==elements[i].charAt(byteIndex, *strings)) { |
| ++i; |
| } |
| return i; |
| } |
| |
| BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) |
| : LinearMatchNode(len, nextNode), s(bytes) { |
| hash=static_cast<int32_t>( |
| static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len))); |
| } |
| |
| UBool |
| BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { |
| if(this==&other) { |
| return TRUE; |
| } |
| if(!LinearMatchNode::operator==(other)) { |
| return FALSE; |
| } |
| const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; |
| return 0==uprv_memcmp(s, o.s, length); |
| } |
| |
| void |
| BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { |
| BytesTrieBuilder &b=(BytesTrieBuilder &)builder; |
| next->write(builder); |
| b.write(s, length); |
| offset=b.write(b.getMinLinearMatch()+length-1); |
| } |
| |
| StringTrieBuilder::Node * |
| BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, |
| Node *nextNode) const { |
| return new BTLinearMatchNode( |
| elements[i].getString(*strings).data()+byteIndex, |
| length, |
| nextNode); |
| } |
| |
| UBool |
| BytesTrieBuilder::ensureCapacity(int32_t length) { |
| if(bytes==NULL) { |
| return FALSE; // previous memory allocation had failed |
| } |
| if(length>bytesCapacity) { |
| int32_t newCapacity=bytesCapacity; |
| do { |
| newCapacity*=2; |
| } while(newCapacity<=length); |
| char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); |
| if(newBytes==NULL) { |
| // unable to allocate memory |
| uprv_free(bytes); |
| bytes=NULL; |
| bytesCapacity=0; |
| return FALSE; |
| } |
| uprv_memcpy(newBytes+(newCapacity-bytesLength), |
| bytes+(bytesCapacity-bytesLength), bytesLength); |
| uprv_free(bytes); |
| bytes=newBytes; |
| bytesCapacity=newCapacity; |
| } |
| return TRUE; |
| } |
| |
| int32_t |
| BytesTrieBuilder::write(int32_t byte) { |
| int32_t newLength=bytesLength+1; |
| if(ensureCapacity(newLength)) { |
| bytesLength=newLength; |
| bytes[bytesCapacity-bytesLength]=(char)byte; |
| } |
| return bytesLength; |
| } |
| |
| int32_t |
| BytesTrieBuilder::write(const char *b, int32_t length) { |
| int32_t newLength=bytesLength+length; |
| if(ensureCapacity(newLength)) { |
| bytesLength=newLength; |
| uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); |
| } |
| return bytesLength; |
| } |
| |
| int32_t |
| BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { |
| return write(elements[i].getString(*strings).data()+byteIndex, length); |
| } |
| |
| int32_t |
| BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { |
| if(0<=i && i<=BytesTrie::kMaxOneByteValue) { |
| return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); |
| } |
| char intBytes[5]; |
| int32_t length=1; |
| if(i<0 || i>0xffffff) { |
| intBytes[0]=(char)BytesTrie::kFiveByteValueLead; |
| intBytes[1]=(char)((uint32_t)i>>24); |
| intBytes[2]=(char)((uint32_t)i>>16); |
| intBytes[3]=(char)((uint32_t)i>>8); |
| intBytes[4]=(char)i; |
| length=5; |
| // } else if(i<=BytesTrie::kMaxOneByteValue) { |
| // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); |
| } else { |
| if(i<=BytesTrie::kMaxTwoByteValue) { |
| intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); |
| } else { |
| if(i<=BytesTrie::kMaxThreeByteValue) { |
| intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); |
| } else { |
| intBytes[0]=(char)BytesTrie::kFourByteValueLead; |
| intBytes[1]=(char)(i>>16); |
| length=2; |
| } |
| intBytes[length++]=(char)(i>>8); |
| } |
| intBytes[length++]=(char)i; |
| } |
| intBytes[0]=(char)((intBytes[0]<<1)|isFinal); |
| return write(intBytes, length); |
| } |
| |
| int32_t |
| BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { |
| int32_t offset=write(node); |
| if(hasValue) { |
| offset=writeValueAndFinal(value, FALSE); |
| } |
| return offset; |
| } |
| |
| int32_t |
| BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { |
| int32_t i=bytesLength-jumpTarget; |
| U_ASSERT(i>=0); |
| if(i<=BytesTrie::kMaxOneByteDelta) { |
| return write(i); |
| } |
| char intBytes[5]; |
| int32_t length; |
| if(i<=BytesTrie::kMaxTwoByteDelta) { |
| intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); |
| length=1; |
| } else { |
| if(i<=BytesTrie::kMaxThreeByteDelta) { |
| intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); |
| length=2; |
| } else { |
| if(i<=0xffffff) { |
| intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; |
| length=3; |
| } else { |
| intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; |
| intBytes[1]=(char)(i>>24); |
| length=4; |
| } |
| intBytes[1]=(char)(i>>16); |
| } |
| intBytes[1]=(char)(i>>8); |
| } |
| intBytes[length++]=(char)i; |
| return write(intBytes, length); |
| } |
| |
| U_NAMESPACE_END |