| /* |
| ******************************************************************************* |
| * Copyright (C) 1999, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * File unistr.cpp |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 09/25/98 stephen Creation. |
| * 04/20/99 stephen Overhauled per 4/16 code review. |
| * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX |
| * 11/18/99 aliu Added handleReplaceBetween() to make inherit from |
| * Replaceable. |
| ******************************************************************************* |
| */ |
| |
| |
| #include "unicode/utypes.h" |
| #include "unicode/putil.h" |
| #include "unicode/locid.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "unicode/ustring.h" |
| #include "mutex.h" |
| #include "unicode/unistr.h" |
| |
| #if 0 |
| //DEBUGGING |
| #include <iostream.h> |
| |
| void |
| print(const UnicodeString& s, |
| const char *name) |
| { |
| UChar c; |
| cout << name << ":|"; |
| for(int i = 0; i < s.length(); ++i) { |
| c = s[i]; |
| if(c>= 0x007E || c < 0x0020) |
| cout << "[0x" << hex << s[i] << "]"; |
| else |
| cout << (char) s[i]; |
| } |
| cout << '|' << endl; |
| } |
| |
| void |
| print(const UChar *s, |
| int32_t len, |
| const char *name) |
| { |
| UChar c; |
| cout << name << ":|"; |
| for(int i = 0; i < len; ++i) { |
| c = s[i]; |
| if(c>= 0x007E || c < 0x0020) |
| cout << "[0x" << hex << s[i] << "]"; |
| else |
| cout << (char) s[i]; |
| } |
| cout << '|' << endl; |
| } |
| // END DEBUGGING |
| #endif |
| |
| // Local function definitions for now |
| |
| // need to copy areas that may overlap |
| inline void |
| us_arrayCopy(const UChar *src, int32_t srcStart, |
| UChar *dst, int32_t dstStart, int32_t count) |
| { |
| if(count>0) { |
| uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); |
| } |
| } |
| |
| // static initialization |
| const UChar UnicodeString::fgInvalidUChar = 0xFFFF; |
| const int32_t UnicodeString::kGrowSize = 0x80; |
| const int32_t UnicodeString::kInvalidHashCode = 0; |
| const int32_t UnicodeString::kEmptyHashCode = 1; |
| UConverter* UnicodeString::fgDefaultConverter = 0; |
| |
| //======================================== |
| // Constructors |
| //======================================== |
| UnicodeString::UnicodeString() |
| : fArray(fStackBuffer), |
| fLength(0), |
| fCapacity(US_STACKBUF_SIZE), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| {} |
| |
| UnicodeString::UnicodeString(int32_t capacity) |
| : fArray(0), |
| fLength(0), |
| fCapacity(0), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| { |
| fArray = allocate(capacity, fCapacity); |
| if(! fArray) { |
| setToBogus(); |
| return; |
| } |
| |
| setRefCount(1); |
| } |
| |
| UnicodeString::UnicodeString(UChar ch) |
| : fArray(fStackBuffer), |
| fLength(0), |
| fCapacity(US_STACKBUF_SIZE), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| { |
| doReplace(0, 0, &ch, 0, 1); |
| } |
| |
| UnicodeString::UnicodeString(const UChar *text) |
| : fArray(fStackBuffer), |
| fLength(0), |
| fCapacity(US_STACKBUF_SIZE), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| { |
| doReplace(0, 0, text, 0, u_strlen(text)); |
| } |
| |
| UnicodeString::UnicodeString( const UChar *text, |
| int32_t textLength) |
| : fArray(fStackBuffer), |
| fLength(0), |
| fCapacity(US_STACKBUF_SIZE), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| { |
| doReplace(0, 0, text, 0, textLength); |
| } |
| |
| UnicodeString::UnicodeString(bool_t isTerminated, |
| UChar *text, |
| int32_t textLength) |
| : fArray(text), |
| fLength(textLength != -1 || !isTerminated ? textLength : u_strlen(text)), |
| fCapacity(isTerminated ? fLength + 1 : fLength), |
| fRefCounted(FALSE), |
| fHashCode(kInvalidHashCode), |
| fBogus(FALSE) |
| { |
| if(fLength < 0) { |
| setToBogus(); |
| } |
| } |
| |
| UnicodeString::UnicodeString(const char *codepageData, |
| const char *codepage) |
| : fArray(fStackBuffer), |
| fLength(0), |
| fCapacity(US_STACKBUF_SIZE), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| { |
| if(codepageData != 0) |
| doCodepageCreate(codepageData, uprv_strlen(codepageData), codepage); |
| } |
| |
| |
| UnicodeString::UnicodeString(const char *codepageData, |
| int32_t dataLength, |
| const char *codepage) |
| : fArray(fStackBuffer), |
| fLength(0), |
| fCapacity(US_STACKBUF_SIZE), |
| fRefCounted(FALSE), |
| fHashCode(kEmptyHashCode), |
| fBogus(FALSE) |
| { |
| if(codepageData != 0) { |
| doCodepageCreate(codepageData, dataLength, codepage); |
| } |
| } |
| |
| //======================================== |
| // Destructor |
| //======================================== |
| UnicodeString::~UnicodeString() |
| { |
| // decrement ref count and reclaim storage, if owned |
| if(fRefCounted && removeRef() == 0) |
| delete [] fArray; |
| } |
| |
| //======================================== |
| // Assignment |
| //======================================== |
| UnicodeString& |
| UnicodeString::operator= (const UnicodeString& src) |
| { |
| // if assigning to ourselves, do nothing |
| if(this == &src) { |
| return *this; |
| } |
| |
| // if src is bogus, set ourselves to bogus |
| if(src.isBogus()) { |
| setToBogus(); |
| return *this; |
| } |
| |
| // if src is aliased or ref counted, point ourselves at its array |
| if(src.fArray != src.fStackBuffer) { |
| |
| // if we're ref counted, decrement our current ref count |
| if(fRefCounted && removeRef() == 0) |
| delete [] fArray; |
| |
| fArray = src.fArray; |
| fLength = src.fLength; |
| fCapacity = src.fCapacity; |
| fHashCode = src.fHashCode; |
| fRefCounted = src.fRefCounted; |
| if(fRefCounted) { |
| addRef(); |
| } |
| fBogus = FALSE; |
| } |
| // if src isn't ref counted, just do a replace |
| else { |
| doReplace(0, fLength, src.fArray, 0, src.fLength); |
| fHashCode = src.fHashCode; |
| } |
| |
| return *this; |
| } |
| |
| //======================================== |
| // Miscellaneous operations |
| //======================================== |
| int32_t |
| UnicodeString::numDisplayCells( UTextOffset start, |
| int32_t length, |
| bool_t asian) const |
| { |
| // pin indices to legal values |
| pinIndices(start, length); |
| |
| UChar c; |
| int32_t result = 0; |
| UTextOffset limit = start + length; |
| |
| while(start < limit) { |
| c = getArrayStart()[start]; |
| switch(Unicode::getCellWidth(c)) { |
| case Unicode::ZERO_WIDTH: |
| break;; |
| |
| case Unicode::HALF_WIDTH: |
| result += 1; |
| break; |
| |
| case Unicode::FULL_WIDTH: |
| result += 2; |
| break; |
| |
| case Unicode::NEUTRAL: |
| result += (asian ? 2 : 1); |
| break; |
| } |
| ++start; |
| } |
| |
| return result; |
| } |
| |
| UCharReference |
| UnicodeString::operator[] (UTextOffset pos) |
| { |
| return UCharReference(this, pos); |
| } |
| |
| //======================================== |
| // Read-only implementation |
| //======================================== |
| int8_t |
| UnicodeString::doCompare( UTextOffset start, |
| int32_t length, |
| const UChar *srcChars, |
| UTextOffset srcStart, |
| int32_t srcLength) const |
| { |
| // compare illegal string values |
| if(isBogus()) { |
| if(srcChars==0) { |
| return 0; |
| } else { |
| return -1; |
| } |
| } else if(srcChars==0) { |
| return 1; |
| } |
| |
| // pin indices to legal values |
| pinIndices(start, length); |
| |
| // get the correct pointer |
| const UChar *chars = getArrayStart(); |
| |
| UTextOffset minLength; |
| int8_t lengthResult; |
| |
| // are we comparing different lengths? |
| if(length != srcLength) { |
| if(length < srcLength) { |
| minLength = length; |
| lengthResult = -1; |
| } else { |
| minLength = srcLength; |
| lengthResult = 1; |
| } |
| } else { |
| minLength = length; |
| lengthResult = 0; |
| } |
| |
| /* |
| * note that uprv_memcmp() returns an int but we return an int8_t; |
| * we need to take care not to truncate the result - |
| * one way to do this is to right-shift the value to |
| * move the sign bit into the lower 8 bits and making sure that this |
| * does not become 0 itself |
| */ |
| |
| if(minLength > 0) { |
| int32_t result; |
| |
| if(U_IS_BIG_ENDIAN) { |
| // big-endian: byte comparison works |
| result = uprv_memcmp(chars + start, srcChars + srcStart, minLength * sizeof(UChar)); |
| if(result != 0) { |
| return (int8_t)(result >> 15 | 1); |
| } |
| } else { |
| // little-endian: compare UChar units |
| chars += start; |
| srcChars += srcStart; |
| do { |
| result = ((int32_t)*chars - (int32_t)*srcChars); |
| if(result != 0) { |
| return (int8_t)(result >> 15 | 1); |
| } |
| ++chars; |
| ++srcChars; |
| } while(--minLength > 0); |
| } |
| } |
| return lengthResult; |
| } |
| |
| void |
| UnicodeString::doExtract(UTextOffset start, |
| int32_t length, |
| UChar *dst, |
| UTextOffset dstStart) const |
| { |
| // pin indices to legal values |
| pinIndices(start, length); |
| us_arrayCopy(getArrayStart(), start, dst, dstStart, length); |
| } |
| |
| |
| UTextOffset |
| UnicodeString::indexOf(const UChar *srcChars, |
| UTextOffset srcStart, |
| int32_t srcLength, |
| UTextOffset start, |
| int32_t length) const |
| { |
| if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength <= 0) { |
| return -1; |
| } |
| |
| // now we will only work with srcLength-1 |
| --srcLength; |
| |
| // get the indices within bounds |
| pinIndices(start, length); |
| |
| // set length for the last possible match start position |
| // note the --srcLength above |
| length -= srcLength; |
| |
| if(length <= 0) { |
| return -1; |
| } |
| |
| const UChar *array = getArrayStart(); |
| UTextOffset limit = start + length; |
| |
| // search for the first char, then compare the rest of the string |
| // increment srcStart here for that, matching the --srcLength above |
| UChar ch = srcChars[srcStart++]; |
| |
| do { |
| if(array[start] == ch && (srcLength == 0 || compare(start + 1, srcLength, srcChars, srcStart, srcLength) == 0)) { |
| return start; |
| } |
| } while(++start < limit); |
| |
| return -1; |
| } |
| |
| UTextOffset |
| UnicodeString::doIndexOf(UChar c, |
| UTextOffset start, |
| int32_t length) const |
| { |
| // pin indices |
| pinIndices(start, length); |
| if(length == 0) { |
| return -1; |
| } |
| |
| // find the first occurrence of c |
| const UChar *begin = getArrayStart() + start; |
| const UChar *limit = begin + length; |
| |
| do { |
| if(*begin == c) { |
| return begin - getArrayStart(); |
| } |
| } while(++begin < limit); |
| |
| return -1; |
| } |
| |
| UTextOffset |
| UnicodeString::lastIndexOf(const UChar *srcChars, |
| UTextOffset srcStart, |
| int32_t srcLength, |
| UTextOffset start, |
| int32_t length) const |
| { |
| if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength <= 0) { |
| return -1; |
| } |
| |
| // now we will only work with srcLength-1 |
| --srcLength; |
| |
| // get the indices within bounds |
| pinIndices(start, length); |
| |
| // set length for the last possible match start position |
| // note the --srcLength above |
| length -= srcLength; |
| |
| if(length <= 0) { |
| return -1; |
| } |
| |
| const UChar *array = getArrayStart(); |
| UTextOffset pos; |
| |
| // search for the first char, then compare the rest of the string |
| // increment srcStart here for that, matching the --srcLength above |
| UChar ch = srcChars[srcStart++]; |
| |
| pos = start + length; |
| do { |
| if(array[--pos] == ch && (srcLength == 0 || compare(pos + 1, srcLength, srcChars, srcStart, srcLength) == 0)) { |
| return pos; |
| } |
| } while(pos > start); |
| |
| return -1; |
| } |
| |
| UTextOffset |
| UnicodeString::doLastIndexOf(UChar c, |
| UTextOffset start, |
| int32_t length) const |
| { |
| if(isBogus()) { |
| return -1; |
| } |
| |
| // pin indices |
| pinIndices(start, length); |
| if(length == 0) { |
| return -1; |
| } |
| |
| const UChar *begin = getArrayStart() + start; |
| const UChar *limit = begin + length; |
| |
| do { |
| if(*--limit == c) { |
| return limit - getArrayStart(); |
| } |
| } while(limit > begin); |
| |
| return -1; |
| } |
| |
| UnicodeString& |
| UnicodeString::findAndReplace(UTextOffset start, |
| int32_t length, |
| const UnicodeString& oldText, |
| UTextOffset oldStart, |
| int32_t oldLength, |
| const UnicodeString& newText, |
| UTextOffset newStart, |
| int32_t newLength) |
| { |
| if(isBogus() || oldText.isBogus() || newText.isBogus()) { |
| return *this; |
| } |
| |
| pinIndices(start, length); |
| oldText.pinIndices(oldStart, oldLength); |
| newText.pinIndices(newStart, newLength); |
| |
| if(oldLength == 0 || newLength == 0) { |
| return *this; |
| } |
| |
| while(length >= oldLength) { |
| UTextOffset pos = indexOf(oldText, oldStart, oldLength, start, length); |
| if(pos < 0) { |
| // no more oldText's here: done |
| break; |
| } else { |
| // we found oldText, replace it by newText and go beyond it |
| replace(pos, oldLength, newText, newStart, newLength); |
| length -= pos + oldLength - start; |
| start = pos + newLength; |
| } |
| } |
| |
| return *this; |
| } |
| |
| |
| //======================================== |
| // Write implementation |
| //======================================== |
| |
| UnicodeString& |
| UnicodeString::setCharAt(UTextOffset offset, |
| UChar c) |
| { |
| if(offset < 0) |
| offset = 0; |
| else if(offset >= fLength) |
| offset = fLength - 1; |
| |
| doSetCharAt(offset, c); |
| fHashCode = kInvalidHashCode; |
| return *this; |
| } |
| |
| UnicodeString& |
| UnicodeString::toUpper() |
| { return toUpper(Locale::getDefault()); } |
| |
| UnicodeString& |
| UnicodeString::toLower() |
| { return toLower(Locale::getDefault()); } |
| |
| UnicodeString& |
| UnicodeString::toUpper(const Locale& locale) |
| { |
| UTextOffset start = 0; |
| UTextOffset limit = fLength; |
| UChar c; |
| UnicodeString lang; |
| |
| locale.getLanguage(lang); |
| |
| // The German sharp S character (U+00DF)'s uppercase equivalent is |
| // "SS", making it the only character that expands to two characters |
| // when its case is changed (we don't automatically convert "SS" to |
| // U+00DF going to lowercase because it can only be determined from |
| // knowing the language whether a particular "SS" should map to |
| // U+00DF or "ss"). So we make a preliminary pass through the |
| // string looking for sharp S characters and then go back and make |
| // room for the extra capital Ses if we find any. [For performance, |
| // we only do this extra work if the language is actually German] |
| if(lang == "de") { |
| UChar SS [] = { 0x0053, 0x0053 }; |
| while(start < limit) { |
| |
| c = getArrayStart()[start]; |
| |
| // A sharp s needs to be replaced with two capital S's. |
| if(c == 0x00DF) { |
| doReplace(start, 1, SS, 0, 2); |
| start++; |
| limit++; |
| } |
| |
| // Otherwise, the case conversion can be handled by the Unicode unit. |
| else if(Unicode::isLowerCase(c)) |
| doSetCharAt(start, Unicode::toUpperCase(c)); |
| |
| // If no conversion is necessary, do nothing |
| ++start; |
| } |
| } |
| |
| // If the specfied language is Turkish, then we have to special-case |
| // for the Turkish dotted and dotless Is. The regular lowercase i |
| // maps to the capital I with a dot (U+0130), and the lowercase i |
| // without the dot (U+0131) maps to the regular capital I |
| else if(lang == "tr") { |
| while(start < limit) { |
| c = getArrayStart()[start]; |
| |
| if(c == 0x0069/*'i'*/) |
| doSetCharAt(start, 0x0130); |
| else if(c == 0x0131) |
| doSetCharAt(start, 0x0049/*'I'*/); |
| else if(Unicode::isLowerCase(c)) |
| doSetCharAt(start, Unicode::toUpperCase(c)); |
| ++start; |
| } |
| } |
| |
| else { |
| // clone our array, if necessary |
| cloneArrayIfNeeded(); |
| UChar *array = getArrayStart(); |
| |
| while(start < limit) { |
| c = array[start]; |
| if(Unicode::isLowerCase(c)) { |
| array[start] = Unicode::toUpperCase(c); |
| } |
| ++start; |
| } |
| } |
| |
| fHashCode = kInvalidHashCode; |
| |
| return *this; |
| } |
| |
| UnicodeString& |
| UnicodeString::toLower(const Locale& locale) |
| { |
| UTextOffset start = 0; |
| UTextOffset limit = fLength; |
| UChar c; |
| UnicodeString lang; |
| |
| locale.getLanguage(lang); |
| |
| // if the specfied language is Turkish, then we have to special-case |
| // for the Turkish dotted and dotless Is. The capital I with a dot |
| // (U+0130) maps to the regular lowercase i, and the regular capital |
| // I maps to the lowercase i without the dot (U+0131) |
| if(lang == "tr") { |
| while(start < limit) { |
| c = getArrayStart()[start]; |
| if(c == 0x0049) // 'I' |
| doSetCharAt(start, 0x0131); |
| else if(c == 0x0130) |
| doSetCharAt(start, 0x0069); // 'i' |
| else if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) |
| doSetCharAt(start, Unicode::toLowerCase(c)); |
| ++start; |
| } |
| } |
| |
| // if the specfied language is Greek, then we have to special-case |
| // for the capital letter sigma (U+3A3), which has two lower-case |
| // forms. If the character following the capital sigma is a letter, |
| // we use the medial form (U+3C3); otherwise, we use the final form |
| // (U+3C2). |
| else if(lang == "el") { |
| while(start < limit) { |
| c = getArrayStart()[start]; |
| if(c == 0x3a3) { |
| if(start + 1 < limit && Unicode::isLetter(getArrayStart()[start + 1])) |
| doSetCharAt(start, 0x3C3); |
| else |
| doSetCharAt(start, 0x3C2); |
| } |
| else if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) |
| doSetCharAt(start, Unicode::toLowerCase(c)); |
| ++start; |
| } |
| } |
| |
| // if the specified language is anything other than Turkish or |
| // Greek, we rely on the Unicode class to do all our case mapping-- |
| // there are no other special cases |
| else { |
| // clone our array, if necessary |
| cloneArrayIfNeeded(); |
| UChar *array = getArrayStart(); |
| |
| while(start < limit) { |
| c = array[start]; |
| if(Unicode::isUpperCase(c) || Unicode::isTitleCase(c)) { |
| array[start] = Unicode::toLowerCase(c); |
| } |
| ++start; |
| } |
| } |
| |
| fHashCode = kInvalidHashCode; |
| |
| return *this; |
| } |
| |
| // for speed, no bounds checking is performed and the hash code isn't changed |
| UnicodeString& |
| UnicodeString::doSetCharAt(UTextOffset offset, |
| UChar c) |
| { |
| // clone our array, if necessary |
| cloneArrayIfNeeded(); |
| |
| // set the character |
| fArray[ (fRefCounted ? offset + 1 : offset) ] = c; |
| return *this; |
| } |
| |
| UnicodeString& |
| UnicodeString::doReplace( UTextOffset start, |
| int32_t length, |
| const UnicodeString& src, |
| UTextOffset srcStart, |
| int32_t srcLength) |
| { |
| if(!src.isBogus()) { |
| // pin the indices to legal values |
| src.pinIndices(srcStart, srcLength); |
| |
| // get the characters from src |
| // and replace the range in ourselves with them |
| return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); |
| } else { |
| // remove the range |
| return doReplace(start, length, 0, 0, 0); |
| } |
| } |
| |
| UnicodeString& |
| UnicodeString::doReplace(UTextOffset start, |
| int32_t length, |
| const UChar *srcChars, |
| UTextOffset srcStart, |
| int32_t srcLength) |
| { |
| // if we're bogus, do nothing |
| if(fBogus) |
| return *this; |
| |
| if(srcChars == 0) { |
| srcStart = srcLength = 0; |
| } |
| |
| bool_t deleteWhenDone = FALSE; |
| UChar *bufferToDelete = 0; |
| |
| // clone our array, if necessary |
| cloneArrayIfNeeded(); |
| |
| // pin the indices to legal values |
| pinIndices(start, length); |
| |
| // calculate the size of the string after the replace |
| int32_t newSize = fLength - length + srcLength; |
| |
| // allocate a bigger array if needed |
| if( newSize > getCapacity() ) { |
| |
| // allocate at minimum needed space |
| int32_t tempLength; |
| UChar *temp = allocate(newSize + 1, tempLength); |
| if(! temp) { |
| setToBogus(); |
| return *this; |
| } |
| |
| // if we're not currently ref counted, shift the array right by one |
| if(fRefCounted == FALSE) |
| us_arrayCopy(fArray, 0, temp, 1, fLength); |
| // otherwise, copy the old array into temp, including the ref count |
| else |
| us_arrayCopy(fArray, 0, temp, 0, fLength + 1); |
| |
| // delete the old array if we were ref counted |
| if(fRefCounted && removeRef() == 0) { |
| // if the srcChars array is the same as this object's array, |
| // don't delete it until the end of the method. this can happen |
| // in code like UnicodeString s = "foo"; s += s; |
| if(srcChars != getArrayStart()) |
| delete [] fArray; |
| else { |
| deleteWhenDone = TRUE; |
| bufferToDelete = fArray; |
| } |
| } |
| |
| // use the new array |
| fCapacity = tempLength; |
| fArray = temp; |
| setRefCount(1); |
| } |
| |
| // now do the replace |
| |
| // first copy the portion that isn't changing, leaving a hole |
| if(length != srcLength) { |
| us_arrayCopy(getArrayStart(), start + length, |
| getArrayStart(), start + srcLength, |
| fLength - (start + length)); |
| } |
| |
| // now fill in the hole with the new string |
| us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength); |
| |
| fLength = newSize; |
| fHashCode = kInvalidHashCode; |
| |
| if(deleteWhenDone) |
| delete [] bufferToDelete; |
| |
| return *this; |
| } |
| |
| /** |
| * Replaceable API |
| */ |
| void |
| UnicodeString::handleReplaceBetween(UTextOffset start, |
| UTextOffset limit, |
| const UnicodeString& text) { |
| replaceBetween(start, limit, text); |
| } |
| |
| UnicodeString& |
| UnicodeString::doReverse(UTextOffset start, |
| int32_t length) |
| { |
| // if we're bogus, do nothing |
| if(fBogus) |
| return *this; |
| |
| // clone our array, if necessary |
| cloneArrayIfNeeded(); |
| |
| // pin the indices to legal values |
| pinIndices(start, length); |
| |
| UChar *left = getArrayStart() + start; |
| UChar *right = getArrayStart() + start + length; |
| UChar swap; |
| |
| while(left < --right) { |
| swap = *left; |
| *left++ = *right; |
| *right = swap; |
| } |
| |
| fHashCode = kInvalidHashCode; |
| |
| return *this; |
| } |
| |
| //======================================== |
| // Hashing |
| //======================================== |
| int32_t |
| UnicodeString::doHashCode() |
| { |
| const UChar *key = getArrayStart(); |
| int32_t len = fLength; |
| int32_t hash = kInvalidHashCode; |
| const UChar *limit = key + len; |
| int32_t inc = (len >= 128 ? len/64 : 1); |
| |
| /* |
| We compute the hash by iterating sparsely over 64 (at most) |
| characters spaced evenly through the string. For each character, |
| we multiply the previous hash value by a prime number and add the |
| new character in, in the manner of an additive linear congruential |
| random number generator, thus producing a pseudorandom |
| deterministic value which should be well distributed over the |
| output range. [LIU] */ |
| |
| while(key < limit) { |
| hash = (hash * 37) + *key; |
| key += inc; |
| } |
| |
| if(hash == kInvalidHashCode) |
| hash = kEmptyHashCode; |
| |
| fHashCode = hash; |
| return fHashCode; |
| } |
| |
| //======================================== |
| // Bogusify? |
| //======================================== |
| void |
| UnicodeString::setToBogus() |
| { |
| if(fRefCounted && removeRef() == 0) { |
| delete [] fArray; |
| } |
| |
| fArray = 0; |
| fCapacity = fLength = 0; |
| fHashCode = kInvalidHashCode; |
| fRefCounted = FALSE; |
| fBogus = TRUE; |
| } |
| |
| //======================================== |
| // Codeset conversion |
| //======================================== |
| int32_t |
| UnicodeString::extract(UTextOffset start, |
| int32_t length, |
| char *dst, |
| const char *codepage) const |
| { |
| // if we're bogus or there's nothing to convert, do nothing |
| if(fBogus || length == 0) |
| return 0; |
| |
| // pin the indices to legal values |
| pinIndices(start, length); |
| |
| int32_t convertedLen = 0; |
| |
| // set up the conversion parameters |
| int32_t sourceLen = length; |
| const UChar *mySource = getArrayStart() + start; |
| const UChar *mySourceEnd = mySource + length; |
| char *myTarget = dst; |
| char *myTargetLimit; |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t arraySize = 0x0FFFFFFF; |
| |
| // create the converter |
| UConverter *converter; |
| |
| // if the codepage is the default, use our cache |
| if(codepage == 0) { |
| converter = getDefaultConverter(status); |
| } else if(*codepage == 0) { |
| converter = 0; |
| } else { |
| converter = ucnv_open(codepage, &status); |
| } |
| |
| // if we failed, set the appropriate flags and return |
| // if it is an empty string, then use the "invariant character" conversion |
| if(U_FAILURE(status)) { |
| // close the converter |
| if(codepage == 0) |
| releaseDefaultConverter(converter); |
| else |
| ucnv_close(converter); |
| return 0; |
| } |
| |
| // perform the conversion |
| if(converter == 0) { |
| // use the "invariant characters" conversion |
| if(length > fLength - start) { |
| length = fLength - start; |
| } |
| u_UCharsToChars(mySource, myTarget, length); |
| return length; |
| } |
| |
| // there is no loop here since we assume the buffer is large enough |
| myTargetLimit = myTarget + arraySize; |
| |
| /* Pin the limit to U_MAX_PTR. NULL check is for AS/400. */ |
| if((myTargetLimit < myTarget) || (myTargetLimit == NULL)) |
| myTargetLimit = (char*)U_MAX_PTR; |
| |
| ucnv_fromUnicode(converter, &myTarget, myTargetLimit, |
| &mySource, mySourceEnd, NULL, TRUE, &status); |
| |
| // close the converter |
| if(codepage == 0) |
| releaseDefaultConverter(converter); |
| else |
| ucnv_close(converter); |
| |
| return (myTarget - dst); |
| } |
| |
| void |
| UnicodeString::doCodepageCreate(const char *codepageData, |
| int32_t dataLength, |
| const char *codepage) |
| { |
| // if there's nothing to convert, do nothing |
| if(codepageData == 0 || dataLength == 0) |
| return; |
| |
| // set up the conversion parameters |
| int32_t sourceLen = dataLength; |
| const char *mySource = codepageData; |
| const char *mySourceEnd = mySource + sourceLen; |
| UChar *myTarget; |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t arraySize = getCapacity(); |
| |
| // create the converter |
| UConverter *converter = 0; |
| |
| // if the codepage is the default, use our cache |
| // if it is an empty string, then use the "invariant character" conversion |
| converter = (codepage == 0 ? |
| getDefaultConverter(status) : |
| *codepage == 0 ? |
| 0 : |
| ucnv_open(codepage, &status)); |
| |
| // if we failed, set the appropriate flags and return |
| if(U_FAILURE(status)) { |
| // close the converter |
| if(codepage == 0) |
| releaseDefaultConverter(converter); |
| else |
| ucnv_close(converter); |
| setToBogus(); |
| return; |
| } |
| |
| fHashCode = kInvalidHashCode; |
| |
| // perform the conversion |
| if(converter == 0) { |
| // use the "invariant characters" conversion |
| if(arraySize < dataLength) { |
| int32_t tempCapacity; |
| // allocate enough space for the dataLength, the refCount, and a NUL |
| UChar *temp = allocate(dataLength + 2, tempCapacity); |
| |
| if(temp == 0) { |
| // set flags and return |
| setToBogus(); |
| return; |
| } |
| |
| fArray = temp; |
| fCapacity = tempCapacity; |
| |
| setRefCount(1); |
| |
| u_charsToUChars(codepageData, fArray + 1, dataLength); |
| fArray[dataLength + 1] = 0; |
| } else { |
| u_charsToUChars(codepageData, getArrayStart(), dataLength); |
| } |
| fLength = dataLength; |
| return; |
| } |
| |
| myTarget = getArrayStart(); |
| for(;;) { |
| // reset the error code |
| status = U_ZERO_ERROR; |
| |
| // perform the conversion |
| ucnv_toUnicode(converter, &myTarget, myTarget + arraySize, |
| &mySource, mySourceEnd, NULL, TRUE, &status); |
| |
| // update the conversion parameters |
| fLength = myTarget - getArrayStart(); |
| |
| // allocate more space and copy data, if needed |
| if(status == U_INDEX_OUTOFBOUNDS_ERROR) { |
| int32_t tempCapacity; |
| UChar *temp = allocate(fCapacity, tempCapacity); |
| |
| if(! temp) { |
| // set flags and return |
| setToBogus(); |
| break; |
| } |
| |
| if(fRefCounted) { |
| // copy the old array into temp |
| us_arrayCopy(fArray, 1, temp, 1, fLength); |
| delete [] fArray; |
| } else { |
| // if we're not currently ref counted, shift the array right by one |
| us_arrayCopy(fArray, 0, temp, 1, fLength); |
| } |
| |
| fArray = temp; |
| fCapacity = tempCapacity; |
| |
| setRefCount(1); |
| |
| myTarget = getArrayStart() + fLength; |
| arraySize = getCapacity() - fLength; |
| } else { |
| break; |
| } |
| } |
| |
| // close the converter |
| if(codepage == 0) |
| releaseDefaultConverter(converter); |
| else |
| ucnv_close(converter); |
| } |
| |
| //======================================== |
| // External Buffer |
| //======================================== |
| UnicodeString::UnicodeString(UChar *buff, |
| int32_t bufLength, |
| int32_t buffCapacity) |
| : fArray(buff), |
| fLength(bufLength), |
| fCapacity(buffCapacity), |
| fRefCounted(FALSE), |
| fHashCode(kInvalidHashCode), |
| fBogus(FALSE) |
| {} |
| |
| const UChar* |
| UnicodeString::getUChars() const |
| { |
| // if we're bogus, do nothing |
| if(fBogus) |
| return 0; |
| |
| // no room for null, resize |
| if(getCapacity() <= fLength) { |
| // allocate at minimum the current capacity + needed space |
| int32_t tempLength; |
| UChar *temp = allocate(fCapacity + 1, tempLength); |
| if(! temp) { |
| ((UnicodeString*)this)->setToBogus(); |
| return 0; |
| } |
| |
| // if we're not currently ref counted, shift the array right by one |
| if(fRefCounted == FALSE) |
| us_arrayCopy(fArray, 0, temp, 1, fLength); |
| // otherwise, copy the old array into temp, including the ref count |
| else |
| us_arrayCopy(fArray, 0, temp, 0, fLength + 1); |
| |
| // delete the old array |
| if(fRefCounted && ((UnicodeString*)this)->removeRef() == 0) |
| delete [] ((UnicodeString*)this)->fArray; |
| |
| // use the new array |
| ((UnicodeString*)this)->fCapacity = tempLength; |
| ((UnicodeString*)this)->fArray = temp; |
| ((UnicodeString*)this)->setRefCount(1); |
| } |
| |
| if(getArrayStart()[fLength] != 0) { |
| // tack on a trailing null |
| ((UChar *)getArrayStart())[fLength] = 0; |
| } |
| |
| return getArrayStart(); |
| } |
| |
| UChar* |
| UnicodeString::orphanStorage() |
| { |
| // if we're bogus, do nothing |
| if(fBogus) |
| return 0; |
| |
| UChar *retVal; |
| |
| // if we're ref counted, get rid of the leading ref count |
| if(fRefCounted && removeRef() == 0) { |
| retVal = fArray; |
| } else { |
| // if we don't own the memory, then we have to allocate it |
| retVal = new UChar[fLength + 1]; |
| if(retVal == 0) { |
| return 0; |
| } |
| } |
| |
| // shift or copy characters |
| us_arrayCopy(getArrayStart(), 0, retVal, 0, fLength); |
| retVal[fLength] = 0; |
| |
| // set self to empty |
| fArray = fStackBuffer; |
| fLength = 0; |
| fCapacity = US_STACKBUF_SIZE; |
| fHashCode = kEmptyHashCode; |
| fRefCounted = FALSE; |
| |
| return retVal; |
| } |
| |
| //======================================== |
| // Miscellaneous |
| //======================================== |
| void |
| UnicodeString::pinIndices(UTextOffset& start, |
| int32_t& length) const |
| { |
| // pin indices |
| if(length < 0 || start < 0) |
| start = length = 0; |
| else { |
| if(length > (fLength - start)) |
| length = (fLength - start); |
| } |
| } |
| |
| void |
| UnicodeString::cloneArrayIfNeeded() |
| { |
| // if we're aliased or ref counted, make a copy of the buffer if necessary |
| if(fArray != fStackBuffer && (!fRefCounted || refCount() > 1)) { |
| UChar *copy; |
| bool_t refCounted; |
| if(fLength <= US_STACKBUF_SIZE) { |
| // a small string does not need allocation |
| fCapacity = US_STACKBUF_SIZE; |
| copy = fStackBuffer; |
| refCounted = FALSE; |
| } else { |
| if(!fRefCounted) { |
| // make room for the ref count |
| ++fCapacity; |
| } |
| if(fCapacity - 1 <= fLength) { |
| // make room for a terminating NUL |
| fCapacity = fLength + 2; |
| } |
| copy = new UChar [ fCapacity ]; |
| if(copy == 0) { |
| setToBogus(); |
| return; |
| } |
| refCounted = TRUE; |
| } |
| |
| // copy the current shared array into our new array |
| us_arrayCopy(getArrayStart(), 0, copy, refCounted ? 1 : 0, fLength); |
| |
| // remove a reference from the current shared array |
| // if there are no more references to the current shared array, |
| // after we remove the reference, delete the array |
| if(fRefCounted && removeRef() == 0) { |
| delete [] fArray; |
| } |
| |
| // make our array point to the new copy and set the ref count to one |
| fArray = copy; |
| fRefCounted = refCounted; |
| if(refCounted) { |
| setRefCount(1); |
| } |
| } |
| } |
| |
| // private function for C API |
| U_CFUNC const UChar* |
| T_UnicodeString_getUChars(const UnicodeString *s) |
| { |
| return s->getUChars(); |
| } |
| |
| // private function for C API |
| U_CFUNC int32_t |
| T_UnicodeString_extract(const UnicodeString *s, char *dst) |
| { |
| return s->extract(0, s->length(), dst, ""); |
| } |
| |
| |
| //======================================== |
| // Default converter caching |
| //======================================== |
| |
| UConverter* |
| UnicodeString::getDefaultConverter(UErrorCode &status) |
| { |
| UConverter *converter = 0; |
| |
| if(fgDefaultConverter != 0) { |
| Mutex lock; |
| |
| // need to check to make sure it wasn't taken out from under us |
| if(fgDefaultConverter != 0) { |
| converter = fgDefaultConverter; |
| fgDefaultConverter = 0; |
| } |
| } |
| |
| // if the cache was empty, create a converter |
| if(converter == 0) { |
| converter = ucnv_open(0, &status); |
| if(U_FAILURE(status)) |
| return 0; |
| } |
| |
| return converter; |
| } |
| |
| void |
| UnicodeString::releaseDefaultConverter(UConverter *converter) |
| { |
| if(fgDefaultConverter == 0) { |
| Mutex lock; |
| |
| if(fgDefaultConverter == 0) { |
| fgDefaultConverter = converter; |
| converter = 0; |
| } |
| } |
| |
| // it's safe to close a NULL converter |
| ucnv_close(converter); |
| } |
| |
| //======================================== |
| // Streaming (to be removed) |
| //======================================== |
| |
| #include <iostream.h> |
| #include "unistrm.h" |
| #include "filestrm.h" |
| |
| |
| inline uint8_t |
| uprv_hibyte(uint16_t x) |
| { return (uint8_t)(x >> 8); } |
| |
| inline uint8_t |
| uprv_lobyte(uint16_t x) |
| { return (uint8_t)(x & 0xff); } |
| |
| inline uint16_t |
| uprv_hiword(uint32_t x) |
| { return (uint16_t)(x >> 16); } |
| |
| inline uint16_t |
| uprv_loword(uint32_t x) |
| { return (uint16_t)(x & 0xffff); } |
| |
| inline void |
| writeLong(FileStream *os, |
| int32_t x) |
| { |
| uint16_t word = uprv_hiword((uint32_t)x); |
| T_FileStream_putc(os, uprv_hibyte(word)); |
| T_FileStream_putc(os, uprv_lobyte(word)); |
| word = uprv_loword((uint32_t)x); |
| T_FileStream_putc(os, uprv_hibyte(word)); |
| T_FileStream_putc(os, uprv_lobyte(word)); |
| } |
| |
| inline int32_t |
| readLong(FileStream *is) |
| { |
| int32_t x = 0; |
| uint16_t byte; |
| |
| byte = T_FileStream_getc(is); |
| x |= byte; |
| byte = T_FileStream_getc(is); |
| x = (x << 8) | byte; |
| byte = T_FileStream_getc(is); |
| x = (x << 8) | byte; |
| byte = T_FileStream_getc(is); |
| x = (x << 8) | byte; |
| |
| return x; |
| } |
| |
| inline void |
| writeUChar(FileStream *os, |
| UChar c) |
| { |
| T_FileStream_putc(os, uprv_hibyte(c)); |
| T_FileStream_putc(os, uprv_lobyte(c)); |
| } |
| |
| inline UChar |
| readUChar(FileStream *is) |
| { |
| UChar c = 0; |
| uint16_t byte; |
| |
| byte = T_FileStream_getc(is); |
| c |= byte; |
| byte = T_FileStream_getc(is); |
| c = (c << 8) | byte; |
| |
| return c; |
| } |
| |
| void |
| UnicodeStringStreamer::streamOut(const UnicodeString *s, |
| FileStream *os) |
| { |
| if(!T_FileStream_error(os)) |
| writeLong(os, s->fLength); |
| |
| const UChar *c = s->getArrayStart(); |
| const UChar *end = c + s->fLength; |
| |
| while(c != end && ! T_FileStream_error(os)) |
| writeUChar(os, *c++); |
| } |
| |
| void |
| UnicodeStringStreamer::streamIn(UnicodeString *s, |
| FileStream *is) |
| { |
| int32_t newSize; |
| |
| // handle error conditions |
| if(T_FileStream_error(is) || T_FileStream_eof(is)) { |
| s->setToBogus(); |
| return; |
| } |
| newSize = readLong(is); |
| if((newSize < 0) || T_FileStream_error(is) |
| || ((newSize > 0) && T_FileStream_eof(is))) { |
| s->setToBogus(); //error condition |
| return; |
| } |
| |
| // clone s's array, if needed |
| s->cloneArrayIfNeeded(); |
| |
| // if the string isn't big enough to hold the data, enlarge it |
| if(s->getCapacity() < newSize) { |
| |
| int32_t tempLength; |
| UChar *temp = s->allocate(newSize, tempLength); |
| if(! temp) { |
| s->setToBogus(); |
| return; |
| } |
| |
| // if s is not currently ref counted, shift the array right by one |
| if(s->fRefCounted == FALSE) |
| us_arrayCopy(s->fArray, 0, temp, 1, s->fLength); |
| // otherwise, copy the old array into temp, including the ref count |
| else |
| us_arrayCopy(s->fArray, 0, temp, 0, s->fLength + 1); |
| |
| // delete the old array if s is ref counted |
| if(s->fRefCounted && s->removeRef() == 0) |
| delete [] s->fArray; |
| |
| // use the new array |
| s->fCapacity = tempLength; |
| s->fArray = temp; |
| s->setRefCount(1); |
| } |
| |
| UChar *c = s->getArrayStart(); |
| UChar *end = c + newSize; |
| |
| while(c < end && ! (T_FileStream_error(is) || T_FileStream_eof(is))) |
| *c++ = readUChar(is); |
| |
| // couldn't read all chars |
| if(c < end) { |
| s->setToBogus(); |
| return; |
| } |
| |
| s->fLength = newSize; |
| } |
| |
| // console IO |
| |
| ostream& |
| operator<<(ostream& stream, |
| const UnicodeString& s) |
| { |
| UTextOffset i; |
| UChar c; |
| int32_t saveFlags = stream.flags(); |
| |
| stream << hex; |
| |
| for(i = 0; i < s.length(); i++) { |
| c = s.charAt(i); |
| if((c >= ' ' && c <= '~') || c == '\n') |
| stream << (char)c; |
| else |
| stream << "[0x" << c << "]"; |
| } |
| stream.flush(); |
| stream.setf(saveFlags & ios::basefield, ios::basefield); |
| return stream; |
| } |