source/common/unistr.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 * Copyright (C) 1999, International Business Machines Corporation and         *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * File unistr.cpp
 *
 * Modification History:
 *
 *   Date        Name        Description
 *   09/25/98    stephen     Creation.
 *   04/20/99    stephen     Overhauled per 4/16 code review.
 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
 *                           Replaceable.
 *******************************************************************************
 */

 #include "unicode/utypes.h"
 #include "unicode/putil.h"
 #include "unicode/locid.h"
 #include "cstring.h"
 #include "cmemory.h"
 #include "unicode/ustring.h"
 #include "mutex.h"
 #include "unicode/unistr.h"
 #include "uhash.h"

 #if U_IOSTREAM_SOURCE >= 199711
 #include <iostream>
 using namespace std;
 #elif U_IOSTREAM_SOURCE >= 198506
 #include <iostream.h>
 #endif

 #if 0
 //DEBUGGING
 void
 print(const UnicodeString& s,
       const char *name)
 {
   UChar c;
   cout << name << ":|";
   for(int i = 0; i < s.length(); ++i) {
     c = s[i];
     if(c>= 0x007E || c < 0x0020)
       cout << "[0x" << hex << s[i] << "]";
     else
       cout << (char) s[i];
   }
   cout << '|' << endl;
 }

 void
 print(const UChar *s,
       int32_t len,
       const char *name)
 {
   UChar c;
   cout << name << ":|";
   for(int i = 0; i < len; ++i) {
     c = s[i];
     if(c>= 0x007E || c < 0x0020)
       cout << "[0x" << hex << s[i] << "]";
     else
       cout << (char) s[i];
   }
   cout << '|' << endl;
 }
 // END DEBUGGING
 #endif

 // Local function definitions for now

 // need to copy areas that may overlap
 inline void
 us_arrayCopy(const UChar *src, int32_t srcStart,
          UChar *dst, int32_t dstStart, int32_t count)
 {
   if(count>0) {
     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
   }
 }

 UConverter* UnicodeString::fgDefaultConverter  = 0;

 //========================================
 // Unescaping
 //========================================
 U_COMMON_API UnicodeString
 u_unescape(const UnicodeString &s) {
   /* ### TBD */
   return s;
 }

 U_COMMON_API UChar32
 u_unescape(const UnicodeString &s, int32_t &offset) {
   /* ### TBD */
   return 0xffff;
 }

 //========================================
 // Constructors
 //========================================
 UnicodeString::UnicodeString()
   : fArray(fStackBuffer),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {}

 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
   : fArray(0),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(0)
 {
   if(count <= 0) {
     // just allocate and do not do anything else
     allocate(capacity);
   } else {
     // count > 0, allocate and fill the new string with count c's
     int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount;
     if(capacity < length) {
       capacity = length;
     }
     if(allocate(capacity)) {
       int32_t i = 0;

       // fill the new string with c
       if(unitCount == 1) {
         // fill with length UChars
         while(i < length) {
           fArray[i++] = (UChar)c;
         }
       } else {
         // get the code units for c
         UChar units[UTF_MAX_CHAR_LENGTH];
         UTF_APPEND_CHAR_UNSAFE(units, i, c);

         // now it must be i==unitCount
         i = 0;

         // for Unicode, unitCount can only be 1, 2, 3, or 4
         // 1 is handled above
         switch(unitCount) {
         case 2:
           while(i < length) {
             fArray[i++]=units[0];
             fArray[i++]=units[1];
           }
           break;
         case 3:
           while(i < length) {
             fArray[i++]=units[0];
             fArray[i++]=units[1];
             fArray[i++]=units[2];
           }
           break;
         case 4:
           while(i < length) {
             fArray[i++]=units[0];
             fArray[i++]=units[1];
             fArray[i++]=units[2];
             fArray[i++]=units[3];
           }
           break;
         default:
           break;
         }
       }
     }
     fLength = length;
   }
 }

 UnicodeString::UnicodeString(UChar ch)
   : fArray(fStackBuffer),
     fLength(1),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   fStackBuffer[0] = ch;
 }

 UnicodeString::UnicodeString(UChar32 ch)
   : fArray(fStackBuffer),
     fLength(1),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   UTextOffset i = 0;
   UTF_APPEND_CHAR(fStackBuffer, i, US_STACKBUF_SIZE, ch);
   fLength = i;
 }

 UnicodeString::UnicodeString(const UChar *text)
   : fArray(fStackBuffer),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   doReplace(0, 0, text, 0, u_strlen(text));
 }

 UnicodeString::UnicodeString(const UChar *text,
                              int32_t textLength)
   : fArray(fStackBuffer),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   doReplace(0, 0, text, 0, textLength);
 }

 UnicodeString::UnicodeString(UBool isTerminated,
                              const UChar *text,
                              int32_t textLength)
   : fArray((UChar *)text),
     fLength(textLength),
     fCapacity(isTerminated ? textLength + 1 : textLength),
     fFlags(kReadonlyAlias)
 {
   if(text == 0 || textLength < -1 || textLength == -1 && !isTerminated) {
     setToBogus();
   } else if(textLength == -1) {
     // text is terminated, or else it would have failed the above test
     fLength = u_strlen(text);
     fCapacity = fLength + 1;
   }
 }

 UnicodeString::UnicodeString(UChar *buff,
                              int32_t bufLength,
                              int32_t buffCapacity)
   : fArray(buff),
     fLength(bufLength),
     fCapacity(buffCapacity),
     fFlags(kWriteableAlias)
 {
   if(buff == 0 || bufLength < 0 || bufLength > buffCapacity) {
     setToBogus();
   }
 }

 UnicodeString::UnicodeString(const char *codepageData,
                              const char *codepage)
   : fArray(fStackBuffer),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   if(codepageData != 0) {
     doCodepageCreate(codepageData, uprv_strlen(codepageData), codepage);
   }
 }


 UnicodeString::UnicodeString(const char *codepageData,
                              int32_t dataLength,
                              const char *codepage)
   : fArray(fStackBuffer),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   if(codepageData != 0) {
     doCodepageCreate(codepageData, dataLength, codepage);
   }
 }

 UnicodeString::UnicodeString(const UnicodeString& that)
   : Replaceable(),
     fArray(fStackBuffer),
     fLength(0),
     fCapacity(US_STACKBUF_SIZE),
     fFlags(kShortString)
 {
   *this = that;
 }

 //========================================
 // array allocation
 //========================================

 UBool
 UnicodeString::allocate(int32_t capacity) {
   if(capacity <= US_STACKBUF_SIZE) {
     fArray = fStackBuffer;
     fCapacity = US_STACKBUF_SIZE;
     fFlags = kShortString;
   } else {
     // count bytes for the refCounter and the string capacity, and
     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
     // to be safely aligned for the refCount
     int32_t words = ((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2;
     int32_t *array = new int32_t[words];
     if(array != 0) {
       // set initial refCount and point behind the refCount
       *array++ = 1;

       // have fArray point to the first UChar
       fArray = (UChar *)array;
       fCapacity = (words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR);
       fFlags = kLongString;
     } else {
       fArray = 0;
       fCapacity = 0;
       fFlags = kIsBogus;
       return FALSE;
     }
   }
   return TRUE;
 }

 //========================================
 // Destructor
 //========================================
 UnicodeString::~UnicodeString()
 {
   releaseArray();
 }

 //========================================
 // Assignment
 //========================================
 UnicodeString&
 UnicodeString::operator= (const UnicodeString& src)
 {
   // if assigning to ourselves, do nothing
   if(this == 0 || this == &src) {
     return *this;
   }

   // is the right side bogus?
   if(&src == 0 || src.isBogus()) {
     setToBogus();
     return *this;
   }

   // delete the current contents
   releaseArray();

   // we always copy the length and the hash code
   fLength = src.fLength;

   switch(src.fFlags) {
   case kShortString:
     // short string using the stack buffer, do the same
     fArray = fStackBuffer;
     fCapacity = US_STACKBUF_SIZE;
     fFlags = kShortString;
     if(fLength > 0) {
       uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR);
     }
     break;
   case kLongString:
     // src uses a refCounted string buffer, use that buffer with refCount
     // src is const, use a cast - we don't really change it
     ((UnicodeString &)src).addRef();
     // fall through to readonly alias copying: copy all fields
   case kReadonlyAlias:
     // src is a readonly alias, do the same
     fArray = src.fArray;
     fCapacity = src.fCapacity;
     fFlags = src.fFlags;
     break;
   case kWriteableAlias:
     // src is a writeable alias; we make a copy of that instead
     if(allocate(fLength)) {
       if(fLength > 0) {
         uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR);
       }
       break;
     }
     // if there is not enough memory, then fall through to setting to bogus
   default:
     // if src is bogus, set ourselves to bogus
     // do not call setToBogus() here because fArray and fFlags are not consistent here
     fArray = 0;
     fLength = 0;
     fCapacity = 0;
     fFlags = kIsBogus;
     break;
   }

   return *this;
 }

 //========================================
 // Miscellaneous operations
 //========================================
 int32_t
 UnicodeString::numDisplayCells( UTextOffset start,
                 int32_t length,
                 UBool asian) const
 {
   // pin indices to legal values
   pinIndices(start, length);

   UChar32 c;
   int32_t result = 0;
   UTextOffset limit = start + length;

   while(start < limit) {
     UTF_NEXT_CHAR(fArray, start, limit, c);
     switch(Unicode::getCellWidth(c)) {
     case Unicode::ZERO_WIDTH:
       break;;

     case Unicode::HALF_WIDTH:
       result += 1;
       break;

     case Unicode::FULL_WIDTH:
       result += 2;
       break;

     case Unicode::NEUTRAL:
       result += (asian ? 2 : 1);
       break;
     }
   }

   return result;
 }

 UCharReference
 UnicodeString::operator[] (UTextOffset pos)
 {
   return UCharReference(this, pos);
 }

 UnicodeString UnicodeString::unescape() const {
     UnicodeString result;
     for (int32_t i=0; i<length(); ) {
         UChar32 c = char32At(i++);
         if (c == 0x005C /*'\\'*/) {
             c = unescapeAt(i); // advances i
             if (c == (UChar32)0xFFFFFFFF) {
                 break; // invalid escape sequence
             }
         }
         result.append(c);
     }
     return result;
 }

 // u_unescapeAt() callback to get a UChar from a UnicodeString
 U_CFUNC UChar _charAt(int32_t offset, void *context) {
     return ((UnicodeString*) context)->charAt(offset);
 }

 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
     return u_unescapeAt(_charAt, &offset, length(), (void*)this);
 }

 //========================================
 // Read-only implementation
 //========================================
 int8_t
 UnicodeString::doCompare( UTextOffset start,
               int32_t length,
               const UChar *srcChars,
               UTextOffset srcStart,
               int32_t srcLength) const
 {
   // compare illegal string values
   if(isBogus()) {
     if(srcChars==0) {
       return 0;
     } else {
       return -1;
     }
   } else if(srcChars==0) {
     return 1;
   }

   // pin indices to legal values
   pinIndices(start, length);

   // get the correct pointer
   const UChar *chars = getArrayStart();

   // are we comparing the same buffer contents?
   if(chars + start == srcChars + srcStart) {
     return 0;
   }

   UTextOffset minLength;
   int8_t lengthResult;

   // are we comparing different lengths?
   if(length != srcLength) {
     if(length < srcLength) {
       minLength = length;
       lengthResult = -1;
     } else {
       minLength = srcLength;
       lengthResult = 1;
     }
   } else {
     minLength = length;
     lengthResult = 0;
   }

   /*
    * note that uprv_memcmp() returns an int but we return an int8_t;
    * we need to take care not to truncate the result -
    * one way to do this is to right-shift the value to
    * move the sign bit into the lower 8 bits and making sure that this
    * does not become 0 itself
    */

   if(minLength > 0) {
     int32_t result;

     if(U_IS_BIG_ENDIAN) {
       // big-endian: byte comparison works
       result = uprv_memcmp(chars + start, srcChars + srcStart, minLength * sizeof(UChar));
       if(result != 0) {
         return (int8_t)(result >> 15 | 1);
       }
     } else {
       // little-endian: compare UChar units
       chars += start;
       srcChars += srcStart;
       do {
         result = ((int32_t)*chars - (int32_t)*srcChars);
         if(result != 0) {
           return (int8_t)(result >> 15 | 1);
         }
         ++chars;
         ++srcChars;
       } while(--minLength > 0);
     }
   }
   return lengthResult;
 }

 void
 UnicodeString::doExtract(UTextOffset start,
              int32_t length,
              UChar *dst,
              UTextOffset dstStart) const
 {
   // do not copy anything if we alias dst itself
   if(fArray + start != dst + dstStart) {
     // pin indices to legal values
     pinIndices(start, length);
     us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
   }
 }

 UTextOffset
 UnicodeString::indexOf(const UChar *srcChars,
                UTextOffset srcStart,
                int32_t srcLength,
                UTextOffset start,
                int32_t length) const
 {
   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength <= 0) {
     return -1;
   }

   // now we will only work with srcLength-1
   --srcLength;

   // get the indices within bounds
   pinIndices(start, length);

   // set length for the last possible match start position
   // note the --srcLength above
   length -= srcLength;

   if(length <= 0) {
     return -1;
   }

   const UChar *array = getArrayStart();
   UTextOffset limit = start + length;

   // search for the first char, then compare the rest of the string
   // increment srcStart here for that, matching the --srcLength above
   UChar ch = srcChars[srcStart++];

   do {
     if(array[start] == ch && (srcLength == 0 || compare(start + 1, srcLength, srcChars, srcStart, srcLength) == 0)) {
       return start;
     }
   } while(++start < limit);

   return -1;
 }

 UTextOffset
 UnicodeString::doIndexOf(UChar c,
              UTextOffset start,
              int32_t length) const
 {
   // pin indices
   pinIndices(start, length);
   if(length == 0) {
     return -1;
   }

   // find the first occurrence of c
   const UChar *begin = getArrayStart() + start;
   const UChar *limit = begin + length;

   do {
     if(*begin == c) {
       return begin - getArrayStart();
     }
   } while(++begin < limit);

   return -1;
 }

 UTextOffset
 UnicodeString::lastIndexOf(const UChar *srcChars,
                UTextOffset srcStart,
                int32_t srcLength,
                UTextOffset start,
                int32_t length) const
 {
   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength <= 0) {
     return -1;
   }

   // now we will only work with srcLength-1
   --srcLength;

   // get the indices within bounds
   pinIndices(start, length);

   // set length for the last possible match start position
   // note the --srcLength above
   length -= srcLength;

   if(length <= 0) {
     return -1;
   }

   const UChar *array = getArrayStart();
   UTextOffset pos;

   // search for the first char, then compare the rest of the string
   // increment srcStart here for that, matching the --srcLength above
   UChar ch = srcChars[srcStart++];

   pos = start + length;
   do {
     if(array[--pos] == ch && (srcLength == 0 || compare(pos + 1, srcLength, srcChars, srcStart, srcLength) == 0)) {
       return pos;
     }
   } while(pos > start);

   return -1;
 }

 UTextOffset
 UnicodeString::doLastIndexOf(UChar c,
                  UTextOffset start,
                  int32_t length) const
 {
   if(isBogus()) {
     return -1;
   }

   // pin indices
   pinIndices(start, length);
   if(length == 0) {
     return -1;
   }

   const UChar *begin = getArrayStart() + start;
   const UChar *limit = begin + length;

   do {
     if(*--limit == c) {
       return limit - getArrayStart();
     }
   } while(limit > begin);

   return -1;
 }

 UnicodeString&
 UnicodeString::findAndReplace(UTextOffset start,
                   int32_t length,
                   const UnicodeString& oldText,
                   UTextOffset oldStart,
                   int32_t oldLength,
                   const UnicodeString& newText,
                   UTextOffset newStart,
                   int32_t newLength)
 {
   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
     return *this;
   }

   pinIndices(start, length);
   oldText.pinIndices(oldStart, oldLength);
   newText.pinIndices(newStart, newLength);

   if(oldLength == 0) {
     return *this;
   }

   while(length > 0 && length >= oldLength) {
     UTextOffset pos = indexOf(oldText, oldStart, oldLength, start, length);
     if(pos < 0) {
       // no more oldText's here: done
       break;
     } else {
       // we found oldText, replace it by newText and go beyond it
       replace(pos, oldLength, newText, newStart, newLength);
       length -= pos + oldLength - start;
       start = pos + newLength;
     }
   }

   return *this;
 }


 //========================================
 // Write implementation
 //========================================

 void
 UnicodeString::setToBogus()
 {
   releaseArray();

   fArray = 0;
   fCapacity = fLength = 0;
   fFlags = kIsBogus;
 }

 // setTo() analogous to the readonly-aliasing constructor with the same signature
 UnicodeString &
 UnicodeString::setTo(UBool isTerminated,
                      const UChar *text,
                      int32_t textLength)
 {
   if(text == 0 || textLength < -1 || textLength == -1 && !isTerminated) {
     setToBogus();
     return *this;
   }

   releaseArray();

   fArray = (UChar *)text;
   if(textLength != -1) {
     fLength = textLength;
   } else {
     // text is terminated, or else it would have failed the above test
     fLength = u_strlen(text);
     fCapacity = fLength + 1;
   }

   fCapacity = isTerminated ? fLength + 1 : fLength;
   fFlags = kReadonlyAlias;
   return *this;
 }

 // setTo() analogous to the writeable-aliasing constructor with the same signature
 UnicodeString &
 UnicodeString::setTo(UChar *buffer,
                      int32_t buffLength,
                      int32_t buffCapacity) {
   if(buffer == 0 || buffLength < 0 || buffLength > buffCapacity) {
     setToBogus();
     return *this;
   }

   releaseArray();

   fArray = buffer;
   fLength = buffLength;
   fCapacity = buffCapacity;
   fFlags = kWriteableAlias;
   return *this;
 }

 UnicodeString&
 UnicodeString::setCharAt(UTextOffset offset,
              UChar c)
 {
   if(cloneArrayIfNeeded()) {
     if(offset < 0) {
       offset = 0;
     } else if(offset >= fLength) {
       offset = fLength - 1;
     }

     fArray[offset] = c;
   }
   return *this;
 }

 UnicodeString&
 UnicodeString::toUpper()
 { return toUpper(Locale::getDefault()); }

 UnicodeString&
 UnicodeString::toLower()
 { return toLower(Locale::getDefault()); }

 /*
  * The following toUpper() and toLower() implementations are designed
  * for UTF-16 and UTF-32, not for UTF-8.
  * In UTF-16 and UTF-32, the number of code units per code point is fixed,
  * and a case mapping is assumed to always stay within the same plane
  * (64k code range) with the original code point. This allows to write
  * the mapping into the same space as the source character without
  * expansions or contractions except in the special cases.
  *
  * For UTF-8, where a source code point may take up a variable number
  * of code units, it is more efficient to get the mapping and write
  * the result only if it is a different code point from the original.
  * Also, a sharp s and the "SS" string typically both take up 2 bytes in UTF-8,
  * while the turkish i's typically result in expansions and contractions.
  * Therefore, for UTF-8, these functions should be reimplemented.
  * One single implementation for all UTF's would be either clumsy
  * or inefficient.
  */
 #if UTF_SIZE==8
 # error reimplement toUpper() and toLower() for UTF-8, see comment above
 #endif

 UnicodeString&
 UnicodeString::toUpper(const Locale& locale)
 {
   if(!cloneArrayIfNeeded()) {
     return *this;
   }

   const char *langChars = locale.getLanguage();

   UTextOffset start = 0, next = 0;
   UTextOffset limit = fLength;
   UChar32 c;

   // The German sharp S character (U+00DF)'s uppercase equivalent is
   // "SS", making it the only character that expands to two characters
   // when its case is changed (we don't automatically convert "SS" to
   // U+00DF going to lowercase because it can only be determined from
   // knowing the language whether a particular "SS" should map to
   // U+00DF or "ss").  So we make a preliminary pass through the
   // string looking for sharp S characters and then go back and make
   // room for the extra capital Ses if we find any.  [For performance,
   // we only do this extra work if the language is actually German]
   if(uprv_strcmp(langChars, "de") == 0) {
     static UChar SS [] = { 0x0053, 0x0053 };
     while(start < limit) {
       // start == next here by design
       UTF_NEXT_CHAR(fArray, next, limit, c);

       // A sharp s needs to be replaced with two capital S's.
       if(c == 0x00DF) {
         doReplace(start, 1, SS, 0, 2);
         start += 2;
         ++next; // the string expanded by one
         ++limit;
       } else {
         // Otherwise, the case conversion can be handled by the Unicode code point.
         c = Unicode::toUpperCase(c);
         UTF_APPEND_CHAR(fArray, start, limit, c);
       }
     }
   } else if(uprv_strcmp(langChars, "tr") == 0) {
     // If the specfied language is Turkish, then we have to special-case
     // for the Turkish dotted and dotless Is.  The regular lowercase i
     // maps to the capital I with a dot (U+0130), and the lowercase i
     // without the dot (U+0131) maps to the regular capital I
     while(start < limit) {
       // start == next here by design
       UTF_NEXT_CHAR(fArray, next, limit, c);
       if(c == 0x0069/*'i'*/) {
         fArray[start++] = 0x0130;
       } else if(c == 0x0131) {
         fArray[start++] = 0x0049/*'I'*/;
       } else {
         c = Unicode::toUpperCase(c);
         UTF_APPEND_CHAR(fArray, start, limit, c);
       }
     }
   } else {
     while(start < limit) {
       // start == next here by design
       UTF_NEXT_CHAR(fArray, next, limit, c);
       c = Unicode::toUpperCase(c);
       UTF_APPEND_CHAR(fArray, start, limit, c);
     }
   }

   return *this;
 }

 UnicodeString&
 UnicodeString::toLower(const Locale& locale)
 {
   if(!cloneArrayIfNeeded()) {
     return *this;
   }

   const char *langChars = locale.getLanguage();

   UTextOffset start = 0, next = 0;
   UTextOffset limit = fLength;
   UChar32 c;

   // if the specfied language is Turkish, then we have to special-case
   // for the Turkish dotted and dotless Is.  The capital I with a dot
   // (U+0130) maps to the regular lowercase i, and the regular capital
   // I maps to the lowercase i without the dot (U+0131)
   if(uprv_strcmp(langChars, "tr") == 0) {
     while(start < limit) {
       // start == next here by design
       UTF_NEXT_CHAR(fArray, next, limit, c);
       if(c == 0x0049) { // 'I'
         fArray[start++] = 0x0131;
       } else if(c == 0x0130) {
         fArray[start++] = 0x0069; // 'i'
       } else {
         c = Unicode::toLowerCase(c);
         UTF_APPEND_CHAR(fArray, start, limit, c);
       }
     }
   } else if(uprv_strcmp(langChars, "el") == 0) {
     // if the specfied language is Greek, then we have to special-case
     // for the capital letter sigma (U+3A3), which has two lower-case
     // forms.  If the character following the capital sigma is a letter,
     // we use the medial form (U+3C3); otherwise, we use the final form
     // (U+3C2).
     while(start < limit) {
       // start == next here by design
       UTF_NEXT_CHAR(fArray, next, limit, c);
       if(c == 0x3a3) {
         if(next < limit) {
           UTextOffset next2 = next;
           UChar32 c2;
           UTF_NEXT_CHAR(fArray, next2, limit, c2);
           if(Unicode::isLetter(c2)) {
             fArray[start++] = 0x3C3;
           } else {
             fArray[start++] = 0x3C2;
           }
         } else {
           fArray[start++] = 0x3C2;
         }
       } else {
         c = Unicode::toLowerCase(c);
         UTF_APPEND_CHAR(fArray, start, limit, c);
       }
     }
   } else {
     // if the specified language is anything other than Turkish or
     // Greek, we rely on the Unicode class to do all our case mapping--
     // there are no other special cases
     while(start < limit) {
       // start == next here by design
       UTF_NEXT_CHAR(fArray, next, limit, c);
       c = Unicode::toLowerCase(c);
       UTF_APPEND_CHAR(fArray, start, limit, c);
     }
   }

   return *this;
 }

 UnicodeString&
 UnicodeString::doReplace( UTextOffset start,
               int32_t length,
               const UnicodeString& src,
               UTextOffset srcStart,
               int32_t srcLength)
 {
   if(!src.isBogus()) {
     // pin the indices to legal values
     src.pinIndices(srcStart, srcLength);

     // get the characters from src
     // and replace the range in ourselves with them
     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
   } else {
     // remove the range
     return doReplace(start, length, 0, 0, 0);
   }
 }

 UnicodeString&
 UnicodeString::doReplace(UTextOffset start,
              int32_t length,
              const UChar *srcChars,
              UTextOffset srcStart,
              int32_t srcLength)
 {
   // if we're bogus, set us to empty first
   if(isBogus()) {
     fArray = fStackBuffer;
     fLength = 0;
     fCapacity = US_STACKBUF_SIZE;
     fFlags = kShortString;
   }

   if(srcChars == 0) {
     srcStart = srcLength = 0;
   }

   int32_t *bufferToDelete = 0;

   // the following may change fArray but will not copy the current contents;
   // therefore we need to keep the current fArray
   UChar *oldArray = fArray;
   int32_t oldLength = fLength;

   // pin the indices to legal values
   pinIndices(start, length);

   // calculate the size of the string after the replace
   int32_t newSize = oldLength - length + srcLength;

   // clone our array and allocate a bigger array if needed
   if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
                          FALSE, &bufferToDelete)
   ) {
     return *this;
   }

   // now do the replace

   if(fArray != oldArray) {
     // if fArray changed, then we need to copy everything except what will change
     us_arrayCopy(oldArray, 0, fArray, 0, start);
     us_arrayCopy(oldArray, start + length,
                  fArray, start + srcLength,
                  oldLength - (start + length));
   } else if(length != srcLength) {
     // fArray did not change; copy only the portion that isn't changing, leaving a hole
     us_arrayCopy(oldArray, start + length,
                  fArray, start + srcLength,
                  oldLength - (start + length));
   }

   // now fill in the hole with the new string
   us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength);

   fLength = newSize;

   // delayed delete in case srcChars == fArray when we started, and
   // to keep oldArray alive for the above operations
   delete [] bufferToDelete;

   return *this;
 }

 /**
  * Replaceable API
  */
 void
 UnicodeString::handleReplaceBetween(UTextOffset start,
                                     UTextOffset limit,
                                     const UnicodeString& text) {
     replaceBetween(start, limit, text);
 }

 /**
  * Replaceable API
  */
 void
 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
     UChar* text = new UChar[limit - start];
     extractBetween(start, limit, text, 0);
     insert(dest, text, 0, limit - start);
     delete[] text;
 }

 UnicodeString&
 UnicodeString::doReverse(UTextOffset start,
              int32_t length)
 {
   // if we're bogus, do nothing
   if(isBogus() || !cloneArrayIfNeeded()) {
     return *this;
   }

   // pin the indices to legal values
   pinIndices(start, length);

   UChar *left = getArrayStart() + start;
   UChar *right = getArrayStart() + start + length;
   UChar swap;

   while(left < --right) {
     swap = *left;
     *left++ = *right;
     *right = swap;
   }

   return *this;
 }

 UBool
 UnicodeString::padLeading(int32_t targetLength,
                           UChar padChar)
 {
   if(isBogus() || fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
     return FALSE;
   } else {
     // move contents up by padding width
     int32_t start = targetLength - fLength;
     us_arrayCopy(fArray, 0, fArray, start, fLength);

     // fill in padding character
     while(--start >= 0) {
       fArray[start] = padChar;
     }
     fLength = targetLength;
     return TRUE;
   }
 }

 UBool
 UnicodeString::padTrailing(int32_t targetLength,
                            UChar padChar)
 {
   if(isBogus() || fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
     return FALSE;
   } else {
     // fill in padding character
     int32_t length = targetLength;
     while(--length >= fLength) {
       fArray[length] = padChar;
     }
     fLength = targetLength;
     return TRUE;
   }
 }

 UnicodeString&
 UnicodeString::trim()
 {
   if(isBogus()) {
     return *this;
   }

   UChar32 c;
   UTextOffset i = fLength, length;

   // first cut off trailing white space
   for(;;) {
     length = i;
     if(i <= 0) {
       break;
     }
     UTF_PREV_CHAR(fArray, 0, i, c);
     if(!(c == 0x20 || Unicode::isWhitespace(c))) {
       break;
     }
   }
   if(length < fLength) {
     fLength = length;
   }

   // find leading white space
   UTextOffset start;
   i = 0;
   for(;;) {
     start = i;
     if(i >= length) {
       break;
     }
     UTF_NEXT_CHAR(fArray, i, length, c);
     if(!(c == 0x20 || Unicode::isWhitespace(c))) {
       break;
     }
   }

   // move string forward over leading white space
   if(start > 0) {
     doReplace(0, start, 0, 0, 0);
   }

   return *this;
 }

 //========================================
 // Hashing
 //========================================
 int32_t
 UnicodeString::doHashCode() const
 {
     /* Delegate hash computation to uhash.  This makes UnicodeString
      * hashing consistent with UChar* hashing.  */
     int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength);
     if (hashCode == kInvalidHashCode) {
         hashCode = kEmptyHashCode;
     }
     return hashCode;
 }

 //========================================
 // Codeset conversion
 //========================================
 int32_t
 UnicodeString::extract(UTextOffset start,
                        int32_t length,
                        char *dst,
                        const char *codepage) const
 {
   // if we're bogus or there's nothing to convert, do nothing
   if(isBogus() || length <= 0) {
     return 0;
   }

   // pin the indices to legal values
   pinIndices(start, length);

   // set up the conversion parameters
   const UChar *mySource    = getArrayStart() + start;
   const UChar *mySourceEnd = mySource + length;
   char *myTarget           = dst;
   char *myTargetLimit;
   UErrorCode status        = U_ZERO_ERROR;
   int32_t arraySize        = 0x0FFFFFFF;

   // create the converter
   UConverter *converter;

   // if the codepage is the default, use our cache
   if(codepage == 0) {
     converter = getDefaultConverter(status);
   } else if(*codepage == 0) {
     converter = 0;
   } else {
     converter = ucnv_open(codepage, &status);
   }

   // if we failed, set the appropriate flags and return
   // if it is an empty string, then use the "invariant character" conversion
   if(U_FAILURE(status)) {
     // close the converter
     if(codepage == 0) {
       releaseDefaultConverter(converter);
     } else {
       ucnv_close(converter);
     }
     return 0;
   }

   // perform the conversion
   if(converter == 0) {
     // use the "invariant characters" conversion
     if(length > fLength - start) {
       length = fLength - start;
     }
     u_UCharsToChars(mySource, myTarget, length);
     return length;
   }

   // there is no loop here since we assume the buffer is large enough
   myTargetLimit = myTarget + arraySize;

   /* Pin the limit to U_MAX_PTR.  NULL check is for AS/400. */
   if((myTargetLimit < myTarget) || (myTargetLimit == NULL)) {
     myTargetLimit = (char*)U_MAX_PTR;
   }

   ucnv_fromUnicode(converter, &myTarget,  myTargetLimit,
            &mySource, mySourceEnd, 0, TRUE, &status);

   // close the converter
   if(codepage == 0) {
     releaseDefaultConverter(converter);
   } else {
     ucnv_close(converter);
   }

   return (myTarget - dst);
 }

 void
 UnicodeString::doCodepageCreate(const char *codepageData,
                 int32_t dataLength,
                 const char *codepage)
 {
   // if there's nothing to convert, do nothing
   if(codepageData == 0 || dataLength <= 0) {
     return;
   }

   UErrorCode status = U_ZERO_ERROR;

   // create the converter
   // if the codepage is the default, use our cache
   // if it is an empty string, then use the "invariant character" conversion
   UConverter *converter = (codepage == 0 ?
                              getDefaultConverter(status) :
                              *codepage == 0 ?
                                0 :
                                ucnv_open(codepage, &status));

   // if we failed, set the appropriate flags and return
   if(U_FAILURE(status)) {
     // close the converter
     if(codepage == 0) {
       releaseDefaultConverter(converter);
     } else {
       ucnv_close(converter);
     }
     setToBogus();
     return;
   }

   // perform the conversion
   if(converter == 0) {
     // use the "invariant characters" conversion
     if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
       u_charsToUChars(codepageData, getArrayStart(), dataLength);
       fLength = dataLength;
     } else {
       setToBogus();
     }
     return;
   }

   // set up the conversion parameters
   const char *mySource     = codepageData;
   const char *mySourceEnd  = mySource + dataLength;
   UChar *myTarget;

   // estimate the size needed:
   // 1.25 UChar's per source byte should cover most cases
   int32_t arraySize = dataLength + (dataLength >> 2);

   // we do not care about the current contents
   UBool doCopyArray = FALSE;
   for(;;) {
     if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
       setToBogus();
       break;
     }

     // perform the conversion
     myTarget = fArray + fLength;
     ucnv_toUnicode(converter, &myTarget,  fArray + fCapacity,
            &mySource, mySourceEnd, 0, FALSE, &status);

     // update the conversion parameters
     fLength = myTarget - fArray;

     // allocate more space and copy data, if needed
     if(status == U_INDEX_OUTOFBOUNDS_ERROR) {
       // reset the error code
       status = U_ZERO_ERROR;

       // keep the previous conversion results
       doCopyArray = TRUE;

       // estimate the new size needed, larger than before
       // try 2 UChar's per remaining source byte
       arraySize = fLength + 2 * (mySourceEnd - mySource);
     } else {
       break;
     }
   }

   // close the converter
   if(codepage == 0) {
     releaseDefaultConverter(converter);
   } else {
     ucnv_close(converter);
   }
 }

 //========================================
 // External Buffer
 //========================================
 // ### TODO:
 // this is very, very dirty: we should not ever expose our array to the outside,
 // and this also violates the const-ness of this object
 // this must be removed when the resource bundle implementation does not need it any more!
 const UChar*
 UnicodeString::getUChars() const {
   // if we're bogus, do nothing
   if(isBogus()) {
     return 0;
   }

   if(fCapacity <= fLength || fArray[fLength] != 0) {
     if(((UnicodeString &)*this).cloneArrayIfNeeded(fLength + 1)) {
       fArray[fLength] = 0;
     }
   }
   return fArray;
 }

 //========================================
 // Miscellaneous
 //========================================
 UBool
 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
                                   int32_t growCapacity,
                                   UBool doCopyArray,
                                   int32_t **pBufferToDelete) {
   // default parameters need to be static, therefore
   // the defaults are -1 to have convenience defaults
   if(newCapacity == -1) {
     newCapacity = fCapacity;
   }

   /*
    * We need to make a copy of the array if
    * the buffer is read-only, or
    * the buffer is refCounted (shared), and refCount>1, or
    * the buffer is too small.
    * Return FALSE if memory could not be allocated.
    */
   if(fFlags & kBufferIsReadonly ||
      fFlags & kRefCounted && refCount() > 1 ||
      newCapacity > fCapacity
   ) {
     // save old values
     UChar *array = fArray;
     uint16_t flags = fFlags;

     // check growCapacity for default value and use of the stack buffer
     if(growCapacity == -1) {
       growCapacity = newCapacity;
     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
       growCapacity = US_STACKBUF_SIZE;
     }

     // allocate a new array
     if(allocate(growCapacity) ||
        newCapacity < growCapacity && allocate(newCapacity)
     ) {
       if(doCopyArray) {
         // copy the contents
         // do not copy more than what fits - it may be smaller than before
         if(fCapacity < fLength) {
           fLength = fCapacity;
         }
         us_arrayCopy(array, 0, fArray, 0, fLength);
       } else {
         fLength = 0;
       }

       // release the old array
       if(flags & kRefCounted) {
         // the array is refCounted; decrement and release if 0
         int32_t *pRefCount = ((int32_t *)array - 1);
         if(--*pRefCount == 0) {
           if(pBufferToDelete == 0) {
             delete [] pRefCount;
           } else {
             // the caller requested to delete it himself
             *pBufferToDelete = pRefCount;
           }
         }
       }
     } else {
       // not enough memory for growCapacity and not even for the smaller newCapacity
       // reset the old values for setToBogus() to release the array
       fArray = array;
       fFlags = flags;
       setToBogus();
       return FALSE;
     }
   }
   return TRUE;
 }

 // private function for C API
 U_CFUNC int32_t
 T_UnicodeString_length(const UnicodeString *s)
 {
   return s->length();
 }

 // private function for C API
 U_CFUNC int32_t
 T_UnicodeString_extract(const UnicodeString *s, char *dst)
 {
   return s->extract(0, s->length(), dst, "");
 }


 //========================================
 // Default converter caching
 //========================================

 UConverter*
 UnicodeString::getDefaultConverter(UErrorCode &status)
 {
   UConverter *converter = 0;

   if(fgDefaultConverter != 0) {
     Mutex lock;

     // need to check to make sure it wasn't taken out from under us
     if(fgDefaultConverter != 0) {
       converter = fgDefaultConverter;
       fgDefaultConverter = 0;
     }
   }

   // if the cache was empty, create a converter
   if(converter == 0) {
     converter = ucnv_open(0, &status);
     if(U_FAILURE(status)) {
       return 0;
     }
   }

   return converter;
 }

 void
 UnicodeString::releaseDefaultConverter(UConverter *converter)
 {
   if(fgDefaultConverter == 0) {
     if (converter != 0) {
       ucnv_reset(converter);
     }

     Mutex lock;

     if(fgDefaultConverter == 0) {
       fgDefaultConverter = converter;
       converter = 0;
     }
   }

   // it's safe to close a 0 converter
   ucnv_close(converter);
 }

 //========================================
 // Streaming (to be removed)
 //========================================

 #include "unistrm.h"
 #include "filestrm.h"


 inline uint8_t
 uprv_hibyte(uint16_t x)
 { return (uint8_t)(x >> 8); }

 inline uint8_t
 uprv_lobyte(uint16_t x)
 { return (uint8_t)(x & 0xff); }

 inline uint16_t
 uprv_hiword(uint32_t x)
 { return (uint16_t)(x >> 16); }

 inline uint16_t
 uprv_loword(uint32_t x)
 { return (uint16_t)(x & 0xffff); }

 inline void
 writeLong(FileStream *os,
       int32_t x)
 {
   uint16_t word = uprv_hiword((uint32_t)x);
   T_FileStream_putc(os, uprv_hibyte(word));
   T_FileStream_putc(os, uprv_lobyte(word));
   word = uprv_loword((uint32_t)x);
   T_FileStream_putc(os, uprv_hibyte(word));
   T_FileStream_putc(os, uprv_lobyte(word));
 }

 inline int32_t
 readLong(FileStream *is)
 {
   int32_t x = 0;
   uint16_t byte;

   byte = T_FileStream_getc(is);
   x |= byte;
   byte = T_FileStream_getc(is);
   x = (x << 8) | byte;
   byte = T_FileStream_getc(is);
   x = (x << 8) | byte;
   byte = T_FileStream_getc(is);
   x = (x << 8) | byte;

   return x;
 }

 inline void
 writeUChar(FileStream *os,
        UChar c)
 {
   T_FileStream_putc(os, uprv_hibyte(c));
   T_FileStream_putc(os, uprv_lobyte(c));
 }

 inline UChar
 readUChar(FileStream *is)
 {
   UChar c = 0;
   uint16_t byte;

   byte = T_FileStream_getc(is);
   c |= byte;
   byte = T_FileStream_getc(is);
   c = (c << 8) | byte;

   return c;
 }

 void
 UnicodeStringStreamer::streamOut(const UnicodeString *s,
                  FileStream *os)
 {
   if(!T_FileStream_error(os)) {
     writeLong(os, s->fLength);
   }

   const UChar *c   = s->getArrayStart();
   const UChar *end = c + s->fLength;

   while(c != end && ! T_FileStream_error(os)) {
     writeUChar(os, *c++);
   }
 }

 void
 UnicodeStringStreamer::streamIn(UnicodeString *s,
                 FileStream *is)
 {
   int32_t newSize;

   // handle error conditions
   if(T_FileStream_error(is) || T_FileStream_eof(is)) {
     s->setToBogus();
     return;
   }
   newSize = readLong(is);
   if((newSize < 0) || T_FileStream_error(is)
      || ((newSize > 0) && T_FileStream_eof(is))) {
     s->setToBogus(); //error condition
     return;
   }

   // clone s's array, if needed
   if(!s->cloneArrayIfNeeded(newSize, newSize, FALSE)) {
     return;
   }

   UChar *c = s->getArrayStart();
   UChar *end = c + newSize;

   while(c < end && ! (T_FileStream_error(is) || T_FileStream_eof(is))) {
     *c++ = readUChar(is);
   }

   // couldn't read all chars
   if(c < end) {
     s->setToBogus();
     return;
   }

   s->fLength = newSize;
 }

 void
 UnicodeStringStreamer::streamOut(const UnicodeString *s,
                  UMemoryStream *os)
 {
   if(!uprv_mstrm_error(os)) {
     uprv_mstrm_write(os, (uint8_t*)&s->fLength, sizeof(s->fLength));
   }

   const UChar *c   = s->getArrayStart();
   const UChar *end = c + s->fLength;

   while(c != end && ! uprv_mstrm_error(os)) {
     uprv_mstrm_write(os, (uint8_t*)c, sizeof(*c));
     c++;
   }
 }

 void
 UnicodeStringStreamer::streamIn(UnicodeString *s,
                 UMemoryStream *is)
 {
   int32_t newSize;

   // handle error conditions
   if(uprv_mstrm_error(is) || uprv_mstrm_eof(is)) {
     s->setToBogus();
     return;
   }
   uprv_mstrm_read(is, (uint8_t *)&newSize, sizeof(int32_t));
   if((newSize < 0) || uprv_mstrm_error(is)
      || ((newSize > 0) && uprv_mstrm_eof(is))) {
     s->setToBogus(); //error condition
     return;
   }

   // clone s's array, if needed
   if(!s->cloneArrayIfNeeded(newSize, newSize, FALSE)) {
     return;
   }

   UChar *c = s->getArrayStart();
   UChar *end = c + newSize;

   while(c < end && ! (uprv_mstrm_error(is) || uprv_mstrm_eof(is))) {
     uprv_mstrm_read(is, (uint8_t *)c, sizeof(*c));
     c++;
   }

   // couldn't read all chars
   if(c < end) {
     s->setToBogus();
     return;
   }

   s->fLength = newSize;
 }

 // console IO

 #if U_IOSTREAM_SOURCE >= 198506

 #if U_IOSTREAM_SOURCE >= 199711

 U_COMMON_API std::ostream &
 operator<<(std::ostream& stream, const UnicodeString& s)

 #else

 U_COMMON_API ostream &
 operator<<(ostream& stream, const UnicodeString& s)

 #endif

 {
   if(s.length() > 0) {
     char buffer[200];
     UConverter *converter;
     UErrorCode errorCode = U_ZERO_ERROR;

     // use the default converter to convert chunks of text
     converter = UnicodeString::getDefaultConverter(errorCode);
     if(U_SUCCESS(errorCode)) {
       const UChar *us = s.getArrayStart(), *uLimit = us + s.length();
       char *s, *sLimit = buffer + sizeof(buffer);
       do {
         errorCode = U_ZERO_ERROR;
         s = buffer;
         ucnv_fromUnicode(converter, &s, sLimit, &us, uLimit, 0, FALSE, &errorCode);

         // write this chunk
         if(s > buffer) {
           stream.write(buffer, s - buffer);
         }
       } while(errorCode == U_INDEX_OUTOFBOUNDS_ERROR);
       UnicodeString::releaseDefaultConverter(converter);
     }
   }

   stream.flush();
   return stream;
 }

 #endif