source/common/normlzr.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 *                                                                             *
 * COPYRIGHT:                                                                  *
 *   IBM Open Class Library                                                    *
 *   (C) Copyright Taligent, Inc.,  1996                                       *
 *   (C) Copyright International Business Machines Corporation,  1996-1998     *
 *   Licensed Material - Program-Property of IBM - All Rights Reserved.        *
 *   US Government Users Restricted Rights - Use, duplication, or disclosure   *
 *   restricted by GSA ADP Schedule Contract with IBM Corp.                    *
 *                                                                             *
 *******************************************************************************
 */


 #include "ucmp16.h"
 #include "dcmpdata.h"
 #include "compdata.h"

 #include "normlzr.h"
 #include "utypes.h"
 #include "unistr.h"
 #include "chariter.h"
 #include "schriter.h"
 #include "unicode.h"
 #include "mutex.h"


 #define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))

 inline static void insert(UnicodeString& dest,
               UTextOffset pos,
               UChar ch)
 {
   dest.replace(pos, 0, &ch, 1);
 }

 const UChar     Normalizer::DONE     = 0xFFFF;
 const UChar    Normalizer::HANGUL_BASE = 0xac00;
 const UChar    Normalizer::HANGUL_LIMIT= 0xd7a4;
 const UChar    Normalizer::JAMO_LBASE  = 0x1100;
 const UChar    Normalizer::JAMO_VBASE  = 0x1161;
 const UChar    Normalizer::JAMO_TBASE  = 0x11a7;
 const int16_t    Normalizer::JAMO_LCOUNT = 19;
 const int16_t    Normalizer::JAMO_VCOUNT = 21;
 const int16_t    Normalizer::JAMO_TCOUNT = 28;
 const int16_t    Normalizer::JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT;


 //-------------------------------------------------------------------------
 // Constructors and other boilerplate
 //-------------------------------------------------------------------------

 Normalizer::Normalizer(const UnicodeString& str,
                EMode mode)
 {
   init(new StringCharacterIterator(str), mode, 0);
 }

 Normalizer::Normalizer(const UnicodeString& str,
                EMode mode,
                int32_t opt)
 {
   init(new StringCharacterIterator(str), mode, opt);
 }

 Normalizer::Normalizer(const CharacterIterator& iter,
                EMode mode)
 {
   init(iter.clone(), mode, 0);
 }

 Normalizer::Normalizer(const CharacterIterator& iter,
                EMode mode,
                int32_t opt)
 {
   init(iter.clone(), mode, opt);
 }

 void Normalizer::init(CharacterIterator* adoptIter,
               EMode mode,
               int32_t options)
 {
   bufferPos = 0;
   bufferLimit = 0;
   fOptions = options;
   currentChar = DONE;
   fMode = mode;
   text = adoptIter;

   minDecomp = (fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT;
 }

 Normalizer::Normalizer(const Normalizer& copy)
 {
   init(copy.text->clone(), copy.fMode, copy.fOptions);

   buffer      = copy.buffer;
   bufferPos   = copy.bufferPos;
   bufferLimit = copy.bufferLimit;
   explodeBuf  = copy.explodeBuf;
   currentChar = copy.currentChar;
 }

 Normalizer::~Normalizer()
 {
   delete text;
 }

 Normalizer*
 Normalizer::clone() const
 {
   return new Normalizer(*this);
 }

 /**
  * Generates a hash code for this iterator.
  */
 int32_t Normalizer::hashCode() const
 {
   return text->hashCode() + fMode + fOptions + bufferPos + bufferLimit;
 }

 bool_t Normalizer::operator==(const Normalizer& that) const
 {
   return *text == *(that.text)
   && currentChar == that.currentChar
   && buffer == that.buffer
   && explodeBuf == that.explodeBuf
   && bufferPos == that.bufferPos
   && bufferLimit == that.bufferLimit;
 }

 //-------------------------------------------------------------------------
 // Static utility methods
 //-------------------------------------------------------------------------

 void
 Normalizer::normalize(const UnicodeString& source,
               EMode mode,
               int32_t options,
               UnicodeString& result,
               UErrorCode &status)
 {
   switch (mode) {
   case NO_OP:
     result = source;
     break;
   case COMPOSE:
   case COMPOSE_COMPAT:
     compose(source, mode & COMPAT_BIT, options, result, status);
     break;
   case DECOMP:
   case DECOMP_COMPAT:
     decompose(source, mode & COMPAT_BIT, options, result, status);
     break;
   }
 }

 //-------------------------------------------------------------------------
 // Compose methods
 //-------------------------------------------------------------------------

 void
 Normalizer::compose(const UnicodeString& source,
             bool_t compat,
             int32_t options,
             UnicodeString& result,
             UErrorCode &status)
 {
   if (U_FAILURE(status)) {
     return;
   }
   result.truncate(0);
   UnicodeString explodeBuf;

   UTextOffset  explodePos = EMPTY;         // Position in input buffer
   UTextOffset  basePos = 0;                // Position of last base in output string
   uint16_t    baseIndex = 0;              // Index of last base in "actions" array
   uint32_t    classesSeen = 0;            // Combining classes seen since last base
   uint16_t    action;

   // Compatibility explosions have lower indices; skip them if necessary
   uint16_t minExplode = compat ? 0 : ComposeData::MAX_COMPAT;
   uint16_t minDecomp = compat ? 0 : DecompData::MAX_COMPAT;

     UTextOffset i = 0;
     while (i < source.size() || explodePos != EMPTY) {
         // Get the next char from either the buffer or the source
       UChar ch;
       if (explodePos == EMPTY) {
     ch = source[i++];
       } else {
     ch = explodeBuf[explodePos++];
     if (explodePos >= explodeBuf.size()) {
       explodePos = EMPTY;
       explodeBuf.truncate(0);
     }
       }

       // Get the basic info for the character
       uint16_t charInfo = composeLookup(ch);
       uint16_t type = charInfo & ComposeData::TYPE_MASK;
       uint16_t index = charInfo >> ComposeData::INDEX_SHIFT;

       if (type == ComposeData::BASE) {
     classesSeen = 0;
     baseIndex = index;
     basePos = result.size();
     result += ch;
       }
       else if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING)
         {
       uint32_t cclass = ComposeData::typeMask[index];

       // We can only combine a character with the base if we haven't
       // already seen a combining character with the same canonical class.
       if (type == ComposeData::COMBINING && (classesSeen & cclass) == 0
           && (action = composeAction(baseIndex, index)) > 0)
             {
           if (action > ComposeData::MAX_COMPOSED) {
         // Pairwise explosion.  Actions above this value are really
         // indices into an array that in turn contains indices
         // into the exploding string table
         // TODO: What if there are unprocessed chars in the explode buffer?
         UChar newBase = pairExplode(explodeBuf, action);
         explodePos = 0;
         result[basePos] = newBase;

         baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
           } else {
         // Normal pairwise combination.  Replace the base char
         UChar newBase = (UChar) action;
         result[basePos] = newBase;

         baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
           }
           //
           // Since there are Unicode characters that cannot be combined in arbitrary
           // order, we have to re-process any combining marks that go with this
           // base character.  There are only four characters in Unicode that have
           // this problem.  If they are fixed in Unicode 3.0, this code can go away.
           //
           UTextOffset len = result.size();
           if (len - basePos > 1) {
         for (UTextOffset j = basePos+1; j < len; j++) {
           explodeBuf += result[j];
         }
         result.truncate(basePos+1);
         classesSeen = 0;
         if (explodePos == EMPTY) explodePos = 0;
           }
             } else {
           // No combination with this character
           bubbleAppend(result, ch, cclass);
           classesSeen |= cclass;
             }
         }
       else if (index > minExplode) {
     // Single exploding character
     explode(explodeBuf, index);
     explodePos = 0;
       }
       else if (type == ComposeData::HANGUL && minExplode == 0) {
     // If we're in compatibility mode we need to decompose Hangul to Jamo,
     // because some of the Jamo might have compatibility decompositions.
     hangulToJamo(ch, explodeBuf, minDecomp);
     explodePos = 0;
       }
       else if (type == ComposeData::INITIAL_JAMO) {
     classesSeen = 0;
     baseIndex = ComposeData::INITIAL_JAMO_INDEX;
     basePos = result.size();
     result += ch;
       }
       else if (type == ComposeData::MEDIAL_JAMO && classesSeen == 0
            && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
     // If the last character was an initial jamo, we can combine it with this
     // one to create a Hangul character.
     uint16_t l = result[basePos] - JAMO_LBASE;
     uint16_t v = ch - JAMO_VBASE;
     result[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);

     baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
       }
       else if (type == ComposeData::FINAL_JAMO && classesSeen == 0
            && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
     // If the last character was a medial jamo that we turned into Hangul,
     // we can add this character too.
     result[basePos] = (UChar)(result[basePos] + (ch - JAMO_TBASE));

     baseIndex = 0;
     basePos = -1;
     classesSeen = 0;
       } else {
     baseIndex = 0;
     basePos = -1;
     classesSeen = 0;
     result += ch;
       }
     }
 }

 /**
  * Compose starting with current input character and continuing
  * until just before the next base char.
  * <p>
  * <b>Input</b>:
  * <ul>
  *  <li>underlying char iter points to first character to decompose
  * </ul>
  * <p>
  * <b>Output:</b>
  * <ul>
  *  <li>returns first char of decomposition or DONE if at end
  *  <li>Underlying char iter is pointing at next base char or past end
  * </ul>
  */
 UChar Normalizer::nextCompose()
 {
     UTextOffset  explodePos = EMPTY;         // Position in input buffer
     UTextOffset  basePos = 0;                // Position of last base in output string
     uint16_t    baseIndex = 0;              // Index of last base in "actions" array
     uint32_t    classesSeen = 0;            // Combining classes seen since last base
     uint16_t    action;
     UChar        lastBase = 0;
     bool_t        chFromText = TRUE;

     // Compatibility explosions have lower indices; skip them if necessary
     uint16_t minExplode = (fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT;
     uint16_t minDecomp = (fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT;

     initBuffer();
     explodeBuf.truncate(0);

     UChar ch = curForward();

     while (ch != DONE) {
         // Get the basic info for the character
         uint16_t charInfo = composeLookup(ch);
         uint16_t type = charInfo & ComposeData::TYPE_MASK;
         uint16_t index = charInfo >> ComposeData::INDEX_SHIFT;

         if (type == ComposeData::BASE) {
             if (buffer.size() > 0 && chFromText && explodePos == EMPTY) {
                 // When we hit a base char in the source text, we can return the text
                 // that's been composed so far.  We'll re-process this char next time through.
                 break;
             }
             classesSeen = 0;
             baseIndex = index;
             basePos = buffer.size();
             buffer += ch;
             lastBase = ch;
         }
         else if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING)
         {
             uint32_t cclass = ComposeData::typeMask[index];

             // We can only combine a character with the base if we haven't
             // already seen a combining character with the same canonical class.
             if (type == ComposeData::COMBINING && (classesSeen & cclass) == 0
                 && (action = composeAction(baseIndex, index)) > 0)
             {
                 if (action > ComposeData::MAX_COMPOSED) {
                     // Pairwise explosion.  Actions above this value are really
                     // indices into an array that in turn contains indices
                     // into the exploding string table
                     // TODO: What if there are unprocessed chars in the explode buffer?
                     UChar newBase = pairExplode(explodeBuf, action);
                     explodePos = 0;
                     buffer[basePos] = newBase;

                     baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
                     lastBase = newBase;
                 } else {
                     // Normal pairwise combination.  Replace the base char
                     UChar newBase = (UChar) action;
                     buffer[basePos] = newBase;

                     baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
                     lastBase = newBase;
                 }
                 //
                 // Since there are Unicode characters that cannot be combined in arbitrary
                 // order, we have to re-process any combining marks that go with this
                 // base character.  There are only four characters in Unicode that have
                 // this problem.  If they are fixed in Unicode 3.0, this code can go away.
                 //
                 UTextOffset len = buffer.size();
                 if (len - basePos > 1) {
                     for (UTextOffset j = basePos+1; j < len; j++) {
                         explodeBuf += buffer[j];
                     }
                     buffer.truncate(basePos+1);
                     classesSeen = 0;
                     if (explodePos == EMPTY) explodePos = 0;
                 }
             } else {
                 // No combination with this character
                 bubbleAppend(buffer, ch, cclass);
                 classesSeen |= cclass;
             }
         }
         else if (index > minExplode) {
             // Single exploding character
             explode(explodeBuf, index);
             explodePos = 0;
         }
         else if (type == ComposeData::HANGUL && minExplode == 0) {
             // If we're in compatibility mode we need to decompose Hangul to Jamo,
             // because some of the Jamo might have compatibility decompositions.
             hangulToJamo(ch, explodeBuf, minDecomp);
             explodePos = 0;
         }
         else if (type == ComposeData::INITIAL_JAMO) {
             if (buffer.size() > 0 && chFromText && explodePos == EMPTY) {
                 // When we hit a base char in the source text, we can return the text
                 // that's been composed so far.  We'll re-process this char next time through.
                 break;
             }
             classesSeen = 0;
             baseIndex = ComposeData::INITIAL_JAMO_INDEX;
             basePos = buffer.size();
             buffer += ch;
         }
         else if (type == ComposeData::MEDIAL_JAMO && classesSeen == 0
                     && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
             // If the last character was an initial jamo, we can combine it with this
             // one to create a Hangul character.
             uint16_t l = buffer[basePos] - JAMO_LBASE;
             uint16_t v = ch - JAMO_VBASE;
             UChar newCh = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
             buffer[basePos] = newCh;

             baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
         }
         else if (type == ComposeData::FINAL_JAMO && classesSeen == 0
                     && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
             // If the last character was a medial jamo that we turned into Hangul,
             // we can add this character too.
             UChar newCh = (UChar)(buffer[basePos] + (ch - JAMO_TBASE));
             buffer[basePos] = newCh;

             baseIndex = 0;
             basePos = -1;
             classesSeen = 0;
         } else {
             // TODO: deal with JAMO character types
             baseIndex = 0;
             basePos = -1;
             classesSeen = 0;
             buffer += ch;
         }

         if (explodePos == EMPTY) {
             ch = text->next();
             chFromText = TRUE;
         } else {
             ch = explodeBuf[explodePos++];
             if (explodePos >= explodeBuf.size()) {
                 explodePos = EMPTY;
                 explodeBuf.truncate(0);
             }
             chFromText = FALSE;
         }
     }
     if (buffer.size() > 0) {
         bufferLimit = buffer.size() - 1;
         ch = buffer[0];
     } else {
         ch = DONE;
         bufferLimit = 0;
     }
     return ch;
 }

 /**
  * Compose starting with the input UChar just before the current position
  * and continuing backward until (and including) the previous base char.
  * <p>
  * <b>Input</b>:
  * <ul>
  *  <li>underlying char iter points just after last char to decompose
  * </ul>
  * <p>
  * <b>Output:</b>
  * <ul>
  *  <li>returns last char of resulting decomposition sequence
  *  <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
  * </ul>
  */
 UChar Normalizer::prevCompose()
 {
     UErrorCode status = U_ZERO_ERROR;
     initBuffer();

     // Slurp up characters until we hit a base char or an initial Jamo
     UChar ch;
     while ((ch = curBackward()) != DONE) {
         insert(buffer, 0, ch);

         // Get the basic info for the character
         uint16_t charInfo = composeLookup(ch);
         uint16_t type = charInfo & ComposeData::TYPE_MASK;

         if (type == ComposeData::BASE || type == ComposeData::HANGUL
             || type == ComposeData::INITIAL_JAMO || type == ComposeData::IGNORE)
         {
             break;
         }
     }
     // If there's more than one character in the buffer, compose it all at once....
     if (buffer.size() > 0) {
         // TODO: The performance of this is awful; add a way to compose
         // a UnicodeString& in place.
       UnicodeString composed;
       compose(buffer, (fMode & COMPAT_BIT), fOptions, composed, status);
       buffer.truncate(0);
       buffer += composed;

         if (buffer.size() > 1) {
             bufferLimit = bufferPos = buffer.size() - 1;
             ch = buffer[bufferPos];
         } else {
             ch = buffer[0];
         }
     }
     else {
         ch = DONE;
     }

     return ch;
 }

 void Normalizer::bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass) {
     UTextOffset i;
     for (i = target.size() - 1; i > 0; --i) {
         uint32_t iClass = getComposeClass(target[i]);

         if (iClass == 1 || iClass <= cclass) {      // 1 means combining class 0
             // We've hit something we can't bubble this character past, so insert here
             break;
         }
     }
     // We need to insert just after character "i"
     insert(target, i+1, ch);
 }


 uint32_t Normalizer::getComposeClass(UChar ch) {
     uint32_t cclass = 0;
     uint16_t charInfo = composeLookup(ch);
     uint16_t type = charInfo & ComposeData::TYPE_MASK;
     if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING) {
         cclass = ComposeData::typeMask[charInfo >> ComposeData::INDEX_SHIFT];
     }
     return cclass;
 }

 uint16_t Normalizer::composeLookup(UChar ch) {
   return ucmp16_getu(ComposeData::lookup, ch);
 }

 uint16_t Normalizer::composeAction(uint16_t baseIndex, uint16_t comIndex)
 {
   return ucmp16_getu(ComposeData::actions,
              ((UChar)(baseIndex + ComposeData::MAX_BASES*comIndex)));
 }

 void Normalizer::explode(UnicodeString& target, uint16_t index) {
     UChar ch;
     while ((ch = ComposeData::replace[index++]) != 0)
     target += ch;
 }

 UChar Normalizer::pairExplode(UnicodeString& target, uint16_t action) {
     uint16_t index = ComposeData::actionIndex[action - ComposeData::MAX_COMPOSED];
     explode(target, index + 1);
     return ComposeData::replace[index];   // New base char
 }

 //-------------------------------------------------------------------------
 // Decompose methods
 //-------------------------------------------------------------------------

 void
 Normalizer::decompose(const UnicodeString& source,
               bool_t compat,
               int32_t options,
               UnicodeString& result,
               UErrorCode &status)
 {
   if (U_FAILURE(status)) {
     return;
   }
   bool_t     hangul = (options & IGNORE_HANGUL) == 0;
   uint16_t     limit  = compat ? 0 : DecompData::MAX_COMPAT;

   result.truncate(0);

   for (UTextOffset i = 0; i < source.size(); ++i) {
     UChar ch = source[i];

     uint16_t offset = ucmp16_getu(DecompData::offsets, ch);


     if (offset > limit) {
       doAppend(DecompData::contents, offset, result);
     } else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) {
       hangulToJamo(ch, result, limit);
     } else {
       result += ch;
     }
   }
   fixCanonical(result);
 }

 /**
  * Decompose starting with current input character and continuing
  * until just before the next base char.
  * <p>
  * <b>Input</b>:
  * <ul>
  *  <li>underlying char iter points to first character to decompose
  * </ul>
  * <p>
  * <b>Output:</b>
  * <ul>
  *  <li>returns first char of decomposition or DONE if at end
  *  <li>Underlying char iter is pointing at next base char or past end
  * </ul>
  */
 UChar Normalizer::nextDecomp()
 {
   bool_t hangul = ((fOptions & IGNORE_HANGUL) == 0);
   UChar ch = curForward();

   uint16_t offset = ucmp16_getu(DecompData::offsets, ch);

   if (offset > minDecomp || ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
     {
       initBuffer();

       if (offset > minDecomp) {
     doAppend(DecompData::contents, offset, buffer);
       } else {
     buffer += ch;
       }
       bool_t needToReorder = FALSE;

       // Any other combining chacters that immediately follow the decomposed
       // character must be included in the buffer too, because they're
       // conceptually part of the same logical character.
       //
       // TODO: Might these need to be decomposed too?
       // (i.e. are there non-BASE characters with decompositions?
       //
       while ((ch = text->next()) != DONE
          && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
         {
       needToReorder = TRUE;
       buffer += ch;
         }

       if (buffer.size() > 1 && needToReorder) {
     // If there is more than one combining character in the buffer,
     // put them into the canonical order.
     // But we don't need to sort if only characters are the ones that
     // resulted from decomosing the base character.
     fixCanonical(buffer);
       }
       bufferLimit = buffer.size() - 1;
       ch = buffer[0];
     } else {
       // Just use this character, but first advance to the next one
       text->next();

       // Do Hangul -> Jamo decomposition if necessary
       if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
     initBuffer();
     hangulToJamo(ch, buffer, minDecomp);
     bufferLimit = buffer.size() - 1;
     ch = buffer[0];
       }
     }
   return ch;
 }


 /**
  * Decompose starting with the input char just before the current position
  * and continuing backward until (and including) the previous base char.
  * <p>
  * <b>Input</b>:
  * <ul>
  *  <li>underlying char iter points just after last char to decompose
  * </ul>
  * <p>
  * <b>Output:</b>
  * <ul>
  *  <li>returns last char of resulting decomposition sequence
  *  <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
  * </ul>
  */
 UChar Normalizer::prevDecomp() {
     bool_t hangul = (fOptions & IGNORE_HANGUL) == 0;

     UChar ch = curBackward();

     uint16_t offset = ucmp16_getu(DecompData::offsets, ch);

     if (offset > minDecomp || ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
     {
         initBuffer();

         // Slurp up any combining characters till we get to a base char.
         while (ch != DONE && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE) {
             insert(buffer, 0, ch);
             ch = text->previous();
         }

         // Now decompose this base character
         offset = ucmp16_getu(DecompData::offsets, ch);
         if (offset > minDecomp) {
             doInsert(DecompData::contents, offset, buffer, 0);
         } else {
             // This is a base character that doesn't decompose
             // and isn't involved in reordering, so throw it back
             text->next();
         }

         if (buffer.size() > 1) {
             // If there is more than one combining character in the buffer,
             // put them into the canonical order.
             fixCanonical(buffer);
         }
         bufferLimit = bufferPos = buffer.size() - 1;
         ch = buffer[bufferPos];
     }
     else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
         initBuffer();
         hangulToJamo(ch, buffer, minDecomp);
         bufferLimit = bufferPos = buffer.size() - 1;
         ch = buffer[bufferPos];
     }
     return ch;
 }

 uint8_t Normalizer::getClass(UChar ch) {
     return  ucmp8_get(DecompData::canonClass, ch);
 }

 /**
  * Fixes the sorting sequence of non-spacing characters according to
  * their combining class.  The algorithm is listed on p.3-11 in the
  * Unicode Standard 2.0.  The table of combining classes is on p.4-2
  * in the Unicode Standard 2.0.
  * @param result the string to fix.
  */
 void Normalizer::fixCanonical(UnicodeString& result) {
     UTextOffset i = result.size() - 1;
     uint8_t currentType = getClass(result[i]);
     uint8_t lastType;

     for (--i; i >= 0; --i) {
         lastType = currentType;
         currentType = getClass(result[i]);

         //
         // a swap is presumed to be rare (and a double-swap very rare),
         // so don't worry about efficiency here.
         //
         if (currentType > lastType && lastType != DecompData::BASE) {
             // swap characters
             UChar temp = result[i];
             result[i] = result[i+1];
             result[i+1] = temp;

             // if not at end, backup (one further, to compensate for for-loop)
             if (i < result.size() - 2) {
                 i += 2;
             }
             // reset type, since we swapped.
             currentType = getClass(result[i]);
         }
     }
 }


 //-------------------------------------------------------------------------
 // CharacterIterator overrides
 //-------------------------------------------------------------------------

 /**
  * Return the current character in the normalized text.
  */
 UChar Normalizer:: current() const
 {
   // TODO: make this method const and guarantee that currentChar is always set?
   Normalizer *nonConst = (Normalizer*)this;

   if (currentChar == DONE) {
     switch (fMode) {
     case NO_OP:
       nonConst->currentChar = text->current();
       break;
     case COMPOSE:
     case COMPOSE_COMPAT:
       nonConst->currentChar = nonConst->nextCompose();
       break;
     case DECOMP:
     case DECOMP_COMPAT:
       nonConst->currentChar = nonConst->nextDecomp();
       break;
     }
   }
   return currentChar;
 }

 /**
  * Return the first character in the normalized text.  This resets
  * the <tt>Normalizer's</tt> position to the beginning of the text.
  */
 UChar Normalizer::first() {
     return setIndex(text->startIndex());
 }

 /**
  * Return the last character in the normalized text.  This resets
  * the <tt>Normalizer's</tt> position to be just before the
  * the input text corresponding to that normalized character.
  */
 UChar Normalizer::last() {
   text->setIndex(text->endIndex());

   currentChar = DONE;                     // The current char hasn't been processed
   clearBuffer();                          // The buffer is empty too
   return previous();
 }

 /**
  * Return the next character in the normalized text and advance
  * the iteration position by one.  If the end
  * of the text has already been reached, {@link #DONE} is returned.
  */
 UChar Normalizer::next() {
   if (bufferPos < bufferLimit) {
     // There are output characters left in the buffer
     currentChar = buffer[++bufferPos];
   }
   else {
     bufferLimit = bufferPos = 0;    // Buffer is now out of date
     switch (fMode) {
     case NO_OP:
       currentChar = text->next();
       break;
     case COMPOSE:
     case COMPOSE_COMPAT:
       currentChar = nextCompose();
       break;
     case DECOMP:
     case DECOMP_COMPAT:
       currentChar = nextDecomp();
       break;
     }
   }
   return currentChar;
 }

 /**
  * Return the previous character in the normalized text and decrement
  * the iteration position by one.  If the beginning
  * of the text has already been reached, {@link #DONE} is returned.
  */
 UChar Normalizer::previous()
 {
   if (bufferPos > 0) {
     // There are output characters left in the buffer
     currentChar = buffer[--bufferPos];
   }
   else {
     bufferLimit = bufferPos = 0;    // Buffer is now out of date
     switch (fMode) {
     case NO_OP:
       currentChar = text->previous();
       break;
     case COMPOSE:
     case COMPOSE_COMPAT:
       currentChar = prevCompose();
       break;
     case DECOMP:
     case DECOMP_COMPAT:
       currentChar = prevDecomp();
       break;
     }
   }
   return currentChar;
 }

 void Normalizer::reset()
 {
     text->setIndex(text->startIndex());
     currentChar = DONE;     // The current char hasn't been processed
     clearBuffer();          // The buffer is empty too
 }

 /**
  * Set the iteration position in the input text that is being normalized
  * and return the first normalized character at that position.
  * <p>
  * <b>Note:</b> This method sets the position in the <em>input</em> text,
  * while {@link #next} and {@link #previous} iterate through characters
  * in the normalized <em>output</em>.  This means that there is not
  * necessarily a one-to-one correspondence between characters returned
  * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
  * returned from <tt>setIndex</tt> and {@link #getIndex}.
  * <p>
  * @param index the desired index in the input text.
  *
  * @return      the first normalized character that is the result of iterating
  *              forward starting at the given index.
  *
  * @throws IllegalArgumentException if the given index is less than
  *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
  */
 UChar Normalizer::setIndex(UTextOffset index)
 {
     text->setIndex(index);   // Checks range
     currentChar = DONE;     // The current char hasn't been processed
     clearBuffer();          // The buffer is empty too

     return current();
 }

 /**
  * Retrieve the current iteration position in the input text that is
  * being normalized.  This method is useful in applications such as
  * searching, where you need to be able to determine the position in
  * the input text that corresponds to a given normalized output character.
  * <p>
  * <b>Note:</b> This method sets the position in the <em>input</em>, while
  * {@link #next} and {@link #previous} iterate through characters in the
  * <em>output</em>.  This means that there is not necessarily a one-to-one
  * correspondence between characters returned by <tt>next</tt> and
  * <tt>previous</tt> and the indices passed to and returned from
  * <tt>setIndex</tt> and {@link #getIndex}.
  *
  */
 UTextOffset Normalizer::getIndex() const {
     return text->getIndex();
 }

 /**
  * Retrieve the index of the start of the input text.  This is the begin index
  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
  * over which this <tt>Normalizer</tt> is iterating
  */
 UTextOffset Normalizer::startIndex() const {
     return text->startIndex();
 }

 /**
  * Retrieve the index of the end of the input text.  This is the end index
  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
  * over which this <tt>Normalizer</tt> is iterating
  */
 UTextOffset Normalizer::endIndex() const {
     return text->endIndex();
 }

 //-------------------------------------------------------------------------
 // Property access methods
 //-------------------------------------------------------------------------

 void
 Normalizer::setMode(EMode newMode)
 {
   fMode     = newMode;
   minDecomp     = ((fMode & COMPAT_BIT) != 0) ? 0 : DecompData::MAX_COMPAT;
 }

 Normalizer::EMode
 Normalizer::getMode() const
 {
     return fMode;
 }

 void
 Normalizer::setOption(int32_t option,
               bool_t value)
 {
   if (value) {
     fOptions |= option;
   } else {
     fOptions &= (~option);
   }
 }

 bool_t
 Normalizer::getOption(int32_t option) const
 {
     return (fOptions & option) != 0;
 }

 /**
  * Set the input text over which this <tt>Normalizer</tt> will iterate.
  * The iteration position is set to the beginning of the input text.
  */
 void
 Normalizer::setText(const UnicodeString& newText,
             UErrorCode &status)
 {
   if (U_FAILURE(status)) {
     return;
   }
   CharacterIterator *newIter = new StringCharacterIterator(newText);
   if (newIter == NULL) {
     status = U_MEMORY_ALLOCATION_ERROR;
     return;
   }
   delete text;
   text = newIter;
   reset();
 }

 /**
  * Set the input text over which this <tt>Normalizer</tt> will iterate.
  * The iteration position is set to the beginning of the string.
  */
 void
 Normalizer::setText(const CharacterIterator& newText,
             UErrorCode &status)
 {
   if (U_FAILURE(status)) {
     return;
   }
   CharacterIterator *newIter = newText.clone();
   if (newIter == NULL) {
     status = U_MEMORY_ALLOCATION_ERROR;
     return;
   }
   delete text;
   text = newIter;
   reset();
 }


 /**
  * Copies the text under iteration into the UnicodeString referred to by "result".
  * @param result Receives a copy of the text under iteration.
  */
 void
 Normalizer::getText(UnicodeString&  result)
 {
     text->getText(result);
 }


 //-------------------------------------------------------------------------
 // Private utility methods
 //-------------------------------------------------------------------------


 UChar Normalizer::curForward() {
     UChar ch = text->current();
     return ch;
 }

 UChar Normalizer::curBackward() {
     UChar ch = text->previous();
     return ch;
 }

 void Normalizer::doAppend(const UChar source[], uint16_t offset, UnicodeString& dest) {
     uint16_t index = offset >> STR_INDEX_SHIFT;
     uint16_t length = offset & STR_LENGTH_MASK;

     if (length == 0) {
         UChar ch;
         while ((ch = source[index++]) != 0x0000) {
             dest += ch;
         }
     } else {
         while (length-- > 0) {
             dest += source[index++];
         }
     }
 }

 void Normalizer::doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos)
 {
     uint16_t index = offset >> STR_INDEX_SHIFT;
     uint16_t length = offset & STR_LENGTH_MASK;

     if (length == 0) {
         UChar ch;
         while ((ch = source[index++]) != 0x0000) {
             insert(dest, pos++, ch);
         }
     } else {
         while (length-- > 0) {
             insert(dest, pos++, source[index++]);
         }
     }
 }

 void Normalizer::initBuffer() {
     buffer.truncate(0);
     clearBuffer();
 }

 void Normalizer::clearBuffer() {
     bufferLimit = bufferPos = 0;
 }

 //-----------------------------------------------------------------------------
 // Hangul / Jamo conversion utilities for internal use
 // See section 3.10 of The Unicode Standard, v 2.0.
 //
 /**
  * Convert a single Hangul syllable into one or more Jamo characters.
  *
  * @param conjoin If TRUE, decompose Jamo into conjoining Jamo.
  */
 void Normalizer::hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit)
 {
     UChar sIndex  = (UChar)(ch - HANGUL_BASE);
     UChar leading = (UChar)(JAMO_LBASE + sIndex / JAMO_NCOUNT);
     UChar vowel   = (UChar)(JAMO_VBASE +
                           (sIndex % JAMO_NCOUNT) / JAMO_TCOUNT);
     UChar trailing= (UChar)(JAMO_TBASE + (sIndex % JAMO_TCOUNT));

     jamoAppend(leading, decompLimit, result);
     jamoAppend(vowel, decompLimit, result);
     if (trailing != JAMO_TBASE) {
         jamoAppend(trailing, decompLimit, result);
     }
 }

 void Normalizer::jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest) {
   uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
     if (offset > decompLimit) {
         doAppend(DecompData::contents, offset, dest);
     } else {
         dest += ch;
     }
 }

 void Normalizer::jamoToHangul(UnicodeString& buffer, UTextOffset start) {
     UTextOffset out = start;
     UTextOffset limit = buffer.size() - 1;

     UTextOffset in;
     uint16_t l, v, t;

     for (in = start; in < limit; in++) {
         UChar ch = buffer[in];

         if ((l = ch - JAMO_LBASE) >= 0 && l < JAMO_LCOUNT
                 && (v = buffer[in+1] - JAMO_VBASE) >= 0 && v < JAMO_VCOUNT) {
             //
             // We've found a pair of Jamo characters to compose.
             // Snarf the Jamo vowel and see if there's also a trailing char
             //
             in++;   // Snarf the Jamo vowel too.

             t = (in < limit) ? buffer.charAt(in+1) : 0;
             t -= JAMO_TBASE;

             if (t >= 0 && t < JAMO_TCOUNT) {
                 in++;   // Snarf the trailing consonant too
             } else {
                 t = 0;  // No trailing consonant
             }
             buffer[out++] = (UChar)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE);
         } else {
             buffer[out++] = ch;
         }
     }
     while (in < buffer.size()) {
         buffer[out++] = buffer[in++];
     }

     buffer.truncate(out);
 }