source/common/unicode/normlzr.h - external/github.com/unicode-org/icu - Git at Google

 /*
  ********************************************************************
  * COPYRIGHT:
  * Copyright (c) 1996-1999, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************
  */

 #ifndef NORMLZR_H
 #define NORMLZR_H

 #include "unicode/utypes.h"
 #include "unicode/unistr.h"
 #include "unicode/chariter.h"

 /* forward declaration */
 class ComposedCharIter;

 /**
  * <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or
  * decomposed form, allowing for easier sorting and searching of text.
  * <tt>Normalizer</tt> supports the standard normalization forms described in
  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
  * Unicode Technical Report #15</a>.
  * <p>
  * Characters with accents or other adornments can be encoded in
  * several different ways in Unicode.  For example, take the character "Á"
  * (A-acute).   In Unicode, this can be encoded as a single character (the
  * "composed" form):
  * <pre>
  * \code
  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
  * \endcode
  * or as two separate characters (the "decomposed" form):
  * <pre>
  * \code
  *      0041    LATIN CAPITAL LETTER A
  *      0301    COMBINING ACUTE ACCENT</pre>
  * \endcode
  * <p>
  * To a user of your program, however, both of these sequences should be
  * treated as the same "user-level" character "Á".  When you are searching or
  * comparing text, you must ensure that these two sequences are treated
  * equivalently.  In addition, you must handle characters with more than one
  * accent.  Sometimes the order of a character's combining accents is
  * significant, while in other cases accent sequences in different orders are
  * really equivalent.
  * <p>
  * Similarly, the string "ffi" can be encoded as three separate letters:
  * <pre>
  * \code
  *      0066    LATIN SMALL LETTER F
  *      0066    LATIN SMALL LETTER F
  *      0069    LATIN SMALL LETTER I</pre>
  * \endcode
  * or as the single character
  * <pre>
  * \code
  *      FB03    LATIN SMALL LIGATURE FFI</pre>
  * \endcode
  * <p>
  * The ffi ligature is not a distinct semantic character, and strictly speaking
  * it shouldn't be in Unicode at all, but it was included for compatibility
  * with existing character sets that already provided it.  The Unicode standard
  * identifies such characters by giving them "compatibility" decompositions
  * into the corresponding semantic characters.  When sorting and searching, you
  * will often want to use these mappings.
  * <p>
  * <tt>Normalizer</tt> helps solve these problems by transforming text into the
  * canonical composed and decomposed forms as shown in the first example above.
  * In addition, you can have it perform compatibility decompositions so that
  * you can treat compatibility characters the same as their equivalents.
  * Finally, <tt>Normalizer</tt> rearranges accents into the proper canonical
  * order, so that you do not have to worry about accent rearrangement on your
  * own.
  * <p>
  * <tt>Normalizer</tt> adds one optional behavior, {@link #IGNORE_HANGUL},
  * that differs from
  * the standard Unicode Normalization Forms.  This option can be passed
  * to the {@link #Normalizer constructors} and to the static
  * {@link #compose compose} and {@link #decompose decompose} methods.  This
  * option, and any that are added in the future, will be turned off by default.
  * <p>
  * There are three common usage models for <tt>Normalizer</tt>.  In the first,
  * the static {@link #normalize normalize()} method is used to process an
  * entire input string at once.  Second, you can create a <tt>Normalizer</tt>
  * object and use it to iterate through the normalized form of a string by
  * calling {@link #first} and {@link #next}.  Finally, you can use the
  * {@link #setIndex setIndex()} and {@link #getIndex} methods to perform
  * random-access iteration, which is very useful for searching.
  * <p>
  * <b>Note:</b> <tt>Normalizer</tt> objects behave like iterators and have
  * methods such as <tt>setIndex</tt>, <tt>next</tt>, <tt>previous</tt>, etc.
  * You should note that while the <tt>setIndex</tt> and <tt>getIndex</tt> refer
  * to indices in the underlying <em>input</em> text being processed, the
  * <tt>next</tt> and <tt>previous</tt> methods it iterate through characters
  * in the normalized <em>output</em>.  This means that there is not
  * necessarily a one-to-one correspondence between characters returned
  * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
  * returned from <tt>setIndex</tt> and <tt>getIndex</tt>.  It is for this
  * reason that <tt>Normalizer</tt> does not implement the
  * {@link CharacterIterator} interface.
  * <p>
  * <b>Note:</b> <tt>Normalizer</tt> is currently based on version 2.1.8
  * of the <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
  * It will be updated as later versions of Unicode are released.  If you are
  * using this class on a JDK that supports an earlier version of Unicode, it
  * is possible that <tt>Normalizer</tt> may generate composed or dedecomposed
  * characters for which your JDK's {@link java.lang.Character} class does not
  * have any data.
  * <p>
  * @author Laura Werner, Mark Davis
  */
 class U_COMMON_API Normalizer
 {

  public:
   // This tells us what the bits in the "mode" mean.
   enum {
     COMPAT_BIT         = 1,
     DECOMP_BIT         = 2,
     COMPOSE_BIT     = 4
   };


   /** If DONE is returned, then there are no more normalization results available. */
   enum {
       DONE=0xffff
   };

   /** The mode of a Normalizer object */
   enum EMode {

     /**
      * Null operation for use with the {@link #Normalizer constructors}
      * and the static {@link #normalize normalize} method.  This value tells
      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
      * from the underlying String or CharacterIterator.  If you have code which
      * requires raw text at some times and normalized text at others, you can
      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
      * than having a separate code path that bypasses <tt>Normalizer</tt>
      * altogether.
      * <p>
      * @see #setMode
      */
     NO_OP         = 0,

     /**
      * Canonical decomposition followed by canonical composition.  Used with
      * the {@link #Normalizer constructors} and the static
      * {@link #normalize normalize}
      * method to determine the operation to be performed.
      * <p>
      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
      * off, this operation produces output that is in
      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
      * Form</a>
      * <b>C</b>.
      * <p>
      * @see #setMode
      */
     COMPOSE         = COMPOSE_BIT,

     /**
      * Compatibility decomposition followed by canonical composition.
      * Used with the {@link #Normalizer constructors} and the static
      * {@link #normalize normalize} method to determine the operation to be
      * performed.
      * <p>
      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
      * off, this operation produces output that is in
      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
      * Form</a>
      * <b>KC</b>.
      * <p>
      * @see #setMode
      */
     COMPOSE_COMPAT     = COMPOSE_BIT | COMPAT_BIT,

     /**
      * Canonical decomposition.  This value is passed to the
      * {@link #Normalizer constructors} and the static
      * {@link #normalize normalize}
      * method to determine the operation to be performed.
      * <p>
      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
      * off, this operation produces output that is in
      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
      * Form</a>
      * <b>D</b>.
      * <p>
      * @see #setMode
      */
     DECOMP         = DECOMP_BIT,

     /**
      * Compatibility decomposition.  This value is passed to the
      * {@link #Normalizer constructors} and the static
      * {@link #normalize normalize}
      * method to determine the operation to be performed.
      * <p>
      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
      * off, this operation produces output that is in
      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
      * Form</a>
      * <b>KD</b>.
      * <p>
      * @see #setMode
      */
     DECOMP_COMPAT     = DECOMP_BIT | COMPAT_BIT
   };

   /** The options for a Normalizer object */
   enum {

     /**
      * Option to disable Hangul/Jamo composition and decomposition.
      * This option applies to Korean text,
      * which can be represented either in the Jamo alphabet or in Hangul
      * characters, which are really just two or three Jamo combined
      * into one visual glyph.  Since Jamo takes up more storage space than
      * Hangul, applications that process only Hangul text may wish to turn
      * this option on when decomposing text.
      * <p>
      * The Unicode standard treates Hangul to Jamo conversion as a
      * canonical decomposition, so this option must be turned <b>off</b> if you
      * wish to transform strings into one of the standard
      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
      * Unicode Normalization Forms</a>.
      * <p>
      * @see #setOption
      */
     IGNORE_HANGUL     = 0x001
   };

   // Constructors

   /**
    * Creates a new <tt>Normalizer</tt> object for iterating over the
    * normalized form of a given string.
    * <p>
    * @param str   The string to be normalized.  The normalization
    *              will start at the beginning of the string.
    *
    * @param mode  The normalization mode.
    * @stable
    */
   Normalizer(const UnicodeString& str,
          EMode mode);

   /**
    * Creates a new <tt>Normalizer</tt> object for iterating over the
    * normalized form of a given string.
    * <p>
    * The <tt>options</tt> parameter specifies which optional
    * <tt>Normalizer</tt> features are to be enabled for this object.
    * <p>
    * @param str   The string to be normalized.  The normalization
    *              will start at the beginning of the string.
    *
    * @param mode  The normalization mode.
    *
    * @param opt   Any optional features to be enabled.
    *              Currently the only available option is {@link #IGNORE_HANGUL}
    *              If you want the default behavior corresponding to one of the
    *              standard Unicode Normalization Forms, use 0 for this argument
    * @stable
    */
   Normalizer(const UnicodeString& str,
          EMode mode,
          int32_t opt);

   /**
    * Creates a new <tt>Normalizer</tt> object for iterating over the
    * normalized form of a given UChar string.
    * <p>
    * @param str   The string to be normalized.  The normalization
    *              will start at the beginning of the string.
    *
    * @param length Lenght of the string
    * @param mode  The normalization mode.
    * @stable
    *
    */
   Normalizer(const UChar* str,
          int32_t length,
          EMode mode);

   /**
    * Creates a new <tt>Normalizer</tt> object for iterating over the
    * normalized form of a given UChar string.
    * <p>
    * @param str   The string to be normalized.  The normalization
    *              will start at the beginning of the string.
    *
    * @param length Lenght of the string
    * @param mode  The normalization mode.
    * @param opt   Any optional features to be enabled.
    *              Currently the only available option is {@link #IGNORE_HANGUL}
    *              If you want the default behavior corresponding to one of the
    *              standard Unicode Normalization Forms, use 0 for this argument
    * @unimplemented
    *
    */
   Normalizer(const UChar* str,
          int32_t length,
          EMode mode,
          int32_t option);

   /**
    * Creates a new <tt>Normalizer</tt> object for iterating over the
    * normalized form of the given text.
    * <p>
    * @param iter  The input text to be normalized.  The normalization
    *              will start at the beginning of the string.
    *
    * @param mode  The normalization mode.
    * @stable
    *
    */
   Normalizer(const CharacterIterator& iter,
          EMode mode);

   /**
    * Creates a new <tt>Normalizer</tt> object for iterating over the
    * normalized form of the given text.
    * <p>
    * @param iter  The input text to be normalized.  The normalization
    *              will start at the beginning of the string.
    *
    * @param mode  The normalization mode.
    *
    * @param opt   Any optional features to be enabled.
    *              Currently the only available option is {@link #IGNORE_HANGUL}
    *              If you want the default behavior corresponding to one of the
    *              standard Unicode Normalization Forms, use 0 for this argument
    * @stable
    */
   Normalizer(const CharacterIterator& iter,
          EMode mode,
          int32_t opt);

   /**
    * Copy constructor.
    * @stable
    */
   Normalizer(const Normalizer& copy);

   /**
    * Destructor
    * @stable
    */
   ~Normalizer();


   //-------------------------------------------------------------------------
   // Static utility methods
   //-------------------------------------------------------------------------

   /**
    * Normalizes a <tt>String</tt> using the given normalization operation.
    * <p>
    * The <tt>options</tt> parameter specifies which optional
    * <tt>Normalizer</tt> features are to be enabled for this operation.
    * Currently the only available option is {@link #IGNORE_HANGUL}.
    * If you want the default behavior corresponding to one of the standard
    * Unicode Normalization Forms, use 0 for this argument.
    * <p>
    * @param source    the input string to be normalized.
    *
    * @param aMode     the normalization mode
    *
    * @param options   the optional features to be enabled.
    *
    * @param result    The normalized string (on output).
    *
    * @param status    The error code.
    * @stable
    */
   static void normalize(const UnicodeString& source,
             EMode mode,
             int32_t options,
             UnicodeString& result,
             UErrorCode &status);

   /**
    * Compose a <tt>String</tt>.
    * <p>
    * The <tt>options</tt> parameter specifies which optional
    * <tt>Normalizer</tt> features are to be enabled for this operation.
    * Currently the only available option is {@link #IGNORE_HANGUL}.
    * If you want the default behavior corresponding
    * to Unicode Normalization Form <b>C</b> or <b>KC</b>,
    * use 0 for this argument.
    * <p>
    * @param source    the string to be composed.
    *
    * @param compat    Perform compatibility decomposition before composition.
    *                  If this argument is <tt>false</tt>, only canonical
    *                  decomposition will be performed.
    *
    * @param options   the optional features to be enabled.
    *
    * @param result    The composed string (on output).
    *
    * @param status    The error code.
    * @stable
    */
   static void compose(const UnicodeString& source,
               UBool compat,
               int32_t options,
               UnicodeString& result,
               UErrorCode &status);

   /**
    * Static method to decompose a <tt>String</tt>.
    * <p>
    * The <tt>options</tt> parameter specifies which optional
    * <tt>Normalizer</tt> features are to be enabled for this operation.
    * Currently the only available option is {@link #IGNORE_HANGUL}.
    * The desired options should be OR'ed together to determine the value
    * of this argument.  If you want the default behavior corresponding
    * to Unicode Normalization Form <b>D</b> or <b>KD</b>,
    * use 0 for this argument.
    * <p>
    * @param str   the string to be decomposed.
    *
    * @param compat    Perform compatibility decomposition.
    *                  If this argument is <tt>false</tt>, only canonical
    *                  decomposition will be performed.
    *
    * @param options   the optional features to be enabled.
    *
    * @param result    The composed string (on output).
    *
    * @param status    The error code.
    *
    * @return      the decomposed string.
    * @stable
    */
   static void decompose(const UnicodeString& source,
             UBool compat,
             int32_t options,
             UnicodeString& result,
             UErrorCode &status);


   //-------------------------------------------------------------------------
   // CharacterIterator overrides
   //-------------------------------------------------------------------------

   /**
    * Return the current character in the normalized text.
    * @draft
    */
   UChar32              current(void) const;

   /**
    * Return the first character in the normalized text.  This resets
    * the <tt>Normalizer's</tt> position to the beginning of the text.
    * @draft
    */
   UChar32              first(void);

   /**
    * Return the last character in the normalized text.  This resets
    * the <tt>Normalizer's</tt> position to be just before the
    * the input text corresponding to that normalized character.
    * @draft
    */
   UChar32              last(void);

   /**
    * Return the next character in the normalized text and advance
    * the iteration position by one.  If the end
    * of the text has already been reached, {@link #DONE} is returned.
    * @draft
    */
   UChar32              next(void);

   /**
    * Return the previous character in the normalized text and decrement
    * the iteration position by one.  If the beginning
    * of the text has already been reached, {@link #DONE} is returned.
    * @draft
    */
   UChar32              previous(void);

   /**
    * Set the iteration position in the input text that is being normalized
    * and return the first normalized character at that position.
    * <p>
    * <b>Note:</b> This method sets the position in the <em>input</em> text,
    * while {@link #next} and {@link #previous} iterate through characters
    * in the normalized <em>output</em>.  This means that there is not
    * necessarily a one-to-one correspondence between characters returned
    * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
    * returned from <tt>setIndex</tt> and {@link #getIndex}.
    * <p>
    * @param index the desired index in the input text.
    *
    * @return      the first normalized character that is the result of iterating
    *              forward starting at the given index.
    * @draft
    */
   UChar32              setIndex(UTextOffset index);

   /**
    * Reset the iterator so that it is in the same state that it was just after
    * it was constructed.  A subsequent call to <tt>next</tt> will return the first
    * character in the normalized text.  In contrast, calling <tt>setIndex(0)</tt> followed
    * by <tt>next</tt> will return the <em>second</em> character in the normalized text,
    * because <tt>setIndex</tt> itself returns the first character
    * @stable
    */
   void                reset(void);

   /**
    * Retrieve the current iteration position in the input text that is
    * being normalized.  This method is useful in applications such as
    * searching, where you need to be able to determine the position in
    * the input text that corresponds to a given normalized output character.
    * <p>
    * <b>Note:</b> This method sets the position in the <em>input</em>, while
    * {@link #next} and {@link #previous} iterate through characters in the
    * <em>output</em>.  This means that there is not necessarily a one-to-one
    * correspondence between characters returned by <tt>next</tt> and
    * <tt>previous</tt> and the indices passed to and returned from
    * <tt>setIndex</tt> and {@link #getIndex}.
    * @stable
    */
   UTextOffset            getIndex(void) const;

   /**
    * Retrieve the index of the start of the input text.  This is the begin index
    * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
    * over which this <tt>Normalizer</tt> is iterating
    * @stable
    */
   UTextOffset            startIndex(void) const;

   /**
    * Retrieve the index of the end of the input text.  This is the end index
    * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
    * over which this <tt>Normalizer</tt> is iterating
    * @stable
    */
   UTextOffset            endIndex(void) const;


   /**
    * Returns true when both iterators refer to the same character in the same
    * character-storage object.
    * @stable
    */
   //  virtual UBool    operator==(const CharacterIterator& that) const;
   UBool        operator==(const Normalizer& that) const;
   inline UBool        operator!=(const Normalizer& that) const;

   /**
    * Returns a pointer to a new Normalizer that is a clone of this one.
    * The caller is responsible for deleting the new clone.
    * @stable
    */
   Normalizer*        clone(void) const;

   /**
    * Generates a hash code for this iterator.
    * @stable
    */
   int32_t                hashCode(void) const;

   //-------------------------------------------------------------------------
   // Property access methods
   //-------------------------------------------------------------------------

   /**
    * Set the normalization mode for this object.
    * <p>
    * <b>Note:</b>If the normalization mode is changed while iterating
    * over a string, calls to {@link #next} and {@link #previous} may
    * return previously buffers characters in the old normalization mode
    * until the iteration is able to re-sync at the next base character.
    * It is safest to call {@link #setText setText()}, {@link #first},
    * {@link #last}, etc. after calling <tt>setMode</tt>.
    * <p>
    * @param newMode the new mode for this <tt>Normalizer</tt>.
    * The supported modes are:
    * <ul>
    *  <li>{@link #COMPOSE}        - Unicode canonical decompositiion
    *                                  followed by canonical composition.
    *  <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
    *                                  follwed by canonical composition.
    *  <li>{@link #DECOMP}         - Unicode canonical decomposition
    *  <li>{@link #DECOMP_COMPAT}  - Unicode compatibility decomposition.
    *  <li>{@link #NO_OP}          - Do nothing but return characters
    *                                  from the underlying input text.
    * </ul>
    *
    * @see #getMode
    * @stable
    */
   void setMode(EMode newMode);

   /**
    * Return the basic operation performed by this <tt>Normalizer</tt>
    *
    * @see #setMode
    * @stable
    */
   EMode getMode(void) const;

   /**
    * Set options that affect this <tt>Normalizer</tt>'s operation.
    * Options do not change the basic composition or decomposition operation
    * that is being performed , but they control whether
    * certain optional portions of the operation are done.
    * Currently the only available option is:
    * <p>
    * <ul>
    *   <li>{@link #IGNORE_HANGUL} - Do not decompose Hangul syllables into the
    *       Jamo alphabet and vice-versa.  This option is off by default
    *       (<i>i.e.</i> Hangul processing is enabled) since the Unicode
    *       standard specifies that Hangul to Jamo is a canonical decomposition.
    *       For any of the standard Unicode Normalization
    *       Forms, you should leave this option off.
    * </ul>
    * <p>
    * @param   option  the option whose value is to be set.
    * @param   value   the new setting for the option.  Use <tt>true</tt> to
    *                  turn the option on and <tt>false</tt> to turn it off.
    *
    * @see #getOption
    * @stable
    */
   void setOption(int32_t option,
          UBool value);

   /**
    * Determine whether an option is turned on or off.
    * <p>
    * @see #setOption
    * @stable
    */
   UBool getOption(int32_t option) const;

   /**
    * Set the input text over which this <tt>Normalizer</tt> will iterate.
    * The iteration position is set to the beginning.
    * @stable
    */
   void setText(const UnicodeString& newText,
            UErrorCode &status);

   /**
    * Set the input text over which this <tt>Normalizer</tt> will iterate.
    * The iteration position is set to the beginning.
    * @stable
    */
   void setText(const CharacterIterator& newText,
            UErrorCode &status);

   /**
    * Set the input text over which this <tt>Normalizer</tt> will iterate.
    * The iteration position is set to the beginning.
    * @stable
    */
   void setText(const UChar* newText,
                     int32_t length,
             UErrorCode &status);
   /**
    * Copies the text under iteration into the UnicodeString referred to by
    * "result".
    * @param result Receives a copy of the text under iteration.
    * @draft should also return the result UnicodeString &
    */
   void            getText(UnicodeString&  result);

   /**
    * Returns the text under iteration into the UChar* buffer pointer.
    * @param result Receives a copy of the text under iteration.
    * @unimplemented
    */
   const UChar*     getText(int32_t&  count);

 private:
   // Private utility methods for iteration
   // For documentation, see the source code
   UChar nextCompose(void);
   UChar prevCompose(void);
   UChar nextDecomp(void);
   UChar prevDecomp(void);

   UChar curForward(void);
   UChar curBackward(void);

   void    init(CharacterIterator* iter,
          EMode mode,
          int32_t option);
   void    initBuffer(void);
   void    clearBuffer(void);

   // Utilities used by Compose
   static void        bubbleAppend(UnicodeString& target,
                      UChar ch,
                      uint32_t cclass);
   static uint32_t     getComposeClass(UChar ch);
   static uint16_t    composeLookup(UChar ch);
   static uint16_t    composeAction(uint16_t baseIndex,
                       uint16_t comIndex);
   static void        explode(UnicodeString& target,
                 uint16_t index);
   static UChar    pairExplode(UnicodeString& target,
                     uint16_t action);

   // Utilities used by Decompose
   static void        fixCanonical(UnicodeString& result);    // Reorders combining marks
   static uint8_t    getClass(UChar ch);                    // Gets char's combining class

   // Other static utility methods
   static void doAppend(const UChar source[],
                uint16_t offset,
                UnicodeString& dest);
   static void doInsert(const UChar source[],
                uint16_t offset,
                UnicodeString& dest,
                UTextOffset pos);
   static uint16_t doReplace(const UChar source[],
                uint16_t offset,
                UnicodeString& dest,
                UTextOffset pos);

   static void hangulToJamo(UChar ch,
                UnicodeString& result,
                uint16_t decompLimit);
   static void jamoAppend(UChar ch,
              uint16_t decompLimit,
              UnicodeString& dest);
   static void jamoToHangul(UnicodeString& buffer,
                UTextOffset start);

   //-------------------------------------------------------------------------
   // Private data
   //-------------------------------------------------------------------------

   EMode         fMode;
   int32_t       fOptions;
   int16_t    minDecomp;

   // The input text and our position in it
   CharacterIterator*  text;

   // A buffer for holding intermediate results
   UnicodeString       buffer;
   UTextOffset          bufferPos;
   UTextOffset          bufferLimit;
   UChar             currentChar;

   // Another buffer for use during iterative composition
   UnicodeString       explodeBuf;

   enum {
     EMPTY = -1,
     STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder
     STR_LENGTH_MASK = 0x0003
   };

   enum {
     HANGUL_BASE = 0xac00,
     HANGUL_LIMIT = 0xd7a4,
     JAMO_LBASE = 0x1100,
     JAMO_VBASE = 0x1161,
     JAMO_TBASE = 0x11a7,
     JAMO_LCOUNT = 19,
     JAMO_VCOUNT = 21,
     JAMO_TCOUNT = 28,
     JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT
   };

   friend class ComposedCharIter;
 };

 inline UBool
 Normalizer::operator!= (const Normalizer& other) const
 { return ! operator==(other); }

 #endif // _NORMLZR