| /* |
| ******************************************************************** |
| * COPYRIGHT: |
| * Copyright (c) 1996-1999, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************** |
| */ |
| |
| #ifndef NORMLZR_H |
| #define NORMLZR_H |
| |
| #include "unicode/utypes.h" |
| #include "unicode/unistr.h" |
| #include "unicode/chariter.h" |
| |
| /* forward declaration */ |
| class ComposedCharIter; |
| |
| /** |
| * <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or |
| * decomposed form, allowing for easier sorting and searching of text. |
| * <tt>Normalizer</tt> supports the standard normalization forms described in |
| * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> |
| * Unicode Technical Report #15</a>. |
| * <p> |
| * Characters with accents or other adornments can be encoded in |
| * several different ways in Unicode. For example, take the character "Á" |
| * (A-acute). In Unicode, this can be encoded as a single character (the |
| * "composed" form): |
| * <pre> |
| * \code |
| * 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre> |
| * \endcode |
| * or as two separate characters (the "decomposed" form): |
| * <pre> |
| * \code |
| * 0041 LATIN CAPITAL LETTER A |
| * 0301 COMBINING ACUTE ACCENT</pre> |
| * \endcode |
| * <p> |
| * To a user of your program, however, both of these sequences should be |
| * treated as the same "user-level" character "Á". When you are searching or |
| * comparing text, you must ensure that these two sequences are treated |
| * equivalently. In addition, you must handle characters with more than one |
| * accent. Sometimes the order of a character's combining accents is |
| * significant, while in other cases accent sequences in different orders are |
| * really equivalent. |
| * <p> |
| * Similarly, the string "ffi" can be encoded as three separate letters: |
| * <pre> |
| * \code |
| * 0066 LATIN SMALL LETTER F |
| * 0066 LATIN SMALL LETTER F |
| * 0069 LATIN SMALL LETTER I</pre> |
| * \endcode |
| * or as the single character |
| * <pre> |
| * \code |
| * FB03 LATIN SMALL LIGATURE FFI</pre> |
| * \endcode |
| * <p> |
| * The ffi ligature is not a distinct semantic character, and strictly speaking |
| * it shouldn't be in Unicode at all, but it was included for compatibility |
| * with existing character sets that already provided it. The Unicode standard |
| * identifies such characters by giving them "compatibility" decompositions |
| * into the corresponding semantic characters. When sorting and searching, you |
| * will often want to use these mappings. |
| * <p> |
| * <tt>Normalizer</tt> helps solve these problems by transforming text into the |
| * canonical composed and decomposed forms as shown in the first example above. |
| * In addition, you can have it perform compatibility decompositions so that |
| * you can treat compatibility characters the same as their equivalents. |
| * Finally, <tt>Normalizer</tt> rearranges accents into the proper canonical |
| * order, so that you do not have to worry about accent rearrangement on your |
| * own. |
| * <p> |
| * <tt>Normalizer</tt> adds one optional behavior, {@link #IGNORE_HANGUL}, |
| * that differs from |
| * the standard Unicode Normalization Forms. This option can be passed |
| * to the {@link #Normalizer constructors} and to the static |
| * {@link #compose compose} and {@link #decompose decompose} methods. This |
| * option, and any that are added in the future, will be turned off by default. |
| * <p> |
| * There are three common usage models for <tt>Normalizer</tt>. In the first, |
| * the static {@link #normalize normalize()} method is used to process an |
| * entire input string at once. Second, you can create a <tt>Normalizer</tt> |
| * object and use it to iterate through the normalized form of a string by |
| * calling {@link #first} and {@link #next}. Finally, you can use the |
| * {@link #setIndex setIndex()} and {@link #getIndex} methods to perform |
| * random-access iteration, which is very useful for searching. |
| * <p> |
| * <b>Note:</b> <tt>Normalizer</tt> objects behave like iterators and have |
| * methods such as <tt>setIndex</tt>, <tt>next</tt>, <tt>previous</tt>, etc. |
| * You should note that while the <tt>setIndex</tt> and <tt>getIndex</tt> refer |
| * to indices in the underlying <em>input</em> text being processed, the |
| * <tt>next</tt> and <tt>previous</tt> methods it iterate through characters |
| * in the normalized <em>output</em>. This means that there is not |
| * necessarily a one-to-one correspondence between characters returned |
| * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and |
| * returned from <tt>setIndex</tt> and <tt>getIndex</tt>. It is for this |
| * reason that <tt>Normalizer</tt> does not implement the |
| * {@link CharacterIterator} interface. |
| * <p> |
| * <b>Note:</b> <tt>Normalizer</tt> is currently based on version 2.1.8 |
| * of the <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>. |
| * It will be updated as later versions of Unicode are released. If you are |
| * using this class on a JDK that supports an earlier version of Unicode, it |
| * is possible that <tt>Normalizer</tt> may generate composed or dedecomposed |
| * characters for which your JDK's {@link java.lang.Character} class does not |
| * have any data. |
| * <p> |
| * @author Laura Werner, Mark Davis |
| */ |
| class U_COMMON_API Normalizer |
| { |
| |
| public: |
| // This tells us what the bits in the "mode" mean. |
| enum { |
| COMPAT_BIT = 1, |
| DECOMP_BIT = 2, |
| COMPOSE_BIT = 4 |
| }; |
| |
| |
| |
| /** If DONE is returned, then there are no more normalization results available. */ |
| enum { |
| DONE=0xffff |
| }; |
| |
| /** The mode of a Normalizer object */ |
| enum EMode { |
| |
| /** |
| * Null operation for use with the {@link #Normalizer constructors} |
| * and the static {@link #normalize normalize} method. This value tells |
| * the <tt>Normalizer</tt> to do nothing but return unprocessed characters |
| * from the underlying String or CharacterIterator. If you have code which |
| * requires raw text at some times and normalized text at others, you can |
| * use <tt>NO_OP</tt> for the cases where you want raw text, rather |
| * than having a separate code path that bypasses <tt>Normalizer</tt> |
| * altogether. |
| * <p> |
| * @see #setMode |
| */ |
| NO_OP = 0, |
| |
| /** |
| * Canonical decomposition followed by canonical composition. Used with |
| * the {@link #Normalizer constructors} and the static |
| * {@link #normalize normalize} |
| * method to determine the operation to be performed. |
| * <p> |
| * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned |
| * off, this operation produces output that is in |
| * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical |
| * Form</a> |
| * <b>C</b>. |
| * <p> |
| * @see #setMode |
| */ |
| COMPOSE = COMPOSE_BIT, |
| |
| /** |
| * Compatibility decomposition followed by canonical composition. |
| * Used with the {@link #Normalizer constructors} and the static |
| * {@link #normalize normalize} method to determine the operation to be |
| * performed. |
| * <p> |
| * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned |
| * off, this operation produces output that is in |
| * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical |
| * Form</a> |
| * <b>KC</b>. |
| * <p> |
| * @see #setMode |
| */ |
| COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT, |
| |
| /** |
| * Canonical decomposition. This value is passed to the |
| * {@link #Normalizer constructors} and the static |
| * {@link #normalize normalize} |
| * method to determine the operation to be performed. |
| * <p> |
| * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned |
| * off, this operation produces output that is in |
| * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical |
| * Form</a> |
| * <b>D</b>. |
| * <p> |
| * @see #setMode |
| */ |
| DECOMP = DECOMP_BIT, |
| |
| /** |
| * Compatibility decomposition. This value is passed to the |
| * {@link #Normalizer constructors} and the static |
| * {@link #normalize normalize} |
| * method to determine the operation to be performed. |
| * <p> |
| * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned |
| * off, this operation produces output that is in |
| * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical |
| * Form</a> |
| * <b>KD</b>. |
| * <p> |
| * @see #setMode |
| */ |
| DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT |
| }; |
| |
| /** The options for a Normalizer object */ |
| enum { |
| |
| /** |
| * Option to disable Hangul/Jamo composition and decomposition. |
| * This option applies to Korean text, |
| * which can be represented either in the Jamo alphabet or in Hangul |
| * characters, which are really just two or three Jamo combined |
| * into one visual glyph. Since Jamo takes up more storage space than |
| * Hangul, applications that process only Hangul text may wish to turn |
| * this option on when decomposing text. |
| * <p> |
| * The Unicode standard treates Hangul to Jamo conversion as a |
| * canonical decomposition, so this option must be turned <b>off</b> if you |
| * wish to transform strings into one of the standard |
| * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> |
| * Unicode Normalization Forms</a>. |
| * <p> |
| * @see #setOption |
| */ |
| IGNORE_HANGUL = 0x001 |
| }; |
| |
| // Constructors |
| |
| /** |
| * Creates a new <tt>Normalizer</tt> object for iterating over the |
| * normalized form of a given string. |
| * <p> |
| * @param str The string to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param mode The normalization mode. |
| * @stable |
| */ |
| Normalizer(const UnicodeString& str, |
| EMode mode); |
| |
| /** |
| * Creates a new <tt>Normalizer</tt> object for iterating over the |
| * normalized form of a given string. |
| * <p> |
| * The <tt>options</tt> parameter specifies which optional |
| * <tt>Normalizer</tt> features are to be enabled for this object. |
| * <p> |
| * @param str The string to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param mode The normalization mode. |
| * |
| * @param opt Any optional features to be enabled. |
| * Currently the only available option is {@link #IGNORE_HANGUL} |
| * If you want the default behavior corresponding to one of the |
| * standard Unicode Normalization Forms, use 0 for this argument |
| * @stable |
| */ |
| Normalizer(const UnicodeString& str, |
| EMode mode, |
| int32_t opt); |
| |
| /** |
| * Creates a new <tt>Normalizer</tt> object for iterating over the |
| * normalized form of a given UChar string. |
| * <p> |
| * @param str The string to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param length Lenght of the string |
| * @param mode The normalization mode. |
| * @stable |
| * |
| */ |
| Normalizer(const UChar* str, |
| int32_t length, |
| EMode mode); |
| |
| /** |
| * Creates a new <tt>Normalizer</tt> object for iterating over the |
| * normalized form of a given UChar string. |
| * <p> |
| * @param str The string to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param length Lenght of the string |
| * @param mode The normalization mode. |
| * @param opt Any optional features to be enabled. |
| * Currently the only available option is {@link #IGNORE_HANGUL} |
| * If you want the default behavior corresponding to one of the |
| * standard Unicode Normalization Forms, use 0 for this argument |
| * @unimplemented |
| * |
| */ |
| Normalizer(const UChar* str, |
| int32_t length, |
| EMode mode, |
| int32_t option); |
| |
| /** |
| * Creates a new <tt>Normalizer</tt> object for iterating over the |
| * normalized form of the given text. |
| * <p> |
| * @param iter The input text to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param mode The normalization mode. |
| * @stable |
| * |
| */ |
| Normalizer(const CharacterIterator& iter, |
| EMode mode); |
| |
| /** |
| * Creates a new <tt>Normalizer</tt> object for iterating over the |
| * normalized form of the given text. |
| * <p> |
| * @param iter The input text to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param mode The normalization mode. |
| * |
| * @param opt Any optional features to be enabled. |
| * Currently the only available option is {@link #IGNORE_HANGUL} |
| * If you want the default behavior corresponding to one of the |
| * standard Unicode Normalization Forms, use 0 for this argument |
| * @stable |
| */ |
| Normalizer(const CharacterIterator& iter, |
| EMode mode, |
| int32_t opt); |
| |
| /** |
| * Copy constructor. |
| * @stable |
| */ |
| Normalizer(const Normalizer& copy); |
| |
| /** |
| * Destructor |
| * @stable |
| */ |
| ~Normalizer(); |
| |
| |
| //------------------------------------------------------------------------- |
| // Static utility methods |
| //------------------------------------------------------------------------- |
| |
| /** |
| * Normalizes a <tt>String</tt> using the given normalization operation. |
| * <p> |
| * The <tt>options</tt> parameter specifies which optional |
| * <tt>Normalizer</tt> features are to be enabled for this operation. |
| * Currently the only available option is {@link #IGNORE_HANGUL}. |
| * If you want the default behavior corresponding to one of the standard |
| * Unicode Normalization Forms, use 0 for this argument. |
| * <p> |
| * @param source the input string to be normalized. |
| * |
| * @param aMode the normalization mode |
| * |
| * @param options the optional features to be enabled. |
| * |
| * @param result The normalized string (on output). |
| * |
| * @param status The error code. |
| * @stable |
| */ |
| static void normalize(const UnicodeString& source, |
| EMode mode, |
| int32_t options, |
| UnicodeString& result, |
| UErrorCode &status); |
| |
| /** |
| * Compose a <tt>String</tt>. |
| * <p> |
| * The <tt>options</tt> parameter specifies which optional |
| * <tt>Normalizer</tt> features are to be enabled for this operation. |
| * Currently the only available option is {@link #IGNORE_HANGUL}. |
| * If you want the default behavior corresponding |
| * to Unicode Normalization Form <b>C</b> or <b>KC</b>, |
| * use 0 for this argument. |
| * <p> |
| * @param source the string to be composed. |
| * |
| * @param compat Perform compatibility decomposition before composition. |
| * If this argument is <tt>false</tt>, only canonical |
| * decomposition will be performed. |
| * |
| * @param options the optional features to be enabled. |
| * |
| * @param result The composed string (on output). |
| * |
| * @param status The error code. |
| * @stable |
| */ |
| static void compose(const UnicodeString& source, |
| UBool compat, |
| int32_t options, |
| UnicodeString& result, |
| UErrorCode &status); |
| |
| /** |
| * Static method to decompose a <tt>String</tt>. |
| * <p> |
| * The <tt>options</tt> parameter specifies which optional |
| * <tt>Normalizer</tt> features are to be enabled for this operation. |
| * Currently the only available option is {@link #IGNORE_HANGUL}. |
| * The desired options should be OR'ed together to determine the value |
| * of this argument. If you want the default behavior corresponding |
| * to Unicode Normalization Form <b>D</b> or <b>KD</b>, |
| * use 0 for this argument. |
| * <p> |
| * @param str the string to be decomposed. |
| * |
| * @param compat Perform compatibility decomposition. |
| * If this argument is <tt>false</tt>, only canonical |
| * decomposition will be performed. |
| * |
| * @param options the optional features to be enabled. |
| * |
| * @param result The composed string (on output). |
| * |
| * @param status The error code. |
| * |
| * @return the decomposed string. |
| * @stable |
| */ |
| static void decompose(const UnicodeString& source, |
| UBool compat, |
| int32_t options, |
| UnicodeString& result, |
| UErrorCode &status); |
| |
| |
| //------------------------------------------------------------------------- |
| // CharacterIterator overrides |
| //------------------------------------------------------------------------- |
| |
| /** |
| * Return the current character in the normalized text. |
| * @draft |
| */ |
| UChar32 current(void) const; |
| |
| /** |
| * Return the first character in the normalized text. This resets |
| * the <tt>Normalizer's</tt> position to the beginning of the text. |
| * @draft |
| */ |
| UChar32 first(void); |
| |
| /** |
| * Return the last character in the normalized text. This resets |
| * the <tt>Normalizer's</tt> position to be just before the |
| * the input text corresponding to that normalized character. |
| * @draft |
| */ |
| UChar32 last(void); |
| |
| /** |
| * Return the next character in the normalized text and advance |
| * the iteration position by one. If the end |
| * of the text has already been reached, {@link #DONE} is returned. |
| * @draft |
| */ |
| UChar32 next(void); |
| |
| /** |
| * Return the previous character in the normalized text and decrement |
| * the iteration position by one. If the beginning |
| * of the text has already been reached, {@link #DONE} is returned. |
| * @draft |
| */ |
| UChar32 previous(void); |
| |
| /** |
| * Set the iteration position in the input text that is being normalized |
| * and return the first normalized character at that position. |
| * <p> |
| * <b>Note:</b> This method sets the position in the <em>input</em> text, |
| * while {@link #next} and {@link #previous} iterate through characters |
| * in the normalized <em>output</em>. This means that there is not |
| * necessarily a one-to-one correspondence between characters returned |
| * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and |
| * returned from <tt>setIndex</tt> and {@link #getIndex}. |
| * <p> |
| * @param index the desired index in the input text. |
| * |
| * @return the first normalized character that is the result of iterating |
| * forward starting at the given index. |
| * @draft |
| */ |
| UChar32 setIndex(UTextOffset index); |
| |
| /** |
| * Reset the iterator so that it is in the same state that it was just after |
| * it was constructed. A subsequent call to <tt>next</tt> will return the first |
| * character in the normalized text. In contrast, calling <tt>setIndex(0)</tt> followed |
| * by <tt>next</tt> will return the <em>second</em> character in the normalized text, |
| * because <tt>setIndex</tt> itself returns the first character |
| * @stable |
| */ |
| void reset(void); |
| |
| /** |
| * Retrieve the current iteration position in the input text that is |
| * being normalized. This method is useful in applications such as |
| * searching, where you need to be able to determine the position in |
| * the input text that corresponds to a given normalized output character. |
| * <p> |
| * <b>Note:</b> This method sets the position in the <em>input</em>, while |
| * {@link #next} and {@link #previous} iterate through characters in the |
| * <em>output</em>. This means that there is not necessarily a one-to-one |
| * correspondence between characters returned by <tt>next</tt> and |
| * <tt>previous</tt> and the indices passed to and returned from |
| * <tt>setIndex</tt> and {@link #getIndex}. |
| * @stable |
| */ |
| UTextOffset getIndex(void) const; |
| |
| /** |
| * Retrieve the index of the start of the input text. This is the begin index |
| * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> |
| * over which this <tt>Normalizer</tt> is iterating |
| * @stable |
| */ |
| UTextOffset startIndex(void) const; |
| |
| /** |
| * Retrieve the index of the end of the input text. This is the end index |
| * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
| * over which this <tt>Normalizer</tt> is iterating |
| * @stable |
| */ |
| UTextOffset endIndex(void) const; |
| |
| |
| /** |
| * Returns true when both iterators refer to the same character in the same |
| * character-storage object. |
| * @stable |
| */ |
| // virtual UBool operator==(const CharacterIterator& that) const; |
| UBool operator==(const Normalizer& that) const; |
| inline UBool operator!=(const Normalizer& that) const; |
| |
| /** |
| * Returns a pointer to a new Normalizer that is a clone of this one. |
| * The caller is responsible for deleting the new clone. |
| * @stable |
| */ |
| Normalizer* clone(void) const; |
| |
| /** |
| * Generates a hash code for this iterator. |
| * @stable |
| */ |
| int32_t hashCode(void) const; |
| |
| //------------------------------------------------------------------------- |
| // Property access methods |
| //------------------------------------------------------------------------- |
| |
| /** |
| * Set the normalization mode for this object. |
| * <p> |
| * <b>Note:</b>If the normalization mode is changed while iterating |
| * over a string, calls to {@link #next} and {@link #previous} may |
| * return previously buffers characters in the old normalization mode |
| * until the iteration is able to re-sync at the next base character. |
| * It is safest to call {@link #setText setText()}, {@link #first}, |
| * {@link #last}, etc. after calling <tt>setMode</tt>. |
| * <p> |
| * @param newMode the new mode for this <tt>Normalizer</tt>. |
| * The supported modes are: |
| * <ul> |
| * <li>{@link #COMPOSE} - Unicode canonical decompositiion |
| * followed by canonical composition. |
| * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion |
| * follwed by canonical composition. |
| * <li>{@link #DECOMP} - Unicode canonical decomposition |
| * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition. |
| * <li>{@link #NO_OP} - Do nothing but return characters |
| * from the underlying input text. |
| * </ul> |
| * |
| * @see #getMode |
| * @stable |
| */ |
| void setMode(EMode newMode); |
| |
| /** |
| * Return the basic operation performed by this <tt>Normalizer</tt> |
| * |
| * @see #setMode |
| * @stable |
| */ |
| EMode getMode(void) const; |
| |
| /** |
| * Set options that affect this <tt>Normalizer</tt>'s operation. |
| * Options do not change the basic composition or decomposition operation |
| * that is being performed , but they control whether |
| * certain optional portions of the operation are done. |
| * Currently the only available option is: |
| * <p> |
| * <ul> |
| * <li>{@link #IGNORE_HANGUL} - Do not decompose Hangul syllables into the |
| * Jamo alphabet and vice-versa. This option is off by default |
| * (<i>i.e.</i> Hangul processing is enabled) since the Unicode |
| * standard specifies that Hangul to Jamo is a canonical decomposition. |
| * For any of the standard Unicode Normalization |
| * Forms, you should leave this option off. |
| * </ul> |
| * <p> |
| * @param option the option whose value is to be set. |
| * @param value the new setting for the option. Use <tt>true</tt> to |
| * turn the option on and <tt>false</tt> to turn it off. |
| * |
| * @see #getOption |
| * @stable |
| */ |
| void setOption(int32_t option, |
| UBool value); |
| |
| /** |
| * Determine whether an option is turned on or off. |
| * <p> |
| * @see #setOption |
| * @stable |
| */ |
| UBool getOption(int32_t option) const; |
| |
| /** |
| * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| * The iteration position is set to the beginning. |
| * @stable |
| */ |
| void setText(const UnicodeString& newText, |
| UErrorCode &status); |
| |
| /** |
| * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| * The iteration position is set to the beginning. |
| * @stable |
| */ |
| void setText(const CharacterIterator& newText, |
| UErrorCode &status); |
| |
| /** |
| * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| * The iteration position is set to the beginning. |
| * @stable |
| */ |
| void setText(const UChar* newText, |
| int32_t length, |
| UErrorCode &status); |
| /** |
| * Copies the text under iteration into the UnicodeString referred to by |
| * "result". |
| * @param result Receives a copy of the text under iteration. |
| * @draft should also return the result UnicodeString & |
| */ |
| void getText(UnicodeString& result); |
| |
| /** |
| * Returns the text under iteration into the UChar* buffer pointer. |
| * @param result Receives a copy of the text under iteration. |
| * @unimplemented |
| */ |
| const UChar* getText(int32_t& count); |
| |
| private: |
| // Private utility methods for iteration |
| // For documentation, see the source code |
| UChar nextCompose(void); |
| UChar prevCompose(void); |
| UChar nextDecomp(void); |
| UChar prevDecomp(void); |
| |
| UChar curForward(void); |
| UChar curBackward(void); |
| |
| void init(CharacterIterator* iter, |
| EMode mode, |
| int32_t option); |
| void initBuffer(void); |
| void clearBuffer(void); |
| |
| // Utilities used by Compose |
| static void bubbleAppend(UnicodeString& target, |
| UChar ch, |
| uint32_t cclass); |
| static uint32_t getComposeClass(UChar ch); |
| static uint16_t composeLookup(UChar ch); |
| static uint16_t composeAction(uint16_t baseIndex, |
| uint16_t comIndex); |
| static void explode(UnicodeString& target, |
| uint16_t index); |
| static UChar pairExplode(UnicodeString& target, |
| uint16_t action); |
| |
| // Utilities used by Decompose |
| static void fixCanonical(UnicodeString& result); // Reorders combining marks |
| static uint8_t getClass(UChar ch); // Gets char's combining class |
| |
| // Other static utility methods |
| static void doAppend(const UChar source[], |
| uint16_t offset, |
| UnicodeString& dest); |
| static void doInsert(const UChar source[], |
| uint16_t offset, |
| UnicodeString& dest, |
| UTextOffset pos); |
| static uint16_t doReplace(const UChar source[], |
| uint16_t offset, |
| UnicodeString& dest, |
| UTextOffset pos); |
| |
| static void hangulToJamo(UChar ch, |
| UnicodeString& result, |
| uint16_t decompLimit); |
| static void jamoAppend(UChar ch, |
| uint16_t decompLimit, |
| UnicodeString& dest); |
| static void jamoToHangul(UnicodeString& buffer, |
| UTextOffset start); |
| |
| //------------------------------------------------------------------------- |
| // Private data |
| //------------------------------------------------------------------------- |
| |
| EMode fMode; |
| int32_t fOptions; |
| int16_t minDecomp; |
| |
| // The input text and our position in it |
| CharacterIterator* text; |
| |
| // A buffer for holding intermediate results |
| UnicodeString buffer; |
| UTextOffset bufferPos; |
| UTextOffset bufferLimit; |
| UChar currentChar; |
| |
| // Another buffer for use during iterative composition |
| UnicodeString explodeBuf; |
| |
| enum { |
| EMPTY = -1, |
| STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder |
| STR_LENGTH_MASK = 0x0003 |
| }; |
| |
| enum { |
| HANGUL_BASE = 0xac00, |
| HANGUL_LIMIT = 0xd7a4, |
| JAMO_LBASE = 0x1100, |
| JAMO_VBASE = 0x1161, |
| JAMO_TBASE = 0x11a7, |
| JAMO_LCOUNT = 19, |
| JAMO_VCOUNT = 21, |
| JAMO_TCOUNT = 28, |
| JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT |
| }; |
| |
| friend class ComposedCharIter; |
| }; |
| |
| inline UBool |
| Normalizer::operator!= (const Normalizer& other) const |
| { return ! operator==(other); } |
| |
| #endif // _NORMLZR |