blob: 95f0c867564513d97a799ad6c5448329e8855c24 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (c) 1996-2001, International Business Machines Corporation
* and others. All Rights Reserved.
*******************************************************************************
* File unorm.h
*
* Created by: Vladimir Weinstein 12052000
*
* Modification history :
*
* Date Name Description
* 02/01/01 synwee Added normalization quickcheck enum and method.
*/
#ifndef UNORM_H
#define UNORM_H
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Unicode Normalization
*
* <h2>Unicode normalization API</h2>
*
* <code>unorm_normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>unorm_normalize</code> supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
*
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character A-acute.
* In Unicode, this can be encoded as a single character (the
* "composed" form):
*
* \code
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
* \endcode
*
* or as two separate characters (the "decomposed" form):
*
* \code
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
* \endcode
*
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "A with acute accent". When you are searching or
* comparing text, you must ensure that these two sequences are treated
* equivalently. In addition, you must handle characters with more than one
* accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
* \code
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
* \endcode
*
* or as the single character
*
* \code
* FB03 LATIN SMALL LIGATURE FFI
* \endcode
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
* into the corresponding semantic characters. When sorting and searching, you
* will often want to use these mappings.
*
* <code>unorm_normalize</code> helps solve these problems by transforming text into the
* canonical composed and decomposed forms as shown in the first example above.
* In addition, you can have it perform compatibility decompositions so that
* you can treat compatibility characters the same as their equivalents.
* Finally, <code>unorm_normalize</code> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
*
* Form FCD, "Fast C or D", is also designed for collation.
* It allows to work on strings that are not necessarily normalized
* with an algorithm (like in collation) that works under "canonical closure", i.e., it treats precomposed
* characters and their decomposed equivalents the same.
*
* It is not a normalization form because it does not provide for uniqueness of representation. Multiple strings
* may be canonically equivalent (their NFDs are identical) and may all conform to FCD without being identical
* themselves.
*
* The form is defined such that the "raw decomposition", the recursive canonical decomposition of each character,
* results in a string that is canonically ordered. This means that precomposed characters are allowed for as long
* as their decompositions do not need canonical reordering.
*
* Its advantage for a process like collation is that all NFD and most NFC texts - and many unnormalized texts -
* already conform to FCD and do not need to be normalized (NFD) for such a process. The FCD quick check will
* return UNORM_YES for most strings in practice.
*
* unorm_normalize(UNORM_FCD) may be implemented with UNORM_NFD.
*
* For more details on FCD see the collation design document:
* http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
*
* ICU collation performs either NFD or FCD normalization automatically if normalization
* is turned on for the collator object.
* Beyond collation and string search, normalized strings may be useful for string equivalence comparisons,
* transliteration/transcription, unique representations, etc.
*
* The W3C generally recommends to exchange texts in NFC.
* Note also that most legacy character encodings use only precomposed forms and often do not
* encode any combining marks by themselves. For conversion to such character encodings the
* Unicode text needs to be normalized to NFC.
* For more usage examples, see the Unicode Standard Annex.
*/
/**
* Constants for normalization modes.
* @draft ICU 1.8
*/
typedef enum {
/** No decomposition/composition. @draft ICU 1.8 */
UNORM_NONE = 1,
/** Canonical decomposition. @draft ICU 1.8 */
UNORM_NFD = 2,
/** Compatibility decomposition. @draft ICU 1.8 */
UNORM_NFKD = 3,
/** Canonical decomposition followed by canonical composition. @draft ICU 1.8 */
UNORM_NFC = 4,
/** Default normalization. @draft ICU 1.8 */
UNORM_DEFAULT = UNORM_NFC,
/** Compatibility decomposition followed by canonical composition. @draft ICU 1.8 */
UNORM_NFKC =5,
/** "Fast C or D" form. @draft ICU 2.0 */
UNORM_FCD = 6,
/** One more than the highest normalization mode constant. @draft ICU 1.8 */
UNORM_MODE_COUNT,
/* *** The rest of this enum is entirely deprecated. *** */
/**
* No decomposition/composition
* @deprecated To be removed after 2002-sep-30, use UNORM_NONE.
*/
UCOL_NO_NORMALIZATION = 1,
/**
* Canonical decomposition
* @deprecated To be removed after 2002-sep-30, use UNORM_NFD.
*/
UCOL_DECOMP_CAN = 2,
/**
* Compatibility decomposition
* @deprecated To be removed after 2002-sep-30, use UNORM_NFKD.
*/
UCOL_DECOMP_COMPAT = 3,
/**
* Default normalization
* @deprecated To be removed after 2002-sep-30, use UNORM_NFKD or UNORM_DEFAULT.
*/
UCOL_DEFAULT_NORMALIZATION = UCOL_DECOMP_COMPAT,
/**
* Canonical decomposition followed by canonical composition
* @deprecated To be removed after 2002-sep-30, use UNORM_NFC.
*/
UCOL_DECOMP_CAN_COMP_COMPAT = 4,
/**
* Compatibility decomposition followed by canonical composition
* @deprecated To be removed after 2002-sep-30, use UNORM_NFKC.
*/
UCOL_DECOMP_COMPAT_COMP_CAN =5,
/**
* Do not normalize Hangul.
* @deprecated To be removed without replacement after 2002-mar-31.
*/
UCOL_IGNORE_HANGUL = 16,
/**
* Do not normalize Hangul.
* @deprecated To be removed without replacement after 2002-mar-31.
*/
UNORM_IGNORE_HANGUL = 16
} UNormalizationMode;
/**
* Normalize a string.
* The string will be normalized according the specified normalization mode
* and options (there are currently no options defined).
*
* @param source The string to normalize.
* @param sourceLength The length of source, or -1 if NUL-terminated.
* @param mode The normalization mode; one of UNORM_NONE,
* UNORM_NFD, UNORM_NFC, UNORM_NFKC, UNORM_NFKD, UNORM_DEFAULT.
* @param options The normalization options, ORed together (0 for no options);
* currently there is no option defined.
* @param result A pointer to a buffer to receive the result string.
* The result string is NUL-terminated if possible.
* @param resultLength The maximum size of result.
* @param status A pointer to a UErrorCode to receive any errors.
* @return The total buffer size needed; if greater than resultLength,
* the output was truncated, and the error code is set to U_BUFFER_OVERFLOW_ERROR.
* @stable
*/
U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar *source, int32_t sourceLength,
UNormalizationMode mode, int32_t options,
UChar *result, int32_t resultLength,
UErrorCode *status);
/**
* The function u_normalize() has been renamed to unorm_normalize()
* for consistency. The old name is deprecated.
* @deprecated To be removed after 2002-mar-31.
*/
#define u_normalize unorm_normalize
/**
* Result values for unorm_quickCheck().
* For details see Unicode Technical Report 15.
* @draft ICU 1.8
*/
typedef enum UNormalizationCheckResult {
/**
* Indicates that string is not in the normalized format
*/
UNORM_NO,
/**
* Indicates that string is in the normalized format
*/
UNORM_YES,
/**
* Indicates that string cannot be determined if it is in the normalized
* format without further thorough checks.
*/
UNORM_MAYBE
} UNormalizationCheckResult;
/**
* Performing quick check on a string, to quickly determine if the string is
* in a particular normalization format.
* Three types of result can be returned UNORM_YES, UNORM_NO or
* UNORM_MAYBE. Result UNORM_YES indicates that the argument
* string is in the desired normalized format, UNORM_NO determines that
* argument string is not in the desired normalized format. A
* UNORM_MAYBE result indicates that a more thorough check is required,
* the user may have to put the string in its normalized form and compare the
* results.
*
* @param source string for determining if it is in a normalized format
* @param sourcelength length of source to test, or -1 if NUL-terminated
* @paran mode which normalization form to test for
* @param status a pointer to a UErrorCode to receive any errors
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
* @draft ICU 1.8
*/
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *source, int32_t sourcelength,
UNormalizationMode mode,
UErrorCode *status);
#endif