| /******************************************************************************* |
| * |
| * Copyright (C) 1998-1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| * |
| * Change history: |
| * |
| * 06/29/2000 helena Major rewrite of the callback APIs. |
| *******************************************************************************/ |
| |
| #ifndef CONVERT_H |
| #define CONVERT_H |
| |
| |
| #include "unicode/unistr.h" |
| #include "unicode/ucnv.h" |
| /** |
| * UnicodeConverter is a C++ wrapper class for UConverter. |
| * You need one UnicodeConverter object in place of one UConverter object. |
| * For details on the API and implementation of the |
| * codepage converter iterface see ucnv.h. |
| * |
| * @see UConverter |
| * @stable |
| */ |
| |
| class U_COMMON_API UnicodeConverter |
| { |
| private: |
| /*Internal Data representation of the Converter*/ |
| UConverter* myUnicodeConverter; |
| /*Debug method*/ |
| void printRef(void) const; |
| |
| /* list of converter and alias names */ |
| static const char **availableConverterNames; |
| static int32_t availableConverterNamesCount; |
| |
| public: |
| |
| //Constructors and a destructor |
| |
| /** |
| * Creates Unicode Conversion Object will default to LATIN1 <-> encoding |
| * @return An object Handle if successful or a NULL if the creation failed |
| * @stable |
| */ |
| UnicodeConverter(); |
| |
| /** |
| * Creates Unicode Conversion Object by specifying the codepage name. The name |
| * string is in ASCII format. |
| * @param code_set the pointer to a char[] object containing a codepage name. (I) |
| * @param UErrorCode Error status (I/O) IILLEGAL_ARGUMENT_ERROR will be returned if the string is empty. |
| * If the internal program does not work correctly, for example, if there's no such codepage, |
| * U_INTERNAL_PROGRAM_ERROR will be returned. |
| * @return An object Handle if successful or a NULL if the creation failed |
| * @stable |
| */ |
| UnicodeConverter(const char* name, |
| UErrorCode& err); |
| |
| /** |
| *Creates a UnicodeConverter object with the names specified as unicode strings. The name should be limited to |
| *the ASCII-7 alphanumerics. Dash and underscore characters are allowed for readability, but are ignored in the |
| *search. |
| *@param code_set name of the uconv table in Unicode string (I) |
| *@param err error status (I/O) IILLEGAL_ARGUMENT_ERROR will be returned if the string is empty. If the internal |
| *program does not work correctly, for example, if there's no such codepage, U_INTERNAL_PROGRAM_ERROR will be |
| *returned. |
| *@return the created Unicode converter object |
| * @stable |
| */ |
| UnicodeConverter(const UnicodeString& name, |
| UErrorCode& err); |
| |
| /** |
| * Creates Unicode Conversion Object using the codepage ID number. |
| * @param code_set a codepage # (I) |
| * @UErrorCode Error status (I/O) IILLEGAL_ARGUMENT_ERROR will be returned if the string is empty. |
| * If the internal program does not work correctly, for example, if there's no such codepage, |
| * U_INTERNAL_PROGRAM_ERROR will be returned. |
| * @return An object Handle if successful or a NULL if failed |
| * @stable |
| */ |
| UnicodeConverter(int32_t codepageNumber, |
| UConverterPlatform platform, |
| UErrorCode& err); |
| |
| ~UnicodeConverter(); |
| |
| |
| /** |
| * Transcodes the source UnicodeString to the target string in a codepage encoding |
| * with the specified Unicode converter. For example, if a Unicode to/from JIS |
| * converter is specified, the source string in Unicode will be transcoded to JIS |
| * encoding. The result will be stored in JIS encoding. |
| * |
| * @param source the source Unicode string |
| * @param target the target string in codepage encoding |
| * @param targetSize Input the number of bytes available in the "target" buffer, Output the number of bytes copied to it |
| * @param err the error status code. U_MEMORY_ALLOCATION_ERROR will be returned if the |
| * the internal process buffer cannot be allocated for transcoding. U_ILLEGAL_ARGUMENT_ERROR |
| * is returned if the converter is null or the source or target string is empty. |
| * @draft backslash versus Yen sign in shift-JIS |
| */ |
| void fromUnicodeString(char* target, |
| int32_t& targetSize, |
| const UnicodeString& source, |
| UErrorCode& err) const; |
| |
| /** |
| * Transcode the source string in codepage encoding to the target string in |
| * Unicode encoding. For example, if a Unicode to/from JIS |
| * converter is specified, the source string in JIS encoding will be transcoded |
| * to Unicode encoding. The result will be stored in Unicode encoding. |
| * @param source the source string in codepage encoding |
| * @param target the target string in Unicode encoding |
| * @param targetSize : I/O parameter, Input size buffer, Output # of bytes copied to it |
| * @param err the error status code U_MEMORY_ALLOCATION_ERROR will be returned if the |
| * the internal process buffer cannot be allocated for transcoding. U_ILLEGAL_ARGUMENT_ERROR |
| * is returned if the converter is null or the source or target string is empty. |
| * @stable |
| */ |
| void toUnicodeString(UnicodeString& target, |
| const char* source, |
| int32_t sourceSize, |
| UErrorCode& err) const; |
| |
| /** |
| * Transcodes an array of unicode characters to an array of codepage characters. |
| * The source pointer is an I/O parameter, it starts out pointing at the place |
| * to begin translating, and ends up pointing after the first sequence of the bytes |
| * that it encounters that are semantically invalid. |
| * if T_UnicodeConverter_setMissingCharAction is called with an action other than STOP |
| * before a call is made to this API, consumed and source should point to the same place |
| * (unless target ends with an imcomplete sequence of bytes and flush is FALSE). |
| * @param target : I/O parameter. Input : Points to the beginning of the buffer to copy |
| * codepage characters to. Output : points to after the last codepage character copied |
| * to target. |
| * @param targetLimit the pointer to the end of the target array |
| * @param source the source Unicode character array |
| * @param sourceLimit the pointer to the end of the source array |
| * @param flush TRUE if the buffer is the last buffer and the conversion will finish |
| * in this call, FALSE otherwise. (future feature pending) |
| * @param UErrorCode the error status. U_ILLEGAL_ARGUMENT_ERROR will be returned if the |
| * converter is null. |
| * @draft backslash versus Yen sign in shift-JIS |
| */ |
| void fromUnicode(char*& target, |
| const char* targetLimit, |
| const UChar*& source, |
| const UChar* sourceLimit, |
| int32_t * offsets, |
| UBool flush, |
| UErrorCode& err); |
| |
| |
| /** |
| * Converts an array of codepage characters into an array of unicode characters. |
| * The source pointer is an I/O parameter, it starts out pointing at the place |
| * to begin translating, and ends up pointing after the first sequence of the bytes |
| * that it encounters that are semantically invalid. |
| * if T_UnicodeConverter_setMissingUnicodeAction is called with an action other than STOP |
| * before a call is made to this API, consumed and source should point to the same place |
| * (unless target ends with an imcomplete sequence of bytes and flush is FALSE). |
| * @param target : I/O parameter. Input : Points to the beginning of the buffer to copy |
| * Unicode characters to. Output : points to after the last UChar copied to target. |
| * @param targetLimit the pointer to the end of the target array |
| * @param source the source codepage character array |
| * @param sourceLimit the pointer to the end of the source array |
| * @param flush TRUE if the buffer is the last buffer and the conversion will finish |
| * in this call, FALSE otherwise. (future feature pending) |
| * @param err the error code status U_ILLEGAL_ARGUMENT_ERROR will be returned if the |
| * converter is null, targetLimit < target, sourceLimit < source |
| * @stable |
| */ |
| void toUnicode(UChar*& target, |
| const UChar* targetLimit, |
| const char*& source, |
| const char* sourceLimit, |
| int32_t * offsets, |
| UBool flush, |
| UErrorCode& err); |
| |
| |
| /** |
| * Returns the maximum length of bytes used by a character. This varies between 1 and 4 |
| * @return the max number of bytes per codepage character * converter is null, targetLimit < target, sourceLimit < source |
| * @stable |
| */ |
| int8_t getMaxBytesPerChar(void) const; |
| |
| /** |
| * Returns the minimum byte length for characters in this codepage. This is either |
| * 1 or 2 for all supported codepages. |
| * @return the minimum number of byte per codepage character |
| * @stable |
| */ |
| int8_t getMinBytesPerChar(void) const; |
| |
| /** |
| *Gets the type of conversion associated with the converter |
| * e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, EBCDIC_STATEFUL, LATIN_1 |
| * @return the type of the converter |
| * @stable |
| */ |
| UConverterType getType(void) const; |
| |
| /** |
| *Gets the "starter" bytes for the converters of type MBCS |
| *will fill in an <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> if converter passed in |
| *is not MBCS. |
| *fills in an array of boolean, with the value of the byte as offset to the array. |
| *At return, if TRUE is found in at offset 0x20, it means that the byte 0x20 is a starter byte |
| *in this converter. |
| * @param starters: an array of size 256 to be filled in |
| * @param err: an array of size 256 to be filled in |
| * @see ucnv_getType |
| * @stable |
| */ |
| void getStarters(UBool starters[256], |
| UErrorCode& err) const; |
| /** |
| * Fills in the output parameter, subChars, with the substitution characters |
| * as multiple bytes. |
| * @param subChars the subsitution characters |
| * @param len the number of bytes of the substitution character array |
| * @param err the error status code. U_ILLEGAL_ARGUMENT_ERROR will be returned if |
| * the converter is null. If the substitution character array is too small, an |
| * U_INDEX_OUTOFBOUNDS_ERROR will be returned. |
| * @stable |
| */ |
| void getSubstitutionChars(char* subChars, |
| int8_t& len, |
| UErrorCode& err) const; |
| /** |
| * Sets the substitution chars when converting from unicode to a codepage. The |
| * substitution is specified as a string of 1-4 bytes, and may contain null byte. |
| * The fill-in parameter err will get the error status on return. |
| * @param cstr the substitution character array to be set with |
| * @param len the number of bytes of the substitution character array and upon return will contain the |
| * number of bytes copied to that buffer |
| * @param err the error status code. U_ILLEGAL_ARGUMENT_ERROR if the converter is |
| * null. or if the number of bytes provided are not in the codepage's range (e.g length 1 for ucs-2) |
| * @stable |
| */ |
| void setSubstitutionChars(const char* subChars, |
| int8_t len, |
| UErrorCode& err); |
| |
| /** |
| * Resets the state of stateful conversion to the default state. This is used |
| * in the case of error to restart a conversion from a known default state. |
| * @stable |
| */ |
| void resetState(void); |
| |
| /** |
| * Gets the name of the converter (zero-terminated). |
| * the name will be the internal name of the converter |
| * @param converter the Unicode converter |
| * @param err the error status code. U_INDEX_OUTOFBOUNDS_ERROR in the converterNameLen is too |
| * small to contain the name. |
| * @stable |
| */ |
| const char* getName( UErrorCode& err) const; |
| |
| |
| /** |
| * Gets a codepage number associated with the converter. This is not guaranteed |
| * to be the one used to create the converter. Some converters do not represent |
| * IBM registered codepages and return zero for the codepage number. |
| * The error code fill-in parameter indicates if the codepage number is available. |
| * @param err the error status code. U_ILLEGAL_ARGUMENT_ERROR will returned if |
| * the converter is null or if converter's data table is null. |
| * @return If any error occurrs, null will be returned. |
| * @stable |
| */ |
| int32_t getCodepage(UErrorCode& err) const; |
| |
| /** |
| * Returns the current setting action taken when a character from a codepage |
| * is missing or a byte sequence is illegal etc. |
| * @param action the callback function pointer |
| * @param context the callback function state |
| * @stable |
| */ |
| void getMissingCharAction(UConverterToUCallback *action, |
| void **context) const; |
| |
| /** |
| * Return the current setting action taken when a unicode character is missing |
| * or there is an unpaired surrogate etc. |
| * @param action the callback function pointer |
| * @param context the callback function state |
| * @stable |
| */ |
| void getMissingUnicodeAction(UConverterFromUCallback *action, |
| void **context) const; |
| |
| /** |
| * Sets the current setting action taken when a character from a codepage is |
| * missing. (Currently STOP or SUBSTITUTE). |
| * @param newAction the action constant if an equivalent codepage character is missing |
| * @param newContext the new toUnicode callback function state |
| * @param oldAction the original action constant, saved for later restoration. |
| * @param oldContext the old toUnicode callback function state |
| * @param err the error status code |
| * @stable |
| */ |
| void setMissingCharAction(UConverterToUCallback newAction, |
| void* newContext, |
| UConverterToUCallback *oldAction, |
| void** oldContext, |
| UErrorCode& err); |
| |
| /** |
| * Sets the current setting action taken when a unicode character is missing. |
| * (currently T_UnicodeConverter_MissingUnicodeAction is either STOP or SUBSTITUTE, |
| * SKIP, CLOSEST_MATCH, ESCAPE_SEQ may be added in the future). |
| * @param newAction the action constant if an equivalent Unicode character is missing |
| * @param newContext the new fromUnicode callback function state |
| * @param oldAction the original action constant, saved for later restoration. |
| * @param oldContext the old fromUnicode callback function state |
| * @param err the error status code |
| * @stable |
| */ |
| void setMissingUnicodeAction(UConverterFromUCallback newAction, |
| void* newContext, |
| UConverterFromUCallback *oldAction, |
| void** oldContext, |
| UErrorCode& err); |
| /** |
| * Returns the localized name of the UnicodeConverter, if for any reason it is |
| * available, the internal name will be returned instead. |
| * @param displayLocale the valid Locale, from which we want to localize |
| * @param displayString a UnicodeString that is going to be filled in. |
| * @stable |
| */ |
| void getDisplayName(const Locale& displayLocale, |
| UnicodeString& displayName) const; |
| |
| /** |
| * Returns the T_UnicodeConverter_platform (ICU defined enum) of a UnicodeConverter |
| * available, the internal name will be returned instead. |
| * @param err the error code status |
| * @return the codepages platform |
| * @stable |
| */ |
| UConverterPlatform getCodepagePlatform(UErrorCode& err) const; |
| |
| |
| UnicodeConverter& operator=(const UnicodeConverter& that); |
| UBool operator==(const UnicodeConverter& that) const; |
| UBool operator!=(const UnicodeConverter& that) const; |
| UnicodeConverter(const UnicodeConverter& that); |
| |
| /** |
| * Returns the available names. Lazy evaluated, Library owns the storage |
| * @param num the number of available converters |
| * @param err the error code status |
| * @return the name array |
| * @stable |
| */ |
| static const char* const* getAvailableNames(int32_t& num, |
| UErrorCode& err); |
| |
| /** |
| * Iterates through every cached converter and frees all the unused ones |
| * @return the number of cached converters successfully deleted |
| * @stable |
| */ |
| static int32_t flushCache(void); |
| /** |
| * Fixes the backslash character mismapping. For example, in SJIS, the backslash |
| * character in the ASCII portion is also used to represent the yen currency sign. |
| * When mapping from Unicode character 0x005C, it's unclear whether to map the |
| * character back to yen or backslash in SJIS. This function will take the input |
| * buffer and replace all the yen sign characters with backslash. This is necessary |
| * when the user tries to open a file with the input buffer on Windows. |
| * @param source the input buffer to be fixed |
| * @draft |
| */ |
| void fixFileSeparator(UnicodeString& source) const; |
| |
| /** |
| * Determines if the converter contains ambiguous mappings of the same |
| * character or not. |
| * @return TRUE if the converter contains ambiguous mapping of the same |
| * character, FALSE otherwise. |
| * @draft |
| */ |
| UBool isAmbiguous(void) const; |
| |
| }; |
| |
| typedef UnicodeConverter UnicodeConverterCPP; /* Backwards compatibility. */ |
| |
| #endif |