| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 1999-2000, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: utf.h |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 1999sep09 |
| * created by: Markus W. Scherer |
| * |
| * This file defines the UChar and UChar32 data types for Unicode code units |
| * and code points, as well as macros for efficiently getting code points |
| * in and out of a string. |
| * utf.h is included by utypes.h and itself includes the utfXX.h after some |
| * common definitions. Those files define the macros for each UTF-size. |
| */ |
| |
| #ifndef __UTF_H__ |
| #define __UTF_H__ |
| |
| /* |
| * ANSI C headers: |
| * stddef.h defines wchar_t |
| * limits.h defines CHAR_MAX |
| */ |
| #include <stddef.h> |
| #include <limits.h> |
| #include "unicode/umachine.h" |
| /* include the utfXX.h after the following definitions */ |
| |
| /* If there is no compiler option for the preferred UTF size, then default to UTF-16. */ |
| #ifndef UTF_SIZE |
| # define UTF_SIZE 16 |
| #endif |
| |
| #define U_SIZEOF_UCHAR (UTF_SIZE>>3) |
| |
| /* Do we have wchar.h on this platform? It is there on most platforms. */ |
| #ifndef U_HAVE_WCHAR_H |
| # define U_HAVE_WCHAR_H 1 |
| #endif |
| |
| /* U_SIZEOF_WCHAR_T==sizeof(wchar_t) (0 means it is not defined or autoconf could not set it) */ |
| #if U_SIZEOF_WCHAR_T==0 |
| # undef U_SIZEOF_WCHAR_T |
| # define U_SIZEOF_WCHAR_T 4 |
| #endif |
| |
| /* Define UChar32 to be compatible with wchar_t if possible. */ |
| #if U_SIZEOF_WCHAR_T==4 |
| typedef wchar_t UChar32; |
| #else |
| typedef uint32_t UChar32; |
| #endif |
| |
| /* Unicode string and array offset and index type */ |
| typedef int32_t UTextOffset; |
| |
| /* Specify which macro versions are the default ones - safe or fast. */ |
| #if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE) |
| # define UTF_SAFE |
| #endif |
| |
| /* internal definitions ----------------------------------------------------- */ |
| |
| /* |
| * Special error values for UTF-8, |
| * which need 1 or 2 bytes in UTF-8: |
| * U+0015 = NAK = Negative Acknowledge, C0 control character |
| * U+009f = highest C1 control character |
| * |
| * These are used by ("safe") UTF-8 macros so that they can return an error value |
| * that needs the same number of code units (bytes) as were seen by |
| * a macro. |
| */ |
| #define UTF8_ERROR_VALUE_1 0x15 |
| #define UTF8_ERROR_VALUE_2 0x9f |
| |
| /* error value for all UTFs */ |
| #define UTF_ERROR_VALUE 0xffff |
| |
| /* single-code point definitions -------------------------------------------- */ |
| |
| /* is this code unit or code point a surrogate? */ |
| #define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) |
| |
| /* |
| * Is a given 32-bit code point/Unicode scalar value |
| * actually a valid Unicode (abstract) character? |
| */ |
| #define UTF_IS_UNICODE_CHAR(c) \ |
| ((uint32_t)(c)<=0x10ffff && \ |
| !UTF_IS_SURROGATE(c) && ((c)&0xfffe)!=0xfffe) |
| |
| /* |
| * Is a given 32-bit code an error value |
| * as returned by one of the macros for any UTF? |
| */ |
| #define UTF_IS_ERROR(c) \ |
| (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) |
| |
| /* This is a combined macro: is c a valid Unicode value _and_ not an error code? */ |
| #define UTF_IS_VALID(c) \ |
| ((uint32_t)(c)<=0x10ffff && \ |
| !UTF_IS_SURROGATE(c) && \ |
| ((c)&0xfffe)!=0xfffe && \ |
| (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2) |
| |
| /* include the utfXX.h ------------------------------------------------------ */ |
| |
| #include "unicode/utf8.h" |
| #include "unicode/utf16.h" |
| #include "unicode/utf32.h" |
| |
| /* Define types and macros according to the selected UTF size. -------------- */ |
| |
| #if UTF_SIZE==8 |
| |
| # error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16 |
| |
| /* Define UChar to be compatible with char if possible. */ |
| # if CHAR_MAX>=255 |
| typedef char UChar; |
| # else |
| typedef uint8_t UChar; |
| # endif |
| |
| #elif UTF_SIZE==16 |
| |
| /* Define UChar to be compatible with wchar_t if possible. */ |
| # if U_SIZEOF_WCHAR_T==2 |
| typedef wchar_t UChar; |
| # else |
| typedef uint16_t UChar; |
| # endif |
| |
| # define UTF_IS_SINGLE(uchar) UTF16_IS_SINGLE(uchar) |
| # define UTF_IS_LEAD(uchar) UTF16_IS_LEAD(uchar) |
| # define UTF_IS_TRAIL(uchar) UTF16_IS_TRAIL(uchar) |
| |
| # define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) |
| # define UTF_CHAR_LENGTH(c) UTF16_CHAR_LENGTH(c) |
| # define UTF_MAX_CHAR_LENGTH UTF16_MAX_CHAR_LENGTH |
| # define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) |
| |
| # define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) |
| # define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) |
| |
| # define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
| # define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) |
| |
| # define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
| # define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
| |
| # define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) |
| # define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) |
| |
| # define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) |
| # define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) |
| |
| # define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) |
| # define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) |
| |
| # define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) |
| # define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
| |
| # define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) |
| # define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) |
| |
| # define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) |
| # define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) |
| |
| # define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
| # define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
| |
| #elif UTF_SIZE==32 |
| |
| # error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16 |
| |
| typedef UChar32 UChar; |
| |
| #else |
| # error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented |
| #endif |
| |
| /* Define the default macros for handling UTF characters. ------------------- */ |
| |
| #ifdef UTF_SAFE |
| |
| # define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE) |
| |
| # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE) |
| # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c) |
| # define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length) |
| # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n) |
| # define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i) |
| |
| # define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE) |
| # define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i) |
| # define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n) |
| # define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
| |
| #elif defined(UTF_STRICT) |
| |
| # define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE) |
| |
| # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE) |
| # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c) |
| # define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length) |
| # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n) |
| # define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i) |
| |
| # define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE) |
| # define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i) |
| # define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n) |
| # define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
| |
| #else /* UTF_UNSAFE */ |
| |
| # define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c) |
| |
| # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_UNSAFE(s, i, c) |
| # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_UNSAFE(s, i, c) |
| # define UTF_FWD_1(s, i, length) UTF_FWD_1_UNSAFE(s, i) |
| # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_UNSAFE(s, i, n) |
| # define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_UNSAFE(s, i) |
| |
| # define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_UNSAFE(s, i, c) |
| # define UTF_BACK_1(s, start, i) UTF_BACK_1_UNSAFE(s, i) |
| # define UTF_BACK_N(s, start, i, n) UTF_BACK_N_UNSAFE(s, i, n) |
| # define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i) |
| |
| #endif |
| |
| #endif |