source/common/unicode.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 * Copyright (C) 1996-1999, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
 //  FILE NAME : unicode.cpp
 //
 //  CREATED
 //      Wednesday, December 11, 1996
 //
 //  CHANGES
 //      Wednesday, February 4,  1998
 //      Changed logic in toUpperCase and toLowerCase in order
 //      to avoid 0xFFFF to be returned when receiving
 //      confusing Unichar  to lowercase or to uppercase
 //      (e.g. Letterlike symbols)
 //
 //  CHANGES BY
 //  Bertramd A. DAMIBA
 //
 //  CREATED BY
 //      Helena Shih
 //
 //  CHANGES
 //      Thursday, April 15, 1999
 //      Modified the definitions of all the functions
 //      C++ Wrappers for Unicode
 //  CHANGES BY
 //      Madhu Katragadda
 //   5/20/99     Madhu		Added the function u_getVersion()
 //  07/09/99     stephen        Added definition for {MIN,MAX}_VALUE
 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
 //********************************************************************************************

 #include "unicode/unicode.h"

 #include "unicode/uchar.h"


 const UChar Unicode::MIN_VALUE = 0x0000;
 const UChar Unicode::MAX_VALUE = 0xFFFF;
 const int8_t Unicode::MIN_RADIX = 2;
 const int8_t Unicode::MAX_RADIX = 36;

 Unicode::Unicode()
 {
 }

 Unicode::Unicode(const  Unicode&    other)
 {
 }

 Unicode::~Unicode()
 {
 }

 const Unicode&
 Unicode::operator=(const    Unicode&    other)
 {
     return *this;
 }

 // Checks if ch is a lower case letter.
 bool_t
 Unicode::isLowerCase(UChar ch)
 {
     return (u_islower(ch) );
 }

 // Checks if ch is a upper case letter.
 bool_t
 Unicode::isUpperCase(UChar ch)
 {
     return (u_isupper(ch) );
 }

 // Checks if ch is a title case letter; usually upper case letters.
 bool_t
 Unicode::isTitleCase(UChar ch)
 {
     return (u_istitle(ch) );
 }

 // Checks if ch is a decimal digit.
 bool_t
 Unicode::isDigit(UChar ch)
 {
     return (u_isdigit(ch) );
 }

 // Checks if ch is a unicode character with assigned character type.
 bool_t
 Unicode::isDefined(UChar ch)
 {
     return (u_isdefined(ch) );
 }


 // Gets the character's linguistic directionality.
 Unicode::EDirectionProperty
 Unicode::characterDirection( UChar ch )
 {

     return ((EDirectionProperty)u_charDirection(ch) );
 }

 // Get the script associated with the character
 Unicode::EUnicodeScript
 Unicode::getScript(UChar ch)
 {


     return ((EUnicodeScript) u_charScript(ch) );
 }

 // Checks if the Unicode character is a base form character that can take a diacritic.
 bool_t
 Unicode::isBaseForm(UChar ch)
 {
     return (u_isbase(ch) );

 }

 // Checks if the Unicode character is a control character.
 bool_t
 Unicode::isControl(UChar ch)
 {
     return( u_iscntrl(ch) );
 }

 // Checks if the Unicode character is printable.
 bool_t
 Unicode::isPrintable(UChar ch)
 {
     return( u_isprint(ch) );
 }

 // Checks if the Unicode character is a letter.
 bool_t
 Unicode::isLetter(UChar ch)
 {
     return(u_isalpha(ch) );
 }

 // Checks if the Unicode character can start a Java identifier.
 bool_t
 Unicode::isJavaIdentifierStart(UChar ch)
 {
     return( u_isJavaIDStart(ch) );
 }

 // Checks if the Unicode character can be a Java identifier part other than starting the
 // identifier.
 bool_t
 Unicode::isJavaIdentifierPart(UChar ch)
 {
     return (u_isJavaIDPart(ch) );
 }

 // Checks if the Unicode character can start a Unicode identifier.
 bool_t
 Unicode::isUnicodeIdentifierStart(UChar ch)
 {
     return(u_isIDStart(ch));
 }

 // Checks if the Unicode character can be a Unicode identifier part other than starting the
 // identifier.
 bool_t
 Unicode::isUnicodeIdentifierPart(UChar ch)
 {
     return (u_isIDPart(ch) );
 }

 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
 bool_t
 Unicode::isIdentifierIgnorable(UChar ch)
 {
     return( u_isIDIgnorable(ch) );
 }

 // Transforms the Unicode character to its lower case equivalent.
 UChar
 Unicode::toLowerCase(UChar ch)
 {
     return (u_tolower(ch) );

 }

 // Transforms the Unicode character to its upper case equivalent.
 UChar
 Unicode::toUpperCase(UChar ch)
 {
     return(u_toupper(ch) );
 }

 // Transforms the Unicode character to its title case equivalent.
 UChar
 Unicode::toTitleCase(UChar ch)
 {
     return(u_totitle(ch) );
 }

 // Checks if the Unicode character is a space character.
 bool_t
 Unicode::isSpaceChar(UChar ch)
 {
     return(u_isspace(ch) );
 }

 // Determines if the specified character is white space according to ICU.
 bool_t
 Unicode::isWhitespace(UChar ch) {
     // TODO Move this implementation to C, and make this call the C
     //      implementation.
     // TODO Optional -- reimplement in terms of modified category
     //      code -- see Mark Davis's note (below).  If this is done,
     //      the implementation still must conform to the specified
     //      semantics.  That is, U+00A0 and U+FEFF must return false,
     //      and the ranges U+0009 - U+000D and U+001C - U+001F must
     //      return true.  Characters other than these in Zs, Zl, or Zp
     //      must return true.

     int8_t cat = Unicode::getType(ch);
     return
         (cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) ||
         (((((int32_t(1) << LINE_SEPARATOR) |
             (int32_t(1) << PARAGRAPH_SEPARATOR)) >> cat) & int32_t(1)) != 0) ||
         (ch <= 0x1F && ((((int32_t(1) << 0x0009) |
                           (int32_t(1) << 0x000A) |
                           (int32_t(1) << 0x000B) |
                           (int32_t(1) << 0x000C) |
                           (int32_t(1) << 0x000D) |
                           (int32_t(1) << 0x001C) |
                           (int32_t(1) << 0x001D) |
                           (int32_t(1) << 0x001E) |
                           (int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);

     // From Mark Davis:
     //| What we should do is to make sure that the special Cc characters like CR
     //| have either Zs, Zl, or Zp in the property database. We can then just call
     //| the equivalent of:
     //|
     //|  public static boolean isWhileSpace(char ch) {
     //|   return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
     //|
     //| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp);
     //|
     //| This is much faster code, since it just looksup the property value and does
     //| a couple of arithmetics to get the right answer.
     //
     // (We still have to make sure U+00A0 and U+FEFF are excluded, so the code
     //  might not be as simple as this. - aliu)
 }

 // Gets if the Unicode character's character property.
 int8_t
 Unicode::getType(UChar ch)
 {
     return(u_charType(ch) );
 }


 // Gets table cell width of the Unicode character.
 uint16_t
 Unicode::getCellWidth(UChar ch)
 {
     return (u_charCellWidth(ch) );
 }

 int32_t
 Unicode::digitValue(UChar ch)
 {
     return (u_charDigitValue(ch) );
 }

 int8_t
 Unicode::digit(UChar ch, int8_t radix) {
     int8_t value = -1;
     if (radix >= MIN_RADIX && radix <= MAX_RADIX) {
         value = (int8_t) u_charDigitValue(ch);
         if (value < 0) {
             if (ch >= (UChar)'A' && ch <= (UChar)'Z') {
                 value = ch - ((UChar)'A' - 10);
             } else if (ch >= (UChar)'a' && ch <= (UChar)'z') {
                 value = ch - ((UChar)'a' - 10);
             }
         }
     }
     return (value < radix) ? value : -1;
 }

 UChar
 Unicode::forDigit(int32_t digit, int8_t radix) {
     if ((radix < MIN_RADIX) || (radix > MAX_RADIX) ||
         (digit < 0) || (digit >= radix)) {
         return (UChar)0;
     }
     return (UChar)(((digit < 10) ? (UChar)'0' : ((UChar)'a' - 10))
                    + digit);
 }

 void
 Unicode::getUnicodeVersion(UVersionInfo versionArray)
 {
 	u_getUnicodeVersion(versionArray);
 }
	/*
	*******************************************************************************
	* Copyright (C) 1996-1999, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*/
	// FILE NAME : unicode.cpp
	//
	// CREATED
	// Wednesday, December 11, 1996
	//
	// CHANGES
	// Wednesday, February 4, 1998
	// Changed logic in toUpperCase and toLowerCase in order
	// to avoid 0xFFFF to be returned when receiving
	// confusing Unichar to lowercase or to uppercase
	// (e.g. Letterlike symbols)
	//
	// CHANGES BY
	// Bertramd A. DAMIBA
	//
	// CREATED BY
	// Helena Shih
	//
	// CHANGES
	// Thursday, April 15, 1999
	// Modified the definitions of all the functions
	// C++ Wrappers for Unicode
	// CHANGES BY
	// Madhu Katragadda
	// 5/20/99 Madhu Added the function u_getVersion()
	// 07/09/99 stephen Added definition for {MIN,MAX}_VALUE
	// 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit
	//********************************************************************************************

	#include "unicode/unicode.h"

	#include "unicode/uchar.h"


	const UChar Unicode::MIN_VALUE = 0x0000;
	const UChar Unicode::MAX_VALUE = 0xFFFF;
	const int8_t Unicode::MIN_RADIX = 2;
	const int8_t Unicode::MAX_RADIX = 36;

	Unicode::Unicode()
	{
	}

	Unicode::Unicode(const Unicode& other)
	{
	}

	Unicode::~Unicode()
	{
	}

	const Unicode&
	Unicode::operator=(const Unicode& other)
	{
	return *this;
	}

	// Checks if ch is a lower case letter.
	bool_t
	Unicode::isLowerCase(UChar ch)
	{
	return (u_islower(ch) );
	}

	// Checks if ch is a upper case letter.
	bool_t
	Unicode::isUpperCase(UChar ch)
	{
	return (u_isupper(ch) );
	}

	// Checks if ch is a title case letter; usually upper case letters.
	bool_t
	Unicode::isTitleCase(UChar ch)
	{
	return (u_istitle(ch) );
	}

	// Checks if ch is a decimal digit.
	bool_t
	Unicode::isDigit(UChar ch)
	{
	return (u_isdigit(ch) );
	}

	// Checks if ch is a unicode character with assigned character type.
	bool_t
	Unicode::isDefined(UChar ch)
	{
	return (u_isdefined(ch) );
	}


	// Gets the character's linguistic directionality.
	Unicode::EDirectionProperty
	Unicode::characterDirection( UChar ch )
	{

	return ((EDirectionProperty)u_charDirection(ch) );
	}

	// Get the script associated with the character
	Unicode::EUnicodeScript
	Unicode::getScript(UChar ch)
	{


	return ((EUnicodeScript) u_charScript(ch) );
	}

	// Checks if the Unicode character is a base form character that can take a diacritic.
	bool_t
	Unicode::isBaseForm(UChar ch)
	{
	return (u_isbase(ch) );

	}

	// Checks if the Unicode character is a control character.
	bool_t
	Unicode::isControl(UChar ch)
	{
	return( u_iscntrl(ch) );
	}

	// Checks if the Unicode character is printable.
	bool_t
	Unicode::isPrintable(UChar ch)
	{
	return( u_isprint(ch) );
	}

	// Checks if the Unicode character is a letter.
	bool_t
	Unicode::isLetter(UChar ch)
	{
	return(u_isalpha(ch) );
	}

	// Checks if the Unicode character can start a Java identifier.
	bool_t
	Unicode::isJavaIdentifierStart(UChar ch)
	{
	return( u_isJavaIDStart(ch) );
	}

	// Checks if the Unicode character can be a Java identifier part other than starting the
	// identifier.
	bool_t
	Unicode::isJavaIdentifierPart(UChar ch)
	{
	return (u_isJavaIDPart(ch) );
	}

	// Checks if the Unicode character can start a Unicode identifier.
	bool_t
	Unicode::isUnicodeIdentifierStart(UChar ch)
	{
	return(u_isIDStart(ch));
	}

	// Checks if the Unicode character can be a Unicode identifier part other than starting the
	// identifier.
	bool_t
	Unicode::isUnicodeIdentifierPart(UChar ch)
	{
	return (u_isIDPart(ch) );
	}

	// Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
	bool_t
	Unicode::isIdentifierIgnorable(UChar ch)
	{
	return( u_isIDIgnorable(ch) );
	}

	// Transforms the Unicode character to its lower case equivalent.
	UChar
	Unicode::toLowerCase(UChar ch)
	{
	return (u_tolower(ch) );

	}

	// Transforms the Unicode character to its upper case equivalent.
	UChar
	Unicode::toUpperCase(UChar ch)
	{
	return(u_toupper(ch) );
	}

	// Transforms the Unicode character to its title case equivalent.
	UChar
	Unicode::toTitleCase(UChar ch)
	{
	return(u_totitle(ch) );
	}

	// Checks if the Unicode character is a space character.
	bool_t
	Unicode::isSpaceChar(UChar ch)
	{
	return(u_isspace(ch) );
	}

	// Determines if the specified character is white space according to ICU.
	bool_t
	Unicode::isWhitespace(UChar ch) {
	// TODO Move this implementation to C, and make this call the C
	// implementation.
	// TODO Optional -- reimplement in terms of modified category
	// code -- see Mark Davis's note (below). If this is done,
	// the implementation still must conform to the specified
	// semantics. That is, U+00A0 and U+FEFF must return false,
	// and the ranges U+0009 - U+000D and U+001C - U+001F must
	// return true. Characters other than these in Zs, Zl, or Zp
	// must return true.

	int8_t cat = Unicode::getType(ch);
	return
	(cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) \|\|
	(((((int32_t(1) << LINE_SEPARATOR) \|
	(int32_t(1) << PARAGRAPH_SEPARATOR)) >> cat) & int32_t(1)) != 0) \|\|
	(ch <= 0x1F && ((((int32_t(1) << 0x0009) \|
	(int32_t(1) << 0x000A) \|
	(int32_t(1) << 0x000B) \|
	(int32_t(1) << 0x000C) \|
	(int32_t(1) << 0x000D) \|
	(int32_t(1) << 0x001C) \|
	(int32_t(1) << 0x001D) \|
	(int32_t(1) << 0x001E) \|
	(int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0);

	// From Mark Davis:
	//\| What we should do is to make sure that the special Cc characters like CR
	//\| have either Zs, Zl, or Zp in the property database. We can then just call
	//\| the equivalent of:
	//\|
	//\| public static boolean isWhileSpace(char ch) {
	//\| return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; }
	//\|
	//\| where WHITESPACE_MASK = (1 << Zs) \| (1 << Zl) \| (1 << Zp);
	//\|
	//\| This is much faster code, since it just looksup the property value and does
	//\| a couple of arithmetics to get the right answer.
	//
	// (We still have to make sure U+00A0 and U+FEFF are excluded, so the code
	// might not be as simple as this. - aliu)
	}

	// Gets if the Unicode character's character property.
	int8_t
	Unicode::getType(UChar ch)
	{
	return(u_charType(ch) );
	}



	// Gets table cell width of the Unicode character.
	uint16_t
	Unicode::getCellWidth(UChar ch)
	{
	return (u_charCellWidth(ch) );
	}

	int32_t
	Unicode::digitValue(UChar ch)
	{
	return (u_charDigitValue(ch) );
	}

	int8_t
	Unicode::digit(UChar ch, int8_t radix) {
	int8_t value = -1;
	if (radix >= MIN_RADIX && radix <= MAX_RADIX) {
	value = (int8_t) u_charDigitValue(ch);
	if (value < 0) {
	if (ch >= (UChar)'A' && ch <= (UChar)'Z') {
	value = ch - ((UChar)'A' - 10);
	} else if (ch >= (UChar)'a' && ch <= (UChar)'z') {
	value = ch - ((UChar)'a' - 10);
	}
	}
	}
	return (value < radix) ? value : -1;
	}

	UChar
	Unicode::forDigit(int32_t digit, int8_t radix) {
	if ((radix < MIN_RADIX) \|\| (radix > MAX_RADIX) \|\|
	(digit < 0) \|\| (digit >= radix)) {
	return (UChar)0;
	}
	return (UChar)(((digit < 10) ? (UChar)'0' : ((UChar)'a' - 10))
	+ digit);
	}

	void
	Unicode::getUnicodeVersion(UVersionInfo versionArray)
	{
	u_getUnicodeVersion(versionArray);
	}