source/i18n/name2uni.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 2001, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   06/07/01    aliu        Creation.
 **********************************************************************
 */

 #include "name2uni.h"
 #include "unicode/unifilt.h"
 #include "unicode/uchar.h"


 // As of Unicode 3.0.0, the longest name is 83 characters long.
 #define LONGEST_NAME 83

 U_NAMESPACE_BEGIN

 const char NameUnicodeTransliterator::_ID[] = "Name-Any";

 /**
  * Constructs a transliterator.
  */
 NameUnicodeTransliterator::NameUnicodeTransliterator(
                                  UChar32 openDelim, UChar32 closeDelim,
                                  UnicodeFilter* adoptedFilter) :
     Transliterator(_ID, adoptedFilter),
     openDelimiter(openDelim),
     closeDelimiter(closeDelim) {
 }

 /**
  * Constructs a transliterator with the default delimiters '{' and
  * '}'.
  */
 NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
     Transliterator(_ID, adoptedFilter),
     openDelimiter((UChar) 0x007B /*{*/),
     closeDelimiter((UChar) 0x007D /*}*/) {
 }

 /**
  * Destructor.
  */
 NameUnicodeTransliterator::~NameUnicodeTransliterator() {}

 /**
  * Copy constructor.
  */
 NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
     Transliterator(o),
     openDelimiter(o.openDelimiter),
     closeDelimiter(o.closeDelimiter) {}

 /**
  * Assignment operator.
  */
 NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
                              const NameUnicodeTransliterator& o) {
     Transliterator::operator=(o);
     openDelimiter = o.openDelimiter;
     closeDelimiter = o.closeDelimiter;
     return *this;
 }

 /**
  * Transliterator API.
  */
 Transliterator* NameUnicodeTransliterator::clone(void) const {
     return new NameUnicodeTransliterator(*this);
 }

 /**
  * Implements {@link Transliterator#handleTransliterate}.
  */
 void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                     UBool isIncremental) const {
     // Accomodate the longest possible name plus padding
     UChar buf[LONGEST_NAME + 8];
     char cbuf[LONGEST_NAME + 8]; // Default converter

     // The only characters used in names are (as of Unicode 3.0.0):
     //  -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
     // (first character is a space).

     int32_t cursor = offsets.start;
     int32_t limit = offsets.limit;

     // Modes:
     // 0 - looking for open delimiter
     // 1 - after open delimiter
     int32_t mode = 0;
     int32_t ibuf = 0;
     int32_t openPos = offsets.start; // position of openDelimiter

     UnicodeString str;

     UChar32 c;
     for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) {
         c = text.char32At(cursor);

         switch (mode) {
         case 0: // looking for open delimiter
             if (c == openDelimiter) {
                 openPos = cursor;
                 mode = 1;
                 ibuf = 0;
             }
             break;

         case 1: // after open delimiter
             // Look for [-a-zA-Z0-9].  If \w+ is found, convert it
             // to a single space.  If closeDelimiter is found, exit
             // the loop.  If any other character is found, exit the
             // loop.  If the limit is found, exit the loop.
             if (u_isWhitespace(c)) {
                 // Ignore leading whitespace
                 if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
                     buf[ibuf++] = (UChar)0x0020 /* */;
                     // If we go a bit past the longest possible name then abort
                     if (ibuf == (LONGEST_NAME + 4)) {
                         mode = 0;
                     }
                 }
                 continue;
             }

             if (c == closeDelimiter) {
                 // Delete trailing space, if any
                 if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
                     --ibuf;
                 }
                 buf[ibuf] = 0; // Add terminating zero
                 UErrorCode status = U_ZERO_ERROR;

                 // Convert UChar to char
                 u_UCharsToChars(buf, cbuf, ibuf+1);

                 UChar32 ch = u_charFromName(U_UNICODE_CHAR_NAME, cbuf, &status);
                 if (ch != (UChar32) 0xFFFF && U_SUCCESS(status)) {
                     // Lookup succeeded
                     str.truncate(0);
                     str.append(ch);
                     text.handleReplaceBetween(openPos, cursor+1, str);

                     // Adjust indices for the change in the length of
                     // the string.  Do not assume that str.length() ==
                     // 1, in case of surrogates.
                     int32_t delta = cursor + 1 - openPos - str.length();
                     cursor -= delta;
                     limit -= delta;
                     // assert(cursor == openPos + str.length());
                 }
                 // If the lookup failed, we leave things as-is and
                 // still switch to mode 0 and continue.
                 mode = 0;
                 continue;
             }

             //if (c >= (UChar)0x0061 && c <= (UChar)0x007A) {
             //    c -= 0x0020; // [a-z] => [A-Z]
             //}

             // Check if c =~ [-A-Z0-9]
             if (c == (UChar)0x002D ||
                 (c >= (UChar)0x0041 && c <= (UChar)0x005A) ||
                 (c >= (UChar)0x0030 && c <= (UChar)0x0039)) {
                 buf[ibuf++] = (char) c;
                 // If we go a bit past the longest possible name then abort
                 if (ibuf == (LONGEST_NAME + 4)) {
                     mode = 0;
                 }
             }

             // Invalid character
             else {
                 --cursor; // Backup and reprocess this character
                 mode = 0;
             }

             break;
         }
     }

     offsets.contextLimit += limit - offsets.limit;
     offsets.limit = limit;
     // In incremental mode, only advance the cursor up to the last
     // open delimiter, if we are in mode 1.
     offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
 }

 U_NAMESPACE_END
	/*
	**********************************************************************
	* Copyright (C) 2001, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 06/07/01 aliu Creation.
	**********************************************************************
	*/

	#include "name2uni.h"
	#include "unicode/unifilt.h"
	#include "unicode/uchar.h"


	// As of Unicode 3.0.0, the longest name is 83 characters long.
	#define LONGEST_NAME 83

	U_NAMESPACE_BEGIN

	const char NameUnicodeTransliterator::_ID[] = "Name-Any";

	/**
	* Constructs a transliterator.
	*/
	NameUnicodeTransliterator::NameUnicodeTransliterator(
	UChar32 openDelim, UChar32 closeDelim,
	UnicodeFilter* adoptedFilter) :
	Transliterator(_ID, adoptedFilter),
	openDelimiter(openDelim),
	closeDelimiter(closeDelim) {
	}

	/**
	* Constructs a transliterator with the default delimiters '{' and
	* '}'.
	*/
	NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
	Transliterator(_ID, adoptedFilter),
	openDelimiter((UChar) 0x007B /{/),
	closeDelimiter((UChar) 0x007D /}/) {
	}

	/**
	* Destructor.
	*/
	NameUnicodeTransliterator::~NameUnicodeTransliterator() {}

	/**
	* Copy constructor.
	*/
	NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
	Transliterator(o),
	openDelimiter(o.openDelimiter),
	closeDelimiter(o.closeDelimiter) {}

	/**
	* Assignment operator.
	*/
	NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
	const NameUnicodeTransliterator& o) {
	Transliterator::operator=(o);
	openDelimiter = o.openDelimiter;
	closeDelimiter = o.closeDelimiter;
	return *this;
	}

	/**
	* Transliterator API.
	*/
	Transliterator* NameUnicodeTransliterator::clone(void) const {
	return new NameUnicodeTransliterator(*this);
	}

	/**
	* Implements {@link Transliterator#handleTransliterate}.
	*/
	void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	UBool isIncremental) const {
	// Accomodate the longest possible name plus padding
	UChar buf[LONGEST_NAME + 8];
	char cbuf[LONGEST_NAME + 8]; // Default converter

	// The only characters used in names are (as of Unicode 3.0.0):
	// -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// (first character is a space).

	int32_t cursor = offsets.start;
	int32_t limit = offsets.limit;

	// Modes:
	// 0 - looking for open delimiter
	// 1 - after open delimiter
	int32_t mode = 0;
	int32_t ibuf = 0;
	int32_t openPos = offsets.start; // position of openDelimiter

	UnicodeString str;

	UChar32 c;
	for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) {
	c = text.char32At(cursor);

	switch (mode) {
	case 0: // looking for open delimiter
	if (c == openDelimiter) {
	openPos = cursor;
	mode = 1;
	ibuf = 0;
	}
	break;

	case 1: // after open delimiter
	// Look for [-a-zA-Z0-9]. If \w+ is found, convert it
	// to a single space. If closeDelimiter is found, exit
	// the loop. If any other character is found, exit the
	// loop. If the limit is found, exit the loop.
	if (u_isWhitespace(c)) {
	// Ignore leading whitespace
	if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
	buf[ibuf++] = (UChar)0x0020 /* */;
	// If we go a bit past the longest possible name then abort
	if (ibuf == (LONGEST_NAME + 4)) {
	mode = 0;
	}
	}
	continue;
	}

	if (c == closeDelimiter) {
	// Delete trailing space, if any
	if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
	--ibuf;
	}
	buf[ibuf] = 0; // Add terminating zero
	UErrorCode status = U_ZERO_ERROR;

	// Convert UChar to char
	u_UCharsToChars(buf, cbuf, ibuf+1);

	UChar32 ch = u_charFromName(U_UNICODE_CHAR_NAME, cbuf, &status);
	if (ch != (UChar32) 0xFFFF && U_SUCCESS(status)) {
	// Lookup succeeded
	str.truncate(0);
	str.append(ch);
	text.handleReplaceBetween(openPos, cursor+1, str);

	// Adjust indices for the change in the length of
	// the string. Do not assume that str.length() ==
	// 1, in case of surrogates.
	int32_t delta = cursor + 1 - openPos - str.length();
	cursor -= delta;
	limit -= delta;
	// assert(cursor == openPos + str.length());
	}
	// If the lookup failed, we leave things as-is and
	// still switch to mode 0 and continue.
	mode = 0;
	continue;
	}

	//if (c >= (UChar)0x0061 && c <= (UChar)0x007A) {
	// c -= 0x0020; // [a-z] => [A-Z]
	//}

	// Check if c =~ [-A-Z0-9]
	if (c == (UChar)0x002D \|\|
	(c >= (UChar)0x0041 && c <= (UChar)0x005A) \|\|
	(c >= (UChar)0x0030 && c <= (UChar)0x0039)) {
	buf[ibuf++] = (char) c;
	// If we go a bit past the longest possible name then abort
	if (ibuf == (LONGEST_NAME + 4)) {
	mode = 0;
	}
	}

	// Invalid character
	else {
	--cursor; // Backup and reprocess this character
	mode = 0;
	}

	break;
	}
	}

	offsets.contextLimit += limit - offsets.limit;
	offsets.limit = limit;
	// In incremental mode, only advance the cursor up to the last
	// open delimiter, if we are in mode 1.
	offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
	}

	U_NAMESPACE_END