source/i18n/hextouni.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 1999, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */
 #include "unicode/hextouni.h"
 #include "unicode/rep.h"
 #include "unicode/unifilt.h"
 #include "unicode/uchar.h"


 U_NAMESPACE_BEGIN

 /**
  * ID for this transliterator.
  */
 const char HexToUnicodeTransliterator::_ID[] = "Hex-Any";

 /**
  * This pattern encodes the following specs for the default constructor:
  *   \\u0000
  *   \\U0000
  *   u+0000
  *   U+0000
  * The multiple backslashes resolve to a single backslash
  * in the effective prefix.
  */
 const UChar HexToUnicodeTransliterator::DEFAULT_PATTERN[] = {
     0x5C, 0x5C, 0x75, 0x30, 0x30, 0x30, 0x30, 0x3B,  /* "\\u0000;" */
     0x5C, 0x5C, 0x55, 0x30, 0x30, 0x30, 0x30, 0x3B,  /* "\\U0000;" */
     0x75, 0x2B, 0x30, 0x30, 0x30, 0x30, 0x3B,        /* "u+0000;" */
     0x55, 0x2B, 0x30, 0x30, 0x30, 0x30, 0           /* "U+0000" */
 };  /* "\\u0000;\\U0000;u+0000;U+0000" */

 static const UChar gQuadA[] = {
     0x41, 0x41, 0x41, 0x41, 0
 };  /* "AAAA" */

 /**
  * Constructs a transliterator.
  */
 HexToUnicodeTransliterator::HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
     Transliterator(_ID, adoptedFilter) {
     // We don't need to pass the status back to the caller because
     // we know that the DEFAULT_PATTERN parses.
     UErrorCode status = U_ZERO_ERROR;
     applyPattern(DEFAULT_PATTERN, status);
 }

 /**
  * Constructs a transliterator.
  */
 HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern,
                                                        UErrorCode& status) :
     Transliterator(_ID, 0) {
     applyPattern(thePattern, status);
 }

 /**
  * Constructs a transliterator.
  */
 HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern,
                                                        UnicodeFilter* adoptedFilter,
                                                        UErrorCode& status) :
     Transliterator(_ID, adoptedFilter) {
     applyPattern(thePattern, status);
 }

 /**
  * Copy constructor.
  */
 HexToUnicodeTransliterator::HexToUnicodeTransliterator(const HexToUnicodeTransliterator& o) :
     Transliterator(o),
     pattern(o.pattern),
     affixes(o.affixes),
     affixCount(o.affixCount) {
 }

 /**
  * Assignment operator.
  */
 HexToUnicodeTransliterator& HexToUnicodeTransliterator::operator=(
                                              const HexToUnicodeTransliterator& o) {
     Transliterator::operator=(o);
     pattern = o.pattern;
     affixes = o.affixes;
     affixCount = o.affixCount;
     return *this;
 }

 /**
  * Transliterator API.
  */
 Transliterator* HexToUnicodeTransliterator::clone(void) const {
     return new HexToUnicodeTransliterator(*this);
 }

 void HexToUnicodeTransliterator::applyPattern(const UnicodeString& thePattern,
                                               UErrorCode& status) {
     if (U_FAILURE(status)) {
         return;
     }

     /* The pattern is processed and stored in affixes.  The pattern
      * consists of zero or more affixes.  Each affix is parsed to
      * determine the prefix, suffix, minimum digit count, and maximum
      * digit count.  These values are then stored as a four character
      * header.  That is, their numeric values are cast to UChars and
      * stored in the string.  Following these four characters, the prefix
      * characters, then suffix characters are stored.  Each spec takes
      * n+4 characters, where n is the total length of the prefix and
      * suffix.
      */

     // POSSIBILE FUTURE MODIFICATION
     // Parse thePattern, and if this succeeds, set pattern to thePattern.
     // If it fails, call applyPattern(pattern) to restore the original
     // conditions.

     pattern = thePattern;
     affixes.truncate(0);
     affixCount = 0;

     /* The mode specifies where we are in each spec.
      * mode 0 = in prefix
      * mode 1 = in optional digits (#)
      * mode 2 = in required digits (0)
      * mode 3 = in suffix
      */
     int32_t mode = 0;

     int32_t prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0;
     int32_t start = 0;

     /* To make parsing easier, we append a virtual ';' at the end of
      * the pattern string, if there isn't one already.  When we get to
      * the index pattern.length() (that is, one past the end), we
      * create a virtual ';' if necessary.
      */
     UChar c = 0; // These are outside the loop so we can see the
     UBool isLiteral = FALSE; // previous character...
     for (int32_t i=0; i<=pattern.length(); ++i) {
         // Create the virtual trailing ';' if necessary
         if (i == pattern.length()) {
             // If the last character was not a non-literal ';'...
             if (i > 0 && !(c == SEMICOLON && !isLiteral)) {
                 c = SEMICOLON;
                 isLiteral = FALSE;
             } else {
                 break;
             }
         } else {
             c = pattern.charAt(i);
             isLiteral = FALSE;
         }

         if (c == BACKSLASH) {
             if ((i+1)<pattern.length()) {
                 isLiteral = TRUE;
                 c = pattern.charAt(++i);
             } else {
                 // Trailing '\\'
                 status = U_ILLEGAL_ARGUMENT_ERROR;
                 return;
             }
         }

         if (!isLiteral) {
             switch (c) {
             case POUND:
                 // Seeing a '#' moves us from mode 0 (prefix) to mode 1
                 // (optional digits).
                 if (mode == 0) {
                     ++mode;
                 } else if (mode != 1) {
                     // Unquoted '#'
                     status = U_ILLEGAL_ARGUMENT_ERROR;
                     return;
                 }
                 ++maxDigits;
                 break;
             case ZERO:
                 // Seeing a '0' moves us to mode 2 (required digits)
                 if (mode < 2) {
                     mode = 2;
                 } else if (mode != 2) {
                     // Unquoted '0'
                     status = U_ILLEGAL_ARGUMENT_ERROR;
                     return;
                 }
                 ++minDigits;
                 ++maxDigits;
                 break;
             case SEMICOLON:
                 if (minDigits < 1 || maxDigits > 4
                     // Invalid min/max digit count
                     || prefixLen > 0xFFFF || suffixLen > 0xFFFF) {
                     // Suffix or prefix too long
                     status = U_ILLEGAL_ARGUMENT_ERROR;
                     return;
                 }
                 // If there was no prefix and no suffix, then the
                 // header will not have been allocated yet.  We need
                 // allocate the header now.
                 if (start == affixes.length()) {
                     affixes.append(gQuadA);
                 }
                 // Fill in 4-character header
                 affixes.setCharAt(start++, (UChar) prefixLen);
                 affixes.setCharAt(start++, (UChar) suffixLen);
                 affixes.setCharAt(start++, (UChar) minDigits);
                 affixes.setCharAt(start++, (UChar) maxDigits);
                 start = affixes.length();
                 ++affixCount;
                 prefixLen = suffixLen = minDigits = maxDigits = mode = 0;
                 break;
             default:
                 isLiteral = TRUE;
                 break;
             }
         }

         if (isLiteral) {
             if (start == affixes.length()) {
                 // Make space for the header.  Append any four
                 // characters as place holders for the header values.
                 // We fill these in when we parse the ';'.
                 affixes.append(gQuadA);
             }
             affixes.append(c);
             if (mode == 0) {
                 ++prefixLen;
             } else {
                 // Any literal outside the prefix moves us into mode 3
                 // (suffix)
                 mode = 3;
                 ++suffixLen;
             }
         }
     }
 }

 const UnicodeString& HexToUnicodeTransliterator::toPattern(void) const {
     return pattern;
 }

 void HexToUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
     int32_t cursor = offsets.start;
     int32_t limit = offsets.limit;
     int32_t i, j, ipat;

     while (cursor < limit) {
         // Loop over the specs in affixes.  If affixCount is zero (an
         // empty pattern), then we do nothing.  We exit this loop when
         // we match one of the specs.  We exit this function (by
         // jumping to exit: below) if a partial match is detected and
         // isIncremental is true.
         for (j=0, ipat=0; j<affixCount; ++j) {

             // Read the header
             int32_t prefixLen = affixes.charAt(ipat++);
             int32_t suffixLen = affixes.charAt(ipat++);
             int32_t minDigits = affixes.charAt(ipat++);
             int32_t maxDigits = affixes.charAt(ipat++);

             // curs is a copy of cursor that is advanced over the
             // characters as we parse them.
             int32_t curs = cursor;
             UBool match = TRUE;

             for (i=0; i<prefixLen; ++i) {
                 if (curs >= limit) {
                     if (i > 0) {
                         // We've already matched a character.  This is
                         // a partial match, so we return if in
                         // incremental mode.  In non-incremental mode,
                         // go to the next spec.
                         if (isIncremental) {
                             goto exit;
                         }
                         match = FALSE;
                         break;
                     }
                 }
                 UChar c = text.charAt(curs++);
                 if (c != affixes.charAt(ipat + i)) {
                     match = FALSE;
                     break;
                 }
             }

             if (match) {
                 UChar u = 0;
                 int32_t digitCount = 0;
                 for (;;) {
                     if (curs >= limit) {
                         // Check for partial match in incremental mode.
                         if (curs > cursor && isIncremental) {
                             goto exit;
                         }
                         break;
                     }
                     int32_t digit = u_digit(text.charAt(curs), 16);
                     if (digit < 0) {
                         break;
                     }
                     ++curs;
                     u <<= 4;
                     u |= digit;
                     if (++digitCount == maxDigits) {
                         break;
                     }
                 }

                 match = (digitCount >= minDigits);

                 if (match) {
                     for (i=0; i<suffixLen; ++i) {
                         if (curs >= limit) {
                             // Check for partial match in incremental mode.
                             if (curs > cursor && isIncremental) {
                                 goto exit;
                             }
                             match = FALSE;
                             break;
                         }
                         UChar c = text.charAt(curs++);
                         if (c != affixes.charAt(ipat + prefixLen + i)) {
                             match = FALSE;
                             break;
                         }
                     }

                     if (match) {
                         // This is a temporary one-character string
                         UnicodeString str(u);

                         // At this point, we have a match
                         text.handleReplaceBetween(cursor, curs, str);
                         limit -= curs - cursor - 1;
                         // The following break statement leaves the
                         // loop that is traversing the specs in
                         // affixes.  We then parse the next input
                         // character.
                         break;
                     }
                 }
             }

             ipat += prefixLen + suffixLen;
         }

         ++cursor;
     }

   exit:
     offsets.contextLimit += limit - offsets.limit;
     offsets.limit = limit;
     offsets.start = cursor;
 }

 U_NAMESPACE_END
	/*
	**********************************************************************
	* Copyright (C) 1999, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 11/17/99 aliu Creation.
	**********************************************************************
	*/
	#include "unicode/hextouni.h"
	#include "unicode/rep.h"
	#include "unicode/unifilt.h"
	#include "unicode/uchar.h"


	U_NAMESPACE_BEGIN

	/**
	* ID for this transliterator.
	*/
	const char HexToUnicodeTransliterator::_ID[] = "Hex-Any";

	/**
	* This pattern encodes the following specs for the default constructor:
	* \\u0000
	* \\U0000
	* u+0000
	* U+0000
	* The multiple backslashes resolve to a single backslash
	* in the effective prefix.
	*/
	const UChar HexToUnicodeTransliterator::DEFAULT_PATTERN[] = {
	0x5C, 0x5C, 0x75, 0x30, 0x30, 0x30, 0x30, 0x3B, /* "\\u0000;" */
	0x5C, 0x5C, 0x55, 0x30, 0x30, 0x30, 0x30, 0x3B, /* "\\U0000;" */
	0x75, 0x2B, 0x30, 0x30, 0x30, 0x30, 0x3B, /* "u+0000;" */
	0x55, 0x2B, 0x30, 0x30, 0x30, 0x30, 0 /* "U+0000" */
	}; /* "\\u0000;\\U0000;u+0000;U+0000" */

	static const UChar gQuadA[] = {
	0x41, 0x41, 0x41, 0x41, 0
	}; /* "AAAA" */

	/**
	* Constructs a transliterator.
	*/
	HexToUnicodeTransliterator::HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
	Transliterator(_ID, adoptedFilter) {
	// We don't need to pass the status back to the caller because
	// we know that the DEFAULT_PATTERN parses.
	UErrorCode status = U_ZERO_ERROR;
	applyPattern(DEFAULT_PATTERN, status);
	}

	/**
	* Constructs a transliterator.
	*/
	HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern,
	UErrorCode& status) :
	Transliterator(_ID, 0) {
	applyPattern(thePattern, status);
	}

	/**
	* Constructs a transliterator.
	*/
	HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern,
	UnicodeFilter* adoptedFilter,
	UErrorCode& status) :
	Transliterator(_ID, adoptedFilter) {
	applyPattern(thePattern, status);
	}

	/**
	* Copy constructor.
	*/
	HexToUnicodeTransliterator::HexToUnicodeTransliterator(const HexToUnicodeTransliterator& o) :
	Transliterator(o),
	pattern(o.pattern),
	affixes(o.affixes),
	affixCount(o.affixCount) {
	}

	/**
	* Assignment operator.
	*/
	HexToUnicodeTransliterator& HexToUnicodeTransliterator::operator=(
	const HexToUnicodeTransliterator& o) {
	Transliterator::operator=(o);
	pattern = o.pattern;
	affixes = o.affixes;
	affixCount = o.affixCount;
	return *this;
	}

	/**
	* Transliterator API.
	*/
	Transliterator* HexToUnicodeTransliterator::clone(void) const {
	return new HexToUnicodeTransliterator(*this);
	}

	void HexToUnicodeTransliterator::applyPattern(const UnicodeString& thePattern,
	UErrorCode& status) {
	if (U_FAILURE(status)) {
	return;
	}

	/* The pattern is processed and stored in affixes. The pattern
	* consists of zero or more affixes. Each affix is parsed to
	* determine the prefix, suffix, minimum digit count, and maximum
	* digit count. These values are then stored as a four character
	* header. That is, their numeric values are cast to UChars and
	* stored in the string. Following these four characters, the prefix
	* characters, then suffix characters are stored. Each spec takes
	* n+4 characters, where n is the total length of the prefix and
	* suffix.
	*/

	// POSSIBILE FUTURE MODIFICATION
	// Parse thePattern, and if this succeeds, set pattern to thePattern.
	// If it fails, call applyPattern(pattern) to restore the original
	// conditions.

	pattern = thePattern;
	affixes.truncate(0);
	affixCount = 0;

	/* The mode specifies where we are in each spec.
	* mode 0 = in prefix
	* mode 1 = in optional digits (#)
	* mode 2 = in required digits (0)
	* mode 3 = in suffix
	*/
	int32_t mode = 0;

	int32_t prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0;
	int32_t start = 0;

	/* To make parsing easier, we append a virtual ';' at the end of
	* the pattern string, if there isn't one already. When we get to
	* the index pattern.length() (that is, one past the end), we
	* create a virtual ';' if necessary.
	*/
	UChar c = 0; // These are outside the loop so we can see the
	UBool isLiteral = FALSE; // previous character...
	for (int32_t i=0; i<=pattern.length(); ++i) {
	// Create the virtual trailing ';' if necessary
	if (i == pattern.length()) {
	// If the last character was not a non-literal ';'...
	if (i > 0 && !(c == SEMICOLON && !isLiteral)) {
	c = SEMICOLON;
	isLiteral = FALSE;
	} else {
	break;
	}
	} else {
	c = pattern.charAt(i);
	isLiteral = FALSE;
	}

	if (c == BACKSLASH) {
	if ((i+1)<pattern.length()) {
	isLiteral = TRUE;
	c = pattern.charAt(++i);
	} else {
	// Trailing '\\'
	status = U_ILLEGAL_ARGUMENT_ERROR;
	return;
	}
	}

	if (!isLiteral) {
	switch (c) {
	case POUND:
	// Seeing a '#' moves us from mode 0 (prefix) to mode 1
	// (optional digits).
	if (mode == 0) {
	++mode;
	} else if (mode != 1) {
	// Unquoted '#'
	status = U_ILLEGAL_ARGUMENT_ERROR;
	return;
	}
	++maxDigits;
	break;
	case ZERO:
	// Seeing a '0' moves us to mode 2 (required digits)
	if (mode < 2) {
	mode = 2;
	} else if (mode != 2) {
	// Unquoted '0'
	status = U_ILLEGAL_ARGUMENT_ERROR;
	return;
	}
	++minDigits;
	++maxDigits;
	break;
	case SEMICOLON:
	if (minDigits < 1 \|\| maxDigits > 4
	// Invalid min/max digit count
	\|\| prefixLen > 0xFFFF \|\| suffixLen > 0xFFFF) {
	// Suffix or prefix too long
	status = U_ILLEGAL_ARGUMENT_ERROR;
	return;
	}
	// If there was no prefix and no suffix, then the
	// header will not have been allocated yet. We need
	// allocate the header now.
	if (start == affixes.length()) {
	affixes.append(gQuadA);
	}
	// Fill in 4-character header
	affixes.setCharAt(start++, (UChar) prefixLen);
	affixes.setCharAt(start++, (UChar) suffixLen);
	affixes.setCharAt(start++, (UChar) minDigits);
	affixes.setCharAt(start++, (UChar) maxDigits);
	start = affixes.length();
	++affixCount;
	prefixLen = suffixLen = minDigits = maxDigits = mode = 0;
	break;
	default:
	isLiteral = TRUE;
	break;
	}
	}

	if (isLiteral) {
	if (start == affixes.length()) {
	// Make space for the header. Append any four
	// characters as place holders for the header values.
	// We fill these in when we parse the ';'.
	affixes.append(gQuadA);
	}
	affixes.append(c);
	if (mode == 0) {
	++prefixLen;
	} else {
	// Any literal outside the prefix moves us into mode 3
	// (suffix)
	mode = 3;
	++suffixLen;
	}
	}
	}
	}

	const UnicodeString& HexToUnicodeTransliterator::toPattern(void) const {
	return pattern;
	}

	void HexToUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	UBool isIncremental) const {
	int32_t cursor = offsets.start;
	int32_t limit = offsets.limit;
	int32_t i, j, ipat;

	while (cursor < limit) {
	// Loop over the specs in affixes. If affixCount is zero (an
	// empty pattern), then we do nothing. We exit this loop when
	// we match one of the specs. We exit this function (by
	// jumping to exit: below) if a partial match is detected and
	// isIncremental is true.
	for (j=0, ipat=0; j<affixCount; ++j) {

	// Read the header
	int32_t prefixLen = affixes.charAt(ipat++);
	int32_t suffixLen = affixes.charAt(ipat++);
	int32_t minDigits = affixes.charAt(ipat++);
	int32_t maxDigits = affixes.charAt(ipat++);

	// curs is a copy of cursor that is advanced over the
	// characters as we parse them.
	int32_t curs = cursor;
	UBool match = TRUE;

	for (i=0; i<prefixLen; ++i) {
	if (curs >= limit) {
	if (i > 0) {
	// We've already matched a character. This is
	// a partial match, so we return if in
	// incremental mode. In non-incremental mode,
	// go to the next spec.
	if (isIncremental) {
	goto exit;
	}
	match = FALSE;
	break;
	}
	}
	UChar c = text.charAt(curs++);
	if (c != affixes.charAt(ipat + i)) {
	match = FALSE;
	break;
	}
	}

	if (match) {
	UChar u = 0;
	int32_t digitCount = 0;
	for (;;) {
	if (curs >= limit) {
	// Check for partial match in incremental mode.
	if (curs > cursor && isIncremental) {
	goto exit;
	}
	break;
	}
	int32_t digit = u_digit(text.charAt(curs), 16);
	if (digit < 0) {
	break;
	}
	++curs;
	u <<= 4;
	u \|= digit;
	if (++digitCount == maxDigits) {
	break;
	}
	}

	match = (digitCount >= minDigits);

	if (match) {
	for (i=0; i<suffixLen; ++i) {
	if (curs >= limit) {
	// Check for partial match in incremental mode.
	if (curs > cursor && isIncremental) {
	goto exit;
	}
	match = FALSE;
	break;
	}
	UChar c = text.charAt(curs++);
	if (c != affixes.charAt(ipat + prefixLen + i)) {
	match = FALSE;
	break;
	}
	}

	if (match) {
	// This is a temporary one-character string
	UnicodeString str(u);

	// At this point, we have a match
	text.handleReplaceBetween(cursor, curs, str);
	limit -= curs - cursor - 1;
	// The following break statement leaves the
	// loop that is traversing the specs in
	// affixes. We then parse the next input
	// character.
	break;
	}
	}
	}

	ipat += prefixLen + suffixLen;
	}

	++cursor;
	}

	exit:
	offsets.contextLimit += limit - offsets.limit;
	offsets.limit = limit;
	offsets.start = cursor;
	}

	U_NAMESPACE_END