source/i18n/unesctrn.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
  **********************************************************************
  *   Copyright (c) 2001-2008, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *   Date        Name        Description
  *   11/19/2001  aliu        Creation.
  **********************************************************************
  */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_TRANSLITERATION

 #include "unicode/uchar.h"
 #include "unesctrn.h"
 #include "util.h"

 #include "cmemory.h"

 U_NAMESPACE_BEGIN

 /**
  * Special character marking the end of the spec[] array.
  */
 static const UChar END = 0xFFFF;

 // Unicode: "U+10FFFF" hex, min=4, max=6
 static const UChar SPEC_Unicode[] = {
     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
     END
 };

 // Java: "\\uFFFF" hex, min=4, max=4
 static const UChar SPEC_Java[] = {
     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     END
 };

 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
 static const UChar SPEC_C[] = {
     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
     END
 };

 // XML: "&#x10FFFF;" hex, min=1, max=6
 static const UChar SPEC_XML[] = {
     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
     END
 };

 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
 static const UChar SPEC_XML10[] = {
     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
     END
 };

 // Perl: "\\x{263A}" hex, min=1, max=6
 static const UChar SPEC_Perl[] = {
     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
     END
 };

 // All: Java, C, Perl, XML, XML10, Unicode
 static const UChar SPEC_Any[] = {
     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
     END
 };

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)

 static UChar* copySpec(const UChar* spec) {
     int32_t len = 0;
     while (spec[len] != END) {
         ++len;
     }
     ++len;
     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
     // Check for memory allocation error.
     if (result != NULL) {
     	uprv_memcpy(result, spec, len*sizeof(result[0]));
     }
     return result;
 }

 /**
  * Factory methods.  Ignore the context.
  */
 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_Unicode);
 }
 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_Java);
 }
 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_C);
 }
 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_XML);
 }
 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_XML10);
 }
 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_Perl);
 }
 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
     return new UnescapeTransliterator(ID, SPEC_Any);
 }

 /**
  * Registers standard variants with the system.  Called by
  * Transliterator during initialization.
  */
 void UnescapeTransliterator::registerIDs() {
     Token t = integerToken(0);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 }

 /**
  * Constructor.  Takes the encoded spec array.
  */
 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
                                                const UChar *newSpec) :
     Transliterator(newID, NULL)
 {
     this->spec = copySpec(newSpec);
 }

 /**
  * Copy constructor.
  */
 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
     Transliterator(o) {
     this->spec = copySpec(o.spec);
 }

 UnescapeTransliterator::~UnescapeTransliterator() {
     uprv_free(spec);
 }

 /**
  * Transliterator API.
  */
 Transliterator* UnescapeTransliterator::clone() const {
     return new UnescapeTransliterator(*this);
 }

 /**
  * Implements {@link Transliterator#handleTransliterate}.
  */
 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
                                                  UBool isIncremental) const {
     int32_t start = pos.start;
     int32_t limit = pos.limit;
     int32_t i, j, ipat;

     while (start < limit) {
         // Loop over the forms in spec[].  Exit this loop when we
         // match one of the specs.  Exit the outer loop if a
         // partial match is detected and isIncremental is true.
         for (j=0, ipat=0; spec[ipat] != END; ++j) {

             // Read the header
             int32_t prefixLen = spec[ipat++];
             int32_t suffixLen = spec[ipat++];
             int8_t  radix     = (int8_t) spec[ipat++];
             int32_t minDigits = spec[ipat++];
             int32_t maxDigits = spec[ipat++];

             // s is a copy of start that is advanced over the
             // characters as we parse them.
             int32_t s = start;
             UBool match = TRUE;

             for (i=0; i<prefixLen; ++i) {
                 if (s >= limit) {
                     if (i > 0) {
                         // We've already matched a character.  This is
                         // a partial match, so we return if in
                         // incremental mode.  In non-incremental mode,
                         // go to the next spec.
                         if (isIncremental) {
                             goto exit;
                         }
                         match = FALSE;
                         break;
                     }
                 }
                 UChar c = text.charAt(s++);
                 if (c != spec[ipat + i]) {
                     match = FALSE;
                     break;
                 }
             }

             if (match) {
                 UChar32 u = 0;
                 int32_t digitCount = 0;
                 for (;;) {
                     if (s >= limit) {
                         // Check for partial match in incremental mode.
                         if (s > start && isIncremental) {
                             goto exit;
                         }
                         break;
                     }
                     UChar32 ch = text.char32At(s);
                     int32_t digit = u_digit(ch, radix);
                     if (digit < 0) {
                         break;
                     }
                     s += UTF_CHAR_LENGTH(ch);
                     u = (u * radix) + digit;
                     if (++digitCount == maxDigits) {
                         break;
                     }
                 }

                 match = (digitCount >= minDigits);

                 if (match) {
                     for (i=0; i<suffixLen; ++i) {
                         if (s >= limit) {
                             // Check for partial match in incremental mode.
                             if (s > start && isIncremental) {
                                 goto exit;
                             }
                             match = FALSE;
                             break;
                         }
                         UChar c = text.charAt(s++);
                         if (c != spec[ipat + prefixLen + i]) {
                             match = FALSE;
                             break;
                         }
                     }

                     if (match) {
                         // At this point, we have a match
                         UnicodeString str(u);
                         text.handleReplaceBetween(start, s, str);
                         limit -= s - start - str.length();
                         // The following break statement leaves the
                         // loop that is traversing the forms in
                         // spec[].  We then parse the next input
                         // character.
                         break;
                     }
                 }
             }

             ipat += prefixLen + suffixLen;
         }

         if (start < limit) {
             start += UTF_CHAR_LENGTH(text.char32At(start));
         }
     }

   exit:
     pos.contextLimit += limit - pos.limit;
     pos.limit = limit;
     pos.start = start;
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

 //eof
	/*
	**********************************************************************
	* Copyright (c) 2001-2008, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 11/19/2001 aliu Creation.
	**********************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION

	#include "unicode/uchar.h"
	#include "unesctrn.h"
	#include "util.h"

	#include "cmemory.h"

	U_NAMESPACE_BEGIN

	/**
	* Special character marking the end of the spec[] array.
	*/
	static const UChar END = 0xFFFF;

	// Unicode: "U+10FFFF" hex, min=4, max=6
	static const UChar SPEC_Unicode[] = {
	2, 0, 16, 4, 6, 85/U/, 43/+/,
	END
	};

	// Java: "\\uFFFF" hex, min=4, max=4
	static const UChar SPEC_Java[] = {
	2, 0, 16, 4, 4, 92/\/, 117/u/,
	END
	};

	// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
	static const UChar SPEC_C[] = {
	2, 0, 16, 4, 4, 92/\/, 117/u/,
	2, 0, 16, 8, 8, 92/\/, 85/U/,
	END
	};

	// XML: "􏿿" hex, min=1, max=6
	static const UChar SPEC_XML[] = {
	3, 1, 16, 1, 6, 38/&/, 35/#/, 120/x/, 59/;/,
	END
	};

	// XML10: "􏿿" dec, min=1, max=7 (not really "Hex-Any")
	static const UChar SPEC_XML10[] = {
	2, 1, 10, 1, 7, 38/&/, 35/#/, 59/;/,
	END
	};

	// Perl: "\\x{263A}" hex, min=1, max=6
	static const UChar SPEC_Perl[] = {
	3, 1, 16, 1, 6, 92/\/, 120/x/, 123/{/, 125/}/,
	END
	};

	// All: Java, C, Perl, XML, XML10, Unicode
	static const UChar SPEC_Any[] = {
	2, 0, 16, 4, 6, 85/U/, 43/+/, // Unicode
	2, 0, 16, 4, 4, 92/\/, 117/u/, // Java
	2, 0, 16, 8, 8, 92/\/, 85/U/, // C (surrogates)
	3, 1, 16, 1, 6, 38/&/, 35/#/, 120/x/, 59/;/, // XML
	2, 1, 10, 1, 7, 38/&/, 35/#/, 59/;/, // XML10
	3, 1, 16, 1, 6, 92/\/, 120/x/, 123/{/, 125/}/, // Perl
	END
	};

	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)

	static UChar* copySpec(const UChar* spec) {
	int32_t len = 0;
	while (spec[len] != END) {
	++len;
	}
	++len;
	UChar result = (UChar )uprv_malloc(len*sizeof(UChar));
	// Check for memory allocation error.
	if (result != NULL) {
	uprv_memcpy(result, spec, len*sizeof(result[0]));
	}
	return result;
	}

	/**
	* Factory methods. Ignore the context.
	*/
	static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_Unicode);
	}
	static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_Java);
	}
	static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_C);
	}
	static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_XML);
	}
	static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_XML10);
	}
	static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_Perl);
	}
	static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /context/) {
	return new UnescapeTransliterator(ID, SPEC_Any);
	}

	/**
	* Registers standard variants with the system. Called by
	* Transliterator during initialization.
	*/
	void UnescapeTransliterator::registerIDs() {
	Token t = integerToken(0);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
	}

	/**
	* Constructor. Takes the encoded spec array.
	*/
	UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
	const UChar *newSpec) :
	Transliterator(newID, NULL)
	{
	this->spec = copySpec(newSpec);
	}

	/**
	* Copy constructor.
	*/
	UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
	Transliterator(o) {
	this->spec = copySpec(o.spec);
	}

	UnescapeTransliterator::~UnescapeTransliterator() {
	uprv_free(spec);
	}

	/**
	* Transliterator API.
	*/
	Transliterator* UnescapeTransliterator::clone() const {
	return new UnescapeTransliterator(*this);
	}

	/**
	* Implements {@link Transliterator#handleTransliterate}.
	*/
	void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
	UBool isIncremental) const {
	int32_t start = pos.start;
	int32_t limit = pos.limit;
	int32_t i, j, ipat;

	while (start < limit) {
	// Loop over the forms in spec[]. Exit this loop when we
	// match one of the specs. Exit the outer loop if a
	// partial match is detected and isIncremental is true.
	for (j=0, ipat=0; spec[ipat] != END; ++j) {

	// Read the header
	int32_t prefixLen = spec[ipat++];
	int32_t suffixLen = spec[ipat++];
	int8_t radix = (int8_t) spec[ipat++];
	int32_t minDigits = spec[ipat++];
	int32_t maxDigits = spec[ipat++];

	// s is a copy of start that is advanced over the
	// characters as we parse them.
	int32_t s = start;
	UBool match = TRUE;

	for (i=0; i<prefixLen; ++i) {
	if (s >= limit) {
	if (i > 0) {
	// We've already matched a character. This is
	// a partial match, so we return if in
	// incremental mode. In non-incremental mode,
	// go to the next spec.
	if (isIncremental) {
	goto exit;
	}
	match = FALSE;
	break;
	}
	}
	UChar c = text.charAt(s++);
	if (c != spec[ipat + i]) {
	match = FALSE;
	break;
	}
	}

	if (match) {
	UChar32 u = 0;
	int32_t digitCount = 0;
	for (;;) {
	if (s >= limit) {
	// Check for partial match in incremental mode.
	if (s > start && isIncremental) {
	goto exit;
	}
	break;
	}
	UChar32 ch = text.char32At(s);
	int32_t digit = u_digit(ch, radix);
	if (digit < 0) {
	break;
	}
	s += UTF_CHAR_LENGTH(ch);
	u = (u * radix) + digit;
	if (++digitCount == maxDigits) {
	break;
	}
	}

	match = (digitCount >= minDigits);

	if (match) {
	for (i=0; i<suffixLen; ++i) {
	if (s >= limit) {
	// Check for partial match in incremental mode.
	if (s > start && isIncremental) {
	goto exit;
	}
	match = FALSE;
	break;
	}
	UChar c = text.charAt(s++);
	if (c != spec[ipat + prefixLen + i]) {
	match = FALSE;
	break;
	}
	}

	if (match) {
	// At this point, we have a match
	UnicodeString str(u);
	text.handleReplaceBetween(start, s, str);
	limit -= s - start - str.length();
	// The following break statement leaves the
	// loop that is traversing the forms in
	// spec[]. We then parse the next input
	// character.
	break;
	}
	}
	}

	ipat += prefixLen + suffixLen;
	}

	if (start < limit) {
	start += UTF_CHAR_LENGTH(text.char32At(start));
	}
	}

	exit:
	pos.contextLimit += limit - pos.limit;
	pos.limit = limit;
	pos.start = start;
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */

	//eof