src/com/ibm/icu/text/HexToUnicodeTransliterator.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 1996-2000, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/HexToUnicodeTransliterator.java,v $
  * $Date: 2002/02/25 22:43:58 $
  * $Revision: 1.13 $
  *
  *****************************************************************************************
  */
 package com.ibm.icu.text;
 import java.util.*;
 import com.ibm.icu.impl.Utility;

 /**
  * A transliterator that converts from hexadecimal Unicode escape
  * sequences to the characters they represent.  For example, "U+0040"
  * and '\u0040'.  A default HexToUnicodeTransliterator recognizes the
  * prefixes "U+", "u+", "&#92;U", and "&#92;u".  Hex values may be
  * upper- or lowercase.  By calling the applyPattern() method, one
  * or more custom prefix/suffix pairs may be specified.  See
  * applyPattern() for details.
  *
  * @author Alan Liu
  * @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.13 $ $Date: 2002/02/25 22:43:58 $
  */
 public class HexToUnicodeTransliterator extends Transliterator {
     private static final String COPYRIGHT =
         "\u00A9 IBM Corporation 1999. All rights reserved.";

     /**
      * Package accessible ID for this transliterator.
      */
     static final String _ID = "Hex-Any";

     /**
      * This pattern encodes the following specs for the default constructor:
      *   \\u0000
      *   \\U0000
      *   u+0000
      *   U+0000
      * The multiple backslashes resolve to a single backslash
      * in the effective prefix.
      */
     private static final String DEFAULT_PATTERN = "\\\\u0000;\\\\U0000;u+0000;U+0000";

     // Character constants for special pattern characters
     private static final char SEMICOLON = ';';
     private static final char ZERO      = '0';
     private static final char POUND     = '#';
     private static final char BACKSLASH = '\\';

     /**
      * The pattern for this transliterator
      */
     private String pattern;

     /**
      * The processed pattern specification.  See applyPattern() for
      * details.
      */
     private char[] affixes;

     /**
      * The number of different affix sets in affixes.
      */
     private int affixCount;

     /**
      * Constructs a transliterator.
      */
     public HexToUnicodeTransliterator() {
         super(_ID, null);
         applyPattern(DEFAULT_PATTERN);
     }

     /**
      * Constructs a transliterator.
      */
     public HexToUnicodeTransliterator(String thePattern) {
         this(thePattern, null);
     }

     /**
      * Constructs a transliterator.
      */
     public HexToUnicodeTransliterator(String thePattern,
                                       UnicodeFilter theFilter) {
         super(_ID, theFilter);
         applyPattern(thePattern);
     }

     /**
      * Set the patterns recognized by this transliterator.  One or
      * more patterns may be specified, separated by semicolons (';').
      * Each pattern contains zero or more prefix characters, one or
      * more digit characters, and zero or more suffix characters.  The
      * digit characters indicates optional digits ('#') followed by
      * required digits ('0').  The total number of digits cannot
      * exceed 4, and must be at least 1 required digit.  Use a
      * backslash ('\\') to escape any of the special characters.  An
      * empty pattern is allowed; it specifies a transliterator that
      * does nothing.
      *
      * <p>Example: "U+0000;<###0>" specifies two patterns.  The first
      * has a prefix of "U+", exactly four digits, and no suffix.  The
      * second has a prefix of "<", between one and four digits, and a
      * suffix of ">".
      *
      * <p><pre>
      * pattern := spec | ( pattern ';' spec )
      * spec := prefix-char* digit-spec suffix-char*
      * digit-spec := '#'* '0'+
      * prefix-char := [^special-char] | '\\' special-char
      * suffix-char := [^special-char] | '\\' special-char
      * special-char := ';' | '0' | '#' | '\\'
      * </pre>
      */
     public void applyPattern(String pattern) {

         /* The pattern is processed and stored in affixes.  The pattern
          * consists of zero or more affixes.  Each affix is parsed to
          * determine the prefix, suffix, minimum digit count, and maximum
          * digit count.  These values are then stored as a four character
          * header.  That is, their numeric values are cast to UChars and
          * stored in the string.  Following these four characters, the prefix
          * characters, then suffix characters are stored.  Each spec takes
          * n+4 characters, where n is the total length of the prefix and
          * suffix.
          */

         StringBuffer affixes = new StringBuffer();
         affixCount = 0;

         /* The mode specifies where we are in each spec.
          * mode 0 = in prefix
          * mode 1 = in optional digits (#)
          * mode 2 = in required digits (0)
          * mode 3 = in suffix
          */
         int mode = 0;

         int prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0;
         int start = 0;

         /* To make parsing easier, we append a virtual ';' at the end of
          * the pattern string, if there isn't one already.  When we get to
          * the index pattern.length() (that is, one past the end), we
          * create a virtual ';' if necessary.
          */
         char c = 0;                // These are outside the loop so we can
         boolean isLiteral = false; // see the previous character...
         for (int i=0; i<=pattern.length(); ++i) {
             // Create the virtual trailing ';' if necessary
             if (i == pattern.length()) {
                 // If the last character was not a non-literal ';'...
                 if (i > 0 && !(c == SEMICOLON && !isLiteral)) {
                     c = SEMICOLON;
                     isLiteral = false;
                 } else {
                     break;
                 }
             } else {
                 c = pattern.charAt(i);
                 isLiteral = false;
             }

             if (c == BACKSLASH) {
                 if ((i+1)<pattern.length()) {
                     isLiteral = true;
                     c = pattern.charAt(++i);
                 } else {
                     // Trailing '\\'
                     throw new IllegalArgumentException("Trailing '\\'");
                 }
             }

             if (!isLiteral) {
                 switch (c) {
                 case POUND:
                     // Seeing a '#' moves us from mode 0 (prefix) to mode 1
                     // (optional digits).
                     if (mode == 0) {
                         ++mode;
                     } else if (mode != 1) {
                         // Unquoted '#'
                         throw new IllegalArgumentException("Unquoted '#'");
                     }
                     ++maxDigits;
                     break;
                 case ZERO:
                     // Seeing a '0' moves us to mode 2 (required digits)
                     if (mode < 2) {
                         mode = 2;
                     } else if (mode != 2) {
                         // Unquoted '0'
                         throw new IllegalArgumentException("Unquoted '0'");
                     }
                     ++minDigits;
                     ++maxDigits;
                     break;
                 case SEMICOLON:
                     if (minDigits < 1 || maxDigits > 4
                         // Invalid min/max digit count
                         || prefixLen > 0xFFFF || suffixLen > 0xFFFF) {
                         // Suffix or prefix too long
                         throw new IllegalArgumentException("Suffix or prefix too long");
                     }
                     // If there was no prefix and no suffix, then the
                     // header will not have been allocated yet.  We need
                     // allocate the header now.
                     if (start == affixes.length()) {
                         affixes.append("AAAA");
                     }
                     // Fill in 4-character header
                     affixes.setCharAt(start++, (char) prefixLen);
                     affixes.setCharAt(start++, (char) suffixLen);
                     affixes.setCharAt(start++, (char) minDigits);
                     affixes.setCharAt(start,   (char) maxDigits);
                     start = affixes.length();
                     ++affixCount;
                     prefixLen = suffixLen = minDigits = maxDigits = mode = 0;
                     break;
                 default:
                     isLiteral = true;
                     break;
                 }
             }

             if (isLiteral) {
                 if (start == affixes.length()) {
                     // Make space for the header.  Append any four
                     // characters as place holders for the header values.
                     // We fill these in when we parse the ';'.
                     affixes.append("AAAA");
                 }
                 affixes.append(c);
                 if (mode == 0) {
                     ++prefixLen;
                 } else {
                     // Any literal outside the prefix moves us into mode 3
                     // (suffix)
                     mode = 3;
                     ++suffixLen;
                 }
             }
         }

         // We only modify the pattern and affixes member variables if
         // we get to this point, that is, if the parse succeeds.
         this.pattern = pattern;
         int len = affixes.length();
         this.affixes = new char[len];
         Utility.getChars(affixes, 0, len, this.affixes, 0);
     }

     /**
      * Return this transliterator's pattern.
      */
     public String toPattern() {
         return pattern;
     }

     /**
      * Implements {@link Transliterator#handleTransliterate}.
      */
     protected void handleTransliterate(Replaceable text,
                                        Position offsets, boolean isIncremental) {
         int cursor = offsets.start;
         int limit = offsets.limit;
         int i, j, ipat;

       loop:
         while (cursor < limit) {
             // Loop over the specs in affixes.  If affixCount is zero (an
             // empty pattern), then we do nothing.  We exit this loop when
             // we match one of the specs.  We exit this function (by
             // jumping to exit: below) if a partial match is detected and
             // isIncremental is true.
             for (j=0, ipat=0; j<affixCount; ++j) {

                 // Read the header
                 int prefixLen = affixes[ipat++];
                 int suffixLen = affixes[ipat++];
                 int minDigits = affixes[ipat++];
                 int maxDigits = affixes[ipat++];

                 // curs is a copy of cursor that is advanced over the
                 // characters as we parse them.
                 int curs = cursor;
                 boolean match = true;

                 for (i=0; i<prefixLen; ++i) {
                     if (curs >= limit) {
                         if (i > 0) {
                             // We've already matched a character.  This is
                             // a partial match, so we return if in
                             // incremental mode.  In non-incremental mode,
                             // go to the next spec.
                             if (isIncremental) {
                                 break loop;
                             }
                             match = false;
                             break;
                         }
                     }
                     char c = text.charAt(curs++);
                     if (c != affixes[ipat + i]) {
                         match = false;
                         break;
                     }
                 }

                 if (match) {
                     char u = 0;
                     int digitCount = 0;
                     for (;;) {
                         if (curs >= limit) {
                             // Check for partial match in incremental mode.
                             if (curs > cursor && isIncremental) {
                                 break loop;
                             }
                             break;
                         }
                         int digit = Character.digit(text.charAt(curs), 16);
                         if (digit < 0) {
                             break;
                         }
                         ++curs;
                         u <<= 4;
                         u |= (char) digit;
                         if (++digitCount == maxDigits) {
                             break;
                         }
                     }

                     match = (digitCount >= minDigits);

                     if (match) {
                         for (i=0; i<suffixLen; ++i) {
                             if (curs >= limit) {
                                 // Check for partial match in incremental mode.
                                 if (curs > cursor && isIncremental) {
                                     break loop;
                                 }
                                 match = false;
                                 break;
                             }
                             char c = text.charAt(curs++);
                             if (c != affixes[ipat + prefixLen + i]) {
                                 match = false;
                                 break;
                             }
                         }

                         if (match) {
                             // At this point, we have a match
                             text.replace(cursor, curs, String.valueOf(u));
                             limit -= curs - cursor - 1;
                             // The following break statement leaves the
                             // loop that is traversing the specs in
                             // affixes.  We then parse the next input
                             // character.
                             break;
                         }
                     }
                 }

                 ipat += prefixLen + suffixLen;
             }

             ++cursor;
         }

         offsets.contextLimit += limit - offsets.limit;
         offsets.limit = limit;
         offsets.start = cursor;
     }
 }
	/*
	*******************************************************************************
	* Copyright (C) 1996-2000, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*
	* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/HexToUnicodeTransliterator.java,v $
	* $Date: 2002/02/25 22:43:58 $
	* $Revision: 1.13 $
	*
	*****************************************************************************************
	*/
	package com.ibm.icu.text;
	import java.util.*;
	import com.ibm.icu.impl.Utility;

	/**
	* A transliterator that converts from hexadecimal Unicode escape
	* sequences to the characters they represent. For example, "U+0040"
	* and '\u0040'. A default HexToUnicodeTransliterator recognizes the
	* prefixes "U+", "u+", "\U", and "\u". Hex values may be
	* upper- or lowercase. By calling the applyPattern() method, one
	* or more custom prefix/suffix pairs may be specified. See
	* applyPattern() for details.
	*
	* @author Alan Liu
	* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.13 $ $Date: 2002/02/25 22:43:58 $
	*/
	public class HexToUnicodeTransliterator extends Transliterator {
	private static final String COPYRIGHT =
	"\u00A9 IBM Corporation 1999. All rights reserved.";

	/**
	* Package accessible ID for this transliterator.
	*/
	static final String _ID = "Hex-Any";

	/**
	* This pattern encodes the following specs for the default constructor:
	* \\u0000
	* \\U0000
	* u+0000
	* U+0000
	* The multiple backslashes resolve to a single backslash
	* in the effective prefix.
	*/
	private static final String DEFAULT_PATTERN = "\\\\u0000;\\\\U0000;u+0000;U+0000";

	// Character constants for special pattern characters
	private static final char SEMICOLON = ';';
	private static final char ZERO = '0';
	private static final char POUND = '#';
	private static final char BACKSLASH = '\\';

	/**
	* The pattern for this transliterator
	*/
	private String pattern;

	/**
	* The processed pattern specification. See applyPattern() for
	* details.
	*/
	private char[] affixes;

	/**
	* The number of different affix sets in affixes.
	*/
	private int affixCount;

	/**
	* Constructs a transliterator.
	*/
	public HexToUnicodeTransliterator() {
	super(_ID, null);
	applyPattern(DEFAULT_PATTERN);
	}

	/**
	* Constructs a transliterator.
	*/
	public HexToUnicodeTransliterator(String thePattern) {
	this(thePattern, null);
	}

	/**
	* Constructs a transliterator.
	*/
	public HexToUnicodeTransliterator(String thePattern,
	UnicodeFilter theFilter) {
	super(_ID, theFilter);
	applyPattern(thePattern);
	}

	/**
	* Set the patterns recognized by this transliterator. One or
	* more patterns may be specified, separated by semicolons (';').
	* Each pattern contains zero or more prefix characters, one or
	* more digit characters, and zero or more suffix characters. The
	* digit characters indicates optional digits ('#') followed by
	* required digits ('0'). The total number of digits cannot
	* exceed 4, and must be at least 1 required digit. Use a
	* backslash ('\\') to escape any of the special characters. An
	* empty pattern is allowed; it specifies a transliterator that
	* does nothing.
	*
	* <p>Example: "U+0000;<###0>" specifies two patterns. The first
	* has a prefix of "U+", exactly four digits, and no suffix. The
	* second has a prefix of "<", between one and four digits, and a
	* suffix of ">".
	*
	* <p><pre>
	* pattern := spec \| ( pattern ';' spec )
	* spec := prefix-char* digit-spec suffix-char*
	* digit-spec := '#'* '0'+
	* prefix-char := [^special-char] \| '\\' special-char
	* suffix-char := [^special-char] \| '\\' special-char
	* special-char := ';' \| '0' \| '#' \| '\\'
	* </pre>
	*/
	public void applyPattern(String pattern) {

	/* The pattern is processed and stored in affixes. The pattern
	* consists of zero or more affixes. Each affix is parsed to
	* determine the prefix, suffix, minimum digit count, and maximum
	* digit count. These values are then stored as a four character
	* header. That is, their numeric values are cast to UChars and
	* stored in the string. Following these four characters, the prefix
	* characters, then suffix characters are stored. Each spec takes
	* n+4 characters, where n is the total length of the prefix and
	* suffix.
	*/

	StringBuffer affixes = new StringBuffer();
	affixCount = 0;

	/* The mode specifies where we are in each spec.
	* mode 0 = in prefix
	* mode 1 = in optional digits (#)
	* mode 2 = in required digits (0)
	* mode 3 = in suffix
	*/
	int mode = 0;

	int prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0;
	int start = 0;

	/* To make parsing easier, we append a virtual ';' at the end of
	* the pattern string, if there isn't one already. When we get to
	* the index pattern.length() (that is, one past the end), we
	* create a virtual ';' if necessary.
	*/
	char c = 0; // These are outside the loop so we can
	boolean isLiteral = false; // see the previous character...
	for (int i=0; i<=pattern.length(); ++i) {
	// Create the virtual trailing ';' if necessary
	if (i == pattern.length()) {
	// If the last character was not a non-literal ';'...
	if (i > 0 && !(c == SEMICOLON && !isLiteral)) {
	c = SEMICOLON;
	isLiteral = false;
	} else {
	break;
	}
	} else {
	c = pattern.charAt(i);
	isLiteral = false;
	}

	if (c == BACKSLASH) {
	if ((i+1)<pattern.length()) {
	isLiteral = true;
	c = pattern.charAt(++i);
	} else {
	// Trailing '\\'
	throw new IllegalArgumentException("Trailing '\\'");
	}
	}

	if (!isLiteral) {
	switch (c) {
	case POUND:
	// Seeing a '#' moves us from mode 0 (prefix) to mode 1
	// (optional digits).
	if (mode == 0) {
	++mode;
	} else if (mode != 1) {
	// Unquoted '#'
	throw new IllegalArgumentException("Unquoted '#'");
	}
	++maxDigits;
	break;
	case ZERO:
	// Seeing a '0' moves us to mode 2 (required digits)
	if (mode < 2) {
	mode = 2;
	} else if (mode != 2) {
	// Unquoted '0'
	throw new IllegalArgumentException("Unquoted '0'");
	}
	++minDigits;
	++maxDigits;
	break;
	case SEMICOLON:
	if (minDigits < 1 \|\| maxDigits > 4
	// Invalid min/max digit count
	\|\| prefixLen > 0xFFFF \|\| suffixLen > 0xFFFF) {
	// Suffix or prefix too long
	throw new IllegalArgumentException("Suffix or prefix too long");
	}
	// If there was no prefix and no suffix, then the
	// header will not have been allocated yet. We need
	// allocate the header now.
	if (start == affixes.length()) {
	affixes.append("AAAA");
	}
	// Fill in 4-character header
	affixes.setCharAt(start++, (char) prefixLen);
	affixes.setCharAt(start++, (char) suffixLen);
	affixes.setCharAt(start++, (char) minDigits);
	affixes.setCharAt(start, (char) maxDigits);
	start = affixes.length();
	++affixCount;
	prefixLen = suffixLen = minDigits = maxDigits = mode = 0;
	break;
	default:
	isLiteral = true;
	break;
	}
	}

	if (isLiteral) {
	if (start == affixes.length()) {
	// Make space for the header. Append any four
	// characters as place holders for the header values.
	// We fill these in when we parse the ';'.
	affixes.append("AAAA");
	}
	affixes.append(c);
	if (mode == 0) {
	++prefixLen;
	} else {
	// Any literal outside the prefix moves us into mode 3
	// (suffix)
	mode = 3;
	++suffixLen;
	}
	}
	}

	// We only modify the pattern and affixes member variables if
	// we get to this point, that is, if the parse succeeds.
	this.pattern = pattern;
	int len = affixes.length();
	this.affixes = new char[len];
	Utility.getChars(affixes, 0, len, this.affixes, 0);
	}

	/**
	* Return this transliterator's pattern.
	*/
	public String toPattern() {
	return pattern;
	}

	/**
	* Implements {@link Transliterator#handleTransliterate}.
	*/
	protected void handleTransliterate(Replaceable text,
	Position offsets, boolean isIncremental) {
	int cursor = offsets.start;
	int limit = offsets.limit;
	int i, j, ipat;

	loop:
	while (cursor < limit) {
	// Loop over the specs in affixes. If affixCount is zero (an
	// empty pattern), then we do nothing. We exit this loop when
	// we match one of the specs. We exit this function (by
	// jumping to exit: below) if a partial match is detected and
	// isIncremental is true.
	for (j=0, ipat=0; j<affixCount; ++j) {

	// Read the header
	int prefixLen = affixes[ipat++];
	int suffixLen = affixes[ipat++];
	int minDigits = affixes[ipat++];
	int maxDigits = affixes[ipat++];

	// curs is a copy of cursor that is advanced over the
	// characters as we parse them.
	int curs = cursor;
	boolean match = true;

	for (i=0; i<prefixLen; ++i) {
	if (curs >= limit) {
	if (i > 0) {
	// We've already matched a character. This is
	// a partial match, so we return if in
	// incremental mode. In non-incremental mode,
	// go to the next spec.
	if (isIncremental) {
	break loop;
	}
	match = false;
	break;
	}
	}
	char c = text.charAt(curs++);
	if (c != affixes[ipat + i]) {
	match = false;
	break;
	}
	}

	if (match) {
	char u = 0;
	int digitCount = 0;
	for (;;) {
	if (curs >= limit) {
	// Check for partial match in incremental mode.
	if (curs > cursor && isIncremental) {
	break loop;
	}
	break;
	}
	int digit = Character.digit(text.charAt(curs), 16);
	if (digit < 0) {
	break;
	}
	++curs;
	u <<= 4;
	u \|= (char) digit;
	if (++digitCount == maxDigits) {
	break;
	}
	}

	match = (digitCount >= minDigits);

	if (match) {
	for (i=0; i<suffixLen; ++i) {
	if (curs >= limit) {
	// Check for partial match in incremental mode.
	if (curs > cursor && isIncremental) {
	break loop;
	}
	match = false;
	break;
	}
	char c = text.charAt(curs++);
	if (c != affixes[ipat + prefixLen + i]) {
	match = false;
	break;
	}
	}

	if (match) {
	// At this point, we have a match
	text.replace(cursor, curs, String.valueOf(u));
	limit -= curs - cursor - 1;
	// The following break statement leaves the
	// loop that is traversing the specs in
	// affixes. We then parse the next input
	// character.
	break;
	}
	}
	}

	ipat += prefixLen + suffixLen;
	}

	++cursor;
	}

	offsets.contextLimit += limit - offsets.limit;
	offsets.limit = limit;
	offsets.start = cursor;
	}
	}