| /* |
| ********************************************************************** |
| * Copyright (c) 2001-2011, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/19/2001 aliu Creation. |
| ********************************************************************** |
| */ |
| package com.ibm.icu.text; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.lang.UCharacter; |
| |
| /** |
| * A transliterator that converts Unicode escape forms to the |
| * characters they represent. Escape forms have a prefix, a suffix, a |
| * radix, and minimum and maximum digit counts. |
| * |
| * <p>This class is package private. It registers several standard |
| * variants with the system which are then accessed via their IDs. |
| * |
| * @author Alan Liu |
| */ |
| class UnescapeTransliterator extends Transliterator { |
| |
| /** |
| * The encoded pattern specification. The pattern consists of |
| * zero or more forms. Each form consists of a prefix, suffix, |
| * radix, minimum digit count, and maximum digit count. These |
| * values are stored as a five character header. That is, their |
| * numeric values are cast to 16-bit characters and stored in the |
| * string. Following these five characters, the prefix |
| * characters, then suffix characters are stored. Each form thus |
| * takes n+5 characters, where n is the total length of the prefix |
| * and suffix. The end is marked by a header of length one |
| * consisting of the character END. |
| */ |
| private char spec[]; |
| |
| /** |
| * Special character marking the end of the spec[] array. |
| */ |
| private static final char END = 0xFFFF; |
| |
| /** |
| * Registers standard variants with the system. Called by |
| * Transliterator during initialization. |
| */ |
| static void register() { |
| // Unicode: "U+10FFFF" hex, min=4, max=6 |
| Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any/Unicode", new char[] { |
| 2, 0, 16, 4, 6, 'U', '+', |
| END |
| }); |
| } |
| }); |
| |
| // Java: "\\uFFFF" hex, min=4, max=4 |
| Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any/Java", new char[] { |
| 2, 0, 16, 4, 4, '\\', 'u', |
| END |
| }); |
| } |
| }); |
| |
| // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 |
| Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any/C", new char[] { |
| 2, 0, 16, 4, 4, '\\', 'u', |
| 2, 0, 16, 8, 8, '\\', 'U', |
| END |
| }); |
| } |
| }); |
| |
| // XML: "" hex, min=1, max=6 |
| Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any/XML", new char[] { |
| 3, 1, 16, 1, 6, '&', '#', 'x', ';', |
| END |
| }); |
| } |
| }); |
| |
| // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any") |
| Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any/XML10", new char[] { |
| 2, 1, 10, 1, 7, '&', '#', ';', |
| END |
| }); |
| } |
| }); |
| |
| // Perl: "\\x{263A}" hex, min=1, max=6 |
| Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any/Perl", new char[] { |
| 3, 1, 16, 1, 6, '\\', 'x', '{', '}', |
| END |
| }); |
| } |
| }); |
| |
| // All: Java, C, Perl, XML, XML10, Unicode |
| Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() { |
| public Transliterator getInstance(String ID) { |
| return new UnescapeTransliterator("Hex-Any", new char[] { |
| 2, 0, 16, 4, 6, 'U', '+', // Unicode |
| 2, 0, 16, 4, 4, '\\', 'u', // Java |
| 2, 0, 16, 8, 8, '\\', 'U', // C (surrogates) |
| 3, 1, 16, 1, 6, '&', '#', 'x', ';', // XML |
| 2, 1, 10, 1, 7, '&', '#', ';', // XML10 |
| 3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl |
| END |
| }); |
| } |
| }); |
| } |
| |
| /** |
| * Package private constructor. Takes the encoded spec array. |
| */ |
| UnescapeTransliterator(String ID, char spec[]) { |
| super(ID, null); |
| this.spec = spec; |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| protected void handleTransliterate(Replaceable text, |
| Position pos, boolean isIncremental) { |
| int start = pos.start; |
| int limit = pos.limit; |
| int i, ipat; |
| |
| loop: |
| while (start < limit) { |
| // Loop over the forms in spec[]. Exit this loop when we |
| // match one of the specs. Exit the outer loop if a |
| // partial match is detected and isIncremental is true. |
| for (ipat = 0; spec[ipat] != END;) { |
| |
| // Read the header |
| int prefixLen = spec[ipat++]; |
| int suffixLen = spec[ipat++]; |
| int radix = spec[ipat++]; |
| int minDigits = spec[ipat++]; |
| int maxDigits = spec[ipat++]; |
| |
| // s is a copy of start that is advanced over the |
| // characters as we parse them. |
| int s = start; |
| boolean match = true; |
| |
| for (i=0; i<prefixLen; ++i) { |
| if (s >= limit) { |
| if (i > 0) { |
| // We've already matched a character. This is |
| // a partial match, so we return if in |
| // incremental mode. In non-incremental mode, |
| // go to the next spec. |
| if (isIncremental) { |
| break loop; |
| } |
| match = false; |
| break; |
| } |
| } |
| char c = text.charAt(s++); |
| if (c != spec[ipat + i]) { |
| match = false; |
| break; |
| } |
| } |
| |
| if (match) { |
| int u = 0; |
| int digitCount = 0; |
| for (;;) { |
| if (s >= limit) { |
| // Check for partial match in incremental mode. |
| if (s > start && isIncremental) { |
| break loop; |
| } |
| break; |
| } |
| int ch = text.char32At(s); |
| int digit = UCharacter.digit(ch, radix); |
| if (digit < 0) { |
| break; |
| } |
| s += UTF16.getCharCount(ch); |
| u = (u * radix) + digit; |
| if (++digitCount == maxDigits) { |
| break; |
| } |
| } |
| |
| match = (digitCount >= minDigits); |
| |
| if (match) { |
| for (i=0; i<suffixLen; ++i) { |
| if (s >= limit) { |
| // Check for partial match in incremental mode. |
| if (s > start && isIncremental) { |
| break loop; |
| } |
| match = false; |
| break; |
| } |
| char c = text.charAt(s++); |
| if (c != spec[ipat + prefixLen + i]) { |
| match = false; |
| break; |
| } |
| } |
| |
| if (match) { |
| // At this point, we have a match |
| String str = UTF16.valueOf(u); |
| text.replace(start, s, str); |
| limit -= s - start - str.length(); |
| // The following break statement leaves the |
| // loop that is traversing the forms in |
| // spec[]. We then parse the next input |
| // character. |
| break; |
| } |
| } |
| } |
| |
| ipat += prefixLen + suffixLen; |
| } |
| |
| if (start < limit) { |
| start += UTF16.getCharCount(text.char32At(start)); |
| } |
| } |
| |
| pos.contextLimit += limit - pos.limit; |
| pos.limit = limit; |
| pos.start = start; |
| } |
| |
| /* (non-Javadoc) |
| * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) |
| */ |
| @Override |
| public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { |
| // Each form consists of a prefix, suffix, |
| // * radix, minimum digit count, and maximum digit count. These |
| // * values are stored as a five character header. ... |
| UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); |
| UnicodeSet items = new UnicodeSet(); |
| StringBuilder buffer = new StringBuilder(); |
| for (int i = 0; spec[i] != END;) { |
| // first 5 items are header |
| int end = i + spec[i] + spec[i+1] + 5; |
| int radix = spec[i+2]; |
| for (int j = 0; j < radix; ++j) { |
| Utility.appendNumber(buffer, j, radix, 0); |
| } |
| // then add the characters |
| for (int j = i + 5; j < end; ++j) { |
| items.add(spec[j]); |
| } |
| // and go to next block |
| i = end; |
| } |
| items.addAll(buffer.toString()); |
| items.retainAll(myFilter); |
| |
| if (items.size() > 0) { |
| sourceSet.addAll(items); |
| targetSet.addAll(0,0x10FFFF); // assume we can produce any character |
| } |
| } |
| } |