| /* |
| ********************************************************************** |
| * Copyright (C) 1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/17/99 aliu Creation. |
| ********************************************************************** |
| */ |
| #include "unicode/hextouni.h" |
| #include "unicode/rep.h" |
| #include "unicode/unifilt.h" |
| #include "unicode/unicode.h" |
| |
| /** |
| * ID for this transliterator. |
| */ |
| const char* HexToUnicodeTransliterator::_ID = "Hex-Unicode"; |
| |
| /** |
| * This pattern encodes the following specs for the default constructor: |
| * \\u0000 |
| * \\U0000 |
| * u+0000 |
| * U+0000 |
| * The multiple backslashes resolve to a single backslash |
| * in the effective prefix. |
| */ |
| const UnicodeString HexToUnicodeTransliterator::DEFAULT_PATTERN = |
| UNICODE_STRING("\\\\u0000;\\\\U0000;u+0000;U+0000", 29); |
| |
| /** |
| * Constructs a transliterator. |
| */ |
| HexToUnicodeTransliterator::HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter) : |
| Transliterator(_ID, adoptedFilter) { |
| // We don't need to pass the status back to the caller because |
| // we know that the DEFAULT_PATTERN parses. |
| UErrorCode status = U_ZERO_ERROR; |
| applyPattern(DEFAULT_PATTERN, status); |
| } |
| |
| /** |
| * Constructs a transliterator. |
| */ |
| HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern, |
| UErrorCode& status) : |
| Transliterator(_ID, 0) { |
| applyPattern(thePattern, status); |
| } |
| |
| /** |
| * Constructs a transliterator. |
| */ |
| HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status) : |
| Transliterator(_ID, adoptedFilter) { |
| applyPattern(thePattern, status); |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| HexToUnicodeTransliterator::HexToUnicodeTransliterator(const HexToUnicodeTransliterator& o) : |
| Transliterator(o), |
| pattern(o.pattern), |
| affixes(o.affixes), |
| affixCount(o.affixCount) { |
| } |
| |
| /** |
| * Assignment operator. |
| */ |
| HexToUnicodeTransliterator& HexToUnicodeTransliterator::operator=( |
| const HexToUnicodeTransliterator& o) { |
| Transliterator::operator=(o); |
| pattern = o.pattern; |
| affixes = o.affixes; |
| affixCount = o.affixCount; |
| return *this; |
| } |
| |
| /** |
| * Transliterator API. |
| */ |
| Transliterator* HexToUnicodeTransliterator::clone(void) const { |
| return new HexToUnicodeTransliterator(*this); |
| } |
| |
| void HexToUnicodeTransliterator::applyPattern(const UnicodeString& thePattern, |
| UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| /* The pattern is processed and stored in affixes. The pattern |
| * consists of zero or more affixes. Each affix is parsed to |
| * determine the prefix, suffix, minimum digit count, and maximum |
| * digit count. These values are then stored as a four character |
| * header. That is, their numeric values are cast to UChars and |
| * stored in the string. Following these four characters, the prefix |
| * characters, then suffix characters are stored. Each spec takes |
| * n+4 characters, where n is the total length of the prefix and |
| * suffix. |
| */ |
| |
| // POSSIBILE FUTURE MODIFICATION |
| // Parse thePattern, and if this succeeds, set pattern to thePattern. |
| // If it fails, call applyPattern(pattern) to restore the original |
| // conditions. |
| |
| pattern = thePattern; |
| affixes.truncate(0); |
| affixCount = 0; |
| |
| /* The mode specifies where we are in each spec. |
| * mode 0 = in prefix |
| * mode 1 = in optional digits (#) |
| * mode 2 = in required digits (0) |
| * mode 3 = in suffix |
| */ |
| int32_t mode = 0; |
| |
| int32_t prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0; |
| int32_t start = 0; |
| |
| /* To make parsing easier, we append a virtual ';' at the end of |
| * the pattern string, if there isn't one already. When we get to |
| * the index pattern.length() (that is, one past the end), we |
| * create a virtual ';' if necessary. |
| */ |
| UChar c; // These are outside the loop so we can see the |
| bool_t isLiteral; // previous character... |
| for (int32_t i=0; i<=pattern.length(); ++i) { |
| // Create the virtual trailing ';' if necessary |
| if (i == pattern.length()) { |
| // If the last character was not a non-literal ';'... |
| if (i > 0 && !(c == SEMICOLON && !isLiteral)) { |
| c = SEMICOLON; |
| isLiteral = FALSE; |
| } else { |
| break; |
| } |
| } else { |
| c = pattern.charAt(i); |
| isLiteral = FALSE; |
| } |
| |
| if (c == BACKSLASH) { |
| if ((i+1)<pattern.length()) { |
| isLiteral = TRUE; |
| c = pattern.charAt(++i); |
| } else { |
| // Trailing '\\' |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| } |
| |
| if (!isLiteral) { |
| switch (c) { |
| case POUND: |
| // Seeing a '#' moves us from mode 0 (prefix) to mode 1 |
| // (optional digits). |
| if (mode == 0) { |
| ++mode; |
| } else if (mode != 1) { |
| // Unquoted '#' |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| ++maxDigits; |
| break; |
| case ZERO: |
| // Seeing a '0' moves us to mode 2 (required digits) |
| if (mode < 2) { |
| mode = 2; |
| } else if (mode != 2) { |
| // Unquoted '0' |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| ++minDigits; |
| ++maxDigits; |
| break; |
| case SEMICOLON: |
| if (minDigits < 1 || maxDigits > 4 |
| // Invalid min/max digit count |
| || prefixLen > 0xFFFF || suffixLen > 0xFFFF) { |
| // Suffix or prefix too long |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| // If there was no prefix and no suffix, then the |
| // header will not have been allocated yet. We need |
| // allocate the header now. |
| if (start == affixes.length()) { |
| affixes.append(UNICODE_STRING("AAAA", 4)); |
| } |
| // Fill in 4-character header |
| affixes.setCharAt(start++, (UChar) prefixLen); |
| affixes.setCharAt(start++, (UChar) suffixLen); |
| affixes.setCharAt(start++, (UChar) minDigits); |
| affixes.setCharAt(start++, (UChar) maxDigits); |
| start = affixes.length(); |
| ++affixCount; |
| prefixLen = suffixLen = minDigits = maxDigits = mode = 0; |
| break; |
| default: |
| isLiteral = TRUE; |
| break; |
| } |
| } |
| |
| if (isLiteral) { |
| if (start == affixes.length()) { |
| // Make space for the header. Append any four |
| // characters as place holders for the header values. |
| // We fill these in when we parse the ';'. |
| affixes.append(UNICODE_STRING("AAAA", 4)); |
| } |
| affixes.append(c); |
| if (mode == 0) { |
| ++prefixLen; |
| } else { |
| // Any literal outside the prefix moves us into mode 3 |
| // (suffix) |
| mode = 3; |
| ++suffixLen; |
| } |
| } |
| } |
| } |
| |
| const UnicodeString& HexToUnicodeTransliterator::toPattern(void) const { |
| return pattern; |
| } |
| |
| void HexToUnicodeTransliterator::handleTransliterate(Replaceable& text, Position& offsets, |
| bool_t isIncremental) const { |
| int32_t cursor = offsets.cursor; |
| int32_t limit = offsets.limit; |
| int32_t i, j, ipat; |
| |
| // This is a temporary one-character string |
| UnicodeString str = UNICODE_STRING("A", 1); |
| |
| while (cursor < limit) { |
| // Loop over the specs in affixes. If affixCount is zero (an |
| // empty pattern), then we do nothing. We exit this loop when |
| // we match one of the specs. We exit this function (by |
| // jumping to exit: below) if a partial match is detected and |
| // isIncremental is true. |
| for (j=0, ipat=0; j<affixCount; ++j) { |
| |
| // Read the header |
| int32_t prefixLen = affixes.charAt(ipat++); |
| int32_t suffixLen = affixes.charAt(ipat++); |
| int32_t minDigits = affixes.charAt(ipat++); |
| int32_t maxDigits = affixes.charAt(ipat++); |
| |
| // curs is a copy of cursor that is advanced over the |
| // characters as we parse them. |
| int32_t curs = cursor; |
| bool_t match = TRUE; |
| |
| for (i=0; i<prefixLen; ++i) { |
| if (curs >= limit) { |
| if (i > 0) { |
| // We've already matched a character. This is |
| // a partial match, so we return if in |
| // incremental mode. In non-incremental mode, |
| // go to the next spec. |
| if (isIncremental) { |
| goto exit; |
| } |
| match = FALSE; |
| break; |
| } |
| } |
| UChar c = filteredCharAt(text, curs++); |
| if (c != affixes.charAt(ipat + i)) { |
| match = FALSE; |
| break; |
| } |
| } |
| |
| if (match) { |
| UChar u = 0; |
| int32_t digitCount = 0; |
| for (;;) { |
| if (curs >= limit) { |
| // Check for partial match in incremental mode. |
| if (curs > cursor && isIncremental) { |
| goto exit; |
| } |
| break; |
| } |
| int8_t digit = Unicode::digit(filteredCharAt(text, curs), 16); |
| if (digit < 0) { |
| break; |
| } |
| ++curs; |
| u <<= 4; |
| u |= digit; |
| if (++digitCount == maxDigits) { |
| break; |
| } |
| } |
| |
| match = (digitCount >= minDigits); |
| |
| if (match) { |
| for (i=0; i<suffixLen; ++i) { |
| if (curs >= limit) { |
| // Check for partial match in incremental mode. |
| if (curs > cursor && isIncremental) { |
| goto exit; |
| } |
| match = FALSE; |
| break; |
| } |
| UChar c = filteredCharAt(text, curs++); |
| if (c != affixes.charAt(ipat + prefixLen + i)) { |
| match = FALSE; |
| break; |
| } |
| } |
| |
| if (match) { |
| // At this point, we have a match |
| str.setCharAt(0, u); |
| text.handleReplaceBetween(cursor, curs, str); |
| limit -= curs - cursor - 1; |
| // The following break statement leaves the |
| // loop that is traversing the specs in |
| // affixes. We then parse the next input |
| // character. |
| break; |
| } |
| } |
| } |
| |
| ipat += prefixLen + suffixLen; |
| } |
| |
| ++cursor; |
| } |
| |
| exit: |
| offsets.limit = limit; |
| offsets.cursor = cursor; |
| } |