| /* |
| ********************************************************************** |
| * Copyright (C) 1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/17/99 aliu Creation. |
| ********************************************************************** |
| */ |
| #ifndef RBT_H |
| #define RBT_H |
| |
| #include "unicode/translit.h" |
| #include "unicode/utypes.h" |
| #include "unicode/parseerr.h" |
| |
| class TransliterationRuleData; |
| |
| /** |
| * A transliterator that reads a set of rules in order to determine how to perform |
| * translations. Rules are stored in resource bundles indexed by name. Rules are separated by |
| * semicolons (';'). To include a literal semicolon, prefix it with a backslash ('\;'). |
| * Whitespace, as defined by <code>Character.isWhitespace()</code>, is ignored. If the first |
| * non-blank character on a line is '#', the entire line is ignored as a comment. </p> |
| * |
| * <p>Each set of rules consists of two groups, one forward, and one reverse. This is a |
| * convention that is not enforced; rules for one direction may be omitted, with the result |
| * that translations in that direction will not modify the source text. </p> |
| * |
| * <p><b>Rule syntax</b> </p> |
| * |
| * <p>Rule statements take one of the following forms: |
| * |
| * <dl> |
| * <dt><code>alefmadda=\u0622</code></dt> |
| * <dd><strong>Variable definition.</strong> The name on the left is assigned the character or |
| * expression on the right. Names may not contain any special characters (see list below). |
| * Duplicate names (including duplicates of simple variables or category names) cause an |
| * exception to be thrown. If the right hand side consists of one character, then the |
| * variable stands for that character. In this example, after this statement, instances of |
| * the left hand name surrounded by braces, "<code>{alefmadda}</code>", will be |
| * replaced by the Unicode character U+0622. If the right hand side is longer than one |
| * character, then it is interpreted as a character category expression; see below for |
| * details.</dd> |
| * <dt> </dt> |
| * <dt><code>softvowel=[eiyEIY]</code></dt> |
| * <dd><strong>Category definition.</strong> The name on the left is assigned to stand for a |
| * set of characters. The same rules for names of simple variables apply. After this |
| * statement, the left hand variable will be interpreted as indicating a set of characters in |
| * appropriate contexts. The pattern syntax defining sets of characters is defined by {@link |
| * UnicodeSet}. Examples of valid patterns are:<table> |
| * <tr valign="top"> |
| * <td nowrap><code>[abc]</code></td> |
| * <td>The set containing the characters 'a', 'b', and 'c'.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>[^abc]</code></td> |
| * <td>The set of all characters <em>except</em> 'a', 'b', and 'c'.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>[A-Z]</code></td> |
| * <td>The set of all characters from 'A' to 'Z' in Unicode order.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>[:Lu:]</code></td> |
| * <td>The set of Unicode uppercase letters. See <a href="http://www.unicode.org">www.unicode.org</a> |
| * for a complete list of categories and their two-letter codes.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td> |
| * <td>The set of all characters <em>except</em> 'a' through 'z' and uppercase or lowercase |
| * letters.</td> |
| * </tr> |
| * </table> |
| * <p>See {@link UnicodeSet} for more documentation and examples. </p> |
| * </dd> |
| * <dt><code>ai>{alefmadda}</code></dt> |
| * <dd><strong>Forward translation rule.</strong> This rule states that the string on the left |
| * will be changed to the string on the right when performing forward transliteration.</dd> |
| * <dt> </dt> |
| * <dt><code>ai<{alefmadda}</code></dt> |
| * <dd><strong>Reverse translation rule.</strong> This rule states that the string on the right |
| * will be changed to the string on the left when performing reverse transliteration.</dd> |
| * </dl> |
| * |
| * <dl> |
| * <dt><code>ai<>{alefmadda}</code></dt> |
| * <dd><strong>Bidirectional translation rule.</strong> This rule states that the string on the |
| * right will be changed to the string on the left when performing forward transliteration, |
| * and vice versa when performing reverse transliteration.</dd> |
| * </dl> |
| * |
| * <p>Forward and reverse translation rules consist of a <em>match pattern</em> and an <em>output |
| * string</em>. The match pattern consists of literal characters, optionally preceded by |
| * context, and optionally followed by context. Context characters, like literal pattern |
| * characters, must be matched in the text being transliterated. However, unlike literal |
| * pattern characters, they are not replaced by the output text. For example, the pattern |
| * "<code>(abc)def</code>" indicates the characters "<code>def</code>" |
| * must be preceded by "<code>abc</code>" for a successful match. If there is a |
| * successful match, "<code>def</code>" will be replaced, but not "<code>abc</code>". |
| * The initial '<code>(</code>' is optional, so "<code>abc)def</code>" is |
| * equivalent to "<code>(abc)def</code>". Another example is "<code>123(456)</code>" |
| * (or "<code>123(456</code>") in which the literal pattern "<code>123</code>" |
| * must be followed by "<code>456</code>". </p> |
| * |
| * <p>The output string of a forward or reverse rule consists of characters to replace the |
| * literal pattern characters. If the output string contains the character '<code>|</code>', |
| * this is taken to indicate the location of the <em>cursor</em> after replacement. The |
| * cursor is the point in the text at which the next replacement, if any, will be applied. </p> |
| * |
| * <p>In addition to being defined in variables, <code>UnicodeSet</code> patterns may be |
| * embedded directly into rule strings. Thus, the following two rules are equivalent:</p> |
| * |
| * <blockquote> |
| * <p><code>vowel=[aeiou]; {vowel}>*; # One way to do this<br> |
| * [aeiou]>*; |
| * # |
| * Another way</code></p> |
| * </blockquote> |
| * |
| * <p><b>Example</b> </p> |
| * |
| * <p>The following example rules illustrate many of the features of the rule language. </p> |
| * |
| * <table cellpadding="4"> |
| * <tr valign="top"> |
| * <td>Rule 1.</td> |
| * <td nowrap><code>(abc)def>x|y</code></td> |
| * </tr> |
| * <tr valign="top"> |
| * <td>Rule 2.</td> |
| * <td nowrap><code>xyz>r</code></td> |
| * </tr> |
| * <tr valign="top"> |
| * <td>Rule 3.</td> |
| * <td nowrap><code>yz>q</code></td> |
| * </tr> |
| * </table> |
| * |
| * <p>Applying these rules to the string "<code>adefabcdefz</code>" yields the |
| * following results: </p> |
| * |
| * <table cellpadding="4"> |
| * <tr valign="top"> |
| * <td nowrap><code>|adefabcdefz</code></td> |
| * <td>Initial state, no rules match. Advance cursor.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>a|defabcdefz</code></td> |
| * <td>Still no match. Rule 1 does not match because the preceding context is not present.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>ad|efabcdefz</code></td> |
| * <td>Still no match. Keep advancing until there is a match...</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>ade|fabcdefz</code></td> |
| * <td>...</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>adef|abcdefz</code></td> |
| * <td>...</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>adefa|bcdefz</code></td> |
| * <td>...</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>adefab|cdefz</code></td> |
| * <td>...</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>adefabc|defz</code></td> |
| * <td>Rule 1 matches; replace "<code>def</code>" with "<code>xy</code>" |
| * and back up the cursor to before the '<code>y</code>'.</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>adefabcx|yz</code></td> |
| * <td>Although "<code>xyz</code>" is present, rule 2 does not match because the |
| * cursor is before the '<code>y</code>', not before the '<code>x</code>'. Rule 3 does match. |
| * Replace "<code>yz</code>" with "<code>q</code>".</td> |
| * </tr> |
| * <tr valign="top"> |
| * <td nowrap><code>adefabcxq|</code></td> |
| * <td>The cursor is at the end; transliteration is complete.</td> |
| * </tr> |
| * </table> |
| * |
| * <p>The order of rules is significant. If multiple rules may match at some point, the first |
| * matching rule is applied. </p> |
| * |
| * <p>Forward and reverse rules may have an empty output string. Otherwise, an empty left or |
| * right hand side of any statement is a syntax error. </p> |
| * |
| * <p>Single quotes are used to quote the special characters <code>=><{}[]()|</code>. |
| * To specify a single quote itself, inside or outside of quotes, use two single quotes in a |
| * row. For example, the rule "<code>'>'>o''clock</code>" changes the string |
| * "<code>></code>" to the string "<code>o'clock</code>". </p> |
| * |
| * <p><b>Notes</b> </p> |
| * |
| * <p>While a RuleBasedTransliterator is being built, it checks that the rules are added in |
| * proper order. For example, if the rule "a>x" is followed by the rule |
| * "ab>y", then the second rule will throw an exception. The reason is that the |
| * second rule can never be triggered, since the first rule always matches anything it |
| * matches. In other words, the first rule <em>masks</em> the second rule. </p> |
| * |
| * @author Alan Liu |
| * @draft |
| */ |
| class U_I18N_API RuleBasedTransliterator : public Transliterator { |
| |
| /** |
| * The data object is immutable, so we can freely share it with |
| * other instances of RBT, as long as we do NOT own this object. |
| */ |
| TransliterationRuleData* data; |
| |
| /** |
| * If true, we own the data object and must delete it. |
| */ |
| bool_t isDataOwned; |
| |
| public: |
| |
| /** |
| * Constructs a new transliterator from the given rules. |
| * @param rules rules, separated by ';' |
| * @param direction either FORWARD or REVERSE. |
| * @exception IllegalArgumentException if rules are malformed |
| * or direction is invalid. |
| * @draft |
| */ |
| RuleBasedTransliterator(const UnicodeString& ID, |
| const UnicodeString& rules, |
| Direction direction, |
| UnicodeFilter* adoptedFilter, |
| ParseError& parseError, |
| UErrorCode& status); |
| |
| /** |
| * Constructs a new transliterator from the given rules. |
| * @param rules rules, separated by ';' |
| * @param direction either FORWARD or REVERSE. |
| * @exception IllegalArgumentException if rules are malformed |
| * or direction is invalid. |
| */ |
| RuleBasedTransliterator(const UnicodeString& ID, |
| const UnicodeString& rules, |
| Direction direction, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status); |
| |
| /** |
| * Covenience constructor with no filter. |
| * @draft |
| */ |
| RuleBasedTransliterator(const UnicodeString& ID, |
| const UnicodeString& rules, |
| Direction direction, |
| UErrorCode& status); |
| |
| /** |
| * Covenience constructor with no filter and FORWARD direction. |
| * @draft |
| */ |
| RuleBasedTransliterator(const UnicodeString& ID, |
| const UnicodeString& rules, |
| UErrorCode& status); |
| |
| /** |
| * Covenience constructor with FORWARD direction. |
| * @draft |
| */ |
| RuleBasedTransliterator(const UnicodeString& ID, |
| const UnicodeString& rules, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status); |
| |
| /** |
| * Covenience constructor. |
| * @draft |
| */ |
| RuleBasedTransliterator(const UnicodeString& ID, |
| const TransliterationRuleData* theData, |
| UnicodeFilter* adoptedFilter = 0); |
| |
| /** |
| * Copy constructor. |
| * @draft |
| */ |
| RuleBasedTransliterator(const RuleBasedTransliterator&); |
| |
| virtual ~RuleBasedTransliterator(); |
| |
| /** |
| * Implement Transliterator API. |
| * @draft |
| */ |
| Transliterator* clone(void) const; |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| * @draft |
| */ |
| virtual void handleTransliterate(Replaceable& text, Position& offsets, |
| bool_t isIncremental) const; |
| |
| /** |
| * Parse error codes generated by RuleBasedTransliterator. |
| * See parseerr.h. |
| */ |
| enum { |
| PARSE_ERROR_BASE = 0x10000, |
| DUPLICATE_VARIABLE_DEFINITION, |
| MALFORMED_RHS, |
| MALFORMED_RULE, |
| MALFORMED_SET, |
| MALFORMED_UNICODE_ESCAPE, |
| MALFORMED_VARIABLE_REFERENCE, |
| MISSING_OPERATOR, |
| MULTIPLE_ANTE_CONTEXTS, |
| MULTIPLE_CURSORS, |
| MULTIPLE_POST_CONTEXTS, |
| TEXT_AFTER_CLOSE_CONTEXT, |
| TRAILING_BACKSLASH, |
| UNDEFINED_VARIABLE, |
| UNEXPECTED_CLOSE_CONTEXT, |
| UNQUOTED_SPECIAL, |
| UNTERMINATED_QUOTE |
| }; |
| |
| private: |
| |
| void _construct(const UnicodeString& rules, |
| Direction direction, |
| UErrorCode& status, |
| ParseError* parseError = 0); |
| }; |
| |
| /** |
| * Constructs a new transliterator from the given rules. |
| * @param rules rules, separated by ';' |
| * @param direction either FORWARD or REVERSE. |
| * @exception IllegalArgumentException if rules are malformed |
| * or direction is invalid. |
| */ |
| inline RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& ID, |
| const UnicodeString& rules, |
| Direction direction, |
| UnicodeFilter* adoptedFilter, |
| ParseError& parseError, |
| UErrorCode& status) : |
| Transliterator(ID, adoptedFilter) { |
| _construct(rules, direction, status, &parseError); |
| } |
| |
| /** |
| * Constructs a new transliterator from the given rules. |
| * @param rules rules, separated by ';' |
| * @param direction either FORWARD or REVERSE. |
| * @exception IllegalArgumentException if rules are malformed |
| * or direction is invalid. |
| */ |
| inline RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& ID, |
| const UnicodeString& rules, |
| Direction direction, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status) : |
| Transliterator(ID, adoptedFilter) { |
| _construct(rules, direction, status); |
| } |
| |
| /** |
| * Covenience constructor with no filter. |
| */ |
| inline RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& ID, |
| const UnicodeString& rules, |
| Direction direction, |
| UErrorCode& status) : |
| Transliterator(ID, 0) { |
| _construct(rules, direction, status); |
| } |
| |
| /** |
| * Covenience constructor with no filter and FORWARD direction. |
| */ |
| inline RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& ID, |
| const UnicodeString& rules, |
| UErrorCode& status) : |
| Transliterator(ID, 0) { |
| _construct(rules, FORWARD, status); |
| } |
| |
| /** |
| * Covenience constructor with FORWARD direction. |
| */ |
| inline RuleBasedTransliterator::RuleBasedTransliterator( |
| const UnicodeString& ID, |
| const UnicodeString& rules, |
| UnicodeFilter* adoptedFilter, |
| UErrorCode& status) : |
| Transliterator(ID, adoptedFilter) { |
| _construct(rules, FORWARD, status); |
| } |
| |
| #endif |