package com.ibm.text;

import java.util.Dictionary;

/**
 * A transliteration rule used by
 * <code>RuleBasedTransliterator</code>.
 * <code>TransliterationRule</code> is an immutable object.
 *
 * <p>A rule consists of an input pattern and an output string.  When
 * the input pattern is matched, the output string is emitted.  The
 * input pattern consists of zero or more characters which are matched
 * exactly (the key) and optional context.  Context must match if it
 * is specified.  Context may be specified before the key, after the
 * key, or both.  The key, preceding context, and following context
 * may contain variables.  Variables represent a set of Unicode
 * characters, such as the letters <i>a</i> through <i>z</i>.
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined. 
 *
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
 * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.7 $ $Date: 2000/01/11 04:12:06 $
 *
 * $Log: TransliterationRule.java,v $
 * Revision 1.7  2000/01/11 04:12:06  Alan
 * Cleanup, embellish comments
 *
 * Revision 1.6  2000/01/11 02:25:03  Alan
 * Rewrite UnicodeSet and RBT parsers for better performance and new syntax
 *
 * Revision 1.5  2000/01/04 21:43:57  Alan
 * Add rule indexing, and move masking check to TransliterationRuleSet.
 *
 * Revision 1.4  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
 * Revision 1.3  1999/12/22 01:05:54  Alan
 * Improve masking checking; turn it off by default, for better performance
 *
 * Revision 1.2  1999/12/21 23:58:44  Alan
 * Detect a>x masking a>y
 *
 */
class TransliterationRule {
    /**
     * Constant returned by <code>getMatchDegree()</code> indicating a mismatch
     * between the text and this rule.  One or more characters of the context or
     * key do not match the text.
     * @see #getMatchDegree
     */
    public static final int MISMATCH      = 0;

    /**
     * Constant returned by <code>getMatchDegree()</code> indicating a partial
     * match between the text and this rule.  All characters of the text match
     * the corresponding context or key, but more characters are required for a
     * complete match.  There are some key or context characters at the end of
     * the pattern that remain unmatched because the text isn't long enough.
     * @see #getMatchDegree
     */
    public static final int PARTIAL_MATCH = 1;

    /**
     * Constant returned by <code>getMatchDegree()</code> indicating a complete
     * match between the text and this rule.  The text matches all context and
     * key characters.
     * @see #getMatchDegree
     */
    public static final int FULL_MATCH    = 2;

    /**
     * The string that must be matched, consisting of the anteContext, key,
     * and postContext, concatenated together, in that order.  Some components
     * may be empty (zero length).
     * @see anteContextLength
     * @see keyLength
     */
    private String pattern;

    /**
     * The string that is emitted if the key, anteContext, and postContext
     * are matched.
     */
    private String output;

    /**
     * The length of the string that must match before the key.  If
     * zero, then there is no matching requirement before the key.
     * Substring [0,anteContextLength) of pattern is the anteContext.
     */
    private int anteContextLength;

    /**
     * The length of the key.  Substring [anteContextLength,
     * anteContextLength + keyLength) is the key.
     */
    private int keyLength;

    /**
     * The position of the cursor after emitting the output string, from 0 to
     * output.length().  For most rules with no special cursor specification,
     * the cursorPos is output.length().
     */
    private int cursorPos;

    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999. All rights reserved.";

    /**
     * Construct a new rule with the given input, output text, and other
     * attributes.  A cursor position may be specified for the output text.
     * @param input input string, including key and optional ante and
     * post context
     * @param anteContextPos offset into input to end of ante context, or -1 if
     * none.  Must be <= input.length() if not -1.
     * @param postContextPos offset into input to start of post context, or -1
     * if none.  Must be <= input.length() if not -1, and must be >=
     * anteContextPos.
     * @param output output string
     * @param cursorPos offset into output at which cursor is located, or -1 if
     * none.  If less than zero, then the cursor is placed after the
     * <code>output</code>; that is, -1 is equivalent to
     * <code>output.length()</code>.  If greater than
     * <code>output.length()</code> then an exception is thrown.
     */
    public TransliterationRule(String input,
                               int anteContextPos, int postContextPos,
                               String output,
                               int cursorPos) {
        // Do range checks only when warranted to save time
        if (anteContextPos < 0) {
            anteContextLength = 0;
        } else {
            if (anteContextPos > input.length()) {
                throw new IllegalArgumentException("Invalid ante context");
            }
            anteContextLength = anteContextPos;
        }
        if (postContextPos < 0) {
            keyLength = input.length() - anteContextLength;
        } else {
            if (postContextPos < anteContextLength ||
                postContextPos > input.length()) {
                throw new IllegalArgumentException("Invalid post context");
            }
            keyLength = postContextPos - anteContextLength;
        }
        if (cursorPos < 0) {
            this.cursorPos = output.length();
        } else {
            if (cursorPos > input.length()) {
                throw new IllegalArgumentException("Invalid cursor position");
            }
            this.cursorPos = cursorPos;
        }
        pattern = input;
        this.output = output;
    }

    /**
     * Return the length of the key.  Equivalent to <code>getKey().length()</code>.
     * @return the length of the match key.
     */
    public int getKeyLength() {
        return keyLength;
    }

    /**
     * Return the output string.
     * @return the output string.
     */
    public String getOutput() {
        return output;
    }

    /**
     * Return the position of the cursor within the output string.
     * @return a value from 0 to <code>getOutput().length()</code>, inclusive.
     */
    public int getCursorPos() {
        return cursorPos;
    }

    /**
     * Return the preceding context length.  This method is needed to
     * support the <code>Transliterator</code> method
     * <code>getMaximumContextLength()</code>.
     */
    public int getAnteContextLength() {
        return anteContextLength;
    }

    /**
     * Internal method.  Returns 8-bit index value for this rule.
     * This is the low byte of the first character of the key,
     * unless the first character of the key is a set.  If it's a
     * set, or otherwise can match multiple keys, the index value is -1.
     */
    final int getIndexValue(Dictionary variables) {
        if (anteContextLength == pattern.length()) {
            // A pattern with just ante context {such as foo)>bar} can
            // match any key.
            return -1;
        }
        char c = pattern.charAt(anteContextLength);
        return variables.get(new Character(c)) == null ? (c & 0xFF) : -1;
    }

    /**
     * Internal method.  Returns true if this rule matches the given
     * index value.  The index value is an 8-bit integer, 0..255,
     * representing the low byte of the first character of the key.
     * It matches this rule if it matches the first character of the
     * key, or if the first character of the key is a set, and the set
     * contains any character with a low byte equal to the index
     * value.  If the rule contains only ante context, as in foo)>bar,
     * then it will match any key.
     */
    final boolean matchesIndexValue(int v, Dictionary variables) {
        if (anteContextLength == pattern.length()) {
            // A pattern with just ante context {such as foo)>bar} can
            // match any key.
            return true;
        }
        char c = pattern.charAt(anteContextLength);
        UnicodeSet set = (UnicodeSet) variables.get(new Character(c));
        return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
    }

    /**
     * Return true if this rule masks another rule.  If r1 masks r2 then
     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
     * "[c]a>x" masks "[dc]a>y".
     */
    public boolean masks(TransliterationRule r2) {
        /* Rule r1 masks rule r2 if the string formed of the
         * antecontext, key, and postcontext overlaps in the following
         * way:
         *
         * r1:      aakkkpppp
         * r2:     aaakkkkkpppp
         *            ^
         * 
         * The strings must be aligned at the first character of the
         * key.  The length of r1 to the left of the alignment point
         * must be <= the length of r2 to the left; ditto for the
         * right.  The characters of r1 must equal (or be a superset
         * of) the corresponding characters of r2.  The superset
         * operation should be performed to check for UnicodeSet
         * masking.
         */

        /* LIMITATION of the current mask algorithm: Some rule
         * maskings are currently not detected.  For example,
         * "{Lu}]a>x" masks "A]a>y".  To detect these sorts of masking,
         * we need a subset operator on UnicodeSet objects, which we
         * currently do not have.  This can be added later.
         */

        int left = anteContextLength;
        int left2 = r2.anteContextLength;
        int right = pattern.length() - left;
        int right2 = r2.pattern.length() - left2;
        return left <= left2 && right <= right2 &&
            r2.pattern.substring(left2 - left).startsWith(pattern);
    }

    /**
     * Return a string representation of this object.
     * @return string representation of this object
     */
    public String toString() {
        return getClass().getName() + '{'
            + escape((anteContextLength > 0 ? ("(" + pattern.substring(0, anteContextLength) +
                                              ") ") : "")
                     + pattern.substring(anteContextLength, anteContextLength + keyLength)
                     + (anteContextLength + keyLength < pattern.length() ?
                        (" (" + pattern.substring(anteContextLength + keyLength) + ")") : "")
                     + " > "
                     + (cursorPos < output.length()
                        ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
                        : output))
            + '}';
    }

    /**
     * Return true if this rule matches the given text.  The text being matched
     * occupies a virtual buffer consisting of the contents of
     * <code>result</code> concatenated to a substring of <code>text</code>.
     * The substring is specified by <code>start</code> and <code>limit</code>.
     * The value of <code>cursor</code> is an index into this virtual buffer,
     * from 0 to the length of the buffer.  In terms of the parameters,
     * <code>cursor</code> must be between 0 and <code>result.length() + limit -
     * start</code>.
     * @param text the untranslated text
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param result translated text so far
     * @param cursor position at which to translate next, an offset into result.
     * If greater than or equal to result.length(), represents offset start +
     * cursor - result.length() into text.
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     */
    public final boolean matches(String text, int start, int limit,
                                 StringBuffer result, int cursor,
                                 Dictionary variables,
                                 UnicodeFilter filter) {
        // Match anteContext, key, and postContext
        return regionMatches(text, start, limit, result,
                             cursor - anteContextLength,
                             pattern, variables, filter);
    }

    /**
     * Return true if this rule matches the given text.
     * @param text the text, both translated and untranslated
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param cursor position at which to translate next, representing offset
     * into text.  This value must be between <code>start</code> and
     * <code>limit</code>.
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     */
    public final boolean matches(Replaceable text, int start, int limit,
                                 int cursor, Dictionary variables,
                                 UnicodeFilter filter) {
        // Match anteContext, key, and postContext
        return regionMatches(text, start, limit,
                             cursor - anteContextLength,
                             pattern, variables, filter);
    }

    /**
     * Return the degree of match between this rule and the given text.  The
     * degree of match may be mismatch, a partial match, or a full match.  A
     * mismatch means at least one character of the text does not match the
     * context or key.  A partial match means some context and key characters
     * match, but the text is not long enough to match all of them.  A full
     * match means all context and key characters match.
     * @param text the text, both translated and untranslated
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param cursor position at which to translate next, representing offset
     * into text.  This value must be between <code>start</code> and
     * <code>limit</code>.
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
     * <code>FULL_MATCH</code>.
     * @see #MISMATCH
     * @see #PARTIAL_MATCH
     * @see #FULL_MATCH
     */
    public int getMatchDegree(Replaceable text, int start, int limit,
                              int cursor, Dictionary variables,
                              UnicodeFilter filter) {
        int len = getRegionMatchLength(text, start, limit, cursor - anteContextLength,
                                       pattern, variables, filter);
        return len < anteContextLength ? MISMATCH :
            (len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
    }

    /**
     * Return true if a template matches the text.  The entire length of the
     * template is compared to the text at the cursor.  As in
     * <code>matches()</code>, the text being matched occupies a virtual buffer
     * consisting of the contents of <code>result</code> concatenated to a
     * substring of <code>text</code>.  See <code>matches()</code> for details.
     * @param text the untranslated text
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param result translated text so far
     * @param cursor position at which to translate next, an offset into result.
     * If greater than or equal to result.length(), represents offset start +
     * cursor - result.length() into text.
     * @param template the text to match against.  All characters must match.
     * @param variables a dictionary of variables mapping <code>Character</code>
     * to <code>UnicodeSet</code>
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @return true if there is a match
     */
    protected static boolean regionMatches(String text, int start, int limit,
                                           StringBuffer result, int cursor,
                                           String template,
                                           Dictionary variables,
                                           UnicodeFilter filter) {
        int rlen = result.length();
        if (cursor < 0
            || (cursor + template.length()) > (rlen + limit - start)) {
            return false;
        }
        for (int i=0; i<template.length(); ++i, ++cursor) {
            if (!charMatches(template.charAt(i),
                             cursor < rlen ? result.charAt(cursor)
                                           : text.charAt(cursor - rlen + start),
                             variables, filter)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Return true if a template matches the text.  The entire length of the
     * template is compared to the text at the cursor.
     * @param text the text, both translated and untranslated
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param cursor position at which to translate next, representing offset
     * into text.  This value must be between <code>start</code> and
     * <code>limit</code>.
     * @param template the text to match against.  All characters must match.
     * @param variables a dictionary of variables mapping <code>Character</code>
     * to <code>UnicodeSet</code>
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @return true if there is a match
     */
    protected static boolean regionMatches(Replaceable text, int start, int limit,
                                           int cursor,
                                           String template, Dictionary variables,
                                           UnicodeFilter filter) {
        if (cursor < start
            || (cursor + template.length()) > limit) {
            return false;
        }
        for (int i=0; i<template.length(); ++i, ++cursor) {
            if (!charMatches(template.charAt(i), text.charAt(cursor),
                             variables, filter)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Return the number of characters of the text that match this rule.  If
     * there is a mismatch, return -1.  If the text is not long enough to match
     * any characters, return 0.
     * @param text the text, both translated and untranslated
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
     * @param cursor position at which to translate next, representing offset
     * into text.  This value must be between <code>start</code> and
     * <code>limit</code>.
     * @param template the text to match against.  All characters must match.
     * @param variables a dictionary of variables mapping <code>Character</code>
     * to <code>UnicodeSet</code>
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @return -1 if there is a mismatch, 0 if the text is not long enough to
     * match any characters, otherwise the number of characters of text that
     * match this rule.
     */
    protected static int getRegionMatchLength(Replaceable text, int start,
                                              int limit, int cursor,
                                              String template,
                                              Dictionary variables,
                                              UnicodeFilter filter) {
        if (cursor < start) {
            return -1;
        }
        int i;
        for (i=0; i<template.length() && cursor<limit; ++i, ++cursor) {
            if (!charMatches(template.charAt(i), text.charAt(cursor),
                             variables, filter)) {
                return -1;
            }
        }
        return i;
    }

    /**
     * Return true if the given key matches the given text.  This method
     * accounts for the fact that the key character may represent a character
     * set.  Note that the key and text characters may not be interchanged
     * without altering the results.
     * @param keyChar a character in the match key
     * @param textChar a character in the text being transliterated
     * @param variables a dictionary of variables mapping <code>Character</code>
     * to <code>UnicodeSet</code>
     * @param filter the filter.  Any character for which
     * <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     */
    protected static final boolean charMatches(char keyChar, char textChar,
                                               Dictionary variables, UnicodeFilter filter) {
        UnicodeSet set = null;
        return (filter == null || filter.isIn(textChar)) &&
            ((set = (UnicodeSet) variables.get(new Character(keyChar)))
             == null) ?
            keyChar == textChar : set.contains(textChar);
    }

    /**
     * Escape non-ASCII characters as Unicode.
     */
    public static final String escape(String s) {
        StringBuffer buf = new StringBuffer();
        for (int i=0; i<s.length(); ++i) {
            char c = s.charAt(i);
            if (c >= ' ' && c <= 0x007F) {
                buf.append(c);
            } else {
                buf.append("\\u");
                if (c < 0x1000) {
                    buf.append('0');
                    if (c < 0x100) {
                        buf.append('0');
                        if (c < 0x10) {
                            buf.append('0');
                        }
                    }
                }
                buf.append(Integer.toHexString(c));
            }
        }
        return buf.toString();
    }
}
