main/classes/translit/src/com/ibm/icu/text/StringMatcher.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 2001-2004, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
 package com.ibm.icu.text;
 import com.ibm.icu.impl.Utility;

 /**
  * An object that matches a fixed input string, implementing the
  * UnicodeMatcher API.  This object also implements the
  * UnicodeReplacer API, allowing it to emit the matched text as
  * output.  Since the match text may contain flexible match elements,
  * such as UnicodeSets, the emitted text is not the match pattern, but
  * instead a substring of the actual matched text.  Following
  * convention, the output text is the leftmost match seen up to this
  * point.
  *
  * A StringMatcher may represent a segment, in which case it has a
  * positive segment number.  This affects how the matcher converts
  * itself to a pattern but does not otherwise affect its function.
  *
  * A StringMatcher that is not a segment should not be used as a
  * UnicodeReplacer.
  */
 class StringMatcher implements UnicodeMatcher, UnicodeReplacer {

     /**
      * The text to be matched.
      */
     private String pattern;

     /**
      * Start offset, in the match text, of the <em>rightmost</em>
      * match.
      */
     private int matchStart;

     /**
      * Limit offset, in the match text, of the <em>rightmost</em>
      * match.
      */
     private int matchLimit;

     /**
      * The segment number, 1-based, or 0 if not a segment.
      */
     private int segmentNumber;

     /**
      * Context object that maps stand-ins to matcher and replacer
      * objects.
      */
     private final RuleBasedTransliterator.Data data;

     /**
      * Construct a matcher that matches the given pattern string.
      * @param theString the pattern to be matched, possibly containing
      * stand-ins that represent nested UnicodeMatcher objects.
      * @param segmentNum the segment number from 1..n, or 0 if this is
      * not a segment.
      * @param theData context object mapping stand-ins to
      * UnicodeMatcher objects.
      */
     public StringMatcher(String theString,
                          int segmentNum,
                          RuleBasedTransliterator.Data theData) {
         data = theData;
         pattern = theString;
         matchStart = matchLimit = -1;
         segmentNumber = segmentNum;
     }

     /**
      * Construct a matcher that matches a substring of the given
      * pattern string.
      * @param theString the pattern to be matched, possibly containing
      * stand-ins that represent nested UnicodeMatcher objects.
      * @param start first character of theString to be matched
      * @param limit index after the last character of theString to be
      * matched.
      * @param segmentNum the segment number from 1..n, or 0 if this is
      * not a segment.
      * @param theData context object mapping stand-ins to
      * UnicodeMatcher objects.
      */
     public StringMatcher(String theString,
                          int start,
                          int limit,
                          int segmentNum,
                          RuleBasedTransliterator.Data theData) {
         this(theString.substring(start, limit), segmentNum, theData);
     }

     /**
      * Implement UnicodeMatcher
      */
     public int matches(Replaceable text,
                        int[] offset,
                        int limit,
                        boolean incremental) {
         // Note (1): We process text in 16-bit code units, rather than
         // 32-bit code points.  This works because stand-ins are
         // always in the BMP and because we are doing a literal match
         // operation, which can be done 16-bits at a time.
         int i;
         int[] cursor = new int[] { offset[0] };
         if (limit < cursor[0]) {
             // Match in the reverse direction
             for (i=pattern.length()-1; i>=0; --i) {
                 char keyChar = pattern.charAt(i); // OK; see note (1) above
                 UnicodeMatcher subm = data.lookupMatcher(keyChar);
                 if (subm == null) {
                     if (cursor[0] > limit &&
                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
                         --cursor[0];
                     } else {
                         return U_MISMATCH;
                     }
                 } else {
                     int m =
                         subm.matches(text, cursor, limit, incremental);
                     if (m != U_MATCH) {
                         return m;
                     }
                 }
             }
             // Record the match position, but adjust for a normal
             // forward start, limit, and only if a prior match does not
             // exist -- we want the rightmost match.
             if (matchStart < 0) {
                 matchStart = cursor[0]+1;
                 matchLimit = offset[0]+1;
             }
         } else {
             for (i=0; i<pattern.length(); ++i) {
                 if (incremental && cursor[0] == limit) {
                     // We've reached the context limit without a mismatch and
                     // without completing our match.
                     return U_PARTIAL_MATCH;
                 }
                 char keyChar = pattern.charAt(i); // OK; see note (1) above
                 UnicodeMatcher subm = data.lookupMatcher(keyChar);
                 if (subm == null) {
                     // Don't need the cursor < limit check if
                     // incremental is true (because it's done above); do need
                     // it otherwise.
                     if (cursor[0] < limit &&
                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
                         ++cursor[0];
                     } else {
                         return U_MISMATCH;
                     }
                 } else {
                     int m =
                         subm.matches(text, cursor, limit, incremental);
                     if (m != U_MATCH) {
                         return m;
                     }
                 }
             }
             // Record the match position
             matchStart = offset[0];
             matchLimit = cursor[0];
         }

         offset[0] = cursor[0];
         return U_MATCH;
     }

     /**
      * Implement UnicodeMatcher
      */
     public String toPattern(boolean escapeUnprintable) {
         StringBuffer result = new StringBuffer();
         StringBuffer quoteBuf = new StringBuffer();
         if (segmentNumber > 0) { // i.e., if this is a segment
             result.append('(');
         }
         for (int i=0; i<pattern.length(); ++i) {
             char keyChar = pattern.charAt(i); // OK; see note (1) above
             UnicodeMatcher m = data.lookupMatcher(keyChar);
             if (m == null) {
                 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
             } else {
                 Utility.appendToRule(result, m.toPattern(escapeUnprintable),
                                      true, escapeUnprintable, quoteBuf);
             }
         }
         if (segmentNumber > 0) { // i.e., if this is a segment
             result.append(')');
         }
         // Flush quoteBuf out to result
         Utility.appendToRule(result, -1,
                              true, escapeUnprintable, quoteBuf);
         return result.toString();
     }

     /**
      * Implement UnicodeMatcher
      */
     public boolean matchesIndexValue(int v) {
         if (pattern.length() == 0) {
             return true;
         }
         int c = UTF16.charAt(pattern, 0);
         UnicodeMatcher m = data.lookupMatcher(c);
         return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
     }

     /**
      * Implementation of UnicodeMatcher API.  Union the set of all
      * characters that may be matched by this object into the given
      * set.
      * @param toUnionTo the set into which to union the source characters
      */
     public void addMatchSetTo(UnicodeSet toUnionTo) {
         int ch;
         for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
             ch = UTF16.charAt(pattern, i);
             UnicodeMatcher matcher = data.lookupMatcher(ch);
             if (matcher == null) {
                 toUnionTo.add(ch);
             } else {
                 matcher.addMatchSetTo(toUnionTo);
             }
         }
     }

     /**
      * UnicodeReplacer API
      */
     public int replace(Replaceable text,
                        int start,
                        int limit,
                        int[] cursor) {

         int outLen = 0;

         // Copy segment with out-of-band data
         int dest = limit;
         // If there was no match, that means that a quantifier
         // matched zero-length.  E.g., x (a)* y matched "xy".
         if (matchStart >= 0) {
             if (matchStart != matchLimit) {
                 text.copy(matchStart, matchLimit, dest);
                 outLen = matchLimit - matchStart;
             }
         }

         text.replace(start, limit, ""); // delete original text

         return outLen;
     }

     /**
      * UnicodeReplacer API
      */
     public String toReplacerPattern(boolean escapeUnprintable) {
         // assert(segmentNumber > 0);
         StringBuffer rule = new StringBuffer("$");
         Utility.appendNumber(rule, segmentNumber, 10, 1);
         return rule.toString();
     }

     /**
      * Remove any match data.  This must be called before performing a
      * set of matches with this segment.
      */
     public void resetMatch() {
         matchStart = matchLimit = -1;
     }

     /**
      * Union the set of all characters that may output by this object
      * into the given set.
      * @param toUnionTo the set into which to union the output characters
      */
     public void addReplacementSetTo(UnicodeSet toUnionTo) {
         // The output of this replacer varies; it is the source text between
         // matchStart and matchLimit.  Since this varies depending on the
         // input text, we can't compute it here.  We can either do nothing
         // or we can add ALL characters to the set.  It's probably more useful
         // to do nothing.
     }
 }

 //eof
	/*
	*******************************************************************************
	* Copyright (C) 2001-2004, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*/
	package com.ibm.icu.text;
	import com.ibm.icu.impl.Utility;

	/**
	* An object that matches a fixed input string, implementing the
	* UnicodeMatcher API. This object also implements the
	* UnicodeReplacer API, allowing it to emit the matched text as
	* output. Since the match text may contain flexible match elements,
	* such as UnicodeSets, the emitted text is not the match pattern, but
	* instead a substring of the actual matched text. Following
	* convention, the output text is the leftmost match seen up to this
	* point.
	*
	* A StringMatcher may represent a segment, in which case it has a
	* positive segment number. This affects how the matcher converts
	* itself to a pattern but does not otherwise affect its function.
	*
	* A StringMatcher that is not a segment should not be used as a
	* UnicodeReplacer.
	*/
	class StringMatcher implements UnicodeMatcher, UnicodeReplacer {

	/**
	* The text to be matched.
	*/
	private String pattern;

	/**
	* Start offset, in the match text, of the <em>rightmost</em>
	* match.
	*/
	private int matchStart;

	/**
	* Limit offset, in the match text, of the <em>rightmost</em>
	* match.
	*/
	private int matchLimit;

	/**
	* The segment number, 1-based, or 0 if not a segment.
	*/
	private int segmentNumber;

	/**
	* Context object that maps stand-ins to matcher and replacer
	* objects.
	*/
	private final RuleBasedTransliterator.Data data;

	/**
	* Construct a matcher that matches the given pattern string.
	* @param theString the pattern to be matched, possibly containing
	* stand-ins that represent nested UnicodeMatcher objects.
	* @param segmentNum the segment number from 1..n, or 0 if this is
	* not a segment.
	* @param theData context object mapping stand-ins to
	* UnicodeMatcher objects.
	*/
	public StringMatcher(String theString,
	int segmentNum,
	RuleBasedTransliterator.Data theData) {
	data = theData;
	pattern = theString;
	matchStart = matchLimit = -1;
	segmentNumber = segmentNum;
	}

	/**
	* Construct a matcher that matches a substring of the given
	* pattern string.
	* @param theString the pattern to be matched, possibly containing
	* stand-ins that represent nested UnicodeMatcher objects.
	* @param start first character of theString to be matched
	* @param limit index after the last character of theString to be
	* matched.
	* @param segmentNum the segment number from 1..n, or 0 if this is
	* not a segment.
	* @param theData context object mapping stand-ins to
	* UnicodeMatcher objects.
	*/
	public StringMatcher(String theString,
	int start,
	int limit,
	int segmentNum,
	RuleBasedTransliterator.Data theData) {
	this(theString.substring(start, limit), segmentNum, theData);
	}

	/**
	* Implement UnicodeMatcher
	*/
	public int matches(Replaceable text,
	int[] offset,
	int limit,
	boolean incremental) {
	// Note (1): We process text in 16-bit code units, rather than
	// 32-bit code points. This works because stand-ins are
	// always in the BMP and because we are doing a literal match
	// operation, which can be done 16-bits at a time.
	int i;
	int[] cursor = new int[] { offset[0] };
	if (limit < cursor[0]) {
	// Match in the reverse direction
	for (i=pattern.length()-1; i>=0; --i) {
	char keyChar = pattern.charAt(i); // OK; see note (1) above
	UnicodeMatcher subm = data.lookupMatcher(keyChar);
	if (subm == null) {
	if (cursor[0] > limit &&
	keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
	--cursor[0];
	} else {
	return U_MISMATCH;
	}
	} else {
	int m =
	subm.matches(text, cursor, limit, incremental);
	if (m != U_MATCH) {
	return m;
	}
	}
	}
	// Record the match position, but adjust for a normal
	// forward start, limit, and only if a prior match does not
	// exist -- we want the rightmost match.
	if (matchStart < 0) {
	matchStart = cursor[0]+1;
	matchLimit = offset[0]+1;
	}
	} else {
	for (i=0; i<pattern.length(); ++i) {
	if (incremental && cursor[0] == limit) {
	// We've reached the context limit without a mismatch and
	// without completing our match.
	return U_PARTIAL_MATCH;
	}
	char keyChar = pattern.charAt(i); // OK; see note (1) above
	UnicodeMatcher subm = data.lookupMatcher(keyChar);
	if (subm == null) {
	// Don't need the cursor < limit check if
	// incremental is true (because it's done above); do need
	// it otherwise.
	if (cursor[0] < limit &&
	keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
	++cursor[0];
	} else {
	return U_MISMATCH;
	}
	} else {
	int m =
	subm.matches(text, cursor, limit, incremental);
	if (m != U_MATCH) {
	return m;
	}
	}
	}
	// Record the match position
	matchStart = offset[0];
	matchLimit = cursor[0];
	}

	offset[0] = cursor[0];
	return U_MATCH;
	}

	/**
	* Implement UnicodeMatcher
	*/
	public String toPattern(boolean escapeUnprintable) {
	StringBuffer result = new StringBuffer();
	StringBuffer quoteBuf = new StringBuffer();
	if (segmentNumber > 0) { // i.e., if this is a segment
	result.append('(');
	}
	for (int i=0; i<pattern.length(); ++i) {
	char keyChar = pattern.charAt(i); // OK; see note (1) above
	UnicodeMatcher m = data.lookupMatcher(keyChar);
	if (m == null) {
	Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
	} else {
	Utility.appendToRule(result, m.toPattern(escapeUnprintable),
	true, escapeUnprintable, quoteBuf);
	}
	}
	if (segmentNumber > 0) { // i.e., if this is a segment
	result.append(')');
	}
	// Flush quoteBuf out to result
	Utility.appendToRule(result, -1,
	true, escapeUnprintable, quoteBuf);
	return result.toString();
	}

	/**
	* Implement UnicodeMatcher
	*/
	public boolean matchesIndexValue(int v) {
	if (pattern.length() == 0) {
	return true;
	}
	int c = UTF16.charAt(pattern, 0);
	UnicodeMatcher m = data.lookupMatcher(c);
	return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
	}

	/**
	* Implementation of UnicodeMatcher API. Union the set of all
	* characters that may be matched by this object into the given
	* set.
	* @param toUnionTo the set into which to union the source characters
	*/
	public void addMatchSetTo(UnicodeSet toUnionTo) {
	int ch;
	for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
	ch = UTF16.charAt(pattern, i);
	UnicodeMatcher matcher = data.lookupMatcher(ch);
	if (matcher == null) {
	toUnionTo.add(ch);
	} else {
	matcher.addMatchSetTo(toUnionTo);
	}
	}
	}

	/**
	* UnicodeReplacer API
	*/
	public int replace(Replaceable text,
	int start,
	int limit,
	int[] cursor) {

	int outLen = 0;

	// Copy segment with out-of-band data
	int dest = limit;
	// If there was no match, that means that a quantifier
	// matched zero-length. E.g., x (a)* y matched "xy".
	if (matchStart >= 0) {
	if (matchStart != matchLimit) {
	text.copy(matchStart, matchLimit, dest);
	outLen = matchLimit - matchStart;
	}
	}

	text.replace(start, limit, ""); // delete original text

	return outLen;
	}

	/**
	* UnicodeReplacer API
	*/
	public String toReplacerPattern(boolean escapeUnprintable) {
	// assert(segmentNumber > 0);
	StringBuffer rule = new StringBuffer("$");
	Utility.appendNumber(rule, segmentNumber, 10, 1);
	return rule.toString();
	}

	/**
	* Remove any match data. This must be called before performing a
	* set of matches with this segment.
	*/
	public void resetMatch() {
	matchStart = matchLimit = -1;
	}

	/**
	* Union the set of all characters that may output by this object
	* into the given set.
	* @param toUnionTo the set into which to union the output characters
	*/
	public void addReplacementSetTo(UnicodeSet toUnionTo) {
	// The output of this replacer varies; it is the source text between
	// matchStart and matchLimit. Since this varies depending on the
	// input text, we can't compute it here. We can either do nothing
	// or we can add ALL characters to the set. It's probably more useful
	// to do nothing.
	}
	}

	//eof