blob: 5a3c05ab7e436197f411529f6fbc16ba90241f0a [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl.text;
import java.util.HashMap;
import java.util.Map;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RbnfLenientScanner;
import com.ibm.icu.text.RbnfLenientScannerProvider;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
/**
* Returns RbnfLenientScanners that use the old RuleBasedNumberFormat
* implementation behind setLenientParseMode, which is based on Collator.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public class RbnfScannerProviderImpl implements RbnfLenientScannerProvider {
private static final boolean DEBUG = ICUDebug.enabled("rbnf");
private Map<String, RbnfLenientScanner> cache;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RbnfScannerProviderImpl() {
cache = new HashMap<String, RbnfLenientScanner>();
}
/**
* Returns a collation-based scanner.
*
* Only primary differences are treated as significant. This means that case
* differences, accent differences, alternate spellings of the same letter
* (e.g., ae and a-umlaut in German), ignorable characters, etc. are ignored in
* matching the text. In many cases, numerals will be accepted in place of words
* or phrases as well.
*
* For example, all of the following will correctly parse as 255 in English in
* lenient-parse mode:
* <br>"two hundred fifty-five"
* <br>"two hundred fifty five"
* <br>"TWO HUNDRED FIFTY-FIVE"
* <br>"twohundredfiftyfive"
* <br>"2 hundred fifty-5"
*
* The Collator used is determined by the locale that was
* passed to this object on construction. The description passed to this object
* on construction may supply additional collation rules that are appended to the
* end of the default collator for the locale, enabling additional equivalences
* (such as adding more ignorable characters or permitting spelled-out version of
* symbols; see the demo program for examples).
*
* It's important to emphasize that even strict parsing is relatively lenient: it
* will accept some text that it won't produce as output. In English, for example,
* it will correctly parse "two hundred zero" and "fifteen hundred".
*
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RbnfLenientScanner get(ULocale locale, String extras) {
RbnfLenientScanner result = null;
String key = locale.toString() + "/" + extras;
synchronized(cache) {
result = cache.get(key);
if (result != null) {
return result;
}
}
result = createScanner(locale, extras);
synchronized(cache) {
cache.put(key, result);
}
return result;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected RbnfLenientScanner createScanner(ULocale locale, String extras) {
RuleBasedCollator collator = null;
try {
// create a default collator based on the locale,
// then pull out that collator's rules, append any additional
// rules specified in the description, and create a _new_
// collator based on the combination of those rules
collator = (RuleBasedCollator)Collator.getInstance(locale.toLocale());
if (extras != null) {
String rules = collator.getRules() + extras;
collator = new RuleBasedCollator(rules);
}
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
}
catch (Exception e) {
// If we get here, it means we have a malformed set of
// collation rules, which hopefully won't happen
///CLOVER:OFF
if (DEBUG){ // debug hook
e.printStackTrace(); System.out.println("++++");
}
collator = null;
///CLOVER:ON
}
return new RbnfLenientScannerImpl(collator);
}
private static class RbnfLenientScannerImpl implements RbnfLenientScanner {
private final RuleBasedCollator collator;
private RbnfLenientScannerImpl(RuleBasedCollator rbc) {
this.collator = rbc;
}
public boolean allIgnorable(String s) {
CollationElementIterator iter = collator.getCollationElementIterator(s);
int o = iter.next();
while (o != CollationElementIterator.NULLORDER
&& CollationElementIterator.primaryOrder(o) == 0) {
o = iter.next();
}
return o == CollationElementIterator.NULLORDER;
}
public int[] findText(String str, String key, int startingAt) {
int p = startingAt;
int keyLen = 0;
// basically just isolate smaller and smaller substrings of
// the target string (each running to the end of the string,
// and with the first one running from startingAt to the end)
// and then use prefixLength() to see if the search key is at
// the beginning of each substring. This is excruciatingly
// slow, but it will locate the key and tell use how long the
// matching text was.
while (p < str.length() && keyLen == 0) {
keyLen = prefixLength(str.substring(p), key);
if (keyLen != 0) {
return new int[] { p, keyLen };
}
++p;
}
// if we make it to here, we didn't find it. Return -1 for the
// location. The length should be ignored, but set it to 0,
// which should be "safe"
return new int[] { -1, 0 };
}
///CLOVER:OFF
// The following method contains the same signature as findText
// and has never been used by anything once.
@SuppressWarnings("unused")
public int[] findText2(String str, String key, int startingAt) {
CollationElementIterator strIter = collator.getCollationElementIterator(str);
CollationElementIterator keyIter = collator.getCollationElementIterator(key);
int keyStart = -1;
strIter.setOffset(startingAt);
int oStr = strIter.next();
int oKey = keyIter.next();
while (oKey != CollationElementIterator.NULLORDER) {
while (oStr != CollationElementIterator.NULLORDER &&
CollationElementIterator.primaryOrder(oStr) == 0)
oStr = strIter.next();
while (oKey != CollationElementIterator.NULLORDER &&
CollationElementIterator.primaryOrder(oKey) == 0)
oKey = keyIter.next();
if (oStr == CollationElementIterator.NULLORDER) {
return new int[] { -1, 0 };
}
if (oKey == CollationElementIterator.NULLORDER) {
break;
}
if (CollationElementIterator.primaryOrder(oStr) ==
CollationElementIterator.primaryOrder(oKey)) {
keyStart = strIter.getOffset();
oStr = strIter.next();
oKey = keyIter.next();
} else {
if (keyStart != -1) {
keyStart = -1;
keyIter.reset();
} else {
oStr = strIter.next();
}
}
}
if (oKey == CollationElementIterator.NULLORDER) {
return new int[] { keyStart, strIter.getOffset() - keyStart };
}
return new int[] { -1, 0 };
}
///CLOVER:ON
public int prefixLength(String str, String prefix) {
// Create two collation element iterators, one over the target string
// and another over the prefix.
//
// Previous code was matching "fifty-" against " fifty" and leaving
// the number " fifty-7" to parse as 43 (50 - 7).
// Also it seems that if we consume the entire prefix, that's ok even
// if we've consumed the entire string, so I switched the logic to
// reflect this.
CollationElementIterator strIter = collator.getCollationElementIterator(str);
CollationElementIterator prefixIter = collator.getCollationElementIterator(prefix);
// match collation elements between the strings
int oStr = strIter.next();
int oPrefix = prefixIter.next();
while (oPrefix != CollationElementIterator.NULLORDER) {
// skip over ignorable characters in the target string
while (CollationElementIterator.primaryOrder(oStr) == 0 && oStr !=
CollationElementIterator.NULLORDER) {
oStr = strIter.next();
}
// skip over ignorable characters in the prefix
while (CollationElementIterator.primaryOrder(oPrefix) == 0 && oPrefix !=
CollationElementIterator.NULLORDER) {
oPrefix = prefixIter.next();
}
// if skipping over ignorables brought to the end of
// the prefix, we DID match: drop out of the loop
if (oPrefix == CollationElementIterator.NULLORDER) {
break;
}
// if skipping over ignorables brought us to the end
// of the target string, we didn't match and return 0
if (oStr == CollationElementIterator.NULLORDER) {
return 0;
}
// match collation elements from the two strings
// (considering only primary differences). If we
// get a mismatch, dump out and return 0
if (CollationElementIterator.primaryOrder(oStr) !=
CollationElementIterator.primaryOrder(oPrefix)) {
return 0;
}
// otherwise, advance to the next character in each string
// and loop (we drop out of the loop when we exhaust
// collation elements in the prefix)
oStr = strIter.next();
oPrefix = prefixIter.next();
}
int result = strIter.getOffset();
if (oStr != CollationElementIterator.NULLORDER) {
--result;
}
return result;
}
}
}