blob: bb42de2334b53ee51059362c821ed469862c520b [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.Set;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.coll.CollationData;
import com.ibm.icu.impl.coll.CollationRoot;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.Freezable;
import com.ibm.icu.util.ICUException;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.ULocale.Category;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.VersionInfo;
/**
* {@icuenhanced java.text.Collator}.{@icu _usage_}
*
* <p>Collator performs locale-sensitive string comparison. A concrete
* subclass, RuleBasedCollator, allows customization of the collation
* ordering by the use of rule sets.</p>
*
* <p>A Collator is thread-safe only when frozen. See {{@link #isFrozen()} and {@link Freezable}.
*
* <p>Following the <a href=http://www.unicode.org>Unicode
* Consortium</a>'s specifications for the
* <a href="http://www.unicode.org/unicode/reports/tr10/">Unicode Collation
* Algorithm (UCA)</a>, there are 5 different levels of strength used
* in comparisons:
*
* <ul>
* <li>PRIMARY strength: Typically, this is used to denote differences between
* base characters (for example, "a" &lt; "b").
* It is the strongest difference. For example, dictionaries are divided
* into different sections by base character.
* <li>SECONDARY strength: Accents in the characters are considered secondary
* differences (for example, "as" &lt; "&agrave;s" &lt; "at"). Other
* differences
* between letters can also be considered secondary differences, depending
* on the language. A secondary difference is ignored when there is a
* primary difference anywhere in the strings.
* <li>TERTIARY strength: Upper and lower case differences in characters are
* distinguished at tertiary strength (for example, "ao" &lt; "Ao" &lt;
* "a&ograve;"). In addition, a variant of a letter differs from the base
* form on the tertiary strength (such as "A" and "&#9398;"). Another
* example is the
* difference between large and small Kana. A tertiary difference is ignored
* when there is a primary or secondary difference anywhere in the strings.
* <li>QUATERNARY strength: When punctuation is ignored
* (see <a href="http://userguide.icu-project.org/collation/concepts#TOC-Ignoring-Punctuation">
* Ignoring Punctuations in the User Guide</a>) at PRIMARY to TERTIARY
* strength, an additional strength level can
* be used to distinguish words with and without punctuation (for example,
* "ab" &lt; "a-b" &lt; "aB").
* This difference is ignored when there is a PRIMARY, SECONDARY or TERTIARY
* difference. The QUATERNARY strength should only be used if ignoring
* punctuation is required.
* <li>IDENTICAL strength:
* When all other strengths are equal, the IDENTICAL strength is used as a
* tiebreaker. The Unicode code point values of the NFD form of each string
* are compared, just in case there is no difference.
* For example, Hebrew cantellation marks are only distinguished at this
* strength. This strength should be used sparingly, as only code point
* value differences between two strings is an extremely rare occurrence.
* Using this strength substantially decreases the performance for both
* comparison and collation key generation APIs. This strength also
* increases the size of the collation key.
* </ul>
*
* Unlike the JDK, ICU4J's Collator deals only with 2 decomposition modes,
* the canonical decomposition mode and one that does not use any decomposition.
* The compatibility decomposition mode, java.text.Collator.FULL_DECOMPOSITION
* is not supported here. If the canonical
* decomposition mode is set, the Collator handles un-normalized text properly,
* producing the same results as if the text were normalized in NFD. If
* canonical decomposition is turned off, it is the user's responsibility to
* ensure that all text is already in the appropriate form before performing
* a comparison or before getting a CollationKey.</p>
*
* <p>For more information about the collation service see the
* <a href="http://userguide.icu-project.org/collation">User Guide</a>.</p>
*
* <p>Examples of use
* <pre>
* // Get the Collator for US English and set its strength to PRIMARY
* Collator usCollator = Collator.getInstance(Locale.US);
* usCollator.setStrength(Collator.PRIMARY);
* if (usCollator.compare("abc", "ABC") == 0) {
* System.out.println("Strings are equivalent");
* }
*
* The following example shows how to compare two strings using the
* Collator for the default locale.
*
* // Compare two strings in the default locale
* Collator myCollator = Collator.getInstance();
* myCollator.setDecomposition(NO_DECOMPOSITION);
* if (myCollator.compare("&agrave;&#92;u0325", "a&#92;u0325&#768;") != 0) {
* System.out.println("&agrave;&#92;u0325 is not equals to a&#92;u0325&#768; without decomposition");
* myCollator.setDecomposition(CANONICAL_DECOMPOSITION);
* if (myCollator.compare("&agrave;&#92;u0325", "a&#92;u0325&#768;") != 0) {
* System.out.println("Error: &agrave;&#92;u0325 should be equals to a&#92;u0325&#768; with decomposition");
* }
* else {
* System.out.println("&agrave;&#92;u0325 is equals to a&#92;u0325&#768; with decomposition");
* }
* }
* else {
* System.out.println("Error: &agrave;&#92;u0325 should be not equals to a&#92;u0325&#768; without decomposition");
* }
* </pre>
* </p>
* @see RuleBasedCollator
* @see CollationKey
* @author Syn Wee Quek
* @stable ICU 2.8
*/
public abstract class Collator implements Comparator<Object>, Freezable<Collator>, Cloneable
{
// public data members ---------------------------------------------------
/**
* Strongest collator strength value. Typically used to denote differences
* between base characters. See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @stable ICU 2.8
*/
public final static int PRIMARY = 0;
/**
* Second level collator strength value.
* Accents in the characters are considered secondary differences.
* Other differences between letters can also be considered secondary
* differences, depending on the language.
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @stable ICU 2.8
*/
public final static int SECONDARY = 1;
/**
* Third level collator strength value.
* Upper and lower case differences in characters are distinguished at this
* strength level. In addition, a variant of a letter differs from the base
* form on the tertiary level.
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @stable ICU 2.8
*/
public final static int TERTIARY = 2;
/**
* {@icu} Fourth level collator strength value.
* When punctuation is ignored
* (see <a href="http://userguide.icu-project.org/collation/concepts#TOC-Ignoring-Punctuation">
* Ignoring Punctuation in the User Guide</a>) at PRIMARY to TERTIARY
* strength, an additional strength level can
* be used to distinguish words with and without punctuation.
* See class documentation for more explanation.
* @see #setStrength
* @see #getStrength
* @stable ICU 2.8
*/
public final static int QUATERNARY = 3;
/**
* Smallest Collator strength value. When all other strengths are equal,
* the IDENTICAL strength is used as a tiebreaker. The Unicode code point
* values of the NFD form of each string are compared, just in case there
* is no difference.
* See class documentation for more explanation.
* </p>
* <p>
* Note this value is different from JDK's
* </p>
* @stable ICU 2.8
*/
public final static int IDENTICAL = 15;
/**
* {@icunote} This is for backwards compatibility with Java APIs only. It
* should not be used, IDENTICAL should be used instead. ICU's
* collation does not support Java's FULL_DECOMPOSITION mode.
* @stable ICU 3.4
*/
public final static int FULL_DECOMPOSITION = IDENTICAL;
/**
* Decomposition mode value. With NO_DECOMPOSITION set, Strings
* will not be decomposed for collation. This is the default
* decomposition setting unless otherwise specified by the locale
* used to create the Collator.</p>
*
* <p><strong>Note</strong> this value is different from the JDK's.</p>
* @see #CANONICAL_DECOMPOSITION
* @see #getDecomposition
* @see #setDecomposition
* @stable ICU 2.8
*/
public final static int NO_DECOMPOSITION = 16;
/**
* Decomposition mode value. With CANONICAL_DECOMPOSITION set,
* characters that are canonical variants according to the Unicode standard
* will be decomposed for collation.</p>
*
* <p>CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
* described in <a href="http://www.unicode.org/unicode/reports/tr15/">
* Unicode Technical Report #15</a>.
* </p>
* @see #NO_DECOMPOSITION
* @see #getDecomposition
* @see #setDecomposition
* @stable ICU 2.8
*/
public final static int CANONICAL_DECOMPOSITION = 17;
/**
* Reordering codes for non-script groups that can be reordered under collation.
*
* @see #getReorderCodes
* @see #setReorderCodes
* @see #getEquivalentReorderCodes
* @stable ICU 4.8
*/
public static interface ReorderCodes {
/**
* A special reordering code that is used to specify the default reordering codes for a locale.
* @stable ICU 4.8
*/
public final static int DEFAULT = -1; // == UScript.INVALID_CODE
/**
* A special reordering code that is used to specify no reordering codes.
* @stable ICU 4.8
*/
public final static int NONE = UScript.UNKNOWN;
/**
* A special reordering code that is used to specify all other codes used for reordering except
* for the codes listed as ReorderingCodes and those listed explicitly in a reordering.
* @stable ICU 4.8
*/
public final static int OTHERS = UScript.UNKNOWN;
/**
* Characters with the space property.
* This is equivalent to the rule value "space".
* @stable ICU 4.8
*/
public final static int SPACE = 0x1000;
/**
* The first entry in the enumeration of reordering groups. This is intended for use in
* range checking and enumeration of the reorder codes.
* @stable ICU 4.8
*/
public final static int FIRST = SPACE;
/**
* Characters with the punctuation property.
* This is equivalent to the rule value "punct".
* @stable ICU 4.8
*/
public final static int PUNCTUATION = 0x1001;
/**
* Characters with the symbol property.
* This is equivalent to the rule value "symbol".
* @stable ICU 4.8
*/
public final static int SYMBOL = 0x1002;
/**
* Characters with the currency property.
* This is equivalent to the rule value "currency".
* @stable ICU 4.8
*/
public final static int CURRENCY = 0x1003;
/**
* Characters with the digit property.
* This is equivalent to the rule value "digit".
* @stable ICU 4.8
*/
public final static int DIGIT = 0x1004;
/**
* The limit of the reorder codes. This is intended for use in range checking
* and enumeration of the reorder codes.
* @stable ICU 4.8
*/
public final static int LIMIT = 0x1005;
}
// public methods --------------------------------------------------------
/**
* Compares the equality of two Collator objects. Collator objects are equal if they have the same
* collation (sorting & searching) behavior.
*
* <p>The base class checks for null and for equal types.
* Subclasses should override.
*
* @param obj the Collator to compare to.
* @return true if this Collator has exactly the same collation behavior as obj, false otherwise.
* @stable ICU 2.8
*/
@Override
public boolean equals(Object obj) {
// Subclasses: Call this method and then add more specific checks.
return this == obj || (obj != null && getClass() == obj.getClass());
}
// public setters --------------------------------------------------------
private void checkNotFrozen() {
if (isFrozen()) {
throw new UnsupportedOperationException("Attempt to modify frozen Collator");
}
}
/**
* Sets this Collator's strength attribute. The strength attribute
* determines the minimum level of difference considered significant
* during comparison.</p>
*
* <p>The base class method does nothing. Subclasses should override it if appropriate.
*
* <p>See the Collator class description for an example of use.</p>
* @param newStrength the new strength value.
* @see #getStrength
* @see #PRIMARY
* @see #SECONDARY
* @see #TERTIARY
* @see #QUATERNARY
* @see #IDENTICAL
* @throws IllegalArgumentException if the new strength value is not valid.
* @stable ICU 2.8
*/
public void setStrength(int newStrength)
{
checkNotFrozen();
}
/**
* @return this, for chaining
* @internal Used in UnicodeTools
* @deprecated This API is ICU internal only.
*/
@Deprecated
public Collator setStrength2(int newStrength)
{
setStrength(newStrength);
return this;
}
/**
* Sets the decomposition mode of this Collator. Setting this
* decomposition attribute with CANONICAL_DECOMPOSITION allows the
* Collator to handle un-normalized text properly, producing the
* same results as if the text were normalized. If
* NO_DECOMPOSITION is set, it is the user's responsibility to
* insure that all text is already in the appropriate form before
* a comparison or before getting a CollationKey. Adjusting
* decomposition mode allows the user to select between faster and
* more complete collation behavior.</p>
*
* <p>Since a great many of the world's languages do not require
* text normalization, most locales set NO_DECOMPOSITION as the
* default decomposition mode.</p>
*
* <p>The base class method does nothing. Subclasses should override it if appropriate.
*
* <p>See getDecomposition for a description of decomposition
* mode.</p>
*
* @param decomposition the new decomposition mode
* @see #getDecomposition
* @see #NO_DECOMPOSITION
* @see #CANONICAL_DECOMPOSITION
* @throws IllegalArgumentException If the given value is not a valid
* decomposition mode.
* @stable ICU 2.8
*/
public void setDecomposition(int decomposition)
{
checkNotFrozen();
}
/**
* Sets the reordering codes for this collator.
* Collation reordering allows scripts and some other groups of characters
* to be moved relative to each other. This reordering is done on top of
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
* at the start and/or the end of the collation order. These groups are specified using
* UScript codes and {@link Collator.ReorderCodes} entries.
*
* <p>By default, reordering codes specified for the start of the order are placed in the
* order given after several special non-script blocks. These special groups of characters
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
* {@link Collator.ReorderCodes} entries. Script groups can be intermingled with
* these special non-script groups if those special groups are explicitly specified in the reordering.
*
* <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS}
* stands for any script that is not explicitly
* mentioned in the list of reordering codes given. Anything that is after OTHERS
* will go at the very end of the reordering in the order given.
*
* <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT}
* will reset the reordering for this collator
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
* was specified when this collator was created from resource data or from rules. The
* DEFAULT code <b>must</b> be the sole code supplied when it is used.
* If not, then an {@link IllegalArgumentException} will be thrown.
*
* <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE}
* will remove any reordering for this collator.
* The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
* NONE code <b>must</b> be the sole code supplied when it is used.
*
* @param order the reordering codes to apply to this collator; if this is null or an empty array
* then this clears any existing reordering
* @see #getReorderCodes
* @see #getEquivalentReorderCodes
* @see Collator.ReorderCodes
* @see UScript
* @stable ICU 4.8
*/
public void setReorderCodes(int... order)
{
throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
// public getters --------------------------------------------------------
/**
* Returns the Collator for the current default locale.
* The default locale is determined by java.util.Locale.getDefault().
* @return the Collator for the default locale (for example, en_US) if it
* is created successfully. Otherwise if there is no Collator
* associated with the current locale, the root collator
* will be returned.
* @see java.util.Locale#getDefault()
* @see #getInstance(Locale)
* @stable ICU 2.8
*/
public static final Collator getInstance()
{
return getInstance(ULocale.getDefault());
}
/**
* Clones the collator.
* @stable ICU 2.6
* @return a clone of this collator.
*/
public Object clone() throws CloneNotSupportedException {
return super.clone();
}
// begin registry stuff
/**
* A factory used with registerFactory to register multiple collators and provide
* display names for them. If standard locale display names are sufficient,
* Collator instances may be registered instead.
* <p><b>Note:</b> as of ICU4J 3.2, the default API for CollatorFactory uses
* ULocale instead of Locale. Instead of overriding createCollator(Locale),
* new implementations should override createCollator(ULocale). Note that
* one of these two methods <b>MUST</b> be overridden or else an infinite
* loop will occur.
* @stable ICU 2.6
*/
public static abstract class CollatorFactory {
/**
* Return true if this factory will be visible. Default is true.
* If not visible, the locales supported by this factory will not
* be listed by getAvailableLocales.
*
* @return true if this factory is visible
* @stable ICU 2.6
*/
public boolean visible() {
return true;
}
/**
* Return an instance of the appropriate collator. If the locale
* is not supported, return null.
* <b>Note:</b> as of ICU4J 3.2, implementations should override
* this method instead of createCollator(Locale).
* @param loc the locale for which this collator is to be created.
* @return the newly created collator.
* @stable ICU 3.2
*/
public Collator createCollator(ULocale loc) {
return createCollator(loc.toLocale());
}
/**
* Return an instance of the appropriate collator. If the locale
* is not supported, return null.
* <p><b>Note:</b> as of ICU4J 3.2, implementations should override
* createCollator(ULocale) instead of this method, and inherit this
* method's implementation. This method is no longer abstract
* and instead delegates to createCollator(ULocale).
* @param loc the locale for which this collator is to be created.
* @return the newly created collator.
* @stable ICU 2.6
*/
public Collator createCollator(Locale loc) {
return createCollator(ULocale.forLocale(loc));
}
/**
* Return the name of the collator for the objectLocale, localized for the displayLocale.
* If objectLocale is not visible or not defined by the factory, return null.
* @param objectLocale the locale identifying the collator
* @param displayLocale the locale for which the display name of the collator should be localized
* @return the display name
* @stable ICU 2.6
*/
public String getDisplayName(Locale objectLocale, Locale displayLocale) {
return getDisplayName(ULocale.forLocale(objectLocale), ULocale.forLocale(displayLocale));
}
/**
* Return the name of the collator for the objectLocale, localized for the displayLocale.
* If objectLocale is not visible or not defined by the factory, return null.
* @param objectLocale the locale identifying the collator
* @param displayLocale the locale for which the display name of the collator should be localized
* @return the display name
* @stable ICU 3.2
*/
public String getDisplayName(ULocale objectLocale, ULocale displayLocale) {
if (visible()) {
Set<String> supported = getSupportedLocaleIDs();
String name = objectLocale.getBaseName();
if (supported.contains(name)) {
return objectLocale.getDisplayName(displayLocale);
}
}
return null;
}
/**
* Return an unmodifiable collection of the locale names directly
* supported by this factory.
*
* @return the set of supported locale IDs.
* @stable ICU 2.6
*/
public abstract Set<String> getSupportedLocaleIDs();
/**
* Empty default constructor.
* @stable ICU 2.6
*/
protected CollatorFactory() {
}
}
static abstract class ServiceShim {
abstract Collator getInstance(ULocale l);
abstract Object registerInstance(Collator c, ULocale l);
abstract Object registerFactory(CollatorFactory f);
abstract boolean unregister(Object k);
abstract Locale[] getAvailableLocales(); // TODO remove
abstract ULocale[] getAvailableULocales();
abstract String getDisplayName(ULocale ol, ULocale dl);
}
private static ServiceShim shim;
private static ServiceShim getShim() {
// Note: this instantiation is safe on loose-memory-model configurations
// despite lack of synchronization, since the shim instance has no state--
// it's all in the class init. The worst problem is we might instantiate
// two shim instances, but they'll share the same state so that's ok.
if (shim == null) {
try {
Class<?> cls = Class.forName("com.ibm.icu.text.CollatorServiceShim");
shim = (ServiceShim)cls.newInstance();
}
catch (MissingResourceException e)
{
///CLOVER:OFF
throw e;
///CLOVER:ON
}
catch (Exception e) {
///CLOVER:OFF
if(DEBUG){
e.printStackTrace();
}
throw new ICUException(e);
///CLOVER:ON
}
}
return shim;
}
/**
* Simpler/faster methods for ASCII than ones based on Unicode data.
* TODO: There should be code like this somewhere already??
*/
private static final class ASCII {
static boolean equalIgnoreCase(CharSequence left, CharSequence right) {
int length = left.length();
if (length != right.length()) { return false; }
for (int i = 0; i < length; ++i) {
char lc = left.charAt(i);
char rc = right.charAt(i);
if (lc == rc) { continue; }
if ('A' <= lc && lc <= 'Z') {
if ((lc + 0x20) == rc) { continue; }
} else if ('A' <= rc && rc <= 'Z') {
if ((rc + 0x20) == lc) { continue; }
}
return false;
}
return true;
}
}
private static final boolean getYesOrNo(String keyword, String s) {
if (ASCII.equalIgnoreCase(s, "yes")) {
return true;
}
if (ASCII.equalIgnoreCase(s, "no")) {
return false;
}
throw new IllegalArgumentException("illegal locale keyword=value: " + keyword + "=" + s);
}
private static final int getIntValue(String keyword, String s, String... values) {
for (int i = 0; i < values.length; ++i) {
if (ASCII.equalIgnoreCase(s, values[i])) {
return i;
}
}
throw new IllegalArgumentException("illegal locale keyword=value: " + keyword + "=" + s);
}
private static final int getReorderCode(String keyword, String s) {
return Collator.ReorderCodes.FIRST +
getIntValue(keyword, s, "space", "punct", "symbol", "currency", "digit");
// Not supporting "others" = UCOL_REORDER_CODE_OTHERS
// as a synonym for Zzzz = USCRIPT_UNKNOWN for now:
// Avoid introducing synonyms/aliases.
}
/**
* Sets collation attributes according to locale keywords. See
* http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Settings
*
* Using "alias" keywords and values where defined:
* http://www.unicode.org/reports/tr35/tr35.html#Old_Locale_Extension_Syntax
* http://unicode.org/repos/cldr/trunk/common/bcp47/collation.xml
*/
private static void setAttributesFromKeywords(ULocale loc, Collator coll, RuleBasedCollator rbc) {
// Check for collation keywords that were already deprecated
// before any were supported in createInstance() (except for "collation").
String value = loc.getKeywordValue("colHiraganaQuaternary");
if (value != null) {
throw new UnsupportedOperationException("locale keyword kh/colHiraganaQuaternary");
}
value = loc.getKeywordValue("variableTop");
if (value != null) {
throw new UnsupportedOperationException("locale keyword vt/variableTop");
}
// Parse known collation keywords, ignore others.
value = loc.getKeywordValue("colStrength");
if (value != null) {
// Note: Not supporting typo "quarternary" because it was never supported in locale IDs.
int strength = getIntValue("colStrength", value,
"primary", "secondary", "tertiary", "quaternary", "identical");
coll.setStrength(strength <= Collator.QUATERNARY ? strength : Collator.IDENTICAL);
}
value = loc.getKeywordValue("colBackwards");
if (value != null) {
if (rbc != null) {
rbc.setFrenchCollation(getYesOrNo("colBackwards", value));
} else {
throw new UnsupportedOperationException(
"locale keyword kb/colBackwards only settable for RuleBasedCollator");
}
}
value = loc.getKeywordValue("colCaseLevel");
if (value != null) {
if (rbc != null) {
rbc.setCaseLevel(getYesOrNo("colCaseLevel", value));
} else {
throw new UnsupportedOperationException(
"locale keyword kb/colBackwards only settable for RuleBasedCollator");
}
}
value = loc.getKeywordValue("colCaseFirst");
if (value != null) {
if (rbc != null) {
int cf = getIntValue("colCaseFirst", value, "no", "lower", "upper");
if (cf == 0) {
rbc.setLowerCaseFirst(false);
rbc.setUpperCaseFirst(false);
} else if (cf == 1) {
rbc.setLowerCaseFirst(true);
} else /* cf == 2 */ {
rbc.setUpperCaseFirst(true);
}
} else {
throw new UnsupportedOperationException(
"locale keyword kf/colCaseFirst only settable for RuleBasedCollator");
}
}
value = loc.getKeywordValue("colAlternate");
if (value != null) {
if (rbc != null) {
rbc.setAlternateHandlingShifted(
getIntValue("colAlternate", value, "non-ignorable", "shifted") != 0);
} else {
throw new UnsupportedOperationException(
"locale keyword ka/colAlternate only settable for RuleBasedCollator");
}
}
value = loc.getKeywordValue("colNormalization");
if (value != null) {
coll.setDecomposition(getYesOrNo("colNormalization", value) ?
Collator.CANONICAL_DECOMPOSITION : Collator.NO_DECOMPOSITION);
}
value = loc.getKeywordValue("colNumeric");
if (value != null) {
if (rbc != null) {
rbc.setNumericCollation(getYesOrNo("colNumeric", value));
} else {
throw new UnsupportedOperationException(
"locale keyword kn/colNumeric only settable for RuleBasedCollator");
}
}
value = loc.getKeywordValue("colReorder");
if (value != null) {
int[] codes = new int[UScript.CODE_LIMIT + Collator.ReorderCodes.LIMIT - Collator.ReorderCodes.FIRST];
int codesLength = 0;
int scriptNameStart = 0;
for (;;) {
if (codesLength == codes.length) {
throw new IllegalArgumentException(
"too many script codes for colReorder locale keyword: " + value);
}
int limit = scriptNameStart;
while (limit < value.length() && value.charAt(limit) != '-') { ++limit; }
String scriptName = value.substring(scriptNameStart, limit);
int code;
if (scriptName.length() == 4) {
// Strict parsing, accept only 4-letter script codes, not long names.
code = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptName);
} else {
code = getReorderCode("colReorder", scriptName);
}
codes[codesLength++] = code;
if (limit == value.length()) { break; }
scriptNameStart = limit + 1;
}
if (codesLength == 0) {
throw new IllegalArgumentException("no script codes for colReorder locale keyword");
}
int[] args = new int[codesLength];
System.arraycopy(codes, 0, args, 0, codesLength);
coll.setReorderCodes(args);
}
value = loc.getKeywordValue("kv");
if (value != null) {
coll.setMaxVariable(getReorderCode("kv", value));
}
}
/**
* {@icu} Returns the Collator for the desired locale.
*
* <p>For some languages, multiple collation types are available;
* for example, "de@collation=phonebook".
* Starting with ICU 54, collation attributes can be specified via locale keywords as well,
* in the old locale extension syntax ("el@colCaseFirst=upper")
* or in language tag syntax ("el-u-kf-upper").
* See <a href="http://userguide.icu-project.org/collation/api">User Guide: Collation API</a>.
*
* @param locale the desired locale.
* @return Collator for the desired locale if it is created successfully.
* Otherwise if there is no Collator
* associated with the current locale, the root collator will
* be returned.
* @see java.util.Locale
* @see java.util.ResourceBundle
* @see #getInstance(Locale)
* @see #getInstance()
* @stable ICU 3.0
*/
public static final Collator getInstance(ULocale locale) {
// fetching from service cache is faster than instantiation
if (locale == null) {
locale = ULocale.getDefault();
}
Collator coll = getShim().getInstance(locale);
if (!locale.getName().equals(locale.getBaseName())) { // any keywords?
setAttributesFromKeywords(locale, coll,
(coll instanceof RuleBasedCollator) ? (RuleBasedCollator)coll : null);
}
return coll;
}
/**
* Returns the Collator for the desired locale.
*
* <p>For some languages, multiple collation types are available;
* for example, "de-u-co-phonebk".
* Starting with ICU 54, collation attributes can be specified via locale keywords as well,
* in the old locale extension syntax ("el@colCaseFirst=upper", only with {@link ULocale})
* or in language tag syntax ("el-u-kf-upper").
* See <a href="http://userguide.icu-project.org/collation/api">User Guide: Collation API</a>.
*
* @param locale the desired locale.
* @return Collator for the desired locale if it is created successfully.
* Otherwise if there is no Collator
* associated with the current locale, the root collator will
* be returned.
* @see java.util.Locale
* @see java.util.ResourceBundle
* @see #getInstance(ULocale)
* @see #getInstance()
* @stable ICU 2.8
*/
public static final Collator getInstance(Locale locale) {
return getInstance(ULocale.forLocale(locale));
}
/**
* {@icu} Registers a collator as the default collator for the provided locale. The
* collator should not be modified after it is registered.
*
* <p>Because ICU may choose to cache Collator objects internally, this must
* be called at application startup, prior to any calls to
* Collator.getInstance to avoid undefined behavior.
*
* @param collator the collator to register
* @param locale the locale for which this is the default collator
* @return an object that can be used to unregister the registered collator.
*
* @stable ICU 3.2
*/
public static final Object registerInstance(Collator collator, ULocale locale) {
return getShim().registerInstance(collator, locale);
}
/**
* {@icu} Registers a collator factory.
*
* <p>Because ICU may choose to cache Collator objects internally, this must
* be called at application startup, prior to any calls to
* Collator.getInstance to avoid undefined behavior.
*
* @param factory the factory to register
* @return an object that can be used to unregister the registered factory.
*
* @stable ICU 2.6
*/
public static final Object registerFactory(CollatorFactory factory) {
return getShim().registerFactory(factory);
}
/**
* {@icu} Unregisters a collator previously registered using registerInstance.
* @param registryKey the object previously returned by registerInstance.
* @return true if the collator was successfully unregistered.
* @stable ICU 2.6
*/
public static final boolean unregister(Object registryKey) {
if (shim == null) {
return false;
}
return shim.unregister(registryKey);
}
/**
* Returns the set of locales, as Locale objects, for which collators
* are installed. Note that Locale objects do not support RFC 3066.
* @return the list of locales in which collators are installed.
* This list includes any that have been registered, in addition to
* those that are installed with ICU4J.
* @stable ICU 2.4
*/
public static Locale[] getAvailableLocales() {
// TODO make this wrap getAvailableULocales later
if (shim == null) {
return ICUResourceBundle.getAvailableLocales(
ICUResourceBundle.ICU_COLLATION_BASE_NAME, ICUResourceBundle.ICU_DATA_CLASS_LOADER);
}
return shim.getAvailableLocales();
}
/**
* {@icu} Returns the set of locales, as ULocale objects, for which collators
* are installed. ULocale objects support RFC 3066.
* @return the list of locales in which collators are installed.
* This list includes any that have been registered, in addition to
* those that are installed with ICU4J.
* @stable ICU 3.0
*/
public static final ULocale[] getAvailableULocales() {
if (shim == null) {
return ICUResourceBundle.getAvailableULocales(
ICUResourceBundle.ICU_COLLATION_BASE_NAME, ICUResourceBundle.ICU_DATA_CLASS_LOADER);
}
return shim.getAvailableULocales();
}
/**
* The list of keywords for this service. This must be kept in sync with
* the resource data.
* @since ICU 3.0
*/
private static final String[] KEYWORDS = { "collation" };
/**
* The resource name for this service. Note that this is not the same as
* the keyword for this service.
* @since ICU 3.0
*/
private static final String RESOURCE = "collations";
/**
* The resource bundle base name for this service.
* *since ICU 3.0
*/
private static final String BASE = ICUResourceBundle.ICU_COLLATION_BASE_NAME;
/**
* {@icu} Returns an array of all possible keywords that are relevant to
* collation. At this point, the only recognized keyword for this
* service is "collation".
* @return an array of valid collation keywords.
* @see #getKeywordValues
* @stable ICU 3.0
*/
public static final String[] getKeywords() {
return KEYWORDS;
}
/**
* {@icu} Given a keyword, returns an array of all values for
* that keyword that are currently in use.
* @param keyword one of the keywords returned by getKeywords.
* @see #getKeywords
* @stable ICU 3.0
*/
public static final String[] getKeywordValues(String keyword) {
if (!keyword.equals(KEYWORDS[0])) {
throw new IllegalArgumentException("Invalid keyword: " + keyword);
}
return ICUResourceBundle.getKeywordValues(BASE, RESOURCE);
}
/**
* {@icu} Given a key and a locale, returns an array of string values in a preferred
* order that would make a difference. These are all and only those values where
* the open (creation) of the service with the locale formed from the input locale
* plus input keyword and that value has different behavior than creation with the
* input locale alone.
* @param key one of the keys supported by this service. For now, only
* "collation" is supported.
* @param locale the locale
* @param commonlyUsed if set to true it will return only commonly used values
* with the given locale in preferred order. Otherwise,
* it will return all the available values for the locale.
* @return an array of string values for the given key and the locale.
* @stable ICU 4.2
*/
public static final String[] getKeywordValuesForLocale(String key, ULocale locale,
boolean commonlyUsed) {
// Note: The parameter commonlyUsed is actually not used.
// The switch is in the method signature for consistency
// with other locale services.
// Read available collation values from collation bundles
String baseLoc = locale.getBaseName();
LinkedList<String> values = new LinkedList<String>();
UResourceBundle bundle = UResourceBundle.getBundleInstance(
ICUResourceBundle.ICU_COLLATION_BASE_NAME, baseLoc);
String defcoll = null;
while (bundle != null) {
UResourceBundle collations = bundle.get("collations");
Enumeration<String> collEnum = collations.getKeys();
while (collEnum.hasMoreElements()) {
String collkey = collEnum.nextElement();
if (collkey.equals("default")) {
if (defcoll == null) {
// Keep the default
defcoll = collations.getString("default");
}
} else if (!collkey.startsWith("private-") && !values.contains(collkey)) {
values.add(collkey);
}
}
bundle = ((ICUResourceBundle)bundle).getParent();
}
// Reordering
Iterator<String> itr = values.iterator();
String[] result = new String[values.size()];
result[0] = defcoll;
int idx = 1;
while (itr.hasNext()) {
String collKey = itr.next();
if (!collKey.equals(defcoll)) {
result[idx++] = collKey;
}
}
return result;
}
/**
* {@icu} Returns the functionally equivalent locale for the given
* requested locale, with respect to given keyword, for the
* collation service. If two locales return the same result, then
* collators instantiated for these locales will behave
* equivalently. The converse is not always true; two collators
* may in fact be equivalent, but return different results, due to
* internal details. The return result has no other meaning than
* that stated above, and implies nothing as to the relationship
* between the two locales. This is intended for use by
* applications who wish to cache collators, or otherwise reuse
* collators when possible. The functional equivalent may change
* over time. For more information, please see the <a
* href="http://userguide.icu-project.org/locale#TOC-Locales-and-Services">
* Locales and Services</a> section of the ICU User Guide.
* @param keyword a particular keyword as enumerated by
* getKeywords.
* @param locID The requested locale
* @param isAvailable If non-null, isAvailable[0] will receive and
* output boolean that indicates whether the requested locale was
* 'available' to the collation service. If non-null, isAvailable
* must have length >= 1.
* @return the locale
* @stable ICU 3.0
*/
public static final ULocale getFunctionalEquivalent(String keyword,
ULocale locID,
boolean isAvailable[]) {
return ICUResourceBundle.getFunctionalEquivalent(BASE, ICUResourceBundle.ICU_DATA_CLASS_LOADER, RESOURCE,
keyword, locID, isAvailable, true);
}
/**
* {@icu} Returns the functionally equivalent locale for the given
* requested locale, with respect to given keyword, for the
* collation service.
* @param keyword a particular keyword as enumerated by
* getKeywords.
* @param locID The requested locale
* @return the locale
* @see #getFunctionalEquivalent(String,ULocale,boolean[])
* @stable ICU 3.0
*/
public static final ULocale getFunctionalEquivalent(String keyword,
ULocale locID) {
return getFunctionalEquivalent(keyword, locID, null);
}
/**
* {@icu} Returns the name of the collator for the objectLocale, localized for the
* displayLocale.
* @param objectLocale the locale of the collator
* @param displayLocale the locale for the collator's display name
* @return the display name
* @stable ICU 2.6
*/
static public String getDisplayName(Locale objectLocale, Locale displayLocale) {
return getShim().getDisplayName(ULocale.forLocale(objectLocale),
ULocale.forLocale(displayLocale));
}
/**
* {@icu} Returns the name of the collator for the objectLocale, localized for the
* displayLocale.
* @param objectLocale the locale of the collator
* @param displayLocale the locale for the collator's display name
* @return the display name
* @stable ICU 3.2
*/
static public String getDisplayName(ULocale objectLocale, ULocale displayLocale) {
return getShim().getDisplayName(objectLocale, displayLocale);
}
/**
* {@icu} Returns the name of the collator for the objectLocale, localized for the
* default <code>DISPLAY</code> locale.
* @param objectLocale the locale of the collator
* @return the display name
* @see com.ibm.icu.util.ULocale.Category#DISPLAY
* @stable ICU 2.6
*/
static public String getDisplayName(Locale objectLocale) {
return getShim().getDisplayName(ULocale.forLocale(objectLocale), ULocale.getDefault(Category.DISPLAY));
}
/**
* {@icu} Returns the name of the collator for the objectLocale, localized for the
* default <code>DISPLAY</code> locale.
* @param objectLocale the locale of the collator
* @return the display name
* @see com.ibm.icu.util.ULocale.Category#DISPLAY
* @stable ICU 3.2
*/
static public String getDisplayName(ULocale objectLocale) {
return getShim().getDisplayName(objectLocale, ULocale.getDefault(Category.DISPLAY));
}
/**
* Returns this Collator's strength attribute. The strength attribute
* determines the minimum level of difference considered significant.
* </p>
* {@icunote} This can return QUATERNARY strength, which is not supported by the
* JDK version.
* <p>
* See the Collator class description for more details.
* </p>
* <p>The base class method always returns {@link #TERTIARY}.
* Subclasses should override it if appropriate.
*
* @return this Collator's current strength attribute.
* @see #setStrength
* @see #PRIMARY
* @see #SECONDARY
* @see #TERTIARY
* @see #QUATERNARY
* @see #IDENTICAL
* @stable ICU 2.8
*/
public int getStrength()
{
return TERTIARY;
}
/**
* Returns the decomposition mode of this Collator. The decomposition mode
* determines how Unicode composed characters are handled.
* </p>
* <p>
* See the Collator class description for more details.
* </p>
* <p>The base class method always returns {@link #NO_DECOMPOSITION}.
* Subclasses should override it if appropriate.
*
* @return the decomposition mode
* @see #setDecomposition
* @see #NO_DECOMPOSITION
* @see #CANONICAL_DECOMPOSITION
* @stable ICU 2.8
*/
public int getDecomposition()
{
return NO_DECOMPOSITION;
}
// public other methods -------------------------------------------------
/**
* Compares the equality of two text Strings using
* this Collator's rules, strength and decomposition mode. Convenience method.
* @param source the source string to be compared.
* @param target the target string to be compared.
* @return true if the strings are equal according to the collation
* rules, otherwise false.
* @see #compare
* @throws NullPointerException thrown if either arguments is null.
* @stable ICU 2.8
*/
public boolean equals(String source, String target)
{
return (compare(source, target) == 0);
}
/**
* {@icu} Returns a UnicodeSet that contains all the characters and sequences tailored
* in this collator.
* @return a pointer to a UnicodeSet object containing all the
* code points and sequences that may sort differently than
* in the root collator.
* @stable ICU 2.4
*/
public UnicodeSet getTailoredSet()
{
return new UnicodeSet(0, 0x10FFFF);
}
/**
* Compares the source text String to the target text String according to
* this Collator's rules, strength and decomposition mode.
* Returns an integer less than,
* equal to or greater than zero depending on whether the source String is
* less than, equal to or greater than the target String. See the Collator
* class description for an example of use.
* </p>
* @param source the source String.
* @param target the target String.
* @return Returns an integer value. Value is less than zero if source is
* less than target, value is zero if source and target are equal,
* value is greater than zero if source is greater than target.
* @see CollationKey
* @see #getCollationKey
* @throws NullPointerException thrown if either argument is null.
* @stable ICU 2.8
*/
public abstract int compare(String source, String target);
/**
* Compares the source Object to the target Object.
* </p>
* @param source the source Object.
* @param target the target Object.
* @return Returns an integer value. Value is less than zero if source is
* less than target, value is zero if source and target are equal,
* value is greater than zero if source is greater than target.
* @throws ClassCastException thrown if either arguments cannot be cast to CharSequence.
* @stable ICU 4.2
*/
public int compare(Object source, Object target) {
return doCompare((CharSequence)source, (CharSequence)target);
}
/**
* Compares two CharSequences.
* The base class just calls compare(left.toString(), right.toString()).
* Subclasses should instead implement this method and have the String API call this method.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int doCompare(CharSequence left, CharSequence right) {
return compare(left.toString(), right.toString());
}
/**
* <p>
* Transforms the String into a CollationKey suitable for efficient
* repeated comparison. The resulting key depends on the collator's
* rules, strength and decomposition mode.
*
* <p>Note that collation keys are often less efficient than simply doing comparison.
* For more details, see the ICU User Guide.
*
* <p>See the CollationKey class documentation for more information.</p>
* @param source the string to be transformed into a CollationKey.
* @return the CollationKey for the given String based on this Collator's
* collation rules. If the source String is null, a null
* CollationKey is returned.
* @see CollationKey
* @see #compare(String, String)
* @see #getRawCollationKey
* @stable ICU 2.8
*/
public abstract CollationKey getCollationKey(String source);
/**
* {@icu} Returns the simpler form of a CollationKey for the String source following
* the rules of this Collator and stores the result into the user provided argument
* key. If key has a internal byte array of length that's too small for the result,
* the internal byte array will be grown to the exact required size.
*
* <p>Note that collation keys are often less efficient than simply doing comparison.
* For more details, see the ICU User Guide.
*
* @param source the text String to be transformed into a RawCollationKey
* @return If key is null, a new instance of RawCollationKey will be
* created and returned, otherwise the user provided key will be
* returned.
* @see #compare(String, String)
* @see #getCollationKey
* @see RawCollationKey
* @stable ICU 2.8
*/
public abstract RawCollationKey getRawCollationKey(String source,
RawCollationKey key);
/**
* {@icu} Sets the variable top to the top of the specified reordering group.
* The variable top determines the highest-sorting character
* which is affected by the alternate handling behavior.
* If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
*
* <p>The base class implementation throws an UnsupportedOperationException.
* @param group one of Collator.ReorderCodes.SPACE, Collator.ReorderCodes.PUNCTUATION,
* Collator.ReorderCodes.SYMBOL, Collator.ReorderCodes.CURRENCY;
* or Collator.ReorderCodes.DEFAULT to restore the default max variable group
* @return this
* @see #getMaxVariable
* @stable ICU 53
*/
public Collator setMaxVariable(int group) {
throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
/**
* {@icu} Returns the maximum reordering group whose characters are affected by
* the alternate handling behavior.
*
* <p>The base class implementation returns Collator.ReorderCodes.PUNCTUATION.
* @return the maximum variable reordering group.
* @see #setMaxVariable
* @stable ICU 53
*/
public int getMaxVariable() {
return Collator.ReorderCodes.PUNCTUATION;
}
/**
* {@icu} Sets the variable top to the primary weight of the specified string.
*
* <p>Beginning with ICU 53, the variable top is pinned to
* the top of one of the supported reordering groups,
* and it must not be beyond the last of those groups.
* See {@link #setMaxVariable(int)}.
*
* @param varTop one or more (if contraction) characters to which the
* variable top should be set
* @return variable top primary weight
* @exception IllegalArgumentException
* is thrown if varTop argument is not a valid variable top element. A variable top element is
* invalid when
* <ul>
* <li>it is a contraction that does not exist in the Collation order
* <li>the variable top is beyond
* the last reordering group supported by setMaxVariable()
* <li>when the varTop argument is null or zero in length.
* </ul>
* @see #getVariableTop
* @see RuleBasedCollator#setAlternateHandlingShifted
* @deprecated ICU 53 Call {@link #setMaxVariable(int)} instead.
*/
@Deprecated
public abstract int setVariableTop(String varTop);
/**
* {@icu} Gets the variable top value of a Collator.
*
* @return the variable top primary weight
* @see #getMaxVariable
* @stable ICU 2.6
*/
public abstract int getVariableTop();
/**
* {@icu} Sets the variable top to the specified primary weight.
*
* <p>Beginning with ICU 53, the variable top is pinned to
* the top of one of the supported reordering groups,
* and it must not be beyond the last of those groups.
* See {@link #setMaxVariable(int)}.
*
* @param varTop primary weight, as returned by setVariableTop or getVariableTop
* @see #getVariableTop
* @see #setVariableTop(String)
* @deprecated ICU 53 Call setMaxVariable() instead.
*/
@Deprecated
public abstract void setVariableTop(int varTop);
/**
* {@icu} Returns the version of this collator object.
* @return the version object associated with this collator
* @stable ICU 2.8
*/
public abstract VersionInfo getVersion();
/**
* {@icu} Returns the UCA version of this collator object.
* @return the version object associated with this collator
* @stable ICU 2.8
*/
public abstract VersionInfo getUCAVersion();
/**
* Retrieves the reordering codes for this collator.
* These reordering codes are a combination of UScript codes and ReorderCodes.
* @return a copy of the reordering codes for this collator;
* if none are set then returns an empty array
* @see #setReorderCodes
* @see #getEquivalentReorderCodes
* @see Collator.ReorderCodes
* @see UScript
* @stable ICU 4.8
*/
public int[] getReorderCodes()
{
throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
/**
* Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
* codes are grouped and must reorder together.
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
* for example Hiragana and Katakana.
*
* @param reorderCode The reorder code to determine equivalence for.
* @return the set of all reorder codes in the same group as the given reorder code.
* @see #setReorderCodes
* @see #getReorderCodes
* @see Collator.ReorderCodes
* @see UScript
* @stable ICU 4.8
*/
public static int[] getEquivalentReorderCodes(int reorderCode) {
CollationData baseData = CollationRoot.getData();
return baseData.getEquivalentScripts(reorderCode);
}
// Freezable interface implementation -------------------------------------------------
/**
* Determines whether the object has been frozen or not.
*
* <p>An unfrozen Collator is mutable and not thread-safe.
* A frozen Collator is immutable and thread-safe.
*
* @stable ICU 4.8
*/
public boolean isFrozen() {
return false;
}
/**
* Freezes the collator.
* @return the collator itself.
* @stable ICU 4.8
*/
public Collator freeze() {
throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
/**
* Provides for the clone operation. Any clone is initially unfrozen.
* @stable ICU 4.8
*/
public Collator cloneAsThawed() {
throw new UnsupportedOperationException("Needs to be implemented by the subclass.");
}
/**
* Empty default constructor to make javadocs happy
* @stable ICU 2.4
*/
protected Collator()
{
}
private static final boolean DEBUG = ICUDebug.enabled("collator");
// -------- BEGIN ULocale boilerplate --------
/**
* {@icu} Returns the locale that was used to create this object, or null.
* This may may differ from the locale requested at the time of
* this object's creation. For example, if an object is created
* for locale <tt>en_US_CALIFORNIA</tt>, the actual data may be
* drawn from <tt>en</tt> (the <i>actual</i> locale), and
* <tt>en_US</tt> may be the most specific locale that exists (the
* <i>valid</i> locale).
*
* <p>Note: This method will be implemented in ICU 3.0; ICU 2.8
* contains a partial preview implementation. The * <i>actual</i>
* locale is returned correctly, but the <i>valid</i> locale is
* not, in most cases.
*
* <p>The base class method always returns {@link ULocale#ROOT}.
* Subclasses should override it if appropriate.
*
* @param type type of information requested, either {@link
* com.ibm.icu.util.ULocale#VALID_LOCALE} or {@link
* com.ibm.icu.util.ULocale#ACTUAL_LOCALE}.
* @return the information specified by <i>type</i>, or null if
* this object was not constructed from locale data.
* @see com.ibm.icu.util.ULocale
* @see com.ibm.icu.util.ULocale#VALID_LOCALE
* @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
* @draft ICU 2.8 (retain)
* @provisional This API might change or be removed in a future release.
*/
public ULocale getLocale(ULocale.Type type) {
return ULocale.ROOT;
}
/**
* Set information about the locales that were used to create this
* object. If the object was not constructed from locale data,
* both arguments should be set to null. Otherwise, neither
* should be null. The actual locale must be at the same level or
* less specific than the valid locale. This method is intended
* for use by factories or other entities that create objects of
* this class.
*
* <p>The base class method does nothing. Subclasses should override it if appropriate.
*
* @param valid the most specific locale containing any resource
* data, or null
* @param actual the locale containing data used to construct this
* object, or null
* @see com.ibm.icu.util.ULocale
* @see com.ibm.icu.util.ULocale#VALID_LOCALE
* @see com.ibm.icu.util.ULocale#ACTUAL_LOCALE
*/
void setLocale(ULocale valid, ULocale actual) {}
// -------- END ULocale boilerplate --------
}