blob: 68b12c8ba1501efd42555f76cbf763f4afb17ed8 [file] [log] [blame]
/*
***************************************************************************
* Copyright (C) 2008-2015, Google, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*/
package com.ibm.icu.text;
import java.util.BitSet;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.SpoofChecker.RestrictionLevel;
/**
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
* then setIdentifier. Available methods include:
* <ol>
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
* each of these.
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
* either Katakana or Hiragana.
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
* the identifier.
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
* </ol>
*
* @author markdavis
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public class IdentifierInfo {
private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
private String identifier;
private final BitSet requiredScripts = new BitSet();
private final Set<BitSet> scriptSetSet = new HashSet<BitSet>();
private final BitSet commonAmongAlternates = new BitSet();
private final UnicodeSet numerics = new UnicodeSet();
private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF);
/**
* Create an identifier info object. Subsequently, call {@link #setIdentifier(String)}, etc.
* {@link #setIdentifierProfile(UnicodeSet)}
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public IdentifierInfo() {
super();
}
private IdentifierInfo clear() {
requiredScripts.clear();
scriptSetSet.clear();
numerics.clear();
commonAmongAlternates.clear();
return this;
}
/**
* Set the identifier profile: the characters that are to be allowed in the identifier.
*
* @param identifierProfile the characters that are to be allowed in the identifier
* @return self
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) {
this.identifierProfile.set(identifierProfile);
return this;
}
/**
* Get the identifier profile: the characters that are to be allowed in the identifier.
*
* @return The characters that are to be allowed in the identifier.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public UnicodeSet getIdentifierProfile() {
return new UnicodeSet(identifierProfile);
}
/**
* Set an identifier to analyze. Afterwards, call methods like getScripts()
*
* @param identifier the identifier to analyze
* @return self
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public IdentifierInfo setIdentifier(String identifier) {
this.identifier = identifier;
clear();
BitSet scriptsForCP = new BitSet();
int cp;
for (int i = 0; i < identifier.length(); i += Character.charCount(cp)) {
cp = Character.codePointAt(identifier, i);
// Store a representative character for each kind of decimal digit
if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
numerics.add(cp - UCharacter.getNumericValue(cp));
}
UScript.getScriptExtensions(cp, scriptsForCP);
scriptsForCP.clear(UScript.COMMON);
scriptsForCP.clear(UScript.INHERITED);
// if (temp.cardinality() == 0) {
// // HACK for older version of ICU
// requiredScripts.set(UScript.getScript(cp));
// } else
switch (scriptsForCP.cardinality()) {
case 0: break;
case 1:
// Single script, record it.
requiredScripts.or(scriptsForCP);
break;
default:
if (!requiredScripts.intersects(scriptsForCP)
&& scriptSetSet.add(scriptsForCP)) {
scriptsForCP = new BitSet();
}
break;
}
}
// Now make a final pass through to remove alternates that came before singles.
// [Kana], [Kana Hira] => [Kana]
// This is relatively infrequent, so doesn't have to be optimized.
// We also compute any commonalities among the alternates.
if (scriptSetSet.size() > 0) {
commonAmongAlternates.set(0, UScript.CODE_LIMIT);
for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext();) {
final BitSet next = it.next();
// [Kana], [Kana Hira] => [Kana]
if (requiredScripts.intersects(next)) {
it.remove();
} else {
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
commonAmongAlternates.and(next); // get the intersection.
for (BitSet other : scriptSetSet) {
if (next != other && contains(next, other)) {
it.remove();
break;
}
}
}
}
}
if (scriptSetSet.size() == 0) {
commonAmongAlternates.clear();
}
return this;
}
/**
* Get the identifier that was analyzed.
*
* @return the identifier that was analyzed.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public String getIdentifier() {
return identifier;
}
/**
* Get the scripts found in the identifiers.
*
* @return the set of explicit scripts.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public BitSet getScripts() {
return (BitSet) requiredScripts.clone();
}
/**
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
* the set consisting of those scripts will be returned.
*
* @return the set of explicit scripts.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public Set<BitSet> getAlternates() {
Set<BitSet> result = new HashSet<BitSet>();
for (BitSet item : scriptSetSet) {
result.add((BitSet) item.clone());
}
return result;
}
/**
* Get the representative characters (zeros) for the numerics found in the identifier.
*
* @return the set of explicit scripts.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public UnicodeSet getNumerics() {
return new UnicodeSet(numerics);
}
/**
* Find out which scripts are in common among the alternates.
*
* @return the set of scripts that are in common among the alternates.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public BitSet getCommonAmongAlternates() {
return (BitSet) commonAmongAlternates.clone();
}
// BitSet doesn't support "contains(...)", so we have inverted constants
// They are private; they can't be made immutable in Java.
private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA,
UScript.KATAKANA);
private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO);
private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL);
private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK,
UScript.CHEROKEE);
/**
* Find the "tightest" restriction level that the identifier satisfies.
*
* @return the restriction level.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RestrictionLevel getRestrictionLevel() {
if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) {
return RestrictionLevel.UNRESTRICTIVE;
}
if (ASCII.containsAll(identifier)) {
return RestrictionLevel.ASCII;
}
// This is a bit tricky. We look at a number of factors.
// The number of scripts in the text.
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
// Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
// time it is created, in setIdentifier().
final int cardinalityPlus = requiredScripts.cardinality() + (commonAmongAlternates.cardinality() == 0 ? scriptSetSet.size() : 1);
if (cardinalityPlus < 2) {
return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
}
if (containsWithAlternates(JAPANESE, requiredScripts) || containsWithAlternates(CHINESE, requiredScripts)
|| containsWithAlternates(KOREAN, requiredScripts)) {
return RestrictionLevel.HIGHLY_RESTRICTIVE;
}
if (cardinalityPlus == 2 && requiredScripts.get(UScript.LATIN) && !requiredScripts.intersects(CONFUSABLE_WITH_LATIN)) {
return RestrictionLevel.MODERATELY_RESTRICTIVE;
}
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
/**
* Get the number of scripts appearing in the identifier.
* Note: Common and Inherited scripts are omitted from the count.
* Note: If the identifier contains characters with alternate scripts
* (the character is used with more than one script), minimize
* the reported number of scripts by considering the character
* to be of a script that already appears elsewhere in the identifier
* when possible.
* The alternate script computation may not be perfect. The distinction
* between 0, 1 and > 1 scripts will be valid, however.
* @return the number of scripts.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public int getScriptCount() {
// Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
int count = requiredScripts.cardinality() +
(commonAmongAlternates.cardinality() == 0 ? scriptSetSet.size() : 1);
return count;
}
/**
* See Object.toString()
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
@Override
public String toString() {
return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", "
+ displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", "
+ numerics.toPattern(false);
}
private boolean containsWithAlternates(BitSet container, BitSet containee) {
if (!contains(container, containee)) {
return false;
}
for (BitSet alternatives : scriptSetSet) {
if (!container.intersects(alternatives)) {
return false;
}
}
return true;
}
/**
* Produce a readable string of alternates.
*
* @param alternates a set of BitSets of script values.
* @return display form
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static String displayAlternates(Set<BitSet> alternates) {
if (alternates.size() == 0) {
return "";
}
StringBuilder result = new StringBuilder();
// for consistent results
Set<BitSet> sorted = new TreeSet<BitSet>(BITSET_COMPARATOR);
sorted.addAll(alternates);
for (BitSet item : sorted) {
if (result.length() != 0) {
result.append("; ");
}
result.append(displayScripts(item));
}
return result.toString();
}
/**
* Order BitSets, first by shortest, then by items.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final Comparator<BitSet> BITSET_COMPARATOR = new Comparator<BitSet>() {
public int compare(BitSet arg0, BitSet arg1) {
int diff = arg0.cardinality() - arg1.cardinality();
if (diff != 0) return diff;
int i0 = arg0.nextSetBit(0);
int i1 = arg1.nextSetBit(0);
while ((diff = i0-i1) == 0 && i0 > 0) {
i0 = arg0.nextSetBit(i0+1);
i1 = arg1.nextSetBit(i1+1);
}
return diff;
}
};
/**
* Produce a readable string of a set of scripts
*
* @param scripts a BitSet of UScript values
* @return a readable string of a set of scripts
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static String displayScripts(BitSet scripts) {
StringBuilder result = new StringBuilder();
for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) {
if (result.length() != 0) {
result.append(' ');
}
result.append(UScript.getShortName(i));
}
return result.toString();
}
/**
* Parse a text list of scripts into a BitSet.
*
* @param scriptsString the string to be parsed
* @return BitSet of UScript values.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static BitSet parseScripts(String scriptsString) {
BitSet result = new BitSet();
for (String item : scriptsString.trim().split(",?\\s+")) {
if (item.length() != 0) {
result.set(UScript.getCodeFromName(item));
}
}
return result;
}
/**
* Parse a list of alternates into a set of sets of UScript values.
*
* @param scriptsSetString a list of alternates, separated by ;
* @return a set of BitSets of UScript values
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static Set<BitSet> parseAlternates(String scriptsSetString) {
Set<BitSet> result = new HashSet<BitSet>();
for (String item : scriptsSetString.trim().split("\\s*;\\s*")) {
if (item.length() != 0) {
result.add(parseScripts(item));
}
}
return result;
}
/**
* Test containment. Should be a method on BitSet.
*
* @param container possible container to be tested
* @param containee possible containee to be tested
* @return true if container contains containee
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final boolean contains(BitSet container, BitSet containee) {
for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) {
if (!container.get(i)) {
return false;
}
}
return true;
}
/**
* Sets a number of values at once. Should be on BitSet.
*
* @param bitset bitset to be affected
* @param values values to be set in the bitset
* @return modified bitset.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final BitSet set(BitSet bitset, int... values) {
for (int value : values) {
bitset.set(value);
}
return bitset;
}
// public static final class FreezableBitSet extends BitSet implements Freezable<FreezableBitSet> {
// private boolean frozen;
//
// public FreezableBitSet() {
// super();
// }
// public FreezableBitSet(int nbits) {
// super(nbits);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#and(java.util.BitSet)
// */
// @Override
// public void and(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.and(set);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#andNot(java.util.BitSet)
// */
// @Override
// public void andNot(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.andNot(set);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#cardinality()
// */
//
// @Override
// public void clear() {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.clear();
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#clear(int)
// */
// @Override
// public void clear(int bitIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.clear(bitIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#clear(int, int)
// */
// @Override
// public void clear(int fromIndex, int toIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.clear(fromIndex, toIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#clone()
// */
// @Override
// public Object clone() {
// return super.clone();
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#equals(java.lang.Object)
// */
// @Override
// public boolean equals(Object obj) {
// if (obj == null || obj.getClass() != FreezableBitSet.class) {
// return false;
// }
// return super.equals((BitSet)obj);
// }
//
// /* (non-Javadoc)
// * @see java.util.BitSet#flip(int)
// */
// @Override
// public void flip(int bitIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.flip(bitIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#flip(int, int)
// */
// @Override
// public void flip(int fromIndex, int toIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.flip(fromIndex, toIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#or(java.util.BitSet)
// */
// @Override
// public void or(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.or(set);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int)
// */
// @Override
// public void set(int bitIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(bitIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int, boolean)
// */
// @Override
// public void set(int bitIndex, boolean value) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(bitIndex, value);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int, int)
// */
// @Override
// public void set(int fromIndex, int toIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(fromIndex, toIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int, int, boolean)
// */
// @Override
// public void set(int fromIndex, int toIndex, boolean value) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(fromIndex, toIndex, value);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#xor(java.util.BitSet)
// */
// @Override
// public void xor(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.xor(set);
// }
// /* (non-Javadoc)
// * @see com.ibm.icu.util.Freezable#isFrozen()
// */
// public boolean isFrozen() {
// return frozen;
// }
// /* (non-Javadoc)
// * @see com.ibm.icu.util.Freezable#freeze()
// */
// public FreezableBitSet freeze() {
// frozen = true;
// return this;
// }
// /* (non-Javadoc)
// * @see com.ibm.icu.util.Freezable#cloneAsThawed()
// */
// public FreezableBitSet cloneAsThawed() {
// FreezableBitSet result = new FreezableBitSet(size());
// result.or(this);
// return result;
// }
// }
}