blob: b069b943851962d3f971aaef6844ec1ca6ddd257 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2015-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl.locale;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import com.ibm.icu.impl.ValidIdentifiers;
import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
import com.ibm.icu.impl.ValidIdentifiers.Datatype;
import com.ibm.icu.util.IllformedLocaleException;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
/**
* @author markdavis
*
*/
public class LocaleValidityChecker {
private final Set<Datasubtype> datasubtypes;
private final boolean allowsDeprecated;
public static class Where {
public Datatype fieldFailure;
public String codeFailure;
public boolean set(Datatype datatype, String code) {
fieldFailure = datatype;
codeFailure = code;
return false;
}
@Override
public String toString() {
return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}";
}
}
public LocaleValidityChecker(Set<Datasubtype> datasubtypes) {
this.datasubtypes = EnumSet.copyOf(datasubtypes);
allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated);
}
public LocaleValidityChecker(Datasubtype... datasubtypes) {
this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes));
allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated);
}
/**
* @return the datasubtypes
*/
public Set<Datasubtype> getDatasubtypes() {
return EnumSet.copyOf(datasubtypes);
}
static Pattern SEPARATOR = Pattern.compile("[-_]");
public boolean isValid(ULocale locale, Where where) {
where.set(null, null);
if (!isValid(Datatype.language, locale.getLanguage(), where)) {
// special case x
if (locale.getLanguage().equals("x")) {
where.set(null, null);
// TODO check syntax is ok, only alphanum{1,8}
return true;
}
return false;
}
if (!isValid(Datatype.script, locale.getScript(), where)) return false;
if (!isValid(Datatype.region, locale.getCountry(), where)) return false;
String variantString = locale.getVariant();
if (!variantString.isEmpty()) {
for (String variant : SEPARATOR.split(variantString)) {
if (!isValid(Datatype.variant, variant, where)) return false;
}
}
for (Character c : locale.getExtensionKeys()) {
try {
Datatype datatype = Datatype.valueOf(c+"");
switch (datatype) {
case x:
// TODO : check that the rest is syntactic
return true;
case t:
if (!isValidT(locale.getExtension(c), where)) return false;
break;
case u:
if (!isValidU(locale, locale.getExtension(c), where)) return false;
break;
}
} catch (Exception e) {
return where.set(Datatype.illegal, c+"");
}
}
return true;
}
enum SpecialCase {
normal, anything, reorder, codepoints, subdivision;
static SpecialCase get(String key) {
if (key.equals("kr")) {
return SpecialCase.reorder;
} else if (key.equals("vt")) {
return SpecialCase.codepoints;
} else if (key.equals("sd")) {
return subdivision;
} else if (key.equals("x0")) {
return anything;
} else {
return normal;
}
}
}
/**
* @param locale
* @param extension
* @param where
* @return
*/
private boolean isValidU(ULocale locale, String extensionString, Where where) {
String key = "";
int typeCount = 0;
ValueType valueType = null;
SpecialCase specialCase = null;
StringBuilder prefix = new StringBuilder();
Set<String> seen = new HashSet<String>();
// TODO: is empty -u- valid?
for (String subtag : SEPARATOR.split(extensionString)) {
if (subtag.length() == 2) {
key = KeyTypeData.toBcpKey(subtag);
if (key == null) {
return where.set(Datatype.u, subtag);
}
if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) {
return where.set(Datatype.u, key);
}
valueType = ValueType.get(key);
specialCase = SpecialCase.get(key);
typeCount = 0;
} else {
++typeCount;
switch (valueType) {
case single:
if (typeCount > 1) {
return where.set(Datatype.u, key+"-"+subtag);
}
break;
case incremental:
if (typeCount == 1) {
prefix.setLength(0);
prefix.append(subtag);
} else {
prefix.append('-').append(subtag);
subtag = prefix.toString();
}
break;
case multiple:
if (typeCount == 1) {
seen.clear();
}
break;
}
switch (specialCase) {
case anything:
continue;
case codepoints:
try {
if (Integer.parseInt(subtag,16) > 0x10FFFF) {
return where.set(Datatype.u, key+"-"+subtag);
}
} catch (NumberFormatException e) {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
case reorder:
boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag);
if (!newlyAdded || !isScriptReorder(subtag)) {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
case subdivision:
if (!isSubdivision(locale, subtag)) {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
}
// en-u-sd-usca
// en-US-u-sd-usca
Output<Boolean> isKnownKey = new Output<Boolean>();
Output<Boolean> isSpecialType = new Output<Boolean>();
String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType);
if (type == null) {
return where.set(Datatype.u, key+"-"+subtag);
}
if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) {
return where.set(Datatype.u, key+"-"+subtag);
}
}
}
return true;
}
/**
* @param locale
* @param subtag
* @return
*/
private boolean isSubdivision(ULocale locale, String subtag) {
// First check if the subtag is valid
if (subtag.length() < 3) {
return false;
}
String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2);
String subdivision = subtag.substring(region.length());
if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) {
return false;
}
// Then check for consistency with the locale's region
String localeRegion = locale.getCountry();
if (localeRegion.isEmpty()) {
ULocale max = ULocale.addLikelySubtags(locale);
localeRegion = max.getCountry();
}
if (!region.equalsIgnoreCase(localeRegion)) {
return false;
}
return true;
}
static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others", "zzzz"));
static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy"));
static final Set<Datasubtype> REGULAR_ONLY = EnumSet.of(Datasubtype.regular);
/**
* @param subtag
* @return
*/
private boolean isScriptReorder(String subtag) {
subtag = AsciiUtil.toLowerString(subtag);
if (REORDERING_INCLUDE.contains(subtag)) {
return true;
} else if (REORDERING_EXCLUDE.contains(subtag)) {
return false;
}
return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null;
// space, punct, symbol, currency, digit - core groups of characters below 'a'
// any script code except Common and Inherited.
// sc ; Zinh ; Inherited ; Qaai
// sc ; Zyyy ; Common
// Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
// others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false;
}
/**
* @param extensionString
* @param where
* @return
*/
private boolean isValidT(String extensionString, Where where) {
// TODO: is empty -t- valid?
// TODO stop at first tag ([a-z][0-9]) and check their validity separately
try {
ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build();
return isValid(locale, where);
} catch (IllformedLocaleException e) {
int startIndex = e.getErrorIndex();
String[] list = SEPARATOR.split(extensionString.substring(startIndex));
return where.set(Datatype.t, list[0]);
} catch (Exception e) {
return where.set(Datatype.t, e.getMessage());
}
}
/**
* @param language
* @param language2
* @return
*/
private boolean isValid(Datatype datatype, String code, Where where) {
return datatype == Datatype.language && code.equalsIgnoreCase("root") ? true
: code.isEmpty() ? true
: ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ? true
: where == null ? false : where.set(datatype, code);
}
public enum ValueType {
single, multiple, incremental;
private static Set<String> multipleValueTypes = new HashSet<String>(Arrays.asList("x0", "kr", "vt"));
private static Set<String> specificValueTypes = new HashSet<String>(Arrays.asList("ca"));
static ValueType get(String key) {
if (multipleValueTypes.contains(key)) {
return multiple;
} else if (specificValueTypes.contains(key)) {
return incremental;
} else {
return single;
}
}
}
/*
Type: any multiple
{"OK", "en-t-x0-SPECIAL"}
{"OK", "en-u-kr-REORDER_CODE"}, // Collation reorder codes; One or more collation reorder codes, see LDML Part 5: Collation
{"OK", "en-u-vt-CODEPOINTS"}, // deprecated Collation parameter key for variable top; The variable top (one or more Unicode code points: LDML Appendix Q)
Multiple-values, specific sequences
<type name="islamic-umalqura" description="Islamic calendar, Umm al-Qura" since="24"/>
*/
}