blob: 150a9db2868057bf370382c2b4b324dfc78a1a13 [file] [log] [blame]
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use:
import static;
import static;
import static;
import static java.util.Arrays.asList;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Logger;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.cldr.api.PathMatcher;
* Mapper for generating locale distance tables from CLDR language data.
* <p>Note that this is an atypical mapper which does a lot more processing than other
* ICU mapper classes and relies on several auxilliary classes (which is why it's in a
* different package). Conceptually it's still a "mapper" though, just not a simple one.
* <p>This mapper was converted from the LocaleDistanceBuilder code in the ICU4J project.
public final class LocaleDistanceMapper {
private static final Logger logger = Logger.getLogger(LocaleDistanceMapper.class.getName());
// All the language matching data comes from the "written_new" language data in
// "common/supplemental/languageInfo.xml".
private static final PathMatcher WRITTEN_LANGUAGE_PREFIX =
// Definitions of region containment variables used when expressing match distances. E.g.:
// <matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
private static final PathMatcher VARIABLE_PATH =
private static final AttributeKey VARIABLE_ID = AttributeKey.keyOf("matchVariable", "id");
private static final AttributeKey VARIABLE_VALUE = AttributeKey.keyOf("matchVariable", "value");
// Language distance data, including wildcards and variable references (possibly negated). E.g.:
// <languageMatch desired="ja_Latn" supported="ja_Jpan" distance="5" oneway="true"/>
// <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/>
// <languageMatch desired="en_*_$!enUS" supported="en_*_GB" distance="3"/>
private static final PathMatcher LANGUAGE_MATCH_PATH =
private static final AttributeKey MATCH_DESIRED =
AttributeKey.keyOf("languageMatch", "desired");
private static final AttributeKey MATCH_SUPPORTED =
AttributeKey.keyOf("languageMatch", "supported");
private static final AttributeKey MATCH_DISTANCE =
AttributeKey.keyOf("languageMatch", "distance");
// Optional, assume false if not present.
private static final AttributeKey MATCH_ONEWAY =
AttributeKey.keyOf("languageMatch", "oneway");
// Singleton element containing the list of special case "paradigm" locales, which should
// always be preferred if there is a tie. E.g.:
// <paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
// Since there are no distinguishing attributes for this path, there can only be one
// instance which we can just lookup directly.
private static final CldrPath PARADIGM_LOCALES_PATH = CldrPath.parseDistinguishingPath(
private static final AttributeKey PARADIGM_LOCALES =
AttributeKey.keyOf("paradigmLocales", "locales");
// NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", "").
private static final Splitter LIST_SPLITTER =
Splitter.on(' ').trimResults().omitEmptyStrings();
// Output resource bundle paths, split into two basic groups for likely locale mappings
// and match data.
private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases");
private static final RbPath LIKELY_M49 = RbPath.of("likely", "m49");
private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases");
private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin");
private static final RbPath LIKELY_LSRNUM = RbPath.of("likely", "lsrnum:intvector");
private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin");
private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin");
private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions");
private static final RbPath MATCH_PARADIGMNUM = RbPath.of("match", "paradigmnum:intvector");
private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector");
// To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS").
private static final Splitter UNDERSCORE = Splitter.on('_');
// The encoding scheme allow us to only encode up to 27 M.49 code below.
// The size is later check while reading the M49 List.
private static final List<String> M49 = Arrays.asList("001", "143", "419");
* Processes data from the given supplier to generate locale matcher ICU data.
* @param src the CLDR data supplier to process.
* @return the IcuData instance to be written to a file.
public static IcuData process(CldrDataSupplier src) {
return process(src.getDataForType(SUPPLEMENTAL));
@VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
static IcuData process(CldrData data) {
IcuData icuData = new IcuData("langInfo", false);
if (M49.size() > 27) {
throw new IllegalStateException(
"The M49 list is too long. We can only encode up to 27 M49 codes.");
XLikelySubtags.Data likelyData =;
icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases));
icuData.add(LIKELY_M49, RbValue.of(M49));
icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases));
icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie));
icuData.add(LIKELY_LSRNUM, ofLsrNum(asList(likelyData.lsrs)));
LocaleDistance.Data distanceData = buildDistanceData(data);
icuData.add(MATCH_TRIE, ofBytes(distanceData.trie));
icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex));
icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays));
icuData.add(MATCH_PARADIGMNUM, ofLsrNum(distanceData.paradigmLSRs));
icuData.add(MATCH_DISTANCES, RbValue.of(;
return icuData;
* A simple holder for language, script and region which allows for wildcards (i.e. "*")
* and variables to represent partitions of regions (e.g. "$enUS"). Minimal additional
* validation is done on incoming fields as data is assumed to be correct.
private static final class LsrSpec {
* Parse a raw specification string (e.g. "en", "ja_Latn", "*_*_*", "ar_*_$maghreb"
* or "en_*_GB") into a structured spec. Note that if the specification string
* contains a "bare" region (e.g. "en_*_GB") then it is registered as a variable in
* the given RegionMapper builder, so the returned {@code LsrSpec} will be
* {@code "en_*_$GB"}.
public static LsrSpec parse(String rawSpec, PartitionInfo.Builder rmb) {
List<String> parts = UNDERSCORE.splitToList(rawSpec);
checkArgument(parts.size() <= 3, "invalid raw LSR specification: %s", rawSpec);
String language = parts.get(0);
Optional<String> script = parts.size() > 1 ? Optional.of(parts.get(1)) : Optional.empty();
// While parsing the region part, ensure any "bare" region subtags are converted
// to variables (e.g. "GB" -> "$GB") and registered with the parition map.
Optional<String> region =
parts.size() > 2 ? Optional.of(rmb.ensureVariable(parts.get(2))) : Optional.empty();
return new LsrSpec(language, script, region);
// A language subtag (e.g. "en") or "*".
private final String language;
// If present, a script subtag (e.g. "Latn") or "*".
private final Optional<String> script;
// If present, a registered variable with '$' prefix (e.g. "$foo" or "$GB") or "*".
private final Optional<String> regionVariable;
private LsrSpec(String language, Optional<String> script, Optional<String> regionVariable) {
this.language = language;
this.script = script;
this.regionVariable = regionVariable;
// Implementation shortcuts assume:
// - If the language subtags are '*', the other-level subtags must also be '*' (if present).
// If there are rules that do not fit these constraints, we need to revise the implementation.
if (isAny(language)) {
s -> checkArgument(isAny(s), "expected wildcard script, got: %s", script));
r -> checkArgument(isAny(r), "expected wildcard region, got: %s", regionVariable));
public String getLanguage() {
return language;
public String getScript() {
return script.orElseThrow(() -> new IllegalArgumentException("no script available: " + this));
public String getRegionVariable() {
return regionVariable.orElseThrow(() -> new IllegalArgumentException("no region available: " + this));
public int size() {
return regionVariable.isPresent() ? 3 : script.isPresent() ? 2 : 1;
public String toString() {
return language + -> "_" + s).orElse("") + -> "_" + r).orElse("");
* Represents a {@code <languageMatch>} rule derived from supplemental data, such as:
* <pre>{@code
* <languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/>
* }</pre>
* or:
* <pre>{@code
* <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/>
* }</pre>
* <p>The job of a {@code Rule} is to provide a mechanism for capturing the data in
* {@code <languageMatch>} elements and subsequently adding that information to a
* {@link DistanceTable.Builder} in a structured way.
private static final class LanguageMatchRule {
private final LsrSpec desired;
private final LsrSpec supported;
private final int distance;
private final boolean oneway;
public LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway) {
this.desired = checkNotNull(desired);
this.supported = checkNotNull(supported);
this.distance = distance;
this.oneway = oneway;
// Implementation shortcuts assume:
// - At any level, either both or neither spec subtags are *.
// If there are rules that do not fit these constraints, we need to revise the implementation.
checkArgument(desired.size() == supported.size(),
"mismatched rule specifications in: %s, %s", desired, supported);
checkArgument(isAny(desired.language) == isAny(supported.language),
"wildcard mismatch for languages in: %s, %s", desired, supported);
checkArgument(isAny(desired.script) == isAny(supported.script),
"wildcard mismatch for scripts in: %s, %s", desired, supported);
checkArgument(isAny(desired.regionVariable) == isAny(supported.regionVariable),
"wildcard mismatch for languages in: %s, %s", desired, supported);
int size() {
return desired.size();
boolean isDefaultRule() {
// We already know that in LsrSpec, if the language is "*" then all subtags are too.
return isAny(desired.language);
* Adds this rule to the given distance table, using the given partition map to
* resolve any region variables present in the desired or supported specs.
void addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions) {
// Note that rather than using the rule's "size" to mediate the different
// cases, we could have had 3 distinct sub-types of a common rule API (e.g.
// "LanguageRule", "ScriptRule" and "RegionRule"), each with a different
// addTo() callback. However this would have been quite a lot more code
// for not much real gain.
switch (size()) {
case 1: // Language only.
distanceTable.addDistance(distance, oneway,
desired.getLanguage(), supported.getLanguage());
case 2: // Language and script present.
distanceTable.addDistance(distance, oneway,
desired.getLanguage(), supported.getLanguage(),
desired.getScript(), supported.getScript());
case 3: // Language, script and region variable present.
// Add the rule distance for every combination of desired/supported
// partition IDs for the region variables. This is important for
// variables like "$americas" which overlap with multiple paritions.
// Note that in this case (because region variables map to sets of
// partition IDs) we can get situations where "shouldReverse" is true,
// but the desired/supported pairs being passed in are identical (e.g.
// different region variables map to distinct partition groups which
// share some common elements).
// This is fine, providing that the distance table is going to ignore
// identical mappings (which it does). Alternatively we could just
// re-calculate "shouldReverse" inside this loop to account for partition
// IDs rather than region variables.
ImmutableSet<String> desiredPartitionIds =
ImmutableSet<String> supportedPartitionIds =
for (String desiredPartitionId : desiredPartitionIds) {
for (String supportedPartitionId : supportedPartitionIds) {
distanceTable.addDistance(distance, oneway,
desired.getLanguage(), supported.getLanguage(),
desired.getScript(), supported.getScript(),
desiredPartitionId, supportedPartitionId);
throw new IllegalStateException("invalid size for LsrSpec: " + this);
public String toString() {
return String.format(
"Rule{ desired=%s, supported=%s, distance=%d, oneway=%b }",
desired, supported, distance, oneway);
private static LocaleDistance.Data buildDistanceData(CldrData supplementalData) {
// Resolve any explicitly declared region variables into the partition map.
// Territory containment information is used to recursively resolve region
// variables (e.g. "$enUS") into a collection of non-macro regions.
PartitionInfo.Builder partitionBuilder =
supplementalData.accept(DTD, v -> {
CldrPath path = v.getPath();
if (VARIABLE_PATH.matches(path)) {
partitionBuilder.addVariableExpression(v.get(VARIABLE_ID), v.get(VARIABLE_VALUE));
// Parse the rules from <languageMatch> elements. Note that the <languageMatch>
// element is marked as "ORDERED" in the DTD, which means the elements always
// appear in the same order is in the CLDR XML file (even when using DTD order).
// This is one of the relatively rare situations in which using DTD order will
// not isolate the ICU data from reordering of the CLDR data. In particular this
// matters when specifying language matcher preferences (such as "en_*_GB" vs
// "en_*_!enUS").
// We could almost process the rules while reading them from the source data, but
// rules may contain region codes rather than variables, and we need to create a
// variable for each such region code before the RegionMapper is built, and
// before processing the rules (this happens when the LsrSpec is parsed).
List<LanguageMatchRule> rules = new ArrayList<>();
supplementalData.accept(DTD, v -> {
CldrPath path = v.getPath();
if (LANGUAGE_MATCH_PATH.matches(path)) {
int distance = Integer.parseInt(v.get(MATCH_DISTANCE));
// Lenient against there being no "oneway" attribute.
boolean oneway = "true".equalsIgnoreCase(v.get(MATCH_ONEWAY));
LsrSpec desired = LsrSpec.parse(v.get(MATCH_DESIRED), partitionBuilder);
LsrSpec supported = LsrSpec.parse(v.get(MATCH_SUPPORTED), partitionBuilder);
LanguageMatchRule rule = new LanguageMatchRule(desired, supported, distance, oneway);
logger.fine(() -> String.format("rule: %s", rule));
// Check that the rules are in the expected order. Rule order is important in ensuring
// data correctness and incorrect order may violate business logic assumptions later.
// TODO: Consider what other ordering/sanity checks make sense here.
for (int n = 0, prevSize = 1; n < rules.size(); n++) {
LanguageMatchRule rule = rules.get(n);
checkArgument(rule.size() >= prevSize, "<languageMatch> elements out of order at: %s", rule);
checkArgument(rule.size() == prevSize || (n > 0 && rules.get(n - 1).isDefaultRule()),
"missing default rule before: %s", rule);
prevSize = rule.size();
checkState( == rules.size(), "duplicated rule in: %s", rules);
// Build region partition data after all the variables have been accounted for
// (including the implicit variables found while processing LsrSpecs).
PartitionInfo partitions =;
// Add all the rules (in order) to the distance table.
DistanceTable.Builder distanceTableBuilder = DistanceTable.builder();
rules.forEach(r -> r.addTo(distanceTableBuilder, partitions));
DistanceTable distanceTable =;
// Note: Using LocaleDistance.Data as a fairly "dumb" container for the return values
// requires us to do slightly awkward things, like passing mutable arrays and LSR
// instances around, but the advantage it has is that this data structure is also what's
// used in client code, so if the likely subtags data changes, it will be a forcing
// function to change this code.
return new LocaleDistance.Data(
private static Set<LSR> getParadigmLsrs(CldrData supplementalData) {
// LinkedHashSet for stable order; otherwise a unit test is flaky.
CldrValue cldrValue = supplementalData.get(PARADIGM_LOCALES_PATH);
checkState(cldrValue != null,
"<paradigmLocales> element was missing: %s", PARADIGM_LOCALES_PATH);
String localesList = cldrValue.get(PARADIGM_LOCALES);
checkState(localesList != null,
"<paradigmLocales> 'locales' attribute was missing: %s", cldrValue);
Set<LSR> paradigmLSRs = new LinkedHashSet<>();
for (String paradigm : LIST_SPLITTER.split(localesList)) {
LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(new ULocale(paradigm), false);
// Clear the LSR flags to make the data equality test in LocaleDistanceTest happy.
paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
checkArgument(paradigmLSRs.size() % 2 == 0, "unpaired paradigm locales: %s", paradigmLSRs);
return paradigmLSRs;
// Returns an RbValue serialized from a map as a sequence of alternating (key, value)
// pairs (formatted as one pair per line in the IcuData file).
// E.g.
// foo{
// key1, value1,
// ...
// keyN, valueN,
// }
private static RbValue ofMapEntries(Map<String, String> map) {
return RbValue.of(
.flatMap(e -> Stream.of(e.getKey(), e.getValue()))
// Returns an RbValue serialized from a sequence of LSR instance as a sequence of number
// represent (language, region, script) tuples (formatted as one number per line in the IcuData file).
private static RbValue ofLsrNum(Collection<LSR> lsrs) {
return RbValue.of(
.flatMapToInt(lsr -> IntStream.of(LSRToNum(lsr)))
// This method is added only to support encodeToIntForResource()
// It only support [a-z]{2,3} and will not work for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static private int encodeLanguageToInt(String language) {
assert language.length() >= 2;
assert language.length() <= 3;
assert language.charAt(0) >= 'a';
assert language.charAt(0) <= 'z';
assert language.charAt(1) >= 'a';
assert language.charAt(1) <= 'z';
assert language.length() == 2 || language.charAt(2) >= 'a';
assert language.length() == 2 || language.charAt(2) <= 'z';
return language.charAt(0) - 'a' + 1 +
27 * (language.charAt(1) - 'a' + 1) +
((language.length() == 2) ? 0 : 27 * 27 * (language.charAt(2) - 'a' + 1));
// This method is added only to support encodeToIntForResource()
// It only support [A-Z][a-z]{3} which defined in UScript and does not work for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static private int encodeScriptToInt(String script) {
int ret = UScript.getCodeFromName(script);
assert ret != UScript.INVALID_CODE;
return ret;
// This method is added only to support encodeToIntForResource()
// It only support [A-Z]{2}|001|143|419 and does not work for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static private int encodeRegionToInt(String region, List<String> m49) {
assert region.length() >= 2;
assert region.length() <= 3;
// Do not have enough bits to store the all 1000 possible combination of \d{3}
// Only support what is in M49.
if (region.length() == 3) {
int index = m49.indexOf(region);
assert index >= 0;
if (index < 0) {
throw new IllegalStateException(
"Please add '" + region + "' to M49 in");
return index;
assert region.charAt(0) >= 'A';
assert region.charAt(0) <= 'Z';
assert region.charAt(1) >= 'A';
assert region.charAt(1) <= 'Z';
// 'AA' => 1+27*1 = 28
// ...
// 'AZ' => 1+27*26 = 703
// 'BA' => 2+27*1 = 29
// ...
// 'IN' => 9+27*14 = 387
// 'ZZ' => 26+27*26 = 728
return (region.charAt(0) - 'A' + 1) + 27 * (region.charAt(1) - 'A' + 1);
// This is designed to only support encoding some LSR into resources but not for other cases.
// TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
static int encodeToIntForResource(LSR lsr) {
return (encodeLanguageToInt(lsr.language) + (27*27*27) * encodeRegionToInt(lsr.region, M49)) |
(encodeScriptToInt(lsr.script) << 24);
private static int LSRToNum(LSR lsr) {
// Special number for "", "", "" return 0
if (lsr.language.isEmpty() && lsr.script.isEmpty() && lsr.region.isEmpty()) {
return 0;
// Special number for "skip", "script", "" return 1
if (lsr.language.equals("skip") && lsr.script.equals("script") && lsr.region.isEmpty()) {
return 1;
// TODO(ftang) Change to the following line after LSR.encodeToIntForResource is available to the tool.
// return lsr.encodeToIntForResource();
return encodeToIntForResource(lsr);
// Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of
// hex values. This is intended only for RbPaths using the ":bin" suffix.
// E.g.
// foo{
// 0123456789abcdef0123456789abcdef
// ...
// 1c0de4c0ffee
// }
// Note that typically no indentation is used when writting this binary "blob".
private static RbValue ofBytes(byte[] data) {
ImmutableList.Builder<String> hexValues = ImmutableList.builder();
List<Byte> bytes = Bytes.asList(data);
for (List<Byte> line : Iterables.partition(bytes, 16)) {
hexValues.add( -> String.format("%02x", b)).collect(Collectors.joining()));
return RbValue.of(;
// Returns if the subtag is the '*' wildcard. This is not to be confused with the
// "ANY" character used in DistanceTable.
private static boolean isAny(String subtag) {
return subtag.equals("*");
// Returns if the subtag exists and is the '*' wildcard.
private static boolean isAny(Optional<String> subtag) {
// Main method for running this mapper directly with logging enabled.
// CLDR_DIR is picked up from system properties or envirnment variables.
// Arguments: <output-file> [<log-level>]
public static void main(String[] args) throws IOException {
DebugWriter.writeForDebugging(args, LocaleDistanceMapper::process);