blob: 8be6af89b53a31ece235443f98b1e243fe46e48d [file] [log] [blame]
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Ascii.toLowerCase;
import static com.google.common.base.Preconditions.checkState;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataType.BCP47;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Ascii;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
/**
* A mapper to collect BCP-47 data from {@link CldrDataType#BCP47 BCP47} data under paths
* matching:
* <pre>{@code
* //ldmlBCP47/keyword/key[@name=*]/type[@name=*]
* }</pre>
*/
public final class Bcp47Mapper {
// Other attributes (e.g. "alias") are value attributes and don't need to be matched here.
private static final AttributeKey KEY_NAME = keyOf("key", "name");
private static final AttributeKey KEY_ALIAS = keyOf("key", "alias");
private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType");
private static final AttributeKey TYPE_NAME = keyOf("type", "name");
private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias");
private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred");
// Deprecation of the data is not the same as deprecation of attributes themselves. This
// deprecation relates to identifying data which exists, but is not longer the right way to
// represent things (which means it can be important for clients to know about).
private static final AttributeKey KEY_DEPRECATED = keyOf("key", "deprecated");
private static final AttributeKey TYPE_DEPRECATED = keyOf("type", "deprecated");
// Attributes that can be emitted under the /keyInfo or /typeInfo paths for auxiliary
// information in the ICU data. If the value is equal to the declared default, it is ignored.
// NOTE: The need for hard-coded default values is a hack because there's not nice way (yet)
// to determine the default for implicit values via the DTD. Ideally this would be automatic
// and the AttributeKey class would be able to have a method like "isDefault(String value)".
private static final ImmutableMap<AttributeKey, String> INFO_ATTRIBUTES =
ImmutableMap.of(KEY_VALUE_TYPE, "", KEY_DEPRECATED, "false", TYPE_DEPRECATED, "false");
private static final RbPath RB_KEYMAP = RbPath.of("keyMap");
private static final RbPath RB_TYPE_ALIAS = RbPath.of("typeAlias", "timezone:alias");
private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias");
private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias");
private static final CldrDataProcessor<Bcp47Mapper> BCP47_PROCESSOR;
static {
CldrDataProcessor.Builder<Bcp47Mapper> processor = CldrDataProcessor.builder();
processor
.addAction("//ldmlBCP47/keyword/key[@name=*]", (m, p) -> m.new ValueCollector(p))
.addValueAction("type[@name=*]", ValueCollector::collect);
BCP47_PROCESSOR = processor.build();
}
/**
* Processes data from the given supplier to generate Timezone and BCP-47 ICU data.
*
* @param src the CLDR data supplier to process.
* @return A list of IcuData instances containing BCP-47 data to be written to files.
*/
public static ImmutableList<IcuData> process(CldrDataSupplier src) {
return process(src.getDataForType(BCP47));
}
@VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
static ImmutableList<IcuData> process(CldrData cldrData) {
Bcp47Mapper mapper = BCP47_PROCESSOR.process(cldrData, new Bcp47Mapper(), DTD);
mapper.addKeyMapValues();
return ImmutableList.of(mapper.keyTypeData, mapper.tzData);
}
// Outer visitor which handles "key" paths by installing sub-visitor methods to process
// each child "type" element. Depending on the key name, values are stored in different
// IcuData instances.
private final IcuData tzData = new IcuData("timezoneTypes", false);
private final IcuData keyTypeData = new IcuData("keyTypeData", false);
// A map collecting each key and values as they are visited.
// TODO: Convert this to a Map<RbPath, String> which involves removing the '@' prefix hack.
private Map<String, String> keyMap = new LinkedHashMap<>();
private Bcp47Mapper() { }
// Post processing to add additional captured attribute values and some special cases.
private void addKeyMapValues() {
IcuData keyData = keyTypeData;
// Add all the keyMap values into the IcuData file.
for (Entry<String, String> kmData : keyMap.entrySet()) {
String bcpKey = kmData.getKey();
String key = kmData.getValue();
if (bcpKey.startsWith("@")) {
// Undoing the weird hack in addInfoAttributes(). This can be done better.
// We use "parse()" because these are full paths, and not single elements.
keyData.add(RbPath.parse(bcpKey.substring(1)), key);
continue;
}
if (bcpKey.equals(key)) {
// An empty value indicates that the BCP47 key is same as the legacy key.
bcpKey = "";
}
keyData.add(RB_KEYMAP.extendBy(key), bcpKey);
}
// Add aliases for timezone data.
keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone");
keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone");
keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz");
}
private final class ValueCollector {
private final String keyName;
// Mutable data to be written into (differs depending on the key name).
private final IcuData icuData;
ValueCollector(CldrPath prefix) {
this.keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix));
this.icuData = keyName.equals("tz") ? tzData : keyTypeData;
}
private void collect(CldrValue value) {
String typeName = TYPE_NAME.valueFrom(value);
// Note that if a "preferred" type exists, we treat the value specially and add
// it only as an alias. We expected values with a preferred replacement to
// always be explicitly deprecated.
Optional<String> prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value);
if (prefName.isPresent()) {
checkState(KEY_DEPRECATED.booleanValueFrom(value, false)
|| TYPE_DEPRECATED.booleanValueFrom(value, false),
"unexpected 'preferred' attribute for non-deprecated value: %s", value);
icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get());
return;
}
// Note: There are some deprecated values which don't have a preferred
// replacement and these will be processed below (in particular we need to emit
// the fact that they are deprecated).
// Not all key elements have an alias. E.g. in calendar.xml:
// <key name="fw" description="First day of week" since="28">
// But we still add it as a alias to itself (which is later turned into a path with
// an empty value).
String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName));
keyMap.put(keyName, keyAlias);
RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias);
List<String> typeAliases = TYPE_ALIASES.listOfValuesFrom(value);
if (typeAliases.isEmpty()) {
// Generate type map entry using empty value (an empty value indicates same
// type name is used for both BCP47 and legacy type).
icuData.add(typeMapPrefix.extendBy(typeName), "");
} else {
String mainAlias = typeAliases.get(0);
icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName);
// Put additional aliases as secondary aliases referencing the main alias.
RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias);
typeAliases.stream()
.skip(1)
.map(Bcp47Mapper::quoteAlias)
.forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias));
}
addInfoAttributes(keyName, typeName, value.getValueAttributes());
}
// Add any additional attributes present to the attribute map. Note that this code was
// copied from largely undocumented code, and the precise reasoning for why this is
// needed or why it's done this way is not completely clear. It is very likely that it
// can be simplified.
//
// The '@' symbol added here is just a magic token that gets stripped off again in the
// addKeyMapValues() method, it appears to just be a way to distinguish keys added via
// this method vs during the collect method. A better approach might just be to have two
// maps.
// TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?).
private void addInfoAttributes(
String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes) {
// Only emit deprecation for the "key" level, even if all types below that are also
// marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES).
Set<AttributeKey> keys =
Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet());
for (AttributeKey a : keys) {
String value = attributes.get(a);
// Skip empty or default values in attributes.
if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) {
continue;
}
// The ID for the xxxInfo paths in ICU is the path fragment at which the
// attribute exists. Since we only process complete paths here, we must do a
// bit of reconstruction based on the element name of the attribute we are
// processing. This relies on explicit knowledge that the paths are "<key>" or
// "<key>/<type>". This all gets less messy if we switch to RbPath.
String id =
a.getElementName().equals("key") ? keyName : keyName + "/" + typeName;
keyMap.put(
"@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id,
value);
}
}
}
/**
* Escapes alias values containing '/' so they can appear in resource bundle paths. This
* function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar").
*
* <p>This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar'
* in the CLDR data.
*/
// TODO: Switch to RbPath and do quoting automatically when ICU data is written out.
private static String quoteAlias(String str) {
return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"';
}
}