blob: a016cd238de1c91f080e2c36ee1e7e208da03c13 [file] [log] [blame]
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
import java.util.Optional;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.escape.UnicodeEscaper;
/**
* A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under
* paths matching:
* <pre>{@code
* //ldml/segmentations/segmentation/suppressions/suppression
* //ldml/special/icu:breakIteratorData/...
* }</pre>
*/
// TODO: This class can almost certainly be replace with a small RegexTransformer config.
public final class BreakIteratorMapper {
// The "type" attribute in /suppressions/ is not required so cannot be in the matcher. And
// its default (and only) value is "standard".
// TODO: Understand and document why this is the case.
private static final PathMatcher SUPPRESSION = PathMatcher.of(
"ldml/segmentations/segmentation[@type=*]/suppressions/suppression");
private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type");
// Note: This could be done with an intermediate matcher for
// "ldml/special/icu:breakIteratorData" but there are so few "special" values it's not worth it
private static final PathMatcher BOUNDARIES =
PathMatcher.of("ldml/special/icu:breakIteratorData/icu:boundaries/*");
private static final PathMatcher DICTIONARY =
PathMatcher.of("ldml/special/icu:breakIteratorData/icu:dictionaries/icu:dictionary");
private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency");
private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type");
/**
* Processes data from the given supplier to generate break-iterator data for a set of locale
* IDs.
*
* @param localeId the locale ID to generate data for.
* @param src the CLDR data supplier to process.
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
* @return IcuData containing break-iterator data for the given locale ID.
*/
public static IcuData process(
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
CldrData cldrData = src.getDataForLocale(localeId, UNRESOLVED);
return process(localeId, cldrData, icuSpecialData);
}
@VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
static IcuData process(String localeId, CldrData cldrData, Optional<CldrData> icuSpecialData) {
BreakIteratorMapper mapper = new BreakIteratorMapper(localeId);
icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, mapper::addSpecials));
cldrData.accept(DTD, mapper::addSuppression);
return mapper.icuData;
}
// The per-locale ICU data being collected by this visitor.
private final IcuData icuData;
private BreakIteratorMapper(String localeId) {
this.icuData = new IcuData(localeId, true);
}
private void addSuppression(CldrValue v) {
if (SUPPRESSION.matches(v.getPath())) {
String type = SEGMENTATION_TYPE.valueFrom(v);
// TODO: Understand and document why we escape values here, but not for collation data.
icuData.add(
RbPath.of("exceptions", type + ":array"),
ESCAPE_NON_ASCII.escape(v.getValue()));
}
}
private void addSpecials(CldrValue v) {
CldrPath p = v.getPath();
if (BOUNDARIES.matches(p)) {
addDependency(
getDependencyName(v),
getBoundaryType(v),
getBoundaryDependency(v));
} else if (DICTIONARY.matches(p)) {
addDependency(
getDependencyName(v),
DICTIONARY_TYPE.valueFrom(v),
DICTIONARY_DEP.optionalValueFrom(v));
}
}
private void addDependency(String name, String type, Optional<String> dependency) {
icuData.add(
RbPath.of(name, type + ":process(dependency)"),
dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency")));
}
// Must match the BOUNDARIES or DICTIONARY path.
private static String getDependencyName(CldrValue value) {
return stripXmlNamespace(value.getPath().getParent().getName());
}
// Must match the BOUNDARIES path.
private static String getBoundaryType(CldrValue value) {
String elementName = value.getPath().getName();
String type = stripXmlNamespace(elementName);
return keyOf(elementName, "alt")
.optionalValueFrom(value).map(a -> type + "_" + a).orElse(type);
}
// Must match the BOUNDARIES path.
private static Optional<String> getBoundaryDependency(CldrValue value) {
return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value);
}
// Strips the first prefix of the form "xxx:" from a string.
private static String stripXmlNamespace(String s) {
return s.substring(s.indexOf(':') + 1);
}
/*
* Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
* backslash to a double backslash. This class is super slow for non-ASCII escaping due to
* using "String.format()", however there's < 100 values that need any escaping, so it's fine.
*/
private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() {
private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
@Override
protected char[] escape(int cp) {
// Returning null means "do not escape".
if (0x0020 <= cp && cp <= 0x007F) {
return cp == '\\' ? DOUBLE_BACKSLASH : null;
} else if (cp <= 0xFFFF) {
return String.format("\\u%04X", cp).toCharArray();
}
return String.format("\\U%08X", cp).toCharArray();
}
};
}