| // © 2019 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| package org.unicode.icu.tool.cldrtoicu.mapper; |
| |
| import static com.google.common.base.Preconditions.checkNotNull; |
| import static org.unicode.cldr.api.AttributeKey.keyOf; |
| |
| import java.util.Optional; |
| |
| import org.unicode.cldr.api.AttributeKey; |
| import org.unicode.cldr.api.CldrData; |
| import org.unicode.cldr.api.CldrDataType; |
| import org.unicode.cldr.api.CldrValue; |
| import org.unicode.icu.tool.cldrtoicu.IcuData; |
| import org.unicode.icu.tool.cldrtoicu.RbPath; |
| import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; |
| import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor.SubProcessor; |
| |
| import com.google.common.escape.UnicodeEscaper; |
| |
| /** |
| * A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under |
| * paths matching: |
| * <pre>{@code |
| * //ldml/segmentations/segmentation/suppressions/suppression |
| * //ldml/special/icu:breakIteratorData/... |
| * }</pre> |
| */ |
| // TODO: This class can almost certainly be replace with a small RegexTransformer config. |
| public final class BreakIteratorMapper { |
| |
| private static final CldrDataProcessor<BreakIteratorMapper> CLDR_PROCESSOR; |
| static { |
| CldrDataProcessor.Builder<BreakIteratorMapper> processor = CldrDataProcessor.builder(); |
| // The "type" attribute in /suppressions/ is not required so cannot be in the matcher. And |
| // its default (and only) value is "standard". |
| // TODO: Understand and document why this is the case. |
| processor.addValueAction( |
| "//ldml/segmentations/segmentation[@type=*]/suppressions/suppression", |
| BreakIteratorMapper::addSuppression); |
| SubProcessor<BreakIteratorMapper> specials = |
| processor.addSubprocessor("//ldml/special/icu:breakIteratorData"); |
| specials.addValueAction("icu:boundaries/*", BreakIteratorMapper::addBoundary); |
| specials.addValueAction( |
| "icu:dictionaries/icu:dictionary", BreakIteratorMapper::addDictionary); |
| CLDR_PROCESSOR = processor.build(); |
| } |
| |
| private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type"); |
| private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency"); |
| private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type"); |
| |
| /** |
| * Processes data from the given supplier to generate break-iterator data for a set of locale |
| * IDs. |
| * |
| * @param icuData the ICU data to be filled. |
| * @param cldrData the unresolved CLDR data to process. |
| * @param icuSpecialData additional ICU data (in the "icu:" namespace) |
| * @return IcuData containing break-iterator data for the given locale ID. |
| */ |
| public static IcuData process( |
| IcuData icuData, CldrData cldrData, Optional<CldrData> icuSpecialData) { |
| |
| BreakIteratorMapper mapper = new BreakIteratorMapper(icuData); |
| icuSpecialData.ifPresent(d -> CLDR_PROCESSOR.process(d, mapper)); |
| CLDR_PROCESSOR.process(cldrData, mapper); |
| return mapper.icuData; |
| } |
| |
| // The per-locale ICU data being collected by this visitor. |
| private final IcuData icuData; |
| |
| private BreakIteratorMapper(IcuData icuData) { |
| this.icuData = checkNotNull(icuData); |
| } |
| |
| private void addSuppression(CldrValue v) { |
| String type = SEGMENTATION_TYPE.valueFrom(v); |
| // TODO: Understand and document why we escape values here, but not for collation data. |
| icuData.add( |
| RbPath.of("exceptions", type + ":array"), ESCAPE_NON_ASCII.escape(v.getValue())); |
| } |
| |
| private void addBoundary(CldrValue v) { |
| addDependency(getDependencyName(v), getBoundaryType(v), getBoundaryDependency(v)); |
| } |
| |
| private void addDictionary(CldrValue v) { |
| addDependency( |
| getDependencyName(v), |
| DICTIONARY_TYPE.valueFrom(v), |
| DICTIONARY_DEP.optionalValueFrom(v)); |
| } |
| |
| private void addDependency(String name, String type, Optional<String> dependency) { |
| icuData.add( |
| RbPath.of(name, type + ":process(dependency)"), |
| dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency"))); |
| } |
| |
| // Must match the BOUNDARIES or DICTIONARY path. |
| private static String getDependencyName(CldrValue value) { |
| return stripXmlNamespace(value.getPath().getParent().getName()); |
| } |
| |
| // Must match the BOUNDARIES path. |
| private static String getBoundaryType(CldrValue value) { |
| String elementName = value.getPath().getName(); |
| String type = stripXmlNamespace(elementName); |
| return keyOf(elementName, "alt") |
| .optionalValueFrom(value).map(a -> type + "_" + a).orElse(type); |
| } |
| |
| // Must match the BOUNDARIES path. |
| private static Optional<String> getBoundaryDependency(CldrValue value) { |
| return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value); |
| } |
| |
| // Strips the first prefix of the form "xxx:" from a string. |
| private static String stripXmlNamespace(String s) { |
| return s.substring(s.indexOf(':') + 1); |
| } |
| |
| /* |
| * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert |
| * backslash to a double backslash. This class is super slow for non-ASCII escaping due to |
| * using "String.format()", however there's < 100 values that need any escaping, so it's fine. |
| */ |
| private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() { |
| private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray(); |
| |
| @Override |
| protected char[] escape(int cp) { |
| // Returning null means "do not escape". |
| if (0x0020 <= cp && cp <= 0x007F) { |
| return cp == '\\' ? DOUBLE_BACKSLASH : null; |
| } else if (cp <= 0xFFFF) { |
| return String.format("\\u%04X", cp).toCharArray(); |
| } |
| return String.format("\\U%08X", cp).toCharArray(); |
| } |
| }; |
| } |