| // © 2019 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| package org.unicode.icu.tool.cldrtoicu; |
| |
| import static com.google.common.base.Preconditions.checkArgument; |
| import static com.google.common.base.Preconditions.checkNotNull; |
| import static com.google.common.collect.ImmutableList.toImmutableList; |
| import static com.google.common.collect.ImmutableMap.toImmutableMap; |
| import static java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT; |
| import static java.util.function.Function.identity; |
| import static java.util.regex.Pattern.CASE_INSENSITIVE; |
| import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY; |
| import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED; |
| import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED; |
| |
| import java.util.Arrays; |
| import java.util.Set; |
| import java.util.function.Function; |
| import java.util.function.IntUnaryOperator; |
| import java.util.function.Predicate; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import java.util.stream.IntStream; |
| |
| import org.unicode.cldr.api.CldrData; |
| import org.unicode.cldr.api.CldrDataSupplier; |
| import org.unicode.cldr.api.CldrDataSupplier.CldrResolution; |
| import org.unicode.cldr.api.CldrDataType; |
| import org.unicode.cldr.api.CldrDraftStatus; |
| import org.unicode.cldr.api.CldrPath; |
| import org.unicode.cldr.api.CldrValue; |
| import org.unicode.cldr.api.FilteredData; |
| import org.unicode.cldr.api.PathMatcher; |
| |
| import com.google.common.base.CharMatcher; |
| import com.google.common.collect.ImmutableList; |
| import com.google.common.collect.ImmutableMap; |
| import com.google.common.collect.ImmutableSet; |
| import com.google.common.collect.Sets; |
| |
| /** |
| * A factory for wrapping data suppliers to add synthetic locales for debugging. The currently |
| * supported synthetic locales are: |
| * <ul> |
| * <li>{@code en_XA}: A pseudo locale which generates expanded text with many non-Latin accents. |
| * <li>{@code ar_XB}: A pseudo locale which generates BiDi text for debugging. |
| * </ul> |
| * |
| * <p>Both pseudo locales are based on {@code "en"} data, and generate values which are readable |
| * by English speaking developers. For example, the CLDR value "Hello World" will be turned into |
| * something like: |
| * <ul> |
| * <li>{@code en_XA}: [Ĥéļļö Ŵöŕļð one two] |
| * <li>{@code ar_XB}: dlroW elloH |
| * </ul> |
| * |
| * <p>In the case of BiDi pseudo localization, bi-directional markers are also inserted into the |
| * text so that, if the system using the data is configured correctly, the results will look |
| * "normal" (i.e. Latin text will appear displayed left-to-right because of the BiDi markers). |
| */ |
| // TODO(CLDR-13381): Move this all into the CLDR API once the dust has settled. |
| public final class PseudoLocales { |
| // Right-to-left override character. |
| private static final String RLO = "\u202e"; |
| // Arabic letter mark character. |
| private static final String ALM = "\u061C"; |
| // Pop direction formatting character. |
| private static final String PDF = "\u202c"; |
| // Prefix to add before each LTR word. |
| private static final String BIDI_PREFIX = ALM + RLO; |
| // Postfix to add after each LTR word. |
| private static final String BIDI_POSTFIX = PDF + ALM; |
| |
| // See getExemplarValue() method for why we don't extract the exemplar list from "en". |
| private enum PseudoType { |
| BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz" + ALM + RLO + PDF), |
| EXPAND("en_XA", PseudoLocales::expanding, |
| "a\u00e5b\u0180c\u00e7d\u00f0e\u00e9f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm" |
| + "\u0271n\u00f1o\u00f6p\u00feq\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175" |
| + "x\u1e8by\u00fdz\u017e"); |
| |
| private static final ImmutableMap<String, PseudoType> ID_MAP = |
| Arrays.stream(values()).collect(toImmutableMap(PseudoType::getLocaleId, identity())); |
| |
| private static PseudoType fromId(String localeId) { |
| return checkNotNull(ID_MAP.get(localeId), "unknown pseduo locale: %s", localeId); |
| } |
| |
| private static ImmutableSet<String> getLocaleIds() { |
| return ID_MAP.keySet(); |
| } |
| |
| private final String localeId; |
| private final Function<Boolean, PseudoText> textSupplier; |
| // A string whose code points form the exemplar set for the pseudo locale. |
| private final String exemplars; |
| |
| PseudoType(String localeId, Function<Boolean, PseudoText> textSupplier, String exemplars) { |
| this.localeId = localeId; |
| this.textSupplier = textSupplier; |
| this.exemplars = exemplars; |
| } |
| |
| String getLocaleId() { |
| return localeId; |
| } |
| |
| PseudoText getText(boolean isPattern) { |
| return textSupplier.apply(isPattern); |
| } |
| |
| String getExemplars() { |
| return exemplars; |
| } |
| } |
| |
| /** |
| * Returns a wrapped data supplier which will inject {@link CldrData} for the pseudo locales |
| * {@code en_XA} and {@code ar_XB}. These locales should behave in all respects like normal |
| * locales and can be processed accordingly. |
| */ |
| public static CldrDataSupplier addPseudoLocalesTo(CldrDataSupplier src) { |
| return new PseudoSupplier(src); |
| } |
| |
| private static final class PseudoSupplier extends CldrDataSupplier { |
| private final CldrDataSupplier src; |
| private final Set<String> srcIds; |
| private final CldrData enData; |
| private final ImmutableSet<CldrPath> pathsToProcess; |
| |
| PseudoSupplier(CldrDataSupplier src) { |
| this.src = checkNotNull(src); |
| this.srcIds = src.getAvailableLocaleIds(); |
| // Start with resolved data so we can merge values from "en" and "en_001" for coverage |
| // and supply the unfiltered values if someone wants the resolved version of the pseudo |
| // locale data. |
| this.enData = src.getDataForLocale("en", RESOLVED); |
| // But since we don't want to filter paths which come from the "root" locale (such as |
| // aliases) then we need to find the union of "English" paths we expect to filter. |
| this.pathsToProcess = getUnresolvedPaths(src, "en", "en_001"); |
| // Just check that we aren't wrapping an already wrapped supplier. |
| PseudoType.getLocaleIds() |
| .forEach(id -> checkArgument(!srcIds.contains(id), |
| "pseudo locale %s already supported by given data supplier", id)); |
| } |
| |
| private static ImmutableSet<CldrPath> getUnresolvedPaths( |
| CldrDataSupplier src, String... ids) { |
| |
| ImmutableSet.Builder<CldrPath> paths = ImmutableSet.builder(); |
| for (String id : ids) { |
| src.getDataForLocale(id, UNRESOLVED).accept(ARBITRARY, v -> paths.add(v.getPath())); |
| } |
| return paths.build(); |
| } |
| |
| @Override public CldrDataSupplier withDraftStatusAtLeast(CldrDraftStatus draftStatus) { |
| return new PseudoSupplier(src.withDraftStatusAtLeast(draftStatus)); |
| } |
| |
| @Override public CldrData getDataForLocale(String localeId, CldrResolution resolution) { |
| if (PseudoType.getLocaleIds().contains(localeId)) { |
| return new PseudoLocaleData( |
| enData, pathsToProcess, resolution, PseudoType.fromId(localeId)); |
| } else { |
| return src.getDataForLocale(localeId, resolution); |
| } |
| } |
| |
| @Override public Set<String> getAvailableLocaleIds() { |
| return Sets.union(src.getAvailableLocaleIds(), PseudoType.getLocaleIds()); |
| } |
| |
| @Override public CldrData getDataForType(CldrDataType type) { |
| return src.getDataForType(type); |
| } |
| } |
| |
| private interface PseudoText { |
| void addFragment(String text, boolean isLocalizable); |
| } |
| |
| private static final class PseudoLocaleData extends FilteredData { |
| private static final PathMatcher LDML = PathMatcher.of("//ldml"); |
| |
| private static final PathMatcher AUX_EXEMPLARS = |
| ldml("characters/exemplarCharacters[@type=\"auxiliary\"]"); |
| |
| private static final PathMatcher NUMBERING_SYSTEM = |
| ldml("numbers/defaultNumberingSystem"); |
| |
| private static final PathMatcher GREGORIAN_SHORT_STANDARD_PATTERN = |
| ldml("dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]"); |
| |
| // These paths were mostly derived from looking at the previous implementation's behaviour |
| // and can be modified as needed. |
| private static final Predicate<CldrPath> IS_PSEUDO_PATH = |
| matchAnyLdmlPrefix( |
| "localeDisplayNames", |
| "delimiters", |
| "dates/calendars/calendar", |
| "dates/fields", |
| "dates/timeZoneNames", |
| "listPatterns", |
| "posix/messages", |
| "characterLabels", |
| "typographicNames", |
| "units") |
| .and(matchAnyLdmlPrefix( |
| "localeDisplayNames/localeDisplayPattern", |
| "dates/timeZoneNames/fallbackFormat") |
| .negate()); |
| |
| // The expectation is that all non-alias paths with values under these roots are "date/time |
| // pattern like" (such as "E h:mm:ss B") in which care must be taken to not pseudo localize |
| // the patterns in such as way as to break them. This list must be accurate. |
| private static final Predicate<CldrPath> IS_PATTERN_PATH = matchAnyLdmlPrefix( |
| "dates/calendars/calendar/timeFormats", |
| "dates/calendars/calendar/dateFormats", |
| "dates/calendars/calendar/dateTimeFormats", |
| "dates/timeZoneNames/hourFormat"); |
| |
| private static PathMatcher ldml(String paths) { |
| return LDML.withSuffix(paths); |
| } |
| |
| private static Predicate<CldrPath> matchAnyLdmlPrefix(String... paths) { |
| ImmutableList<Predicate<CldrPath>> collect = |
| Arrays.stream(paths) |
| .map(s -> (Predicate<CldrPath>) ldml(s)::matchesPrefixOf) |
| .collect(toImmutableList()); |
| return p -> collect.stream().anyMatch(e -> e.test(p)); |
| } |
| |
| // Look for any attribute in the path with "narrow" in its value. Since "narrow" values |
| // have strong expectations of width, we should not expand these (but might alter them |
| // otherwise). |
| private static final Predicate<String> IS_NARROW = |
| Pattern.compile("\\[@[a-z]+=\"[^\"]*narrow[^\"]*\"]", CASE_INSENSITIVE).asPredicate(); |
| |
| private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}"); |
| private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'"); |
| |
| private final PseudoType type; |
| private final boolean isResolved; |
| private final ImmutableSet<CldrPath> pathsToProcess; |
| |
| private PseudoLocaleData( |
| CldrData srcData, |
| ImmutableSet<CldrPath> pathsToProcess, |
| CldrResolution resolution, |
| PseudoType type) { |
| |
| super(srcData); |
| this.isResolved = checkNotNull(resolution) == RESOLVED; |
| this.type = checkNotNull(type); |
| this.pathsToProcess = pathsToProcess; |
| } |
| |
| @Override |
| protected CldrValue filter(CldrValue value) { |
| CldrPath path = value.getPath(); |
| |
| // Special case(s) first... |
| // We add the exemplar character list according to the pseudo type. |
| if (AUX_EXEMPLARS.matches(path)) { |
| return getExemplarValue(path); |
| } |
| // Force "latn" for the "ar_XB" pseudo locale (since otherwise it inherits from "ar". |
| // The path we get here was from "en" so should already be "latn", but we just have |
| // to return it in order for it to take effect. |
| if (type == PseudoType.BIDI && NUMBERING_SYSTEM.matches(path)) { |
| checkArgument(value.getValue().equals("latn")); |
| return value; |
| } |
| |
| CldrValue defaultReturnValue = isResolved ? value : null; |
| // This makes it look like we have explicit values only for the included paths. |
| if (!pathsToProcess.contains(path) || !IS_PSEUDO_PATH.test(path)) { |
| return defaultReturnValue; |
| } |
| String fullPath = value.getFullPath(); |
| // For now don't do anything with "narrow" data (this matches the previous behaviour). |
| // We can always add something here later if necessary. |
| if (IS_NARROW.test(fullPath)) { |
| return defaultReturnValue; |
| } |
| // Explicitly return 24 hrs format pattern for the Gregorian short standard pattern |
| // entry to be consistent with the time cycle specified in supplemental.xml for |
| // region 001. 001 is the region the pseudolocales en_XA/ar_XB default to. |
| // This prevents ICU unit test failure. |
| if (GREGORIAN_SHORT_STANDARD_PATTERN.matches(path)) { |
| return CldrValue.parseValue(fullPath, "[H:mm]"); |
| } |
| String text = createMessage(value.getValue(), IS_PATTERN_PATH.test(path)); |
| |
| return CldrValue.parseValue(fullPath, text); |
| } |
| |
| // It's tempting to think that the existing exemplar list in "en" could be parsed to |
| // generate list automatically (rather than having a hard coded list in the type) but |
| // https://unicode.org/reports/tr35/tr35-general.html#ExemplarSyntax |
| // makes it quite clear that this is infeasible, since there are many equivalent |
| // representations of the exemplar characters that could appear in the value |
| // (e.g. "[a b ... z]", "[a-z]", "[{a} {b} ... {z}]") |
| private CldrValue getExemplarValue(CldrPath path) { |
| StringBuilder exemplarList = new StringBuilder("["); |
| type.getExemplars().codePoints() |
| .forEach(cp -> appendExemplarCodePoint(exemplarList, cp).append(' ')); |
| exemplarList.setCharAt(exemplarList.length() - 1, ']'); |
| return CldrValue.parseValue(path.toString(), exemplarList.toString()); |
| } |
| |
| // Append a (possibly escaped) representation of the exemaplar character. |
| private static StringBuilder appendExemplarCodePoint(StringBuilder out, int cp) { |
| // This could be fixed if needed, but for now it's safer to check. |
| checkArgument( |
| Character.isBmpCodePoint(cp), |
| "Only BMP code points are supported for exemplars: 0x%s", Integer.toHexString(cp)); |
| if (Character.isAlphabetic(cp)) { |
| out.appendCodePoint(cp); |
| } else { |
| out.append(String.format("\\u%04X", cp)); |
| } |
| return out; |
| } |
| |
| private String createMessage(String text, boolean isPattern) { |
| // Pattern text is split by the quoted sections (which are localizable) whereas |
| // non-pattern text is split by placeholder (e.g. {0}) which are not localizable. |
| // This is why "isPattern" is used to signal "isLocalizable" in addFragment(). |
| Matcher match = (isPattern ? QUOTED_TEXT : NUMERIC_PLACEHOLDER).matcher(text); |
| // Alternate between unmatched and matched sections in the text, always localizing one |
| // but not the other (depending the type). Append the trailing section at the end. |
| PseudoText out = type.getText(isPattern); |
| int start = 0; |
| for (; match.find(); start = match.end()) { |
| out.addFragment(text.substring(start, match.start()), !isPattern); |
| out.addFragment(match.group(), isPattern); |
| } |
| out.addFragment(text.substring(start), !isPattern); |
| return out.toString(); |
| } |
| } |
| |
| // ---- Expanding Pseudo-localizer (e.g. "November" --> "[Ñöṽéɱƀéŕ one two]") ---- |
| |
| // A map from a string of alternating key/value code-points; e.g. '1' -> '①'. |
| // Note that a subset of this is also used to form the "exemplar" set (see PseudoType). |
| private static final IntUnaryOperator CONVERT_CODEPOINT = toCodePointFunction( |
| " \u2003!\u00a1\"\u2033#\u266f$\u20ac%\u2030&\u214b*\u204e+\u207a,\u060c-\u2010.\u00b7" |
| + "/\u20440\u24ea1\u24602\u24613\u24624\u24635\u24646\u24657\u24668\u24679\u2468" |
| + ":\u2236;\u204f<\u2264=\u2242>\u2265?\u00bf@\u055eA\u00c5B\u0181C\u00c7D\u00d0" |
| + "E\u00c9F\u0191G\u011cH\u0124I\u00ceJ\u0134K\u0136L\u013bM\u1e40N\u00d1O\u00d6" |
| + "P\u00deQ\u01eaR\u0154S\u0160T\u0162U\u00dbV\u1e7cW\u0174X\u1e8aY\u00ddZ\u017d" |
| + "[\u2045\\\u2216]\u2046^\u02c4_\u203f`\u2035a\u00e5b\u0180c\u00e7d\u00f0e\u00e9" |
| + "f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm\u0271n\u00f1o\u00f6p\u00fe" |
| + "q\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175x\u1e8by\u00fdz\u017e|\u00a6" |
| + "~\u02de"); |
| |
| // Converts a source/target alternating code-points into a map. |
| private static IntUnaryOperator toCodePointFunction(String s) { |
| // Not pretty, but there's no nice way to "pair up" successive stream elements without |
| // extra library dependencies, so we collect them and then iterate via index. |
| int[] codePoints = s.codePoints().toArray(); |
| checkArgument((codePoints.length & 1) == 0, |
| "must have an even number of code points (was %s)", codePoints.length); |
| ImmutableMap<Integer, Integer> map = |
| IntStream.range(0, codePoints.length / 2) |
| .boxed() |
| .collect(toImmutableMap(n -> codePoints[2 * n], n -> codePoints[(2 * n) + 1])); |
| return cp -> map.getOrDefault(cp, cp); |
| } |
| |
| // A list of words to be added to text when it is expanded. A whole number of words are |
| // always added (and the fact they are numeric words is irrelevant, could be Lorem Ipsum). |
| // So far nothing goes above "ten" in en_XA, but this can always be trivially extended. |
| private static final String PADDING = "one two three four five six seven eight nine ten"; |
| |
| private static PseudoText expanding(boolean isPattern) { |
| return new PseudoText() { |
| IntStream.Builder codePoints = IntStream.builder(); |
| |
| @Override |
| public void addFragment(String text, boolean isLocalizable) { |
| text.codePoints() |
| .map(isLocalizable ? CONVERT_CODEPOINT : cp -> cp) |
| .forEach(codePoints::add); |
| } |
| |
| @Override |
| public String toString() { |
| int[] cp = codePoints.build().toArray(); |
| // Copy the original code and round up the 50% calculation (it's not important). |
| int endIndex = CharMatcher.whitespace().indexIn(PADDING, (cp.length + 1) / 2); |
| String suffix = PADDING.substring(0, Math.min(endIndex, PADDING.length())); |
| // For pattern strings, any literal text must be quoted (the fragment text |
| // already was). Note that this is why we don't transform single-quotes. |
| if (isPattern) { |
| suffix = "'" + suffix.replace(" ", "' '") + "'"; |
| } |
| // Final output is something like "November" --> "[Ñöṽéɱƀéŕ one two]" |
| // Where the additional padding adds at least 50% to the length of the text. |
| return "[" + new String(cp, 0, cp.length) + " " + suffix + "]"; |
| } |
| }; |
| } |
| |
| // ---- Bidi Pseudo-localizer (e.g. "November" --> "rebmevoN" using BiDi tags)---- |
| |
| // Bidi localization doesn't care if the fragment is a pattern or not. |
| @SuppressWarnings("unused") |
| private static PseudoText bidi(boolean isPattern) { |
| return new PseudoText() { |
| private final StringBuilder out = new StringBuilder(); |
| |
| // This was largely copied from the original CLDRFilePseudolocalizer class and |
| // while it appears to work fine, I don't know enough to comment it clearly. |
| // TODO: Find someone who can add a decent comment here! |
| @Override |
| public void addFragment(String text, boolean isLocalizable) { |
| if (isLocalizable) { |
| boolean wrapping = false; |
| for (int index = 0; index < text.length(); ) { |
| int codePoint = text.codePointAt(index); |
| index += Character.charCount(codePoint); |
| byte directionality = Character.getDirectionality(codePoint); |
| boolean needsWrap = (directionality == DIRECTIONALITY_LEFT_TO_RIGHT); |
| if (needsWrap != wrapping) { |
| wrapping = needsWrap; |
| out.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX); |
| } |
| out.appendCodePoint(codePoint); |
| } |
| if (wrapping) { |
| out.append(BIDI_POSTFIX); |
| } |
| } else { |
| out.append(text); |
| } |
| } |
| |
| @Override |
| public String toString() { |
| return out.toString(); |
| } |
| }; |
| } |
| |
| private PseudoLocales() { |
| } |
| } |