ICU-21227 Fixing pseudo locale generation to include RTL control chars in exemplars
See #1237
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java
index 36c14e2..1e11e68 100644
--- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java
@@ -60,8 +60,20 @@
*/
// TODO(CLDR-13381): Move this all into the CLDR API once the dust has settled.
public final class PseudoLocales {
+ // Right-to-left override character.
+ private static final String RLO = "\u202e";
+ // Arabic letter mark character.
+ private static final String ALM = "\u061C";
+ // Pop direction formatting character.
+ private static final String PDF = "\u202c";
+ // Prefix to add before each LTR word.
+ private static final String BIDI_PREFIX = ALM + RLO;
+ // Postfix to add after each LTR word.
+ private static final String BIDI_POSTFIX = PDF + ALM;
+
+ // See getExemplarValue() method for why we don't extract the exemplar list from "en".
private enum PseudoType {
- BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz"),
+ BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz" + ALM + RLO + PDF),
EXPAND("en_XA", PseudoLocales::expanding,
"a\u00e5b\u0180c\u00e7d\u00f0e\u00e9f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm"
+ "\u0271n\u00f1o\u00f6p\u00feq\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175"
@@ -284,11 +296,25 @@
private CldrValue getExemplarValue(CldrPath path) {
StringBuilder exemplarList = new StringBuilder("[");
type.getExemplars().codePoints()
- .forEach(cp -> exemplarList.appendCodePoint(cp).append(' '));
+ .forEach(cp -> appendExemplarCodePoint(exemplarList, cp).append(' '));
exemplarList.setCharAt(exemplarList.length() - 1, ']');
return CldrValue.parseValue(path.toString(), exemplarList.toString());
}
+ // Append a (possibly escaped) representation of the exemaplar character.
+ private static StringBuilder appendExemplarCodePoint(StringBuilder out, int cp) {
+ // This could be fixed if needed, but for now it's safer to check.
+ checkArgument(
+ Character.isBmpCodePoint(cp),
+ "Only BMP code points are supported for exemplars: 0x%s", Integer.toHexString(cp));
+ if (Character.isAlphabetic(cp)) {
+ out.appendCodePoint(cp);
+ } else {
+ out.append(String.format("\\u%04X", cp));
+ }
+ return out;
+ }
+
private String createMessage(String text, boolean isPattern) {
// Pattern text is split by the quoted sections (which are localizable) whereas
// non-pattern text is split by placeholder (e.g. {0}) which are not localizable.
@@ -372,17 +398,6 @@
// ---- Bidi Pseudo-localizer (e.g. "November" --> "rebmevoN" using BiDi tags)----
- // Right-to-left override character.
- private static final String RLO = "\u202e";
- // Arabic letter mark character.
- private static final String ALM = "\u061C";
- // Pop direction formatting character.
- private static final String PDF = "\u202c";
- // Prefix to add before each LTR word.
- private static final String BIDI_PREFIX = ALM + RLO;
- // Postfix to add after each LTR word.
- private static final String BIDI_POSTFIX = PDF + ALM;
-
// Bidi localization doesn't care if the fragment is a pattern or not.
@SuppressWarnings("unused")
private static PseudoText bidi(boolean isPattern) {
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java
index 4ffd2f3..c72e67b 100644
--- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java
@@ -122,7 +122,7 @@
CldrDataSupplier pseudo = PseudoLocales.addPseudoLocalesTo(src);
assertValuesUnordered(pseudo.getDataForLocale("ar_XB", UNRESOLVED),
- value(exemplarsPath, "[a b c d e f g h i j k l m n o p q r s t u v w x y z]"));
+ value(exemplarsPath, "[a b c d e f g h i j k l m n o p q r s t u v w x y z \\u061C \\u202E \\u202C]"));
assertValuesUnordered(pseudo.getDataForLocale("en_XA", UNRESOLVED),
value(exemplarsPath,
"[a å b ƀ c ç d ð e é f ƒ g ĝ h ĥ i î j ĵ k ķ l ļ m ɱ"