ICU-21227 Fixing pseudo locale generation to include RTL control chars in exemplars See #1237

commit: d085cbb8166afc9fa7b9bcd6eb26e85b6614b665 [log] [tgz]
author: David Beaumont <dbeaumont@google.com> Mon Aug 17 19:56:43 2020 +0000
committer: David Beaumont <david.beaumont+github@gmail.com> Tue Aug 18 20:19:15 2020 +0200
tree: 9cc38445d91012a5e9301be023c52cc0349b9529
parent: d2789a035bae19b1d0dcef9082122f9b955a7faa [diff]
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java
index 36c14e2..1e11e68 100644
--- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java

@@ -60,8 +60,20 @@
  */
 // TODO(CLDR-13381): Move this all into the CLDR API once the dust has settled.
 public final class PseudoLocales {
+    // Right-to-left override character.
+    private static final String RLO = "\u202e";
+    // Arabic letter mark character.
+    private static final String ALM = "\u061C";
+    // Pop direction formatting character.
+    private static final String PDF = "\u202c";
+    // Prefix to add before each LTR word.
+    private static final String BIDI_PREFIX = ALM + RLO;
+    // Postfix to add after each LTR word.
+    private static final String BIDI_POSTFIX = PDF + ALM;
+
+    // See getExemplarValue() method for why we don't extract the exemplar list from "en".
     private enum PseudoType {
-        BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz"),
+        BIDI("ar_XB", PseudoLocales::bidi, "abcdefghijklmnopqrstuvwxyz" + ALM + RLO + PDF),
         EXPAND("en_XA", PseudoLocales::expanding,
             "a\u00e5b\u0180c\u00e7d\u00f0e\u00e9f\u0192g\u011dh\u0125i\u00eej\u0135k\u0137l\u013cm"
                 + "\u0271n\u00f1o\u00f6p\u00feq\u01ebr\u0155s\u0161t\u0163u\u00fbv\u1e7dw\u0175"
@@ -284,11 +296,25 @@
         private CldrValue getExemplarValue(CldrPath path) {
             StringBuilder exemplarList = new StringBuilder("[");
             type.getExemplars().codePoints()
-                .forEach(cp -> exemplarList.appendCodePoint(cp).append(' '));
+                .forEach(cp -> appendExemplarCodePoint(exemplarList, cp).append(' '));
             exemplarList.setCharAt(exemplarList.length() - 1, ']');
             return CldrValue.parseValue(path.toString(), exemplarList.toString());
         }
 
+        // Append a (possibly escaped) representation of the exemaplar character.
+        private static StringBuilder appendExemplarCodePoint(StringBuilder out, int cp) {
+            // This could be fixed if needed, but for now it's safer to check.
+            checkArgument(
+                Character.isBmpCodePoint(cp),
+                "Only BMP code points are supported for exemplars: 0x%s", Integer.toHexString(cp));
+            if (Character.isAlphabetic(cp)) {
+                out.appendCodePoint(cp);
+            } else {
+                out.append(String.format("\\u%04X", cp));
+            }
+            return out;
+        }
+
         private String createMessage(String text, boolean isPattern) {
             // Pattern text is split by the quoted sections (which are localizable) whereas
             // non-pattern text is split by placeholder (e.g. {0}) which are not localizable.
@@ -372,17 +398,6 @@
 
     // ---- Bidi Pseudo-localizer (e.g. "November" --> "rebmevoN" using BiDi tags)----
 
-    // Right-to-left override character.
-    private static final String RLO = "\u202e";
-    // Arabic letter mark character.
-    private static final String ALM = "\u061C";
-    // Pop direction formatting character.
-    private static final String PDF = "\u202c";
-    // Prefix to add before each LTR word.
-    private static final String BIDI_PREFIX = ALM + RLO;
-    // Postfix to add after each LTR word.
-    private static final String BIDI_POSTFIX = PDF + ALM;
-
     // Bidi localization doesn't care if the fragment is a pattern or not.
     @SuppressWarnings("unused")
     private static PseudoText bidi(boolean isPattern) {

diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java
index 4ffd2f3..c72e67b 100644
--- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PseudoLocalesTest.java

@@ -122,7 +122,7 @@
         CldrDataSupplier pseudo = PseudoLocales.addPseudoLocalesTo(src);
 
         assertValuesUnordered(pseudo.getDataForLocale("ar_XB", UNRESOLVED),
-            value(exemplarsPath, "[a b c d e f g h i j k l m n o p q r s t u v w x y z]"));
+            value(exemplarsPath, "[a b c d e f g h i j k l m n o p q r s t u v w x y z \\u061C \\u202E \\u202C]"));
         assertValuesUnordered(pseudo.getDataForLocale("en_XA", UNRESOLVED),
             value(exemplarsPath,
                 "[a å b ƀ c ç d ð e é f ƒ g ĝ h ĥ i î j ĵ k ķ l ļ m ɱ"
commit	d085cbb8166afc9fa7b9bcd6eb26e85b6614b665	[log] [tgz]
author	David Beaumont <dbeaumont@google.com>	Mon Aug 17 19:56:43 2020 +0000
committer	David Beaumont <david.beaumont+github@gmail.com>	Tue Aug 18 20:19:15 2020 +0200
tree	9cc38445d91012a5e9301be023c52cc0349b9529
parent	d2789a035bae19b1d0dcef9082122f9b955a7faa [diff]