ICU-20917 LocaleMatcher: prefer a more-default locale

commit: 60b567d6abc707d256e4049bb80a4e9d2a93a021 [log] [tgz]
author: Markus Scherer <markus.icu@gmail.com> Sat Dec 21 06:48:17 2019 -0800
committer: Markus Scherer <markus.icu@gmail.com> Thu Jan 02 18:00:52 2020 -0800
tree: 45fd0726a724b4db2ef044cb4931c3541959b8dd
parent: 79fac501010d63231c258dc0d4fb9a9e87ddb8d8 [diff]
diff --git a/icu4c/source/common/locdistance.cpp b/icu4c/source/common/locdistance.cpp
index 50633cc..4304fab 100644
--- a/icu4c/source/common/locdistance.cpp
+++ b/icu4c/source/common/locdistance.cpp

@@ -69,7 +69,7 @@
         errorCode = U_MISSING_RESOURCE_ERROR;
         return;
     }
-    gLocaleDistance = new LocaleDistance(data);
+    gLocaleDistance = new LocaleDistance(data, likely);
     if (gLocaleDistance == nullptr) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return;
@@ -83,7 +83,8 @@
     return gLocaleDistance;
 }
 
-LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
+LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
+        likelySubtags(likely),
         trie(data.distanceTrieBytes),
         regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
         paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
@@ -122,6 +123,8 @@
     uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
     // Index of the supported LSR with the lowest distance.
     int32_t bestIndex = -1;
+    // Cached lookup info from XLikelySubtags.compareLikely().
+    int32_t bestLikelyInfo = -1;
     for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
         const LSR &supported = *supportedLSRs[slIndex];
         bool star = false;
@@ -207,13 +210,29 @@
             // Distinguish between equivalent but originally unequal locales via an
             // additional micro distance.
             shiftedDistance |= (desired.flags ^ supported.flags);
-        }
-        if (shiftedDistance < shiftedThreshold) {
-            if (shiftedDistance == 0) {
-                return slIndex << INDEX_SHIFT;
+            if (shiftedDistance < shiftedThreshold) {
+                if (shiftedDistance == 0) {
+                    return slIndex << INDEX_SHIFT;
+                }
+                bestIndex = slIndex;
+                shiftedThreshold = shiftedDistance;
+                bestLikelyInfo = -1;
             }
-            bestIndex = slIndex;
-            shiftedThreshold = shiftedDistance;
+        } else {
+            if (shiftedDistance < shiftedThreshold) {
+                bestIndex = slIndex;
+                shiftedThreshold = shiftedDistance;
+                bestLikelyInfo = -1;
+            } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
+                bestLikelyInfo = likelySubtags.compareLikely(
+                        supported, *supportedLSRs[bestIndex], bestLikelyInfo);
+                if ((bestLikelyInfo & 1) != 0) {
+                    // This supported locale matches as well as the previous best match,
+                    // and neither matches perfectly,
+                    // but this one is "more likely" (has more-default subtags).
+                    bestIndex = slIndex;
+                }
+            }
         }
     }
     return bestIndex >= 0 ?

diff --git a/icu4c/source/common/locdistance.h b/icu4c/source/common/locdistance.h
index 0ee3d0e..88fd73f 100644
--- a/icu4c/source/common/locdistance.h
+++ b/icu4c/source/common/locdistance.h

@@ -82,7 +82,7 @@
         return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
     }
 
-    LocaleDistance(const LocaleDistanceData &data);
+    LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
     LocaleDistance(const LocaleDistance &other) = delete;
     LocaleDistance &operator=(const LocaleDistance &other) = delete;
 
@@ -110,6 +110,8 @@
         return defaultRegionDistance;
     }
 
+    const XLikelySubtags &likelySubtags;
+
     // The trie maps each dlang+slang+dscript+sscript+dregion+sregion
     // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
     // There is also a trie value for each subsequence of whole subtags.

diff --git a/icu4c/source/common/loclikelysubtags.cpp b/icu4c/source/common/loclikelysubtags.cpp
index 27f10b3..1fbf1a1 100644
--- a/icu4c/source/common/loclikelysubtags.cpp
+++ b/icu4c/source/common/loclikelysubtags.cpp

@@ -557,6 +557,106 @@
     return LSR(language, script, region, retainOldMask);
 }
 
+int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
+    // If likelyInfo >= 0:
+    // likelyInfo bit 1 is set if the previous comparison with lsr
+    // was for equal language and script.
+    // Otherwise the scripts differed.
+    if (uprv_strcmp(lsr.language, other.language) != 0) {
+        return 0xfffffffc;  // negative, lsr not better than other
+    }
+    if (uprv_strcmp(lsr.script, other.script) != 0) {
+        int32_t index;
+        if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+            index = likelyInfo >> 2;
+        } else {
+            index = getLikelyIndex(lsr.language, "");
+            likelyInfo = index << 2;
+        }
+        const LSR &likely = lsrs[index];
+        if (uprv_strcmp(lsr.script, likely.script) == 0) {
+            return likelyInfo | 1;
+        } else {
+            return likelyInfo & ~1;
+        }
+    }
+    if (uprv_strcmp(lsr.region, other.region) != 0) {
+        int32_t index;
+        if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+            index = likelyInfo >> 2;
+        } else {
+            index = getLikelyIndex(lsr.language, lsr.region);
+            likelyInfo = (index << 2) | 2;
+        }
+        const LSR &likely = lsrs[index];
+        if (uprv_strcmp(lsr.region, likely.region) == 0) {
+            return likelyInfo | 1;
+        } else {
+            return likelyInfo & ~1;
+        }
+    }
+    return likelyInfo & ~1;  // lsr not better than other
+}
+
+// Subset of maximize().
+int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const {
+    if (uprv_strcmp(language, "und") == 0) {
+        language = "";
+    }
+    if (uprv_strcmp(script, "Zzzz") == 0) {
+        script = "";
+    }
+
+    BytesTrie iter(trie);
+    uint64_t state;
+    int32_t value;
+    // Small optimization: Array lookup for first language letter.
+    int32_t c0;
+    if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
+            language[1] != 0 &&  // language.length() >= 2
+            (state = trieFirstLetterStates[c0]) != 0) {
+        value = trieNext(iter.resetToState64(state), language, 1);
+    } else {
+        value = trieNext(iter, language, 0);
+    }
+    if (value >= 0) {
+        state = iter.getState64();
+    } else {
+        iter.resetToState64(trieUndState);  // "und" ("*")
+        state = 0;
+    }
+
+    if (value > 0) {
+        // Intermediate or final value from just language.
+        if (value == SKIP_SCRIPT) {
+            value = 0;
+        }
+    } else {
+        value = trieNext(iter, script, 0);
+        if (value >= 0) {
+            state = iter.getState64();
+        } else {
+            if (state == 0) {
+                iter.resetToState64(trieUndZzzzState);  // "und-Zzzz" ("**")
+            } else {
+                iter.resetToState64(state);
+                value = trieNext(iter, "", 0);
+                U_ASSERT(value >= 0);
+                state = iter.getState64();
+            }
+        }
+    }
+
+    if (value > 0) {
+        // Final value from just language or language+script.
+    } else {
+        value = trieNext(iter, "", 0);
+        U_ASSERT(value > 0);
+    }
+    U_ASSERT(value < lsrsLength);
+    return value;
+}
+
 int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
     UStringTrieResult result;
     uint8_t c;

diff --git a/icu4c/source/common/loclikelysubtags.h b/icu4c/source/common/loclikelysubtags.h
index 8c8a08a..90ddfff 100644
--- a/icu4c/source/common/loclikelysubtags.h
+++ b/icu4c/source/common/loclikelysubtags.h

@@ -85,6 +85,18 @@
     // VisibleForTesting
     LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
 
+    /**
+     * Tests whether lsr is "more likely" than other.
+     * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+     * FR is the default region for fr-Latn.
+     *
+     * The likelyInfo caches lookup information between calls.
+     * The return value is an updated likelyInfo value,
+     * with bit 0 set if lsr is "more likely".
+     * The initial value of likelyInfo must be negative.
+     */
+    int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
+
     // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
     // in loclikely.cpp to this new code, including activating this
     // minimizeSubtags() function. The LocaleMatcher does not minimize.
@@ -111,6 +123,8 @@
      */
     LSR maximize(const char *language, const char *script, const char *region) const;
 
+    int32_t getLikelyIndex(const char *language, const char *script) const;
+
     static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
 
     UResourceBundle *langInfoBundle;

diff --git a/icu4c/source/test/testdata/localeMatcherTest.txt b/icu4c/source/test/testdata/localeMatcherTest.txt
index 649c95b..6e5dbdd 100644
--- a/icu4c/source/test/testdata/localeMatcherTest.txt
+++ b/icu4c/source/test/testdata/localeMatcherTest.txt

@@ -733,7 +733,7 @@
 @favor=script
 en-GB >> en-GB
 en-US >> en
-fr >> en-GB
+fr >> en
 ja >> fr
 
 ** test: testEmptyWithDefault
@@ -761,8 +761,8 @@
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
+zu >> en
 # For a language that doesn't match anything, return the default.
-zu >> en-GB
 zxx >> fr
 
 @favor=script
@@ -770,7 +770,7 @@
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
-zu >> en-GB
+zu >> en
 zxx >> en
 
 ** test: TestExactMatch
@@ -1322,7 +1322,7 @@
 @favor=script
 und >> und
 ja >> und
-fr-CA >> en-GB
+fr-CA >> en-US
 en-AU >> en-GB
 en-BZ >> en-GB
 en-CA >> en-GB
@@ -1359,8 +1359,8 @@
 @supported=en-GB, en-US, en, en-AU
 und >> und
 ja >> und
-fr-CA >> en-GB
-fr >> en-GB
+fr-CA >> en-US
+fr >> en-US
 @supported=en-AU, ja, ca
 fr >> en-AU
 @supported=pl, ja, ca
@@ -1901,7 +1901,7 @@
 fr-FR >> fr-CA # Sibling match is chosen.
 @supported=fr-CA, fr-FR
 fr >> fr-FR # Inferred region match is chosen.
-fr-SN >> fr-CA
+fr-SN >> fr-FR
 @supported=en, fr-FR
 fr >> fr-FR # Child match is chosen.
 @supported=de, en, it
@@ -1931,7 +1931,7 @@
 fr-FR >> fr-CA
 @supported=fr-CA, fr-FR
 fr >> fr-FR
-fr-SN >> fr-CA
+fr-SN >> fr-FR
 @supported=en, fr-FR
 fr >> fr-FR
 @supported=de, en, it
@@ -1951,3 +1951,10 @@
 zh-CN >> zh-TW
 @supported=ja
 ru >> und
+
+** test: favor a more-default locale among equally imperfect matches
+@supported=fr-CA, fr-CH, fr-FR, fr-GB
+fr-SN >> fr-FR
+@supported=sr-Latn, sr-Cyrl, sr-Grek
+@threshold=60
+sr-Thai >> sr-Cyrl

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java
index b5bd4df..fce5a9c 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java

@@ -255,6 +255,8 @@
         long desLangState = desLangDistance >= 0 && supportedLSRs.length > 1 ? iter.getState64() : 0;
         // Index of the supported LSR with the lowest distance.
         int bestIndex = -1;
+        // Cached lookup info from XLikelySubtags.compareLikely().
+        int bestLikelyInfo = -1;
         for (int slIndex = 0; slIndex < supportedLSRs.length; ++slIndex) {
             LSR supported = supportedLSRs[slIndex];
             boolean star = false;
@@ -340,13 +342,29 @@
                 // Distinguish between equivalent but originally unequal locales via an
                 // additional micro distance.
                 shiftedDistance |= (desired.flags ^ supported.flags);
-            }
-            if (shiftedDistance < shiftedThreshold) {
-                if (shiftedDistance == 0) {
-                    return slIndex << INDEX_SHIFT;
+                if (shiftedDistance < shiftedThreshold) {
+                    if (shiftedDistance == 0) {
+                        return slIndex << INDEX_SHIFT;
+                    }
+                    bestIndex = slIndex;
+                    shiftedThreshold = shiftedDistance;
+                    bestLikelyInfo = -1;
                 }
-                bestIndex = slIndex;
-                shiftedThreshold = shiftedDistance;
+            } else {
+                if (shiftedDistance < shiftedThreshold) {
+                    bestIndex = slIndex;
+                    shiftedThreshold = shiftedDistance;
+                    bestLikelyInfo = -1;
+                } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
+                    bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
+                            supported, supportedLSRs[bestIndex], bestLikelyInfo);
+                    if ((bestLikelyInfo & 1) != 0) {
+                        // This supported locale matches as well as the previous best match,
+                        // and neither matches perfectly,
+                        // but this one is "more likely" (has more-default subtags).
+                        bestIndex = slIndex;
+                    }
+                }
             }
         }
         return bestIndex >= 0 ?

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
index 543aade..332f035 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java

@@ -367,6 +367,114 @@
         return new LSR(language, script, region, retainOldMask);
     }
 
+    /**
+     * Tests whether lsr is "more likely" than other.
+     * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+     * FR is the default region for fr-Latn.
+     *
+     * <p>The likelyInfo caches lookup information between calls.
+     * The return value is an updated likelyInfo value,
+     * with bit 0 set if lsr is "more likely".
+     * The initial value of likelyInfo must be negative.
+     */
+    int compareLikely(LSR lsr, LSR other, int likelyInfo) {
+        // If likelyInfo >= 0:
+        // likelyInfo bit 1 is set if the previous comparison with lsr
+        // was for equal language and script.
+        // Otherwise the scripts differed.
+        if (!lsr.language.equals(other.language)) {
+            return 0xfffffffc;  // negative, lsr not better than other
+        }
+        if (!lsr.script.equals(other.script)) {
+            int index;
+            if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+                index = likelyInfo >> 2;
+            } else {
+                index = getLikelyIndex(lsr.language, "");
+                likelyInfo = index << 2;
+            }
+            LSR likely = lsrs[index];
+            if (lsr.script.equals(likely.script)) {
+                return likelyInfo | 1;
+            } else {
+                return likelyInfo & ~1;
+            }
+        }
+        if (!lsr.region.equals(other.region)) {
+            int index;
+            if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+                index = likelyInfo >> 2;
+            } else {
+                index = getLikelyIndex(lsr.language, lsr.region);
+                likelyInfo = (index << 2) | 2;
+            }
+            LSR likely = lsrs[index];
+            if (lsr.region.equals(likely.region)) {
+                return likelyInfo | 1;
+            } else {
+                return likelyInfo & ~1;
+            }
+        }
+        return likelyInfo & ~1;  // lsr not better than other
+    }
+
+    // Subset of maximize().
+    private int getLikelyIndex(String language, String script) {
+        if (language.equals("und")) {
+            language = "";
+        }
+        if (script.equals("Zzzz")) {
+            script = "";
+        }
+
+        BytesTrie iter = new BytesTrie(trie);
+        long state;
+        int value;
+        // Small optimization: Array lookup for first language letter.
+        int c0;
+        if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
+                (state = trieFirstLetterStates[c0]) != 0) {
+            value = trieNext(iter.resetToState64(state), language, 1);
+        } else {
+            value = trieNext(iter, language, 0);
+        }
+        if (value >= 0) {
+            state = iter.getState64();
+        } else {
+            iter.resetToState64(trieUndState);  // "und" ("*")
+            state = 0;
+        }
+
+        if (value > 0) {
+            // Intermediate or final value from just language.
+            if (value == SKIP_SCRIPT) {
+                value = 0;
+            }
+        } else {
+            value = trieNext(iter, script, 0);
+            if (value >= 0) {
+                state = iter.getState64();
+            } else {
+                if (state == 0) {
+                    iter.resetToState64(trieUndZzzzState);  // "und-Zzzz" ("**")
+                } else {
+                    iter.resetToState64(state);
+                    value = trieNext(iter, "", 0);
+                    assert value >= 0;
+                    state = iter.getState64();
+                }
+            }
+        }
+
+        if (value > 0) {
+            // Final value from just language or language+script.
+        } else {
+            value = trieNext(iter, "", 0);
+            assert value > 0;
+        }
+        return value;
+    }
+
     private static final int trieNext(BytesTrie iter, String s, int i) {
         BytesTrie.Result result;
         if (s.isEmpty()) {

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt
index 649c95b..6e5dbdd 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt

@@ -733,7 +733,7 @@
 @favor=script
 en-GB >> en-GB
 en-US >> en
-fr >> en-GB
+fr >> en
 ja >> fr
 
 ** test: testEmptyWithDefault
@@ -761,8 +761,8 @@
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
+zu >> en
 # For a language that doesn't match anything, return the default.
-zu >> en-GB
 zxx >> fr
 
 @favor=script
@@ -770,7 +770,7 @@
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
-zu >> en-GB
+zu >> en
 zxx >> en
 
 ** test: TestExactMatch
@@ -1322,7 +1322,7 @@
 @favor=script
 und >> und
 ja >> und
-fr-CA >> en-GB
+fr-CA >> en-US
 en-AU >> en-GB
 en-BZ >> en-GB
 en-CA >> en-GB
@@ -1359,8 +1359,8 @@
 @supported=en-GB, en-US, en, en-AU
 und >> und
 ja >> und
-fr-CA >> en-GB
-fr >> en-GB
+fr-CA >> en-US
+fr >> en-US
 @supported=en-AU, ja, ca
 fr >> en-AU
 @supported=pl, ja, ca
@@ -1901,7 +1901,7 @@
 fr-FR >> fr-CA # Sibling match is chosen.
 @supported=fr-CA, fr-FR
 fr >> fr-FR # Inferred region match is chosen.
-fr-SN >> fr-CA
+fr-SN >> fr-FR
 @supported=en, fr-FR
 fr >> fr-FR # Child match is chosen.
 @supported=de, en, it
@@ -1931,7 +1931,7 @@
 fr-FR >> fr-CA
 @supported=fr-CA, fr-FR
 fr >> fr-FR
-fr-SN >> fr-CA
+fr-SN >> fr-FR
 @supported=en, fr-FR
 fr >> fr-FR
 @supported=de, en, it
@@ -1951,3 +1951,10 @@
 zh-CN >> zh-TW
 @supported=ja
 ru >> und
+
+** test: favor a more-default locale among equally imperfect matches
+@supported=fr-CA, fr-CH, fr-FR, fr-GB
+fr-SN >> fr-FR
+@supported=sr-Latn, sr-Cyrl, sr-Grek
+@threshold=60
+sr-Thai >> sr-Cyrl
commit	60b567d6abc707d256e4049bb80a4e9d2a93a021	[log] [tgz]
author	Markus Scherer <markus.icu@gmail.com>	Sat Dec 21 06:48:17 2019 -0800
committer	Markus Scherer <markus.icu@gmail.com>	Thu Jan 02 18:00:52 2020 -0800
tree	45fd0726a724b4db2ef044cb4931c3541959b8dd
parent	79fac501010d63231c258dc0d4fb9a9e87ddb8d8 [diff]