// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;

import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.util.function.Function.identity;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.PathMatcher;

import com.google.common.base.Ascii;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableTable;
import com.google.common.collect.Sets;
import com.google.common.collect.Table;

/**
 * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
 * in some mapper classes.
 *
 * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
 * build using the same underlying CLDR data. The only reason mapper classes do not create their
 * own instances directly is the relative cost of processing all the supplemental data each time.
 */
// TODO: This should be moved into the API and leverage some of the existing utility functions.
public final class SupplementalData {
    // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
    // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
    // locale. However CLDR cannot represent this currently because calendar defaults are in
    // supplemental data (rather than locale data) and are keyed only on territory.
    private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
        ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");

    private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");

    private static final PathMatcher ALIAS =
        PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]");

    private static final PathMatcher PARENT_LOCALE =
        PathMatcher.of("//supplementalData/parentLocales/parentLocale[@parent=*]");
    private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
    private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");

    private static final PathMatcher CALENDER_PREFERENCE =
        PathMatcher.of("//supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
    private static final AttributeKey CALENDER_TERRITORIES =
        keyOf("calendarPreference", "territories");
    private static final AttributeKey CALENDER_ORDERING =
        keyOf("calendarPreference", "ordering");

    private static final PathMatcher LIKELY_SUBTAGS =
        PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]");
    private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
    private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");

    private static final Splitter LIST_SPLITTER =
        Splitter.on(whitespace()).omitEmptyStrings();

    // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
    // a single value (it's structurally always a list, but only territory aliases have a need for
    // more than one value).
    private enum Alias {
        LANGUAGE, SCRIPT, TERRITORY;

        private static final ImmutableMap<String, Alias> TYPE_MAP =
            Arrays.stream(values())
                .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));

        private final String elementName = Ascii.toLowerCase(name()) + "Alias";
        final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
        final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");

        static Optional<Alias> forElementName(String name) {
            return Optional.ofNullable(TYPE_MAP.get(name));
        }
    }

    /**
     * Creates a supplemental data API instance from the given CLDR data supplier.
     *
     * @param src the CLDR data supplier.
     * @return the supplemental data API.
     */
    public static SupplementalData create(CldrDataSupplier src) {
        Table<Alias, String, String> aliasTable = HashBasedTable.create();
        Map<String, String> parentLocaleMap = new HashMap<>();
        Map<String, String> defaultCalendarMap = new HashMap<>();
        Map<String, String> likelySubtagMap = new HashMap<>();

        src.getDataForType(CldrDataType.SUPPLEMENTAL).accept(
            ARBITRARY,
            v -> {
                if (ALIAS.matches(v.getPath())) {
                    // Territory alias replacements can be a list of values (e.g. when countries
                    // break up). We use the first (geo-politically most significant) value. This
                    // doesn't happen for languages or scripts, but could in theory.
                    Alias.forElementName(v.getPath().getName()).ifPresent(
                        alias -> aliasTable.put(
                            alias,
                            alias.typeKey.valueFrom(v),
                            alias.replacementKey.valueFrom(v)));
                } else if (PARENT_LOCALE.matches(v.getPath())) {
                    String p = PARENT.valueFrom(v);
                    LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
                } else if (CALENDER_PREFERENCE.matches(v.getPath())) {
                    String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
                    CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
                } else if (LIKELY_SUBTAGS.matches(v.getPath())) {
                    likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
                }
            });

        Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS);
        return new SupplementalData(
            availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
    }

    // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
    // data generation. Because this is mutable, it is thoroughly unsuitable for general use.
    private static final class LocaleId {
        // From: https://unicode.org/reports/tr35/#Identifiers
        // Locale ID is:
        //   (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
        //
        // However in CLDR data, there's always a language (even if it's "und"), and never more
        // than one variant, so this can be simplified to:
        //   <language>(_<script>)?(_<region>)?(_<variant>)?
        //
        // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
        //   Note that the specification allows for languages 5-8 characters long, but in reality
        //   this has never occurred yet, so it's ignored in this code.
        //
        // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
        //   The specification permits any casing for script subtags, but since all the data uses
        //   the capitalized "Xxxx" form, that's what this code expects.
        //
        // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
        //   identifier (e.g. "001").
        //
        // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
        //   with a digit (this avoids any ambiguity with script subtags). However because ICU
        //   violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
        //   merely "longer than 5".
        //
        // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
        // for either '-' or '_').
        //
        // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
        private static final Pattern LOCALE_ID =
            Pattern.compile("([a-z]{2,3})"
                + "(?:_([A-Z][a-z]{3}))?"
                + "(?:_([A-Z]{2}|[0-9]{3}))?"
                + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");

        static LocaleId parse(String localeId) {
            Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
            checkArgument(m.matches(), "invalid locale ID: %s", localeId);
            return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
        }

        static LocaleId of(String language, String script, String region) {
            return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
        }

        // Only the language subtag is non-nullable.
        private String languageSubtag;
        private String scriptSubtag;
        private String regionSubtag;
        private String variantSubtag;

        String getLanguage() {
            return languageSubtag;
        }

        String getScript() {
            return scriptSubtag;
        }

        String getRegion() {
            return regionSubtag;
        }

        String getVariant() {
            return variantSubtag;
        }

        LocaleId setLanguage(String languageSubtag) {
            checkNotNull(languageSubtag, "language subtag must not be null");
            checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
            this.languageSubtag = languageSubtag;
            return this;
        }

        LocaleId setScript(String scriptSubtag) {
            this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
            return this;
        }

        LocaleId setRegion(String regionSubtag) {
            this.regionSubtag = Strings.emptyToNull(regionSubtag);
            return this;
        }

        LocaleId setVariant(String variantSubtag) {
            this.variantSubtag = Strings.emptyToNull(variantSubtag);
            return this;
        }

        @Override public String toString() {
            StringBuilder id = new StringBuilder(languageSubtag);
            if (scriptSubtag != null) {
                id.append("_").append(scriptSubtag);
            }
            if (regionSubtag != null) {
                id.append("_").append(regionSubtag);
            }
            if (variantSubtag != null) {
                id.append("_").append(variantSubtag);
            }
            return id.toString();
        }

        @Override public boolean equals(Object o) {
            if (!(o instanceof LocaleId)) {
                return false;
            }
            LocaleId other = (LocaleId) o;
            return Objects.equals(languageSubtag, other.languageSubtag)
                && Objects.equals(scriptSubtag, other.scriptSubtag)
                && Objects.equals(regionSubtag, other.regionSubtag)
                && Objects.equals(variantSubtag, other.variantSubtag);
        }

        @Override public int hashCode() {
            return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
        }
    }

    private final ImmutableSet<String> availableIds;
    private final ImmutableTable<Alias, String, String> aliasTable;
    private final ImmutableMap<String, String> parentLocaleMap;
    private final ImmutableMap<String, String> defaultCalendarMap;
    private final ImmutableMap<String, String> likelySubtagMap;

    private SupplementalData(
        Set<String> availableIds,
        Table<Alias, String, String> aliasTable,
        Map<String, String> parentLocaleMap,
        Map<String, String> defaultCalendarMap,
        Map<String, String> likelySubtagMap) {

        this.availableIds = ImmutableSet.copyOf(availableIds);
        this.aliasTable = ImmutableTable.copyOf(aliasTable);
        this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
        this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
        this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
    }

    public ImmutableSet<String> getAvailableLocaleIds() {
        return availableIds;
    }

    /**
     * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
     */
    public Optional<String> maximize(String localeId) {
        return addLikelySubtags(localeId).map(Object::toString);
    }

    /**
     * Returns the locale ID with any deprecated elements replaced. This is an
     * implementation of the algorithm specified in
     * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
     * specification</a> but without any "minimizing" of the final result (as happens for
     * canonicalization in the CLDR tools).
     */
    public String replaceDeprecatedTags(String localeId) {
        if (localeId.equals("root")) {
            return localeId;
        }
        LocaleId id = LocaleId.parse(localeId);

        // ---- LDML Specification ----
        // If the region subtag matches the type attribute of a territoryAlias element in
        // Supplemental Data, replace the region subtag with the replacement value, as follows:
        //
        // * If there is a single territory in the replacement, use it.
        // * If there are multiple territories:
        //   * Look up the most likely territory for the base language code (and script, if there
        //     is one).
        //   * If that likely territory is in the list, use it.
        //   * Otherwise, use the first territory in the list.
        // ----
        // However there is a footnote that says:
        //   Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
        //   However, there are a small number of cases of multiple territories, so the mappings
        //   can be precomputed. This results in a faster lookup with a very small subset of the
        //   likely subtags data.
        //
        // Note that (contrary to the order implied by the LDML specification) this step is
        // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
        // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
        // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
        //
        // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
        if (id.getRegion() != null) {
            String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
            if (replacementRegions != null) {
                List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
                checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
                if (regions.size() == 1) {
                    id.setRegion(regions.get(0));
                } else {
                    LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
                    String likelyId = likelySubtagMap.get(key.toString());
                    if (likelyId == null) {
                        likelyId = likelySubtagMap.get(key.setScript(null).toString());
                    }
                    String likelyRegion =
                        likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
                    if (regions.contains(likelyRegion)) {
                        id.setRegion(likelyRegion);
                    } else {
                        id.setRegion(regions.get(0));
                    }
                }
            }
        }

        // While it's not mentioned in the LDML specification, there is data in the alias table for
        // replacement scripts (currently it contains exactly one entry with one value). Because
        // its not clear if this is intended to only be single values or a list (and how to handle
        // it if it were a list), there's a hard check to ensure it's only ever a single value.
        if (id.getScript() != null) {
            String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
            if (replacementScript != null) {
                checkArgument(whitespace().matchesNoneOf(replacementScript),
                    "unexpected list of replacement scripts: %s", replacementScript);
                id.setScript(replacementScript);
            }
        }

        // ---- LDML Specification ----
        // If the language subtag matches the type attribute of a languageAlias element in
        // Supplemental Data, replace the language subtag with the replacement value.
        //
        // If there are additional subtags in the replacement value, add them to the result, but
        // only if there is no corresponding subtag already in the tag.
        // ----
        // Contrary to the precise wording of the specification, we don't just check the language
        // subtag, since language aliases can contain script and even region information. Instead
        // we check the alias table using the same order as defined in subtag maximizing:
        //
        // <language>_<script>_<region>
        // <language>_<region>
        // <language>_<script>
        // <language>
        //
        // There is no need to check for "und" however since that's not aliased anything, but since
        // it shares the same code it's harmless to do.
        resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
            .ifPresent(resolvedId -> {
                id.setLanguage(checkNotNull(resolvedId.getLanguage(),
                     "missing language subtag in language alias: %s", resolvedId));
                if (id.getScript() == null) {
                    id.setScript(resolvedId.getScript());
                }
                if (id.getRegion() == null) {
                    id.setRegion(resolvedId.getRegion());
                }
                if (id.getVariant() == null) {
                    id.setVariant(resolvedId.getVariant());
                }
            });
        return id.toString();
    }

    /**
     * Returns a suitable default calendar for a given locale if it's different from the default
     * calendar inferred by the locale's parent.
     *
     * <p>Note that since the default calendar data is keyed from territory (region subtag) rather
     * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
     * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
     * handled with hard-code special casing, but should probably be data driven eventually.
     */
    public Optional<String> getDefaultCalendar(String localeId) {
        Optional<String> calendar = getSpecialCaseCalendar(localeId);
        if (calendar.isPresent()) {
            return calendar;
        }
        String t = territoryOf(localeId);
        calendar = Optional.ofNullable(defaultCalendarMap.get(t));
        if (!calendar.isPresent()) {
            return Optional.empty();
        }
        String rootCalendar = defaultCalendarMap.get("001");
        checkState(!rootCalendar.isEmpty(), "missing root calendar");
        if (localeId.equals("root")) {
            return Optional.of(rootCalendar);
        }
        // All locales reach "root" eventually, and that maps to territory "001" which
        // we already know has a value, so this loop *must* exit.
        String parentCalendar;
        do {
            localeId = getParent(localeId);
            String territory = territoryOf(localeId);
            parentCalendar = defaultCalendarMap.get(territory);
        } while (parentCalendar == null);
        return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
    }

    // Hack to work around the limitation that CLDR data cannot represent default calendars that
    // change because of non-territory information. Since this is limited to exactly two cases at
    // the moment, and is unlikely to be expanded, it's being done directly in code.
    private Optional<String> getSpecialCaseCalendar(String localeId) {
        Optional<String> maximized = maximize(localeId);
        if (maximized.isPresent()) {
            switch (maximized.get()) {
            case "ja_Jpan_JP_TRADITIONAL":
                return Optional.of("japanese");
            case "th_Thai_TH_TRADITIONAL":
                return Optional.of("buddhist");
            }
        }
        return Optional.empty();
    }

    /**
     * Returns the parent of a non-root locale ID. This is more complex than simple truncation for
     * two reasons:
     * <ul>
     *     <li>There may be an explicit parent locale ID specified in the CLDR data.
     *     <li>Removal of non-default script subtags makes the parent locale "root" (unless there
     *         was an explicit parent specified).
     * </ul>
     * Note that all valid locale ID parent "chains" must end up at "root" eventually.
     *
     * For example (showing parent "chains"):
     * <ul>
     *     <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
     *     <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
     * </ul>
     *
     * @throws IllegalArgumentException if the given locale ID is invalid or "root".
     */
    public String getParent(String localeId) {
        checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
        // Always defer to an explicit parent locale set in the CLDR data.
        Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
        if (explicitParent.isPresent()) {
            return explicitParent.get();
        }
        // Now look for the start of the last ID "part" in order to truncate.
        int lastPartSeperatorIndex = localeId.lastIndexOf('_');
        // The parent of a base language ID (e.g. "en" or "fr") is always "root".
        if (lastPartSeperatorIndex == -1) {
            return "root";
        }
        String parentId = localeId.substring(0, lastPartSeperatorIndex);

        // However, if the script of the locale is what's being truncated and it's NOT the default
        // script for the language, return "root" as the parent rather than truncating.
        String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
        if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
            return "root";
        }
        return !parentId.isEmpty() ? parentId : "root";
    }

    /**
     * Returns the explicit parent of a locale ID if specified in the CLDR data.
     *
     * Note that this method will not return a value for most locale IDs, since they do not have
     * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
     * #getParent(String)}.
     */
    public Optional<String> getExplicitParentLocaleOf(String localeId) {
        return Optional.ofNullable(parentLocaleMap.get(localeId));
    }

    private String territoryOf(String localeId) {
        return localeId.equals("root")
            ? "001"
            : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
    }

    private String scriptOf(String localeId) {
        return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
    }

    // From: https://unicode.org/reports/tr35/#Likely_Subtags
    //
    // Add Likely Subtags
    // ------------------
    // Given a source locale X, to return a locale Y where the empty subtags have been filled in
    // by the most likely subtags. A subtag is called empty if it is a missing script or region
    // subtag, or it is a base language subtag with the value "und".
    //
    // Canonicalize
    // ------------
    // Make sure the input locale is in canonical form ...
    // ...
    // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
    //
    // Note that this implementation does not need to handle "grandfathered" tags.
    private Optional<LocaleId> addLikelySubtags(String localeId) {
        if (localeId.equals("root")) {
            return Optional.empty();
        }

        LocaleId id = LocaleId.parse(localeId);
        // ---- LDML Specification ----
        // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
        if ("Zzzz".equals(id.getScript())) {
            id.setScript(null);
        }
        if ("ZZ".equals(id.getRegion())) {
            id.setRegion(null);
        }
        // ---- LDML Specification ----
        // A subtag is called empty if it is a missing script or region subtag, or it is a base
        // language subtag with the value "und"
        if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
            // We are already canonical, so just return.
            return Optional.of(id);
        }
        Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
        if (!optTags.isPresent()) {
            return Optional.empty();
        }
        LocaleId subtags = optTags.get();
        checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
        // Replace "missing" elements in the original ID with likely subtags.
        if (id.getLanguage().equals("und")) {
            id.setLanguage(subtags.getLanguage());
        }
        if (id.getScript() == null) {
            id.setScript(checkNotNull(subtags.getScript()));
        }
        if (id.getRegion() == null) {
            id.setRegion(checkNotNull(subtags.getRegion()));
        }
        // Language is not "und" and both script and region subtags are set!
        return Optional.of(id);
    }

    // From: https://unicode.org/reports/tr35/#Likely_Subtags
    //
    // Lookup
    // ------
    // Lookup each of the following in order, and stop on the first match:
    // <language>_<script>_<region>
    // <language>_<region>
    // <language>_<script>
    // <language>
    // "und"_<script>
    private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
        String lang = id.getLanguage();
        String script = id.getScript();
        String region = id.getRegion();
        Stream<LocaleId> candidateIds = Stream.of(
            LocaleId.of(lang, script, region),
            LocaleId.of(lang, null, region),
            LocaleId.of(lang, script, null),
            LocaleId.of(lang, null, null));
        // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
        // its own ("en_Latn_US") which is not intended.
        if (script != null) {
            candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
        }
        return candidateIds
            // Remove duplicate IDs (keeps the first one encountered).
            .distinct()
            .map(Object::toString)
            .map(fn)
            .filter(Objects::nonNull)
            .findFirst()
            .map(LocaleId::parse);
    }
}
