ICU-20941 NumberFormatter: format arbitrary compound units, with inflections See #1588.
diff --git a/icu4c/source/i18n/measunit_extra.cpp b/icu4c/source/i18n/measunit_extra.cpp index 06bf90b..676ff88 100644 --- a/icu4c/source/i18n/measunit_extra.cpp +++ b/icu4c/source/i18n/measunit_extra.cpp
@@ -894,6 +894,12 @@ SingleUnitImpl SingleUnitImpl::forMeasureUnit(const MeasureUnit& measureUnit, UE MeasureUnit SingleUnitImpl::build(UErrorCode& status) const { MeasureUnitImpl temp; temp.appendSingleUnit(*this, status); + // TODO(icu-units#28): the MeasureUnitImpl::build() method uses + // findBySubtype, which is relatively slow. + // - At the time of loading the simple unit IDs, we could also save a + // mapping to the builtin MeasureUnit type and subtype they correspond to. + // - This method could then check dimensionality and index, and if both are + // 1, directly return MeasureUnit instances very quickly. return std::move(temp).build(status); }
diff --git a/icu4c/source/i18n/measunit_impl.h b/icu4c/source/i18n/measunit_impl.h index 645e821..6c5a5ac 100644 --- a/icu4c/source/i18n/measunit_impl.h +++ b/icu4c/source/i18n/measunit_impl.h
@@ -14,6 +14,12 @@ U_NAMESPACE_BEGIN +namespace number { +namespace impl { +class LongNameHandler; +} +} // namespace number + // Export an explicit template instantiation of the LocalPointer that is used as a // data member of MeasureUnitImpl. // (When building DLLs for Windows this is required.) @@ -310,6 +316,10 @@ class U_I18N_API MeasureUnitImpl : public UMemory { * Normalizes a MeasureUnitImpl and generate the identifier string in place. */ void serialize(UErrorCode &status); + + // For calling serialize + // TODO(icu-units#147): revisit serialization + friend class number::impl::LongNameHandler; }; U_NAMESPACE_END
diff --git a/icu4c/source/i18n/number_formatimpl.cpp b/icu4c/source/i18n/number_formatimpl.cpp index fb0eec9..eb904dc 100644 --- a/icu4c/source/i18n/number_formatimpl.cpp +++ b/icu4c/source/i18n/number_formatimpl.cpp
@@ -375,7 +375,7 @@ NumberFormatterImpl::macrosToMicroGenerator(const MacroProps& macros, bool safe, // Outer modifier (CLDR units and currency long names) if (isCldrUnit) { - StringPiece unitDisplayCase(""); + const char *unitDisplayCase = ""; if (macros.unitDisplayCase.isSet()) { unitDisplayCase = macros.unitDisplayCase.fValue; } @@ -398,6 +398,16 @@ NumberFormatterImpl::macrosToMicroGenerator(const MacroProps& macros, bool safe, MeasureUnit unit = macros.unit; if (!utils::unitIsBaseUnit(macros.perUnit)) { unit = unit.product(macros.perUnit.reciprocal(status), status); + // This isn't strictly necessary, but was what we specced out + // when perUnit became a backward-compatibility thing: + // unit/perUnit use case is only valid if both units are + // built-ins, or the product is a built-in. + if (uprv_strcmp(unit.getType(), "") == 0 && + (uprv_strcmp(macros.unit.getType(), "") == 0 || + uprv_strcmp(macros.perUnit.getType(), "") == 0)) { + status = U_UNSUPPORTED_ERROR; + return nullptr; + } } fLongNameHandler.adoptInsteadAndCheckErrorCode(new LongNameHandler(), status); LongNameHandler::forMeasureUnit(macros.locale, unit, unitWidth, unitDisplayCase,
diff --git a/icu4c/source/i18n/number_longnames.cpp b/icu4c/source/i18n/number_longnames.cpp index e740272..41d0e7c 100644 --- a/icu4c/source/i18n/number_longnames.cpp +++ b/icu4c/source/i18n/number_longnames.cpp
@@ -46,13 +46,19 @@ constexpr int32_t GENDER_INDEX = StandardPlural::Form::COUNT + 2; // Number of keys in the array populated by PluralTableSink. constexpr int32_t ARRAY_LENGTH = StandardPlural::Form::COUNT + 3; -// TODO(inflections): load this list from resources, after creating a "&set" +// TODO(icu-units#28): load this list from resources, after creating a "&set" // function for use in ldml2icu rules. const int32_t GENDER_COUNT = 7; const char *gGenders[GENDER_COUNT] = {"animate", "common", "feminine", "inanimate", "masculine", "neuter", "personal"}; +// Converts a UnicodeString to a const char*, either pointing to a string in +// gGenders, or pointing to an empty string if an appropriate string was not +// found. const char *getGenderString(UnicodeString uGender, UErrorCode status) { + if (uGender.length() == 0) { + return ""; + } CharString gender; gender.appendInvariantChars(uGender, status); if (U_FAILURE(status)) { @@ -71,9 +77,16 @@ const char *getGenderString(UnicodeString uGender, UErrorCode status) { last = mid; } } + // We don't return an error in case our gGenders list is incomplete in + // production. + // + // TODO(icu-units#28): a unit test checking all locales' genders are covered + // by gGenders? Else load a complete list of genders found in + // grammaticalFeatures in an initOnce. return ""; } +// Returns the array index that corresponds to the given pluralKeyword. static int32_t getIndex(const char* pluralKeyword, UErrorCode& status) { // pluralKeyword can also be "dnam", "per", or "gender" switch (*pluralKeyword) { @@ -119,13 +132,248 @@ static UnicodeString getWithPlural( return result; } +enum PlaceholderPosition { PH_EMPTY, PH_NONE, PH_BEGINNING, PH_MIDDLE, PH_END }; + +/** + * Returns three outputs extracted from pattern. + * + * @param coreUnit is extracted as per Extract(...) in the spec: + * https://unicode.org/reports/tr35/tr35-general.html#compound-units + * @param PlaceholderPosition indicates where in the string the placeholder was + * found. + * @param joinerChar Iff the placeholder was at the beginning or end, joinerChar + * contains the space character (if any) that separated the placeholder from + * the rest of the pattern. Otherwise, joinerChar is set to NUL. + */ +void extractCorePattern(const UnicodeString &pattern, + UnicodeString &coreUnit, + PlaceholderPosition &placeholderPosition, + UChar &joinerChar) { + joinerChar = 0; + if (pattern.startsWith(u"{0}", 3)) { + placeholderPosition = PH_BEGINNING; + if (u_isJavaSpaceChar(pattern[3])) { + joinerChar = pattern[3]; + coreUnit.setTo(pattern, 4, pattern.length() - 4); + // Expecting no double spaces + U_ASSERT(!u_isJavaSpaceChar(pattern[4])); + } else { + coreUnit.setTo(pattern, 3, pattern.length() - 3); + } + } else if (pattern.endsWith(u"{0}", 3)) { + placeholderPosition = PH_END; + int32_t len = pattern.length(); + if (u_isJavaSpaceChar(pattern[len - 4])) { + coreUnit.setTo(pattern, 0, pattern.length() - 4); + joinerChar = pattern[len - 4]; + // Expecting no double spaces + U_ASSERT(!u_isJavaSpaceChar(pattern[len - 5])); + } else { + coreUnit.setTo(pattern, 0, pattern.length() - 3); + } + } else if (pattern.indexOf(u"{0}", 0, 1, pattern.length() - 2) == -1) { + placeholderPosition = PH_NONE; + coreUnit = pattern; + } else { + placeholderPosition = PH_MIDDLE; + coreUnit = pattern; + } +} ////////////////////////// /// BEGIN DATA LOADING /// ////////////////////////// +// Gets the gender of a built-in unit: unit must be a built-in. Returns an empty +// string both in case of unknown gender and in case of unknown unit. +const char *getGenderForBuiltin(const Locale &locale, MeasureUnit builtinUnit, UErrorCode &status) { + LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status)); + if (U_FAILURE(status)) { return ""; } + + // Map duration-year-person, duration-week-person, etc. to duration-year, duration-week, ... + // TODO(ICU-20400): Get duration-*-person data properly with aliases. + StringPiece subtypeForResource; + int32_t subtypeLen = static_cast<int32_t>(uprv_strlen(builtinUnit.getSubtype())); + if (subtypeLen > 7 && uprv_strcmp(builtinUnit.getSubtype() + subtypeLen - 7, "-person") == 0) { + subtypeForResource = {builtinUnit.getSubtype(), subtypeLen - 7}; + } else { + subtypeForResource = builtinUnit.getSubtype(); + } + + CharString key; + key.append("units/", status); + key.append(builtinUnit.getType(), status); + key.append("/", status); + key.append(subtypeForResource, status); + key.append("/gender", status); + + UErrorCode localStatus = status; + StackUResourceBundle fillIn; + ures_getByKeyWithFallback(unitsBundle.getAlias(), key.data(), fillIn.getAlias(), &localStatus); + if (U_SUCCESS(localStatus)) { + status = localStatus; + UnicodeString directString = ures_getUnicodeString(fillIn.getAlias(), &status); + return getGenderString(directString, status); + } else { + // TODO(icu-units#28): "$unitRes/gender" does not exist. Do we want to + // check whether the parent "$unitRes" exists? Then we could return + // U_MISSING_RESOURCE_ERROR for incorrect usage (e.g. builtinUnit not + // being a builtin). + return ""; + } +} + +// Loads data from a resource tree with paths matching +// $key/$pluralForm/$gender/$case, with lateral inheritance for missing cases +// and genders. +// +// An InflectedPluralSink is configured to load data for a specific gender and +// case. It loads all plural forms, because selection between plural forms is +// dependent upon the value being formatted. +// +// TODO(icu-units#138): Conceptually similar to PluralTableSink, however the +// tree structures are different. After homogenizing the structures, we may be +// able to unify the two classes. +// +// TODO: Spec violation: expects presence of "count" - does not fallback to an +// absent "count"! If this fallback were added, getCompoundValue could be +// superseded? +class InflectedPluralSink : public ResourceSink { + public: + // Accepts `char*` rather than StringPiece because + // ResourceTable::findValue(...) requires a null-terminated `char*`. + // + // NOTE: outArray MUST have a length of at least ARRAY_LENGTH. No bounds + // checking is performed. + explicit InflectedPluralSink(const char *gender, const char *caseVariant, UnicodeString *outArray) + : gender(gender), caseVariant(caseVariant), outArray(outArray) { + // Initialize the array to bogus strings. + for (int32_t i = 0; i < ARRAY_LENGTH; i++) { + outArray[i].setToBogus(); + } + } + + // See ResourceSink::put(). + void put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) U_OVERRIDE { + ResourceTable pluralsTable = value.getTable(status); + if (U_FAILURE(status)) { return; } + for (int32_t i = 0; pluralsTable.getKeyAndValue(i, key, value); ++i) { + int32_t pluralIndex = getIndex(key, status); + if (U_FAILURE(status)) { return; } + if (!outArray[pluralIndex].isBogus()) { + // We already have a pattern + continue; + } + ResourceTable genderTable = value.getTable(status); + if (loadForPluralForm(genderTable, value, status)) { + outArray[pluralIndex] = value.getUnicodeString(status); + } + } + } + + private: + // Tries to load data for the configured gender from `genderTable`. Returns + // true if found, returning the data in `value`. The returned data will be + // for the configured gender if found, falling back to "neuter" and + // no-gender if not. + bool loadForPluralForm(const ResourceTable &genderTable, ResourceValue &value, UErrorCode &status) { + if (uprv_strcmp(gender, "") != 0) { + if (loadForGender(genderTable, gender, value, status)) { + return true; + } + if (uprv_strcmp(gender, "neuter") != 0 && loadForGender(genderTable, "neuter", value, status)) { + return true; + } + } + if (loadForGender(genderTable, "_", value, status)) { + return true; + } + return false; + } + + // Tries to load data for the given gender from `genderTable`. Returns true + // if found, returning the data in `value`. The returned data will be for + // the configured case if found, falling back to "nominative" and no-case if + // not. + bool loadForGender(const ResourceTable &genderTable, + const char *genderVal, + ResourceValue &value, + UErrorCode &status) { + if (!genderTable.findValue(genderVal, value)) { + return false; + } + ResourceTable caseTable = value.getTable(status); + if (uprv_strcmp(caseVariant, "") != 0) { + if (loadForCase(caseTable, caseVariant, value)) { + return true; + } + if (uprv_strcmp(caseVariant, "nominative") != 0 && + loadForCase(caseTable, "nominative", value)) { + return true; + } + } + if (loadForCase(caseTable, "_", value)) { + return true; + } + return false; + } + + // Tries to load data for the given case from `caseTable`. Returns true if + // found, returning the data in `value`. + bool loadForCase(const ResourceTable &caseTable, + const char *caseValue, + ResourceValue &value) { + if (!caseTable.findValue(caseValue, value)) { + return false; + } + return true; + } + + const char *gender; + const char *caseVariant; + UnicodeString *outArray; +}; + +void getInflectedMeasureData(StringPiece subKey, + const Locale &locale, + const UNumberUnitWidth &width, + const char *gender, + const char *caseVariant, + UnicodeString *outArray, + UErrorCode &status) { + InflectedPluralSink sink(gender, caseVariant, outArray); + LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status)); + if (U_FAILURE(status)) { return; } + + CharString key; + key.append("units", status); + if (width == UNUM_UNIT_WIDTH_NARROW) { + key.append("Narrow", status); + } else if (width == UNUM_UNIT_WIDTH_SHORT) { + key.append("Short", status); + } + key.append("/", status); + key.append(subKey, status); + + UErrorCode localStatus = status; + ures_getAllItemsWithFallback(unitsBundle.getAlias(), key.data(), sink, status); + if (width == UNUM_UNIT_WIDTH_SHORT) { + status = localStatus; + return; + } + + // TODO(ICU-13353): The fallback to short does not work in ICU4C. + // Manually fall back to short (this is done automatically in Java). + key.clear(); + key.append("unitsShort/", status); + key.append(subKey, status); + ures_getAllItemsWithFallback(unitsBundle.getAlias(), key.data(), sink, status); +} + class PluralTableSink : public ResourceSink { public: + // NOTE: outArray MUST have a length of at least ARRAY_LENGTH. No bounds + // checking is performed. explicit PluralTableSink(UnicodeString *outArray) : outArray(outArray) { // Initialize the array to bogus strings. for (int32_t i = 0; i < ARRAY_LENGTH; i++) { @@ -154,8 +402,6 @@ class PluralTableSink : public ResourceSink { UnicodeString *outArray; }; -// NOTE: outArray MUST have room for all StandardPlural values. No bounds checking is performed. - /** * Populates outArray with `locale`-specific values for `unit` through use of * PluralTableSink. Only the set of basic units are supported! @@ -174,7 +420,7 @@ class PluralTableSink : public ResourceSink { void getMeasureData(const Locale &locale, const MeasureUnit &unit, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, UnicodeString *outArray, UErrorCode &status) { PluralTableSink sink(outArray); @@ -206,21 +452,26 @@ void getMeasureData(const Locale &locale, // Grab desired case first, if available. Then grab no-case data to fill in // the gaps. - if (width == UNUM_UNIT_WIDTH_FULL_NAME && !unitDisplayCase.empty()) { + if (width == UNUM_UNIT_WIDTH_FULL_NAME && unitDisplayCase[0] != 0) { CharString caseKey; caseKey.append(key, status); caseKey.append("/case/", status); caseKey.append(unitDisplayCase, status); UErrorCode localStatus = U_ZERO_ERROR; + // TODO(icu-units#138): our fallback logic is not spec-compliant: + // lateral fallback should happen before locale fallback. Switch to + // getInflectedMeasureData after homogenizing data format? Find a unit + // test case that demonstrates the incorrect fallback logic (via + // regional variant of an inflected language?) ures_getAllItemsWithFallback(unitsBundle.getAlias(), caseKey.data(), sink, localStatus); - // TODO(icu-units#138): our fallback logic is not spec-compliant: we - // check the given case, then go straight to the no-case data. The spec - // states we should first look for case="nominative". As part of #138, - // either get the spec changed, or add unit tests that warn us if - // case="nominative" data differs from no-case data? } + // TODO(icu-units#138): our fallback logic is not spec-compliant: we + // check the given case, then go straight to the no-case data. The spec + // states we should first look for case="nominative". As part of #138, + // either get the spec changed, or add unit tests that warn us if + // case="nominative" data differs from no-case data? UErrorCode localStatus = U_ZERO_ERROR; ures_getAllItemsWithFallback(unitsBundle.getAlias(), key.data(), sink, localStatus); if (width == UNUM_UNIT_WIDTH_SHORT) { @@ -240,6 +491,7 @@ void getMeasureData(const Locale &locale, ures_getAllItemsWithFallback(unitsBundle.getAlias(), key.data(), sink, status); } +// NOTE: outArray MUST have a length of at least ARRAY_LENGTH. void getCurrencyLongNameData(const Locale &locale, const CurrencyUnit ¤cy, UnicodeString *outArray, UErrorCode &status) { // In ICU4J, this method gets a CurrencyData from CurrencyData.provider. @@ -268,7 +520,10 @@ void getCurrencyLongNameData(const Locale &locale, const CurrencyUnit ¤cy, } } -UnicodeString getPerUnitFormat(const Locale& locale, const UNumberUnitWidth &width, UErrorCode& status) { +UnicodeString getCompoundValue(StringPiece compoundKey, + const Locale &locale, + const UNumberUnitWidth &width, + UErrorCode &status) { LocalUResourceBundlePointer unitsBundle(ures_open(U_ICUDATA_UNIT, locale.getName(), &status)); if (U_FAILURE(status)) { return {}; } CharString key; @@ -278,9 +533,25 @@ UnicodeString getPerUnitFormat(const Locale& locale, const UNumberUnitWidth &wid } else if (width == UNUM_UNIT_WIDTH_SHORT) { key.append("Short", status); } - key.append("/compound/per", status); + key.append("/compound/", status); + key.append(compoundKey, status); + + UErrorCode localStatus = status; int32_t len = 0; - const UChar* ptr = ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &len, &status); + const UChar *ptr = + ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &len, &localStatus); + if (U_FAILURE(localStatus) && width != UNUM_UNIT_WIDTH_SHORT) { + // Fall back to short, which contains more compound data + key.clear(); + key.append("unitsShort/compound/", status); + key.append(compoundKey, status); + ptr = ures_getStringByKeyWithFallback(unitsBundle.getAlias(), key.data(), &len, &status); + } else { + status = localStatus; + } + if (U_FAILURE(status)) { + return {}; + } return UnicodeString(ptr, len); } @@ -293,12 +564,14 @@ UnicodeString getPerUnitFormat(const Locale& locale, const UNumberUnitWidth &wid * * Instantiating an instance as follows: * - * DerivedComponents d(loc, "case", "per", "foo"); + * DerivedComponents d(loc, "case", "per"); * - * Applying the rule in the XML element above, `d.value0()` will be "foo", and - * `d.value1()` will be "nominative". + * Applying the rule in the XML element above, `d.value0("foo")` will be "foo", + * and `d.value1("foo")` will be "nominative". * - * In case of any kind of failure, value0() and value1() will simply return "". + * The values returned by value0(...) and value1(...) are valid only while the + * instance exists. In case of any kind of failure, value0(...) and value1(...) + * will return "". */ class DerivedComponents { public: @@ -309,10 +582,7 @@ class DerivedComponents { * referenced by compoundValue must exist for longer than the * DerivedComponents instance. */ - DerivedComponents(const Locale &locale, - const char *feature, - const char *structure, - const StringPiece compoundValue) { + DerivedComponents(const Locale &locale, const char *feature, const char *structure) { StackUResourceBundle derivationsBundle, stackBundle; ures_openDirectFillIn(derivationsBundle.getAlias(), NULL, "grammaticalFeatures", &status); ures_getByKey(derivationsBundle.getAlias(), "grammaticalData", derivationsBundle.getAlias(), @@ -323,10 +593,11 @@ class DerivedComponents { return; } UErrorCode localStatus = U_ZERO_ERROR; - // TODO: use standard normal locale resolution algorithms rather than just grabbing language: + // TODO(icu-units#28): use standard normal locale resolution algorithms + // rather than just grabbing language: ures_getByKey(derivationsBundle.getAlias(), locale.getLanguage(), stackBundle.getAlias(), &localStatus); - // TODO: + // TODO(icu-units#28): // - code currently assumes if the locale exists, the rules are there - // instead of falling back to root when the requested rule is missing. // - investigate ures.h functions, see if one that uses res_findResource() @@ -344,38 +615,61 @@ class DerivedComponents { UnicodeString val1 = ures_getUnicodeStringByIndex(stackBundle.getAlias(), 1, &status); if (U_SUCCESS(status)) { if (val0.compare(UnicodeString(u"compound")) == 0) { - sp0 = compoundValue; + compound0_ = true; } else { - memory0.appendInvariantChars(val0, status); - sp0 = memory0.toStringPiece(); + compound0_ = false; + value0_.appendInvariantChars(val0, status); } if (val1.compare(UnicodeString(u"compound")) == 0) { - sp1 = compoundValue; + compound1_ = true; } else { - memory1.appendInvariantChars(val1, status); - sp1 = memory1.toStringPiece(); + compound1_ = false; + value1_.appendInvariantChars(val1, status); } } } - // The returned StringPiece is only valid as long as both the instance - // exists, and the compoundValue passed to the constructor is valid. - StringPiece value0() const { - return sp0; + + // Returns a StringPiece that is only valid as long as the instance exists. + StringPiece value0(const StringPiece compoundValue) const { + return compound0_ ? compoundValue : value0_.toStringPiece(); } - // The returned StringPiece is only valid as long as both the instance - // exists, and the compoundValue passed to the constructor is valid. - StringPiece value1() const { - return sp1; + + // Returns a StringPiece that is only valid as long as the instance exists. + StringPiece value1(const StringPiece compoundValue) const { + return compound1_ ? compoundValue : value1_.toStringPiece(); + } + + // Returns a char* that is only valid as long as the instance exists. + const char *value0(const char *compoundValue) const { + return compound0_ ? compoundValue : value0_.data(); + } + + // Returns a char* that is only valid as long as the instance exists. + const char *value1(const char *compoundValue) const { + return compound1_ ? compoundValue : value1_.data(); } private: UErrorCode status = U_ZERO_ERROR; // Holds strings referred to by value0 and value1; - CharString memory0, memory1; - StringPiece sp0, sp1; + bool compound0_, compound1_; + CharString value0_, value1_; }; +// TODO(icu-units#28): test somehow? Associate with an ICU ticket for adding +// testsuite support for testing with synthetic data? +/** + * Loads and returns the value in rules that look like these: + * + * <deriveCompound feature="gender" structure="per" value="0"/> + * <deriveCompound feature="gender" structure="times" value="1"/> + * + * Currently a fake example, but spec compliant: + * <deriveCompound feature="gender" structure="power" value="feminine"/> + * + * NOTE: If U_FAILURE(status), returns an empty string. + */ UnicodeString getDeriveCompoundRule(Locale locale, const char *feature, const char *structure, UErrorCode &status) { StackUResourceBundle derivationsBundle, stackBundle; @@ -397,7 +691,45 @@ getDeriveCompoundRule(Locale locale, const char *feature, const char *structure, } ures_getByKey(stackBundle.getAlias(), "compound", stackBundle.getAlias(), &status); ures_getByKey(stackBundle.getAlias(), feature, stackBundle.getAlias(), &status); - return ures_getUnicodeStringByKey(stackBundle.getAlias(), structure, &status); + UnicodeString uVal = ures_getUnicodeStringByKey(stackBundle.getAlias(), structure, &status); + if (U_FAILURE(status)) { + return {}; + } + U_ASSERT(!uVal.isBogus()); + return uVal; +} + +// Returns the gender string for structures following these rules: +// +// <deriveCompound feature="gender" structure="per" value="0"/> +// <deriveCompound feature="gender" structure="times" value="1"/> +// +// Fake example: +// <deriveCompound feature="gender" structure="power" value="feminine"/> +// +// data0 and data1 should be pattern arrays (UnicodeString[ARRAY_SIZE]) that +// correspond to value="0" and value="1". +// +// Pass a nullptr to data1 if the structure has no concept of value="1" (e.g. +// "prefix" doesn't). +UnicodeString getDerivedGender(Locale locale, + const char *structure, + UnicodeString *data0, + UnicodeString *data1, + UErrorCode &status) { + UnicodeString val = getDeriveCompoundRule(locale, "gender", structure, status); + if (val.length() == 1) { + switch (val[0]) { + case u'0': + return data0[GENDER_INDEX]; + case u'1': + if (data1 == nullptr) { + return {}; + } + return data1[GENDER_INDEX]; + } + } + return val; } //////////////////////// @@ -430,27 +762,84 @@ const UChar *trimSpaceChars(const UChar *s, int32_t &length) { void LongNameHandler::forMeasureUnit(const Locale &loc, const MeasureUnit &unitRef, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, const PluralRules *rules, const MicroPropsGenerator *parent, LongNameHandler *fillIn, UErrorCode &status) { - // Not valid for mixed units that aren't built-in units, and there should - // not be any built-in mixed units! + // From https://unicode.org/reports/tr35/tr35-general.html#compound-units - + // Points 1 and 2 are mostly handled by MeasureUnit: + // + // 1. If the unitId is empty or invalid, fail + // 2. Put the unitId into normalized order + // + // We just need to check if it is a MeasureUnit this constructor handles: + // this constructor does not handle mixed units U_ASSERT(uprv_strcmp(unitRef.getType(), "") != 0 || unitRef.getComplexity(status) != UMEASURE_UNIT_MIXED); U_ASSERT(fillIn != nullptr); - if (uprv_strcmp(unitRef.getType(), "") == 0) { - // Not a built-in unit. Split it up, since we can already format - // "builtin-per-builtin". - // TODO(ICU-20941): support more generic case than builtin-per-builtin. + if (uprv_strcmp(unitRef.getType(), "") != 0) { + // Handling built-in units: + // + // 3. Set result to be getValue(unitId with length, pluralCategory, caseVariant) + // - If result is not empty, return it + UnicodeString simpleFormats[ARRAY_LENGTH]; + getMeasureData(loc, unitRef, width, unitDisplayCase, simpleFormats, status); + if (U_FAILURE(status)) { + return; + } + fillIn->rules = rules; + fillIn->parent = parent; + fillIn->simpleFormatsToModifiers(simpleFormats, + {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status); + if (!simpleFormats[GENDER_INDEX].isBogus()) { + fillIn->gender = getGenderString(simpleFormats[GENDER_INDEX], status); + } + return; + + // TODO(icu-units#145): figure out why this causes a failure in + // format/MeasureFormatTest/TestIndividualPluralFallback and other + // tests, when it should have been an alternative for the lines above: + + // forArbitraryUnit(loc, unitRef, width, unitDisplayCase, fillIn, status); + // fillIn->rules = rules; + // fillIn->parent = parent; + // return; + } else { + forArbitraryUnit(loc, unitRef, width, unitDisplayCase, fillIn, status); + fillIn->rules = rules; + fillIn->parent = parent; + return; + } +} + +void LongNameHandler::forArbitraryUnit(const Locale &loc, + const MeasureUnit &unitRef, + const UNumberUnitWidth &width, + const char *unitDisplayCase, + LongNameHandler *fillIn, + UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (fillIn == nullptr) { + status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + // Numbered list items are from the algorithms at + // https://unicode.org/reports/tr35/tr35-general.html#compound-units: + // + // 4. Divide the unitId into numerator (the part before the "-per-") and + // denominator (the part after the "-per-). If both are empty, fail + MeasureUnitImpl unit; + MeasureUnitImpl perUnit; + { MeasureUnitImpl fullUnit = MeasureUnitImpl::forMeasureUnitMaybeCopy(unitRef, status); if (U_FAILURE(status)) { return; } - MeasureUnitImpl unit; - MeasureUnitImpl perUnit; for (int32_t i = 0; i < fullUnit.singleUnits.length(); i++) { SingleUnitImpl *subUnit = fullUnit.singleUnits[i]; if (subUnit->dimensionality > 0) { @@ -460,120 +849,454 @@ void LongNameHandler::forMeasureUnit(const Locale &loc, perUnit.appendSingleUnit(*subUnit, status); } } - forCompoundUnit(loc, std::move(unit).build(status), std::move(perUnit).build(status), width, - unitDisplayCase, rules, parent, fillIn, status); - return; } - UnicodeString simpleFormats[ARRAY_LENGTH]; - getMeasureData(loc, unitRef, width, unitDisplayCase, simpleFormats, status); - if (U_FAILURE(status)) { - return; + // TODO(icu-units#28): check placeholder logic, see if it needs to be + // present here instead of only in processPatternTimes: + // + // 5. Set both globalPlaceholder and globalPlaceholderPosition to be empty + + DerivedComponents derivedPerCases(loc, "case", "per"); + + // 6. numeratorUnitString + UnicodeString numeratorUnitData[ARRAY_LENGTH]; + processPatternTimes(std::move(unit), loc, width, derivedPerCases.value0(unitDisplayCase), + numeratorUnitData, status); + + // 7. denominatorUnitString + UnicodeString denominatorUnitData[ARRAY_LENGTH]; + processPatternTimes(std::move(perUnit), loc, width, derivedPerCases.value1(unitDisplayCase), + denominatorUnitData, status); + + // TODO(icu-units#139): + // - implement DerivedComponents for "plural/times" and "plural/power": + // French has different rules, we'll be producing the wrong results + // currently. (Prove via tests!) + // - implement DerivedComponents for "plural/per", "plural/prefix", + // "case/times", "case/power", and "case/prefix" - although they're + // currently hardcoded. Languages with different rules are surely on the + // way. + // + // Currently we only use "case/per", "plural/times", "case/times", and + // "case/power". + // + // This may have impact on multiSimpleFormatsToModifiers(...) below too? + // These rules are currently (ICU 69) all the same and hard-coded below. + UnicodeString perUnitPattern; + if (!denominatorUnitData[PER_INDEX].isBogus()) { + // If we have no denominator, we obtain the empty string: + perUnitPattern = denominatorUnitData[PER_INDEX]; + } else { + // 8. Set perPattern to be getValue([per], locale, length) + UnicodeString rawPerUnitFormat = getCompoundValue("per", loc, width, status); + // rawPerUnitFormat is something like "{0} per {1}"; we need to substitute in the secondary unit. + SimpleFormatter perPatternFormatter(rawPerUnitFormat, 2, 2, status); + if (U_FAILURE(status)) { + return; + } + // Plural and placeholder handling for 7. denominatorUnitString: + // TODO(icu-units#139): hardcoded: + // <deriveComponent feature="plural" structure="per" value0="compound" value1="one"/> + UnicodeString denominatorFormat = + getWithPlural(denominatorUnitData, StandardPlural::Form::ONE, status); + // Some "one" pattern may not contain "{0}". For example in "ar" or "ne" locale. + SimpleFormatter denominatorFormatter(denominatorFormat, 0, 1, status); + if (U_FAILURE(status)) { + return; + } + UnicodeString denominatorPattern = denominatorFormatter.getTextWithNoArguments(); + int32_t trimmedLen = denominatorPattern.length(); + const UChar *trimmed = trimSpaceChars(denominatorPattern.getBuffer(), trimmedLen); + UnicodeString denominatorString(false, trimmed, trimmedLen); + // 9. If the denominatorString is empty, set result to + // [numeratorString], otherwise set result to format(perPattern, + // numeratorString, denominatorString) + // + // TODO(icu-units#28): Why does UnicodeString need to be explicit in the + // following line? + perPatternFormatter.format(UnicodeString(u"{0}"), denominatorString, perUnitPattern, status); + if (U_FAILURE(status)) { + return; + } } - fillIn->rules = rules; - fillIn->parent = parent; - fillIn->simpleFormatsToModifiers(simpleFormats, {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, - status); - if (!simpleFormats[GENDER_INDEX].isBogus()) { - fillIn->gender = getGenderString(simpleFormats[GENDER_INDEX], status); + if (perUnitPattern.length() == 0) { + fillIn->simpleFormatsToModifiers(numeratorUnitData, + {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status); + } else { + fillIn->multiSimpleFormatsToModifiers(numeratorUnitData, perUnitPattern, + {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status); } + + // Gender + // + // TODO(icu-units#28): find out what gender to use in the absence of a first + // value - e.g. what's the gender of "per-second"? Mentioned in CLDR-14253. + // + // gender/per deriveCompound rules don't say: + // <deriveCompound feature="gender" structure="per" value="0"/> <!-- gender(gram-per-meter) ← gender(gram) --> + fillIn->gender = getGenderString( + getDerivedGender(loc, "per", numeratorUnitData, denominatorUnitData, status), status); } -void LongNameHandler::forCompoundUnit(const Locale &loc, - const MeasureUnit &unit, - const MeasureUnit &perUnit, - const UNumberUnitWidth &width, - StringPiece unitDisplayCase, - const PluralRules *rules, - const MicroPropsGenerator *parent, - LongNameHandler *fillIn, - UErrorCode &status) { +void LongNameHandler::processPatternTimes(MeasureUnitImpl &&productUnit, + Locale loc, + const UNumberUnitWidth &width, + const char *caseVariant, + UnicodeString *outArray, + UErrorCode &status) { if (U_FAILURE(status)) { return; } - if (uprv_strcmp(unit.getType(), "") == 0 || uprv_strcmp(perUnit.getType(), "") == 0) { - // TODO(ICU-20941): Unsanctioned unit. Not yet fully supported. Set an - // error code. Once we support not-built-in units here, unitRef may be - // anything, but if not built-in, perUnit has to be "none". + if (productUnit.complexity == UMEASURE_UNIT_MIXED) { + // These are handled by MixedUnitLongNameHandler status = U_UNSUPPORTED_ERROR; return; } - if (fillIn == nullptr) { - status = U_INTERNAL_PROGRAM_ERROR; - return; + +#if U_DEBUG + for (int32_t pluralIndex = 0; pluralIndex < ARRAY_LENGTH; pluralIndex++) { + U_ASSERT(outArray[pluralIndex].length() == 0); + U_ASSERT(!outArray[pluralIndex].isBogus()); } +#endif - DerivedComponents derivedPerCases(loc, "case", "per", unitDisplayCase); - - UnicodeString primaryData[ARRAY_LENGTH]; - getMeasureData(loc, unit, width, derivedPerCases.value0(), primaryData, status); + if (productUnit.identifier.isEmpty()) { + // TODO(icu-units#28): consider when serialize should be called. + // identifier might also be empty for MeasureUnit(). + productUnit.serialize(status); + } if (U_FAILURE(status)) { return; } - UnicodeString secondaryData[ARRAY_LENGTH]; - getMeasureData(loc, perUnit, width, derivedPerCases.value1(), secondaryData, status); + if (productUnit.identifier.length() == 0) { + // MeasureUnit(): no units: return empty strings. + return; + } + + MeasureUnit builtinUnit; + if (MeasureUnit::findBySubType(productUnit.identifier.toStringPiece(), &builtinUnit)) { + // TODO(icu-units#145): spec doesn't cover builtin-per-builtin, it + // breaks them all down. Do we want to drop this? + // - findBySubType isn't super efficient, if we skip it and go to basic + // singles, we don't have to construct MeasureUnit's anymore. + // - Check all the existing unit tests that fail without this: is it due + // to incorrect fallback via getMeasureData? + // - Do those unit tests cover this code path representatively? + if (builtinUnit != MeasureUnit()) { + getMeasureData(loc, builtinUnit, width, caseVariant, outArray, status); + } + return; + } + + // 2. Set timesPattern to be getValue(times, locale, length) + UnicodeString timesPattern = getCompoundValue("times", loc, width, status); + SimpleFormatter timesPatternFormatter(timesPattern, 2, 2, status); if (U_FAILURE(status)) { return; } - // TODO(icu-units#139): implement these rules: - // <deriveComponent feature="plural" structure="per" ...> - // This has impact on multiSimpleFormatsToModifiers(...) below too. - // These rules are currently (ICU 69) all the same and hard-coded below. - UnicodeString perUnitFormat; - if (!secondaryData[PER_INDEX].isBogus()) { - perUnitFormat = secondaryData[PER_INDEX]; - } else { - UnicodeString rawPerUnitFormat = getPerUnitFormat(loc, width, status); - if (U_FAILURE(status)) { - return; + PlaceholderPosition globalPlaceholder[ARRAY_LENGTH]; + UChar globalJoinerChar = 0; + // Numbered list items are from the algorithms at + // https://unicode.org/reports/tr35/tr35-general.html#compound-units: + // + // pattern(...) point 5: + // - Set both globalPlaceholder and globalPlaceholderPosition to be empty + // + // 3. Set result to be empty + for (int32_t pluralIndex = 0; pluralIndex < ARRAY_LENGTH; pluralIndex++) { + // Initial state: empty string pattern, via all falling back to OTHER: + if (pluralIndex == StandardPlural::Form::OTHER) { + outArray[pluralIndex].remove(); + } else { + outArray[pluralIndex].setToBogus(); } - // rawPerUnitFormat is something like "{0} per {1}"; we need to substitute in the secondary unit. - SimpleFormatter compiled(rawPerUnitFormat, 2, 2, status); - if (U_FAILURE(status)) { - return; - } - UnicodeString secondaryFormat = getWithPlural(secondaryData, StandardPlural::Form::ONE, status); - if (U_FAILURE(status)) { - return; - } - // Some "one" pattern may not contain "{0}". For example in "ar" or "ne" locale. - SimpleFormatter secondaryCompiled(secondaryFormat, 0, 1, status); - if (U_FAILURE(status)) { - return; - } - UnicodeString secondaryFormatString = secondaryCompiled.getTextWithNoArguments(); - int32_t trimmedSecondaryLen = secondaryFormatString.length(); - const UChar *trimmedSecondaryString = - trimSpaceChars(secondaryFormatString.getBuffer(), trimmedSecondaryLen); - UnicodeString secondaryString(false, trimmedSecondaryString, trimmedSecondaryLen); - // TODO: Why does UnicodeString need to be explicit in the following line? - compiled.format(UnicodeString(u"{0}"), secondaryString, perUnitFormat, status); - if (U_FAILURE(status)) { - return; - } + globalPlaceholder[pluralIndex] = PH_EMPTY; } - fillIn->rules = rules; - fillIn->parent = parent; - fillIn->multiSimpleFormatsToModifiers(primaryData, perUnitFormat, - {UFIELD_CATEGORY_NUMBER, UNUM_MEASURE_UNIT_FIELD}, status); - // Gender - UnicodeString uVal = getDeriveCompoundRule(loc, "gender", "per", status); - if (U_FAILURE(status)) { - return; + // Empty string represents "compound" (propagate the plural form). + const char *pluralCategory = ""; + DerivedComponents derivedTimesPlurals(loc, "plural", "times"); + DerivedComponents derivedTimesCases(loc, "case", "times"); + DerivedComponents derivedPowerCases(loc, "case", "power"); + + // 4. For each single_unit in product_unit + for (int32_t singleUnitIndex = 0; singleUnitIndex < productUnit.singleUnits.length(); + singleUnitIndex++) { + SingleUnitImpl *singleUnit = productUnit.singleUnits[singleUnitIndex]; + const char *singlePluralCategory; + const char *singleCaseVariant; + // TODO(icu-units#28): ensure we have unit tests that change/fail if we + // assign incorrect case variants here: + if (singleUnitIndex < productUnit.singleUnits.length() - 1) { + // 4.1. If hasMultiple + singlePluralCategory = derivedTimesPlurals.value0(pluralCategory); + singleCaseVariant = derivedTimesCases.value0(caseVariant); + pluralCategory = derivedTimesPlurals.value1(pluralCategory); + caseVariant = derivedTimesCases.value1(caseVariant); + } else { + singlePluralCategory = derivedTimesPlurals.value1(pluralCategory); + singleCaseVariant = derivedTimesCases.value1(caseVariant); + } + + // 4.2. Get the gender of that single_unit + MeasureUnit builtinUnit; + if (!MeasureUnit::findBySubType(singleUnit->getSimpleUnitID(), &builtinUnit)) { + // Ideally all simple units should be known, but they're not: + // 100-kilometer is internally treated as a simple unit, but it is + // not a built-in unit and does not have formatting data in CLDR 39. + // + // TODO(icu-units#28): test (desirable) invariants in unit tests. + status = U_UNSUPPORTED_ERROR; + return; + } + const char *gender = getGenderForBuiltin(loc, builtinUnit, status); + + // 4.3. If singleUnit starts with a dimensionality_prefix, such as 'square-' + U_ASSERT(singleUnit->dimensionality > 0); + int32_t dimensionality = singleUnit->dimensionality; + UnicodeString dimensionalityPrefixPatterns[ARRAY_LENGTH]; + if (dimensionality != 1) { + // 4.3.1. set dimensionalityPrefixPattern to be + // getValue(that dimensionality_prefix, locale, length, singlePluralCategory, singleCaseVariant, gender), + // such as "{0} kwadratowym" + CharString dimensionalityKey("compound/power", status); + dimensionalityKey.appendNumber(dimensionality, status); + getInflectedMeasureData(dimensionalityKey.toStringPiece(), loc, width, gender, + singleCaseVariant, dimensionalityPrefixPatterns, status); + if (U_FAILURE(status)) { + // At the time of writing, only power2 and power3 are supported. + // Attempting to format other powers results in a + // U_RESOURCE_TYPE_MISMATCH. We convert the error if we + // understand it: + if (status == U_RESOURCE_TYPE_MISMATCH && dimensionality > 3) { + status = U_UNSUPPORTED_ERROR; + } + return; + } + + // TODO(icu-units#139): + // 4.3.2. set singlePluralCategory to be power0(singlePluralCategory) + + // 4.3.3. set singleCaseVariant to be power0(singleCaseVariant) + singleCaseVariant = derivedPowerCases.value0(singleCaseVariant); + // 4.3.4. remove the dimensionality_prefix from singleUnit + singleUnit->dimensionality = 1; + } + + // 4.4. if singleUnit starts with an si_prefix, such as 'centi' + UMeasurePrefix prefix = singleUnit->unitPrefix; + UnicodeString prefixPattern; + if (prefix != UMEASURE_PREFIX_ONE) { + // 4.4.1. set siPrefixPattern to be getValue(that si_prefix, locale, + // length), such as "centy{0}" + CharString prefixKey; + // prefixKey looks like "1024p3" or "10p-2": + prefixKey.appendNumber(umeas_getPrefixBase(prefix), status); + prefixKey.append('p', status); + prefixKey.appendNumber(umeas_getPrefixPower(prefix), status); + // Contains a pattern like "centy{0}". + prefixPattern = getCompoundValue(prefixKey.toStringPiece(), loc, width, status); + + // 4.4.2. set singlePluralCategory to be prefix0(singlePluralCategory) + // + // TODO(icu-units#139): that refers to these rules: + // <deriveComponent feature="plural" structure="prefix" value0="one" value1="compound"/> + // though I'm not sure what other value they might end up having. + // + // 4.4.3. set singleCaseVariant to be prefix0(singleCaseVariant) + // + // TODO(icu-units#139): that refers to: + // <deriveComponent feature="case" structure="prefix" value0="nominative" + // value1="compound"/> but the prefix (value0) doesn't have case, the rest simply + // propagates. + + // 4.4.4. remove the si_prefix from singleUnit + singleUnit->unitPrefix = UMEASURE_PREFIX_ONE; + } + + // 4.5. Set corePattern to be the getValue(singleUnit, locale, length, + // singlePluralCategory, singleCaseVariant), such as "{0} metrem" + UnicodeString singleUnitArray[ARRAY_LENGTH]; + // At this point we are left with a Simple Unit: + U_ASSERT(uprv_strcmp(singleUnit->build(status).getIdentifier(), singleUnit->getSimpleUnitID()) == + 0); + getMeasureData(loc, singleUnit->build(status), width, singleCaseVariant, singleUnitArray, + status); + if (U_FAILURE(status)) { + // Shouldn't happen if we have data for all single units + return; + } + + // Calculate output gender + if (!singleUnitArray[GENDER_INDEX].isBogus()) { + U_ASSERT(!singleUnitArray[GENDER_INDEX].isEmpty()); + UnicodeString uVal; + + if (prefix != UMEASURE_PREFIX_ONE) { + singleUnitArray[GENDER_INDEX] = + getDerivedGender(loc, "prefix", singleUnitArray, nullptr, status); + } + + // Powers use compoundUnitPattern1, dimensionalityPrefixPatterns may + // have a "gender" element + // + // TODO(icu-units#28): untested: no locale data uses this currently: + if (dimensionality != 1) { + singleUnitArray[GENDER_INDEX] = getDerivedGender(loc, "power", singleUnitArray, + dimensionalityPrefixPatterns, status); + } + + UnicodeString timesGenderRule = getDeriveCompoundRule(loc, "gender", "times", status); + if (timesGenderRule.length() == 1) { + switch (timesGenderRule[0]) { + case u'0': + if (singleUnitIndex == 0) { + U_ASSERT(outArray[GENDER_INDEX].isBogus()); + outArray[GENDER_INDEX] = singleUnitArray[GENDER_INDEX]; + } + break; + case u'1': + if (singleUnitIndex == productUnit.singleUnits.length() - 1) { + U_ASSERT(outArray[GENDER_INDEX].isBogus()); + outArray[GENDER_INDEX] = singleUnitArray[GENDER_INDEX]; + } + } + } else { + if (outArray[GENDER_INDEX].isBogus()) { + outArray[GENDER_INDEX] = timesGenderRule; + } + } + } + + // Calculate resulting patterns for each plural form + for (int32_t pluralIndex = 0; pluralIndex < StandardPlural::Form::COUNT; pluralIndex++) { + StandardPlural::Form plural = static_cast<StandardPlural::Form>(pluralIndex); + + // singleUnitArray[pluralIndex] looks something like "{0} Meter" + if (outArray[pluralIndex].isBogus()) { + if (singleUnitArray[pluralIndex].isBogus()) { + // Let the usual plural fallback mechanism take care of this + // plural form + continue; + } else { + // Since our singleUnit can have a plural form that outArray + // doesn't yet have (relying on fallback to OTHER), we start + // by grabbing it with the normal plural fallback mechanism + outArray[pluralIndex] = getWithPlural(outArray, plural, status); + if (U_FAILURE(status)) { + return; + } + } + } + + if (uprv_strcmp(singlePluralCategory, "") != 0) { + plural = static_cast<StandardPlural::Form>(getIndex(singlePluralCategory, status)); + } + + // 4.6. Extract(corePattern, coreUnit, placeholder, placeholderPosition) from that pattern. + UnicodeString coreUnit; + PlaceholderPosition placeholderPosition; + UChar joinerChar; + extractCorePattern(getWithPlural(singleUnitArray, plural, status), coreUnit, + placeholderPosition, joinerChar); + + // 4.7 If the position is middle, then fail + if (placeholderPosition == PH_MIDDLE) { + status = U_UNSUPPORTED_ERROR; + return; + } + + // 4.8. If globalPlaceholder is empty + if (globalPlaceholder[pluralIndex] == PH_EMPTY) { + globalPlaceholder[pluralIndex] = placeholderPosition; + globalJoinerChar = joinerChar; + } else { + // Expect all units involved to have the same placeholder position + U_ASSERT(globalPlaceholder[pluralIndex] == placeholderPosition); + // TODO(icu-units#28): Do we want to add a unit test that checks + // for consistent joiner chars? Probably not, given how + // inconsistent they are. File a CLDR ticket with examples? + } + // Now coreUnit would be just "Meter" + + // 4.9. If siPrefixPattern is not empty + if (prefix != UMEASURE_PREFIX_ONE) { + SimpleFormatter prefixCompiled(prefixPattern, 1, 1, status); + if (U_FAILURE(status)) { + return; + } + + // 4.9.1. Set coreUnit to be the combineLowercasing(locale, length, siPrefixPattern, + // coreUnit) + UnicodeString tmp; + // combineLowercasing(locale, length, prefixPattern, coreUnit) + // + // TODO(icu-units#28): run this only if prefixPattern does not + // contain space characters - do languages "as", "bn", "hi", + // "kk", etc have concepts of upper and lower case?: + if (width == UNUM_UNIT_WIDTH_FULL_NAME) { + coreUnit.toLower(loc); + } + prefixCompiled.format(coreUnit, tmp, status); + if (U_FAILURE(status)) { + return; + } + coreUnit = tmp; + } + + // 4.10. If dimensionalityPrefixPattern is not empty + if (dimensionality != 1) { + SimpleFormatter dimensionalityCompiled( + getWithPlural(dimensionalityPrefixPatterns, plural, status), 1, 1, status); + if (U_FAILURE(status)) { + return; + } + + // 4.10.1. Set coreUnit to be the combineLowercasing(locale, length, + // dimensionalityPrefixPattern, coreUnit) + UnicodeString tmp; + // combineLowercasing(locale, length, prefixPattern, coreUnit) + // + // TODO(icu-units#28): run this only if prefixPattern does not + // contain space characters - do languages "as", "bn", "hi", + // "kk", etc have concepts of upper and lower case?: + if (width == UNUM_UNIT_WIDTH_FULL_NAME) { + coreUnit.toLower(loc); + } + dimensionalityCompiled.format(coreUnit, tmp, status); + if (U_FAILURE(status)) { + return; + } + coreUnit = tmp; + } + + if (outArray[pluralIndex].length() == 0) { + // 4.11. If the result is empty, set result to be coreUnit + outArray[pluralIndex] = coreUnit; + } else { + // 4.12. Otherwise set result to be format(timesPattern, result, coreUnit) + UnicodeString tmp; + timesPatternFormatter.format(outArray[pluralIndex], coreUnit, tmp, status); + outArray[pluralIndex] = tmp; + } + } } - U_ASSERT(!uVal.isBogus() && uVal.length() == 1); - switch (uVal[0]) { - case u'0': - fillIn->gender = getGenderString(primaryData[GENDER_INDEX], status); - break; - case u'1': - fillIn->gender = getGenderString(secondaryData[GENDER_INDEX], status); - break; - default: - // Data error. Assert-fail in debug mode, else return no gender. - U_ASSERT(false); + for (int32_t pluralIndex = 0; pluralIndex < StandardPlural::Form::COUNT; pluralIndex++) { + if (globalPlaceholder[pluralIndex] == PH_BEGINNING) { + UnicodeString tmp; + tmp.append(u"{0}", 3); + if (globalJoinerChar != 0) { + tmp.append(globalJoinerChar); + } + tmp.append(outArray[pluralIndex]); + outArray[pluralIndex] = tmp; + } else if (globalPlaceholder[pluralIndex] == PH_END) { + if (globalJoinerChar != 0) { + outArray[pluralIndex].append(globalJoinerChar); + } + outArray[pluralIndex].append(u"{0}", 3); + } } } @@ -623,7 +1346,7 @@ LongNameHandler* LongNameHandler::forCurrencyLongNames(const Locale &loc, const getCurrencyLongNameData(loc, currency, simpleFormats, status); if (U_FAILURE(status)) { return nullptr; } result->simpleFormatsToModifiers(simpleFormats, {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, status); - // TODO(inflections): currency gender? + // TODO(icu-units#28): currency gender? return result; } @@ -648,8 +1371,12 @@ void LongNameHandler::multiSimpleFormatsToModifiers(const UnicodeString *leadFor UnicodeString leadFormat = getWithPlural(leadFormats, plural, status); if (U_FAILURE(status)) { return; } UnicodeString compoundFormat; - trailCompiled.format(leadFormat, compoundFormat, status); - if (U_FAILURE(status)) { return; } + if (leadFormat.length() == 0) { + compoundFormat = trailFormat; + } else { + trailCompiled.format(leadFormat, compoundFormat, status); + if (U_FAILURE(status)) { return; } + } SimpleFormatter compoundCompiled(compoundFormat, 0, 1, status); if (U_FAILURE(status)) { return; } fModifiers[i] = SimpleModifier(compoundCompiled, field, false, {this, SIGNUM_POS_ZERO, plural}); @@ -673,16 +1400,24 @@ const Modifier* LongNameHandler::getModifier(Signum /*signum*/, StandardPlural:: void MixedUnitLongNameHandler::forMeasureUnit(const Locale &loc, const MeasureUnit &mixedUnit, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, const PluralRules *rules, const MicroPropsGenerator *parent, MixedUnitLongNameHandler *fillIn, UErrorCode &status) { - U_ASSERT(mixedUnit.getComplexity(status) == UMEASURE_UNIT_MIXED); U_ASSERT(fillIn != nullptr); + if (U_FAILURE(status)) { + return; + } MeasureUnitImpl temp; const MeasureUnitImpl &impl = MeasureUnitImpl::forMeasureUnit(mixedUnit, temp, status); + if (impl.complexity != UMEASURE_UNIT_MIXED) { + // Should be using the normal LongNameHandler + status = U_UNSUPPORTED_ERROR; + return; + } + fillIn->fMixedUnitCount = impl.singleUnits.length(); fillIn->fMixedUnitData.adoptInstead(new UnicodeString[fillIn->fMixedUnitCount * ARRAY_LENGTH]); for (int32_t i = 0; i < fillIn->fMixedUnitCount; i++) { @@ -814,7 +1549,7 @@ const Modifier *MixedUnitLongNameHandler::getMixedUnitModifier(DecimalQuantity & const Modifier *MixedUnitLongNameHandler::getModifier(Signum /*signum*/, StandardPlural::Form /*plural*/) const { - // TODO(units): investigate this method when investigating where + // TODO(icu-units#28): investigate this method when investigating where // ModifierStore::getModifier() gets used. To be sure it remains // unreachable: UPRV_UNREACHABLE; @@ -824,7 +1559,7 @@ const Modifier *MixedUnitLongNameHandler::getModifier(Signum /*signum*/, LongNameMultiplexer *LongNameMultiplexer::forMeasureUnits(const Locale &loc, const MaybeStackVector<MeasureUnit> &units, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, const PluralRules *rules, const MicroPropsGenerator *parent, UErrorCode &status) {
diff --git a/icu4c/source/i18n/number_longnames.h b/icu4c/source/i18n/number_longnames.h index 8b004f0..cc66aff 100644 --- a/icu4c/source/i18n/number_longnames.h +++ b/icu4c/source/i18n/number_longnames.h
@@ -62,7 +62,7 @@ class LongNameHandler : public MicroPropsGenerator, public ModifierStore, public static void forMeasureUnit(const Locale &loc, const MeasureUnit &unitRef, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, const PluralRules *rules, const MicroPropsGenerator *parent, LongNameHandler *fillIn, @@ -102,18 +102,25 @@ class LongNameHandler : public MicroPropsGenerator, public ModifierStore, public // Allow macrosToMicroGenerator to call the private default constructor. friend class NumberFormatterImpl; - // Fills in LongNameHandler fields for formatting compound units identified - // via `unit` and `perUnit`. Both `unit` and `perUnit` need to be built-in - // units (for which data exists). - static void forCompoundUnit(const Locale &loc, - const MeasureUnit &unit, - const MeasureUnit &perUnit, - const UNumberUnitWidth &width, - StringPiece unitDisplayCase, - const PluralRules *rules, - const MicroPropsGenerator *parent, - LongNameHandler *fillIn, - UErrorCode &status); + // Fills in LongNameHandler fields for formatting units identified `unit`. + static void forArbitraryUnit(const Locale &loc, + const MeasureUnit &unit, + const UNumberUnitWidth &width, + const char *unitDisplayCase, + LongNameHandler *fillIn, + UErrorCode &status); + + // Roughly corresponds to patternTimes(...) in the spec: + // https://unicode.org/reports/tr35/tr35-general.html#compound-units + // + // productUnit is an rvalue reference to indicate this function consumes it, + // leaving it in a not-useful / undefined state. + static void processPatternTimes(MeasureUnitImpl &&productUnit, + Locale loc, + const UNumberUnitWidth &width, + const char *caseVariant, + UnicodeString *outArray, + UErrorCode &status); // Sets fModifiers to use the patterns from `simpleFormats`. void simpleFormatsToModifiers(const UnicodeString *simpleFormats, Field field, UErrorCode &status); @@ -122,7 +129,7 @@ class LongNameHandler : public MicroPropsGenerator, public ModifierStore, public // and `trailFormat` appended to each. // // With a leadFormat of "{0}m" and a trailFormat of "{0}/s", it produces a - // pattern of "{0}m/s" by inserting the leadFormat pattern into trailFormat. + // pattern of "{0}m/s" by inserting each leadFormat pattern into trailFormat. void multiSimpleFormatsToModifiers(const UnicodeString *leadFormats, UnicodeString trailFormat, Field field, UErrorCode &status); }; @@ -153,7 +160,7 @@ class MixedUnitLongNameHandler : public MicroPropsGenerator, public ModifierStor static void forMeasureUnit(const Locale &loc, const MeasureUnit &mixedUnit, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, const PluralRules *rules, const MicroPropsGenerator *parent, MixedUnitLongNameHandler *fillIn, @@ -230,7 +237,7 @@ class LongNameMultiplexer : public MicroPropsGenerator, public UMemory { static LongNameMultiplexer *forMeasureUnits(const Locale &loc, const MaybeStackVector<MeasureUnit> &units, const UNumberUnitWidth &width, - StringPiece unitDisplayCase, + const char *unitDisplayCase, const PluralRules *rules, const MicroPropsGenerator *parent, UErrorCode &status);
diff --git a/icu4c/source/i18n/unicode/measunit.h b/icu4c/source/i18n/unicode/measunit.h index cedf6ea..ead3dee 100644 --- a/icu4c/source/i18n/unicode/measunit.h +++ b/icu4c/source/i18n/unicode/measunit.h
@@ -32,6 +32,12 @@ U_NAMESPACE_BEGIN class StringEnumeration; class MeasureUnitImpl; +namespace number { +namespace impl { +class LongNameHandler; +} +} // namespace number + #ifndef U_HIDE_DRAFT_API /** * Enumeration for unit complexity. There are three levels: @@ -3708,6 +3714,9 @@ class U_I18N_API MeasureUnit: public UObject { LocalArray<MeasureUnit> splitToSingleUnitsImpl(int32_t& outCount, UErrorCode& status) const; friend class MeasureUnitImpl; + + // For access to findBySubType + friend class number::impl::LongNameHandler; }; #ifndef U_HIDE_DRAFT_API // @draft ICU 68
diff --git a/icu4c/source/test/intltest/numbertest.h b/icu4c/source/test/intltest/numbertest.h index 12ce045..3285ff7 100644 --- a/icu4c/source/test/intltest/numbertest.h +++ b/icu4c/source/test/intltest/numbertest.h
@@ -59,6 +59,7 @@ class NumberFormatterApiTest : public IntlTestWithFieldPosition { void notationCompact(); void unitMeasure(); void unitCompoundMeasure(); + void unitArbitraryMeasureUnits(); void unitSkeletons(); void unitUsage(); void unitUsageErrorCodes(); @@ -174,6 +175,7 @@ class NumberFormatterApiTest : public IntlTestWithFieldPosition { int32_t length); struct UnitInflectionTestCase { + const char *unitIdentifier; const char *locale; const char *unitDisplayCase; double value; @@ -181,10 +183,10 @@ class NumberFormatterApiTest : public IntlTestWithFieldPosition { }; void runUnitInflectionsTestCases(UnlocalizedNumberFormatter unf, - const UChar *skeleton, - const UChar *conciseSkeleton, + UnicodeString skeleton, const UnitInflectionTestCase *cases, - int32_t numCases); + int32_t numCases, + IcuTestErrorCode &status); }; class DecimalQuantityTest : public IntlTest {
diff --git a/icu4c/source/test/intltest/numbertest_api.cpp b/icu4c/source/test/intltest/numbertest_api.cpp index 8ddaa53..77be3ae 100644 --- a/icu4c/source/test/intltest/numbertest_api.cpp +++ b/icu4c/source/test/intltest/numbertest_api.cpp
@@ -80,6 +80,7 @@ void NumberFormatterApiTest::runIndexedTest(int32_t index, UBool exec, const cha TESTCASE_AUTO(notationCompact); TESTCASE_AUTO(unitMeasure); TESTCASE_AUTO(unitCompoundMeasure); + TESTCASE_AUTO(unitArbitraryMeasureUnits); TESTCASE_AUTO(unitSkeletons); TESTCASE_AUTO(unitUsage); TESTCASE_AUTO(unitUsageErrorCodes); @@ -584,22 +585,21 @@ void NumberFormatterApiTest::unitMeasure() { u"0.0088 meters", u"0 meters"); -// // TODO(ICU-20941): Support formatting for not-built-in units -// assertFormatDescending( -// u"Hectometers", -// u"measure-unit/length-hectometer", -// u"unit/hectometer", -// NumberFormatter::with().unit(MeasureUnit::forIdentifier("hectometer", status)), -// Locale::getEnglish(), -// u"87,650 hm", -// u"8,765 hm", -// u"876.5 hm", -// u"87.65 hm", -// u"8.765 hm", -// u"0.8765 hm", -// u"0.08765 hm", -// u"0.008765 hm", -// u"0 hm"); + assertFormatDescending( + u"Hectometers", + u"unit/hectometer", + u"unit/hectometer", + NumberFormatter::with().unit(MeasureUnit::forIdentifier("hectometer", status)), + Locale::getEnglish(), + u"87,650 hm", + u"8,765 hm", + u"876.5 hm", + u"87.65 hm", + u"8.765 hm", + u"0.8765 hm", + u"0.08765 hm", + u"0.008765 hm", + u"0 hm"); // TODO: Implement Measure in C++ // assertFormatSingleMeasure( @@ -717,15 +717,14 @@ void NumberFormatterApiTest::unitMeasure() { 5, u"5 a\u00F1os"); - // TODO(ICU-20941): arbitrary unit formatting -// assertFormatSingle( -// u"Hubble Constant", -// u"unit/kilometer-per-megaparsec-second", -// u"unit/kilometer-per-megaparsec-second", -// NumberFormatter::with().unit(MeasureUnit::forIdentifier("kilometer-per-megaparsec-second", status)), -// Locale("en"), -// 74, // Approximate 2019-03-18 measurement -// u"74 km/s.Mpc"); + assertFormatSingle( + u"Hubble Constant - usually expressed in km/s/Mpc", + u"unit/kilometer-per-megaparsec-second", + u"unit/kilometer-per-megaparsec-second", + NumberFormatter::with().unit(MeasureUnit::forIdentifier("kilometer-per-second-per-megaparsec", status)), + Locale("en"), + 74, // Approximate 2019-03-18 measurement + u"74 km/Mpc⋅sec"); assertFormatSingle( u"Mixed unit", @@ -1060,7 +1059,7 @@ void NumberFormatterApiTest::unitCompoundMeasure() { status.assertSuccess(); // Error is only returned once we try to format. FormattedNumber num = nf.formatDouble(2.4, status); if (!status.expectErrorAndReset(U_UNSUPPORTED_ERROR)) { - errln(UnicodeString("Expected failure, got: \"") + + errln(UnicodeString("Expected failure for unit/furlong-pascal per-unit/length-meter, got: \"") + nf.formatDouble(2.4, status).toString(status) + "\"."); status.assertSuccess(); } @@ -1088,6 +1087,167 @@ void NumberFormatterApiTest::unitCompoundMeasure() { u"2.4 m/s\u00B2"); } +void NumberFormatterApiTest::unitArbitraryMeasureUnits() { + IcuTestErrorCode status(*this, "unitArbitraryMeasureUnits()"); + + // TODO: fix after data bug is resolved? See CLDR-14510. +// assertFormatSingle( +// u"Binary unit prefix: kibibyte", +// u"unit/kibibyte", +// u"unit/kibibyte", +// NumberFormatter::with().unit(MeasureUnit::forIdentifier("kibibyte", status)), +// Locale("en-GB"), +// 2.4, +// u"2.4 KiB"); + + assertFormatSingle( + u"Binary unit prefix: kibibyte full-name", + u"unit/kibibyte unit-width-full-name", + u"unit/kibibyte unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("kibibyte", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("en-GB"), + 2.4, + u"2.4 kibibytes"); + + assertFormatSingle( + u"Binary unit prefix: kibibyte full-name", + u"unit/kibibyte unit-width-full-name", + u"unit/kibibyte unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("kibibyte", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("de"), + 2.4, + u"2,4 Kibibyte"); + + assertFormatSingle( + u"Binary prefix for non-digital units: kibimeter", + u"unit/kibimeter", + u"unit/kibimeter", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("kibimeter", status)), + Locale("en-GB"), + 2.4, + u"2.4 Kim"); + + assertFormatSingle( + u"SI prefix falling back to root: microohm", + u"unit/microohm", + u"unit/microohm", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("microohm", status)), + Locale("de-CH"), + 2.4, + u"2.4 μΩ"); + + assertFormatSingle( + u"de-CH fallback to de: microohm unit-width-full-name", + u"unit/microohm unit-width-full-name", + u"unit/microohm unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("microohm", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("de-CH"), + 2.4, + u"2.4\u00A0Mikroohm"); + + assertFormatSingle( + u"No prefixes, 'times' pattern: joule-furlong", + u"unit/joule-furlong", + u"unit/joule-furlong", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("joule-furlong", status)), + Locale("en"), + 2.4, + u"2.4 J⋅fur"); + + assertFormatSingle( + u"No numeratorUnitString: per-second", + u"unit/per-second", + u"unit/per-second", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("per-second", status)), + Locale("de-CH"), + 2.4, + u"2.4/s"); + + assertFormatSingle( + u"No numeratorUnitString: per-second unit-width-full-name", + u"unit/per-second unit-width-full-name", + u"unit/per-second unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("per-second", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("de-CH"), + 2.4, + u"2.4 pro Sekunde"); + + assertFormatSingle( + u"Prefix in the denominator: nanogram-per-picobarrel", + u"unit/nanogram-per-picobarrel", + u"unit/nanogram-per-picobarrel", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("nanogram-per-picobarrel", status)), + Locale("en-ZA"), + 2.4, + u"2,4 ng/pbbl"); + + assertFormatSingle( + u"Prefix in the denominator: nanogram-per-picobarrel unit-width-full-name", + u"unit/nanogram-per-picobarrel unit-width-full-name", + u"unit/nanogram-per-picobarrel unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("nanogram-per-picobarrel", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("en-ZA"), + 2.4, + u"2,4 nanograms per picobarrel"); + + // Valid MeasureUnit, but unformattable, because we only have patterns for + // pow2 and pow3 at this time: + LocalizedNumberFormatter lnf = NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("pow4-mile", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME) + .locale("en-ZA"); + lnf.formatInt(1, status); + status.expectErrorAndReset(U_RESOURCE_TYPE_MISMATCH); + + assertFormatSingle( + u"kibijoule-foot-per-cubic-gigafurlong-square-second unit-width-full-name", + u"unit/kibijoule-foot-per-cubic-gigafurlong-square-second unit-width-full-name", + u"unit/kibijoule-foot-per-cubic-gigafurlong-square-second unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("kibijoule-foot-per-cubic-gigafurlong-square-second", + status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("en-ZA"), + 2.4, + u"2,4 kibijoule-feet per cubic gigafurlong-square second"); + + assertFormatSingle( + u"kibijoule-foot-per-cubic-gigafurlong-square-second unit-width-full-name", + u"unit/kibijoule-foot-per-cubic-gigafurlong-square-second unit-width-full-name", + u"unit/kibijoule-foot-per-cubic-gigafurlong-square-second unit-width-full-name", + NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("kibijoule-foot-per-cubic-gigafurlong-square-second", + status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME), + Locale("de-CH"), + 2.4, + u"2.4\u00A0Kibijoule⋅Fuss pro Kubikgigafurlong⋅Quadratsekunde"); + + // TODO(ICU-21504): We want to be able to format this, but "100-kilometer" + // is not yet supported when it's not part of liter-per-100-kilometer: + lnf = NumberFormatter::with() + .unit(MeasureUnit::forIdentifier("kilowatt-hour-per-100-kilometer", status)) + .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME) + .locale("en-ZA"); + lnf.formatInt(1, status); + status.expectErrorAndReset(U_UNSUPPORTED_ERROR); +} + // TODO: merge these tests into numbertest_skeletons.cpp instead of here: void NumberFormatterApiTest::unitSkeletons() { const struct TestCase { @@ -1929,28 +2089,37 @@ void NumberFormatterApiTest::unitCurrency() { } void NumberFormatterApiTest::runUnitInflectionsTestCases(UnlocalizedNumberFormatter unf, - const UChar *skeleton, - const UChar *conciseSkeleton, + UnicodeString skeleton, const UnitInflectionTestCase *cases, - int32_t numCases) { + int32_t numCases, + IcuTestErrorCode &status) { for (int32_t i = 0; i < numCases; i++) { UnitInflectionTestCase t = cases[i]; + status.assertSuccess(); + MeasureUnit mu = MeasureUnit::forIdentifier(t.unitIdentifier, status); + if (status.errIfFailureAndReset("MeasureUnit::forIdentifier(\"%s\", ...) failed", + t.unitIdentifier)) { + continue; + }; + UnicodeString skelString = UnicodeString("unit/") + t.unitIdentifier + u" " + skeleton; const UChar *skel; const UChar *cSkel; if (t.unitDisplayCase == nullptr || t.unitDisplayCase[0] == 0) { - unf = unf.unitDisplayCase(""); - skel = skeleton; - cSkel = conciseSkeleton; + unf = unf.unit(mu).unitDisplayCase(""); + skel = skelString.getTerminatedBuffer(); + cSkel = skelString.getTerminatedBuffer(); } else { - unf = unf.unitDisplayCase(t.unitDisplayCase); + unf = unf.unit(mu).unitDisplayCase(t.unitDisplayCase); + // No skeleton support for unitDisplayCase yet. skel = nullptr; cSkel = nullptr; } - assertFormatSingle((UnicodeString("\"") + skeleton + u"\", locale=\"" + t.locale + - u"\", case=\"" + (t.unitDisplayCase ? t.unitDisplayCase : "") + - u"\", value=" + t.value) + assertFormatSingle((UnicodeString("Unit: \"") + t.unitIdentifier + ("\", \"") + skeleton + + u"\", locale=\"" + t.locale + u"\", case=\"" + + (t.unitDisplayCase ? t.unitDisplayCase : "") + u"\", value=" + t.value) .getTerminatedBuffer(), skel, cSkel, unf, Locale(t.locale), t.value, t.expected); + status.assertSuccess(); } } @@ -1959,110 +2128,122 @@ void NumberFormatterApiTest::unitInflections() { UnlocalizedNumberFormatter unf; const UChar *skeleton; - const UChar *conciseSkeleton; { // Simple inflected form test - test case based on the example in CLDR's // grammaticalFeatures.xml - unf = NumberFormatter::with().unit(NoUnit::percent()).unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); - skeleton = u"percent unit-width-full-name"; - conciseSkeleton = u"% unit-width-full-name"; + unf = NumberFormatter::with().unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); + skeleton = u"unit-width-full-name"; const UnitInflectionTestCase percentCases[] = { - {"ru", nullptr, 10, u"10 процентов"}, // many - {"ru", "genitive", 10, u"10 процентов"}, // many - {"ru", nullptr, 33, u"33 процента"}, // few - {"ru", "genitive", 33, u"33 процентов"}, // few - {"ru", nullptr, 1, u"1 процент"}, // one - {"ru", "genitive", 1, u"1 процента"}, // one + {"percent", "ru", nullptr, 10, u"10 процентов"}, // many + {"percent", "ru", "genitive", 10, u"10 процентов"}, // many + {"percent", "ru", nullptr, 33, u"33 процента"}, // few + {"percent", "ru", "genitive", 33, u"33 процентов"}, // few + {"percent", "ru", nullptr, 1, u"1 процент"}, // one + {"percent", "ru", "genitive", 1, u"1 процента"}, // one }; - runUnitInflectionsTestCases(unf, skeleton, conciseSkeleton, percentCases, - UPRV_LENGTHOF(percentCases)); + runUnitInflectionsTestCases(unf, skeleton, percentCases, UPRV_LENGTHOF(percentCases), status); } { - // Testing "de" rules: - // <deriveComponent feature="case" structure="per" value0="compound" value1="accusative"/> - // <deriveComponent feature="plural" structure="per" value0="compound" value1="one"/> - // - // per-patterns use accusative, but happen to match nominative, so we're - // not testing value1 in the first rule above. - - unf = NumberFormatter::with().unit(MeasureUnit::getMeter()).unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); - skeleton = u"unit/meter unit-width-full-name"; - conciseSkeleton = u"unit/meter unit-width-full-name"; + // General testing of inflection rules + unf = NumberFormatter::with().unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); + skeleton = u"unit-width-full-name"; const UnitInflectionTestCase meterCases[] = { - {"de", nullptr, 1, u"1 Meter"}, - {"de", "genitive", 1, u"1 Meters"}, - {"de", nullptr, 2, u"2 Meter"}, - {"de", "dative", 2, u"2 Metern"}, - }; - runUnitInflectionsTestCases(unf, skeleton, conciseSkeleton, meterCases, - UPRV_LENGTHOF(meterCases)); + // Check up on the basic values that the compound patterns below are + // derived from: + {"meter", "de", nullptr, 1, u"1 Meter"}, + {"meter", "de", "genitive", 1, u"1 Meters"}, + {"meter", "de", nullptr, 2, u"2 Meter"}, + {"meter", "de", "dative", 2, u"2 Metern"}, + {"mile", "de", nullptr, 1, u"1 Meile"}, + {"mile", "de", nullptr, 2, u"2 Meilen"}, + {"day", "de", nullptr, 1, u"1 Tag"}, + {"day", "de", "genitive", 1, u"1 Tages"}, + {"day", "de", nullptr, 2, u"2 Tage"}, + {"day", "de", "dative", 2, u"2 Tagen"}, + {"decade", "de", nullptr, 1, u"1\u00A0Jahrzehnt"}, + {"decade", "de", nullptr, 2, u"2\u00A0Jahrzehnte"}, - unf = NumberFormatter::with().unit(MeasureUnit::getDay()).unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); - skeleton = u"unit/day unit-width-full-name"; - conciseSkeleton = u"unit/day unit-width-full-name"; - const UnitInflectionTestCase dayCases[] = { - {"de", nullptr, 1, u"1 Tag"}, - {"de", "genitive", 1, u"1 Tages"}, - {"de", nullptr, 2, u"2 Tage"}, - {"de", "dative", 2, u"2 Tagen"}, - }; - runUnitInflectionsTestCases(unf, skeleton, conciseSkeleton, dayCases, UPRV_LENGTHOF(dayCases)); + // Testing de "per" rules: + // <deriveComponent feature="case" structure="per" value0="compound" value1="accusative"/> + // <deriveComponent feature="plural" structure="per" value0="compound" value1="one"/> + // per-patterns use accusative, but since the accusative form + // matches the nominative form, we're not effectively testing value1 + // in the "case & per" rule above. - // Day has a perUnitPattern - unf = NumberFormatter::with() - .unit(MeasureUnit::forIdentifier("meter-per-day", status)) - .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); - skeleton = u"unit/meter-per-day unit-width-full-name"; - conciseSkeleton = u"unit/meter-per-day unit-width-full-name"; - const UnitInflectionTestCase meterPerDayCases[] = { - {"de", nullptr, 1, u"1 Meter pro Tag"}, - {"de", "genitive", 1, u"1 Meters pro Tag"}, - {"de", nullptr, 2, u"2 Meter pro Tag"}, - {"de", "dative", 2, u"2 Metern pro Tag"}, - // testing code path that falls back to "root" but does not inflect: - {"af", nullptr, 1, u"1 meter per dag"}, - {"af", "dative", 1, u"1 meter per dag"}, - }; - runUnitInflectionsTestCases(unf, skeleton, conciseSkeleton, meterPerDayCases, - UPRV_LENGTHOF(meterPerDayCases)); + // We have a perUnitPattern for "day" in de, so "per" rules are not + // applied for these: + {"meter-per-day", "de", nullptr, 1, u"1 Meter pro Tag"}, + {"meter-per-day", "de", "genitive", 1, u"1 Meters pro Tag"}, + {"meter-per-day", "de", nullptr, 2, u"2 Meter pro Tag"}, + {"meter-per-day", "de", "dative", 2, u"2 Metern pro Tag"}, - // Decade does not have a perUnitPattern at this time (CLDR 39 / ICU - // 69), so we can test for the correct form of the per part: - unf = NumberFormatter::with() - .unit(MeasureUnit::forIdentifier("parsec-per-decade", status)) - .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); - skeleton = u"unit/parsec-per-decade unit-width-full-name"; - conciseSkeleton = u"unit/parsec-per-decade unit-width-full-name"; - // Fragile test cases: these cases will break when whitespace is more - // consistently applied. - const UnitInflectionTestCase parsecPerDecadeCases[] = { - {"de", nullptr, 1, u"1\u00A0Parsec pro Jahrzehnt"}, - {"de", "genitive", 1, u"1 Parsec pro Jahrzehnt"}, - {"de", nullptr, 2, u"2\u00A0Parsec pro Jahrzehnt"}, - {"de", "dative", 2, u"2 Parsec pro Jahrzehnt"}, + // testing code path that falls back to "root" grammaticalFeatures + // but does not inflect: + {"meter-per-day", "af", nullptr, 1, u"1 meter per dag"}, + {"meter-per-day", "af", "dative", 1, u"1 meter per dag"}, + + // Decade does not have a perUnitPattern at this time (CLDR 39 / ICU + // 69), so we can use it to test for selection of correct plural form. + // - Note: fragile test cases, these cases will break when + // whitespace is more consistently applied. + {"parsec-per-decade", "de", nullptr, 1, u"1\u00A0Parsec pro Jahrzehnt"}, + {"parsec-per-decade", "de", "genitive", 1, u"1 Parsec pro Jahrzehnt"}, + {"parsec-per-decade", "de", nullptr, 2, u"2\u00A0Parsec pro Jahrzehnt"}, + {"parsec-per-decade", "de", "dative", 2, u"2 Parsec pro Jahrzehnt"}, + + // Testing de "times", "power" and "prefix" rules: + // + // <deriveComponent feature="plural" structure="times" value0="one" value1="compound"/> + // <deriveComponent feature="case" structure="times" value0="nominative" value1="compound"/> + // + // <deriveComponent feature="plural" structure="prefix" value0="one" value1="compound"/> + // <deriveComponent feature="case" structure="prefix" value0="nominative" value1="compound"/> + // + // Prefixes in German don't change with plural or case, so these + // tests can't test value0 of the following two rules: + // <deriveComponent feature="plural" structure="power" value0="one" value1="compound"/> + // <deriveComponent feature="case" structure="power" value0="nominative" value1="compound"/> + {"square-decimeter-dekameter", "de", nullptr, 1, u"1 Quadratdezimeter⋅Dekameter"}, + {"square-decimeter-dekameter", "de", "genitive", 1, u"1 Quadratdezimeter⋅Dekameters"}, + {"square-decimeter-dekameter", "de", nullptr, 2, u"2 Quadratdezimeter⋅Dekameter"}, + {"square-decimeter-dekameter", "de", "dative", 2, u"2 Quadratdezimeter⋅Dekametern"}, + // Feminine "Meile" better demonstrates singular-vs-plural form: + {"cubic-mile-dekamile", "de", nullptr, 1, u"1 Kubikmeile⋅Dekameile"}, + {"cubic-mile-dekamile", "de", nullptr, 2, u"2 Kubikmeile⋅Dekameilen"}, + + // French handles plural "times" and "power" structures differently: + // plural form impacts all "numerator" units (denominator remains + // singular like German), and "pow2" prefixes have different forms + // <deriveComponent feature="plural" structure="times" value0="compound" value1="compound"/> + // <deriveComponent feature="plural" structure="power" value0="compound" value1="compound"/> + // TODO: this looks wrong, and will change if CLDR-14533 causes a change: + {"square-decimeter-square-second", "fr", nullptr, 1, u"1\u00A0décimètre carréseconde carrée"}, + {"square-decimeter-square-second", "fr", nullptr, 2, u"2\u00A0décimètres carréssecondes carrées"}, }; - runUnitInflectionsTestCases(unf, skeleton, conciseSkeleton, parsecPerDecadeCases, - UPRV_LENGTHOF(parsecPerDecadeCases)); + runUnitInflectionsTestCases(unf, skeleton, meterCases, UPRV_LENGTHOF(meterCases), status); } { // Testing inflection of mixed units: - unf = NumberFormatter::with() - .unit(MeasureUnit::forIdentifier("meter-and-centimeter", status)) - .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); - skeleton = u"unit/meter-and-centimeter unit-width-full-name"; - conciseSkeleton = u"unit/meter-and-centimeter unit-width-full-name"; + unf = NumberFormatter::with().unitWidth(UNUM_UNIT_WIDTH_FULL_NAME); + skeleton = u"unit-width-full-name"; const UnitInflectionTestCase meterPerDayCases[] = { + {"meter", "de", nullptr, 1, u"1 Meter"}, + {"meter", "de", "genitive", 1, u"1 Meters"}, + {"meter", "de", "dative", 2, u"2 Metern"}, + {"centimeter", "de", nullptr, 1, u"1 Zentimeter"}, + {"centimeter", "de", "genitive", 1, u"1 Zentimeters"}, + {"centimeter", "de", "dative", 10, u"10 Zentimetern"}, // TODO(CLDR-14502): check that these inflections are correct, and // whether CLDR needs any rules for them (presumably CLDR spec // should mention it, if it's a consistent rule): - {"de", nullptr, 1.01, u"1 Meter, 1 Zentimeter"}, - {"de", "genitive", 1.01, u"1 Meters, 1 Zentimeters"}, - {"de", "genitive", 1.1, u"1 Meters, 10 Zentimeter"}, - {"de", "dative", 1.1, u"1 Meter, 10 Zentimetern"}, - {"de", "dative", 2.1, u"2 Metern, 10 Zentimetern"}, + {"meter-and-centimeter", "de", nullptr, 1.01, u"1 Meter, 1 Zentimeter"}, + {"meter-and-centimeter", "de", "genitive", 1.01, u"1 Meters, 1 Zentimeters"}, + {"meter-and-centimeter", "de", "genitive", 1.1, u"1 Meters, 10 Zentimeter"}, + {"meter-and-centimeter", "de", "dative", 1.1, u"1 Meter, 10 Zentimetern"}, + {"meter-and-centimeter", "de", "dative", 2.1, u"2 Metern, 10 Zentimetern"}, }; - runUnitInflectionsTestCases(unf, skeleton, conciseSkeleton, meterPerDayCases, - UPRV_LENGTHOF(meterPerDayCases)); + runUnitInflectionsTestCases(unf, skeleton, meterPerDayCases, UPRV_LENGTHOF(meterPerDayCases), + status); } // TODO: add a usage case that selects between preferences with different // genders (e.g. year, month, day, hour). @@ -2078,16 +2259,26 @@ void NumberFormatterApiTest::unitGender() { const char *expectedGender; } cases[] = { {"de", "meter", "masculine"}, + {"de", "second", "feminine"}, {"de", "minute", "feminine"}, {"de", "hour", "feminine"}, {"de", "day", "masculine"}, {"de", "year", "neuter"}, + {"fr", "meter", "masculine"}, + {"fr", "second", "feminine"}, {"fr", "minute", "feminine"}, {"fr", "hour", "feminine"}, {"fr", "day", "masculine"}, - // grammaticalFeatures deriveCompound "per" rule: + // grammaticalFeatures deriveCompound "per" rule takes the gender of the + // numerator unit: {"de", "meter-per-hour", "masculine"}, - {"af", "meter-per-hour", ""}, + {"fr", "meter-per-hour", "masculine"}, + {"af", "meter-per-hour", ""}, // ungendered language + // French "times" takes gender from first value, German takes the + // second. Prefix and power does not have impact on gender for these + // languages: + {"de", "square-decimeter-square-second", "feminine"}, + {"fr", "square-decimeter-square-second", "masculine"}, // TODO(ICU-21494): determine whether list genders behave as follows, // and implement proper getListGender support (covering more than just // two genders): @@ -2101,13 +2292,22 @@ void NumberFormatterApiTest::unitGender() { FormattedNumber fn; for (const TestCase &t : cases) { // TODO(icu-units#140): make this work for more than just UNUM_UNIT_WIDTH_FULL_NAME + // formatter = NumberFormatter::with() + // .unit(MeasureUnit::forIdentifier(t.unitIdentifier, status)) + // .locale(Locale(t.locale)); + // fn = formatter.formatDouble(1.1, status); + // assertEquals(UnicodeString("Testing gender with default width, unit: ") + t.unitIdentifier + + // ", locale: " + t.locale, + // t.expectedGender, fn.getGender(status)); + // status.assertSuccess(); + formatter = NumberFormatter::with() .unit(MeasureUnit::forIdentifier(t.unitIdentifier, status)) .unitWidth(UNUM_UNIT_WIDTH_FULL_NAME) .locale(Locale(t.locale)); fn = formatter.formatDouble(1.1, status); - assertEquals(UnicodeString("Testing gender, unit: ") + t.unitIdentifier + - ", locale: " + t.locale, + assertEquals(UnicodeString("Testing gender with UNUM_UNIT_WIDTH_FULL_NAME, unit: ") + + t.unitIdentifier + ", locale: " + t.locale, t.expectedGender, fn.getGender(status)); status.assertSuccess(); }