ICU-21246 Handle kilogram SI prefix correctly
Fix kilogram parsing: ignore 'kilogram' as a stem, we have 'gram'.
Failures in the added unit test before the fix: withSIPrefix resulted
in 'microkilogram', and kilogram's prefix was considered to be
"ONE" (i.e. 10^0).
diff --git a/icu4c/source/i18n/measunit_extra.cpp b/icu4c/source/i18n/measunit_extra.cpp
index 0ae76b5..c702e13 100644
--- a/icu4c/source/i18n/measunit_extra.cpp
+++ b/icu4c/source/i18n/measunit_extra.cpp
@@ -14,6 +14,7 @@
#include "charstr.h"
#include "cmemory.h"
+#include "cstring.h"
#include "measunit_impl.h"
#include "resource.h"
#include "uarrsort.h"
@@ -115,20 +116,37 @@
};
/**
- * A ResourceSink that collects table keys from a resource.
+ * A ResourceSink that collects simple unit identifiers from the keys of the
+ * convertUnits table into an array, and adds these values to a TrieBuilder,
+ * with associated values being their index into this array plus a specified
+ * offset, to a trie.
*
- * This class is for use by ures_getAllItemsWithFallback. Example code:
+ * Example code:
*
* UErrorCode status = U_ZERO_ERROR;
- * const char* unitIdentifiers[200];
- * TableKeysSink identifierSink(unitIdentifiers, 200);
+ * BytesTrieBuilder b(status);
+ * const char *unitIdentifiers[200];
+ * SimpleUnitIdentifiersSink identifierSink(unitIdentifiers, 200, b, kTrieValueOffset);
* LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
* ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
*/
-class TableKeysSink : public icu::ResourceSink {
+class SimpleUnitIdentifiersSink : public icu::ResourceSink {
public:
- explicit TableKeysSink(const char **out, int32_t outSize)
- : outArray(out), outSize(outSize), outIndex(0) {
+ /**
+ * Constructor.
+ * @param out Array of char* to which the simple unit identifiers will be
+ * saved.
+ * @param outSize The size of `out`.
+ * @param trieBuilder The trie builder to which the simple unit identifier
+ * should be added. The trie builder must outlive this resource sink.
+ * @param trieValueOffset This is added to the index of the identifier in
+ * the `out` array, before adding to `trieBuilder` as the value
+ * associated with the identifier.
+ */
+ explicit SimpleUnitIdentifiersSink(const char **out, int32_t outSize, BytesTrieBuilder &trieBuilder,
+ int32_t trieValueOffset)
+ : outArray(out), outSize(outSize), trieBuilder(trieBuilder), trieValueOffset(trieValueOffset),
+ outIndex(0) {
}
/**
@@ -154,13 +172,25 @@
for (int32_t i = 0; table.getKeyAndValue(i, key, value); ++i) {
U_ASSERT(i < table.getSize());
U_ASSERT(outIndex < outSize);
- outArray[outIndex++] = key;
+ if (uprv_strcmp(key, "kilogram") == 0) {
+ // For parsing, we use "gram", the prefixless metric mass unit. We
+ // thus ignore the SI Base Unit of Mass: it exists due to being the
+ // mass conversion target unit, but not needed for MeasureUnit
+ // parsing.
+ continue;
+ }
+ outArray[outIndex] = key;
+ trieBuilder.add(key, trieValueOffset + outIndex, status);
+ outIndex++;
}
}
private:
const char **outArray;
int32_t outSize;
+ BytesTrieBuilder &trieBuilder;
+ int32_t trieValueOffset;
+
int32_t outIndex;
};
@@ -228,6 +258,8 @@
ures_getByKey(unitsBundle.getAlias(), "convertUnits", NULL, &status));
if (U_FAILURE(status)) { return; }
+ // Allocate enough space: with identifierSink below skipping kilogram, we're
+ // probably allocating one more than needed.
int32_t simpleUnitsCount = convertUnits.getAlias()->fSize;
int32_t arrayMallocSize = sizeof(char *) * simpleUnitsCount;
gSimpleUnits = static_cast<const char **>(uprv_malloc(arrayMallocSize));
@@ -237,11 +269,9 @@
}
uprv_memset(gSimpleUnits, 0, arrayMallocSize);
- TableKeysSink identifierSink(gSimpleUnits, simpleUnitsCount);
+ // Populate gSimpleUnits and build the associated trie.
+ SimpleUnitIdentifiersSink identifierSink(gSimpleUnits, simpleUnitsCount, b, kSimpleUnitOffset);
ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
- for (int i = 0; i < simpleUnitsCount; i++) {
- b.add(gSimpleUnits[i], kSimpleUnitOffset + i, status);
- }
// Build the CharsTrie
// TODO: Use SLOW or FAST here?
diff --git a/icu4c/source/test/intltest/measfmttest.cpp b/icu4c/source/test/intltest/measfmttest.cpp
index b562022..380246d 100644
--- a/icu4c/source/test/intltest/measfmttest.cpp
+++ b/icu4c/source/test/intltest/measfmttest.cpp
@@ -81,6 +81,7 @@
void TestNumericTimeSomeSpecialFormats();
void TestIdentifiers();
void TestInvalidIdentifiers();
+ void TestKilogramIdentifier();
void TestCompoundUnitOperations();
void TestDimensionlessBehaviour();
void Test21060_AddressSanitizerProblem();
@@ -206,6 +207,7 @@
TESTCASE_AUTO(TestNumericTimeSomeSpecialFormats);
TESTCASE_AUTO(TestIdentifiers);
TESTCASE_AUTO(TestInvalidIdentifiers);
+ TESTCASE_AUTO(TestKilogramIdentifier);
TESTCASE_AUTO(TestCompoundUnitOperations);
TESTCASE_AUTO(TestDimensionlessBehaviour);
TESTCASE_AUTO(Test21060_AddressSanitizerProblem);
@@ -3323,6 +3325,45 @@
}
}
+// Kilogram is a "base unit", although it's also "gram" with a kilo- prefix.
+// This tests that it is handled in the preferred manner.
+void MeasureFormatTest::TestKilogramIdentifier() {
+ IcuTestErrorCode status(*this, "TestKilogramIdentifier");
+
+ // SI unit of mass
+ MeasureUnit kilogram = MeasureUnit::forIdentifier("kilogram", status);
+ // Metric mass unit
+ MeasureUnit gram = MeasureUnit::forIdentifier("gram", status);
+ // Microgram: still a built-in type
+ MeasureUnit microgram = MeasureUnit::forIdentifier("microgram", status);
+ // Nanogram: not a built-in type at this time
+ MeasureUnit nanogram = MeasureUnit::forIdentifier("nanogram", status);
+ status.assertSuccess();
+
+ assertEquals("parsed kilogram equals built-in kilogram", MeasureUnit::getKilogram().getType(),
+ kilogram.getType());
+ assertEquals("parsed kilogram equals built-in kilogram", MeasureUnit::getKilogram().getSubtype(),
+ kilogram.getSubtype());
+ assertEquals("parsed gram equals built-in gram", MeasureUnit::getGram().getType(), gram.getType());
+ assertEquals("parsed gram equals built-in gram", MeasureUnit::getGram().getSubtype(),
+ gram.getSubtype());
+ assertEquals("parsed microgram equals built-in microgram", MeasureUnit::getMicrogram().getType(),
+ microgram.getType());
+ assertEquals("parsed microgram equals built-in microgram", MeasureUnit::getMicrogram().getSubtype(),
+ microgram.getSubtype());
+ assertEquals("nanogram", "", nanogram.getType());
+ assertEquals("nanogram", "nanogram", nanogram.getIdentifier());
+
+ assertEquals("prefix of kilogram", UMEASURE_SI_PREFIX_KILO, kilogram.getSIPrefix(status));
+ assertEquals("prefix of gram", UMEASURE_SI_PREFIX_ONE, gram.getSIPrefix(status));
+ assertEquals("prefix of microgram", UMEASURE_SI_PREFIX_MICRO, microgram.getSIPrefix(status));
+ assertEquals("prefix of nanogram", UMEASURE_SI_PREFIX_NANO, nanogram.getSIPrefix(status));
+
+ MeasureUnit tmp = kilogram.withSIPrefix(UMEASURE_SI_PREFIX_MILLI, status);
+ assertEquals(UnicodeString("Kilogram + milli should be milligram, got: ") + tmp.getIdentifier(),
+ MeasureUnit::getMilligram().getIdentifier(), tmp.getIdentifier());
+}
+
void MeasureFormatTest::TestCompoundUnitOperations() {
IcuTestErrorCode status(*this, "TestCompoundUnitOperations");