ICU-11380 genuca tool changes for root radical-stroke order; merge tools/trunk r36730 into tags/release-54; this was used to generate ICU 54 ucadata.icu
X-SVN-Rev: 36800
diff --git a/unicode/c/genuca/genuca.cpp b/unicode/c/genuca/genuca.cpp
index b131936..62c3226 100644
--- a/unicode/c/genuca/genuca.cpp
+++ b/unicode/c/genuca/genuca.cpp
@@ -25,6 +25,7 @@
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
#include "unicode/localpointer.h"
+#include "unicode/utf8.h"
#include "charstr.h"
#include "cmemory.h"
#include "collation.h"
@@ -60,8 +61,16 @@
U_NAMESPACE_USE
+enum HanOrderValue {
+ HAN_NO_ORDER = -1,
+ HAN_IMPLICIT,
+ HAN_RADICAL_STROKE
+};
+
static UBool beVerbose=FALSE, withCopyright=TRUE;
+static HanOrderValue hanOrder = HAN_NO_ORDER;
+
static UVersionInfo UCAVersion={ 0, 0, 0, 0 };
static UDataInfo ucaDataInfo={
@@ -225,11 +234,54 @@
return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
}
+/**
+ * Maps Unified_Ideograph's to primary CEs in the given order of ranges.
+ */
+class HanOrder {
+public:
+ HanOrder(UErrorCode &errorCode) : ranges(errorCode), set(), done(FALSE) {}
+
+ void addRange(UChar32 start, UChar32 end, UErrorCode &errorCode) {
+ int32_t length = ranges.size();
+ if(length > 0 && (ranges.elementAti(length - 1) + 1) == start) {
+ // The previous range end is just before this range start: Merge adjacent ranges.
+ ranges.setElementAt(end, length - 1);
+ } else {
+ ranges.addElement(start, errorCode);
+ ranges.addElement(end, errorCode);
+ }
+ set.add(start, end);
+ }
+
+ void setBuilderHanOrder(CollationBaseDataBuilder &builder, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ builder.initHanRanges(ranges.getBuffer(), ranges.size(), errorCode);
+ done = TRUE;
+ }
+
+ void setDone() {
+ done = TRUE;
+ }
+
+ UBool isDone() { return done; }
+
+ const UnicodeSet &getSet() { return set; }
+
+private:
+ UVector32 ranges;
+ UnicodeSet set;
+ UBool done;
+};
+
+static HanOrder *implicitHanOrder = NULL;
+static HanOrder *radicalStrokeOrder = NULL;
+
enum ActionType {
READCE,
READPRIMARY,
READBYTE,
READUNIFIEDIDEOGRAPH,
+ READRADICAL,
READUCAVERSION,
READLEADBYTETOSCRIPTS,
IGNORE
@@ -256,6 +308,7 @@
{"[last trailing", 0, READCE},
{"[Unified_Ideograph", 0, READUNIFIEDIDEOGRAPH},
+ {"[radical", 0, READRADICAL},
{"[fixed first implicit byte", 0, IGNORE},
{"[fixed last implicit byte", 0, IGNORE},
@@ -321,7 +374,12 @@
return;
}
} else if(what_to_do == READUNIFIEDIDEOGRAPH) {
- UVector32 unihan(*status);
+ if(implicitHanOrder != NULL) {
+ fprintf(stderr, "duplicate [Unified_Ideograph] lines\n");
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ implicitHanOrder = new HanOrder(*status);
if(U_FAILURE(*status)) { return; }
for(;;) {
if(*pointer == ']') { break; }
@@ -342,11 +400,79 @@
*status = U_INVALID_FORMAT_ERROR;
return;
}
- unihan.addElement((UChar32)start, *status);
- unihan.addElement((UChar32)end, *status);
+ implicitHanOrder->addRange((UChar32)start, (UChar32)end, *status);
pointer = skipWhiteSpace(s);
}
- builder.initHanRanges(unihan.getBuffer(), unihan.size(), *status);
+ if(hanOrder == HAN_IMPLICIT) {
+ implicitHanOrder->setBuilderHanOrder(builder, *status);
+ }
+ implicitHanOrder->setDone();
+ } else if(what_to_do == READRADICAL) {
+ if(radicalStrokeOrder == NULL) {
+ if(implicitHanOrder == NULL) {
+ fprintf(stderr, "[radical] section before [Unified_Ideograph] line\n");
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ radicalStrokeOrder = new HanOrder(*status);
+ if(U_FAILURE(*status)) { return; }
+ } else if(radicalStrokeOrder->isDone()) {
+ fprintf(stderr, "duplicate [radical] sections\n");
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ UBool ok;
+ if(uprv_strcmp(pointer, "end]") == 0) {
+ if(radicalStrokeOrder->getSet() != implicitHanOrder->getSet()) {
+ fprintf(stderr, "[radical end]: "
+ "some of [Unified_Ideograph] missing from [radical] lines\n");
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ if(hanOrder == HAN_RADICAL_STROKE) {
+ radicalStrokeOrder->setBuilderHanOrder(builder, *status);
+ }
+ radicalStrokeOrder->setDone();
+ } else {
+ // Read Han characters and ranges between : and ].
+ // Ignore the radical data before the :.
+ char *startPointer = uprv_strchr(pointer, ':');
+ char *limitPointer = uprv_strchr(pointer, ']');
+ if(startPointer == NULL || limitPointer == NULL ||
+ (startPointer + 1) >= limitPointer) {
+ fprintf(stderr, "[radical]: no Han characters listed between : and ]\n");
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ pointer = startPointer + 1;
+ int32_t length = (int32_t)(limitPointer - pointer);
+ for(int32_t i = 0; i < length;) {
+ UChar32 start;
+ U8_NEXT(pointer, i, length, start);
+ UChar32 end;
+ if(pointer[i] == '-') {
+ ++i;
+ U8_NEXT(pointer, i, length, end);
+ } else {
+ end = start;
+ }
+ if(radicalStrokeOrder->getSet().containsSome(start, end)) {
+ fprintf(stderr, "[radical]: some of U+%04x..U+%04x occur "
+ "multiple times in the radical-stroke order\n",
+ start, end);
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ if(!implicitHanOrder->getSet().contains(start, end)) {
+ fprintf(stderr, "[radical]: some of U+%04x..U+%04x are "
+ "not Unified_Ideograph\n",
+ start, end);
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+ radicalStrokeOrder->addRange(start, end, *status);
+ }
+ }
} else if (what_to_do == READUCAVERSION) {
u_versionFromString(UCAVersion, pointer);
if(beVerbose) {
@@ -429,8 +555,8 @@
if(U_FAILURE(*status)) {
return FALSE;
}
- char buffer[2048];
- char *result = fgets(buffer, 2048, data);
+ char buffer[30000];
+ char *result = fgets(buffer, sizeof(buffer), data);
if(result == NULL) {
if(feof(data)) {
return FALSE;
@@ -863,7 +989,10 @@
(long)totalSize + 32); // 32 bytes = DataHeader rounded up to 16-byte boundary
CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
- UNewDataMemory *pData=udata_create(path, "icu", "ucadata", &ucaDataInfo,
+ const char *dataName =
+ hanOrder == HAN_IMPLICIT ? "ucadata-implicithan" :
+ "ucadata-unihan";
+ UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n",
@@ -1023,14 +1152,16 @@
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
- COPYRIGHT
+ COPYRIGHT,
+ HAN_ORDER
};
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
- UOPTION_COPYRIGHT
+ UOPTION_COPYRIGHT,
+ UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG)
};
extern "C" int
@@ -1045,6 +1176,17 @@
"error in command line argument \"%s\"\n",
argv[-argc]);
}
+ if(options[HAN_ORDER].doesOccur) {
+ const char *order = options[HAN_ORDER].value;
+ if(uprv_strcmp(order, "implicit") == 0) {
+ hanOrder = HAN_IMPLICIT;
+ } else if(uprv_strcmp(order, "radical-stroke") == 0) {
+ hanOrder = HAN_RADICAL_STROKE;
+ }
+ }
+ if(hanOrder == HAN_NO_ORDER) {
+ argc = -1;
+ }
if( argc<2 ||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
) {
@@ -1053,7 +1195,7 @@
* required supported string length is 509 bytes.
*/
fprintf(stderr,
- "Usage: %s [-options] path/to/ICU/src/root\n"
+ "Usage: %s [-options] --hanOrder (implicit|radical-stroke) path/to/ICU/src/root\n"
"\n"
"Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n"
"writes source and binary data files with the collation root data.\n"
@@ -1063,7 +1205,8 @@
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
- "\t-c or --copyright include a copyright notice\n");
+ "\t-c or --copyright include a copyright notice\n"
+ "\t --hanOrder implicit or radical-stroke\n");
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}