| // © 2017 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2003, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * |
| * File colprobe.cpp |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 03/18/2003 weiv Creation. |
| ******************************************************************************* |
| */ |
| |
| #include "uoptions.h" |
| #include "unicode/ucol.h" |
| #include "unicode/ucoleitr.h" |
| #include "unicode/ures.h" |
| #include "unicode/uniset.h" |
| #include "unicode/usetiter.h" |
| #include "unicode/ustring.h" |
| #include "unicode/uchar.h" |
| #include "unicode/uscript.h" |
| #include "unicode/locid.h" |
| #include "unicode/ucnv.h" |
| #include "uprops.h" |
| #include "hash.h" |
| #include "ucol_imp.h" |
| |
| #include "unicode/ustdio.h" |
| #include "unicode/utrans.h" |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <fcntl.h> |
| |
| // unix tolower |
| #include <ctype.h> |
| // unix setlocale |
| #include <locale.h> |
| |
| #include "colprobe.h" |
| |
| #include "line.h" |
| #include "sortedlines.h" |
| #include "strengthprobe.h" |
| |
| void testWin(StrengthProbe &probe, UErrorCode &status) ; |
| |
| #if defined WIN32 |
| #include <io.h> |
| #include <windows.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <direct.h> |
| |
| int createDir(const char* dirName) { |
| struct _stat myStat; |
| int result = _stat(dirName, &myStat); |
| |
| if(result == -1) { |
| result = _mkdir(dirName); |
| return result; |
| } else if(myStat.st_mode & _S_IFDIR) { |
| return 0; |
| } else { |
| return 1; |
| } |
| } |
| |
| //#elif defined POSIX |
| #else |
| #include <sys/stat.h> |
| #include <unistd.h> |
| |
| int createDir(const char* dirName) { |
| struct stat myStat; |
| int result = stat(dirName, &myStat); |
| |
| if(result == -1) { |
| result = mkdir(dirName, S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH); |
| return result; |
| } else if(S_ISDIR(myStat.st_mode)) { |
| return 0; |
| } else { |
| return 1; |
| } |
| } |
| // |
| // Stubs for Windows API functions when building on UNIXes. |
| // |
| typedef int DWORD; |
| inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}; |
| //#else |
| //#error "Not POSIX or Windows. Won't work." |
| #endif |
| |
| #include "line.h" |
| |
| static UBool gVerbose = FALSE; |
| static UBool gDebug = FALSE; |
| static UBool gQuiet = FALSE; |
| static UBool gExemplar = FALSE; |
| |
| DWORD gWinLCID; |
| int gCount; |
| UCollator *gCol; |
| UCollator *gUCA; |
| UConverter *utf8cnv; |
| CompareFn gComparer; |
| int gRefNum; |
| UnicodeSet gExcludeSet; |
| UnicodeSet gRepertoire; |
| |
| const UChar separatorChar = 0x0030; |
| |
| UPrinter *logger; |
| UPrinter *debug; |
| UPrinter *tailoringBundle; |
| UPrinter *referenceBundle; |
| UPrinter *bundle; |
| FILE *fTailoringDump; |
| FILE *fDefaultDump; |
| |
| const char *progName = "colprobe"; |
| |
| const char *gLocale = NULL; |
| int32_t platformIndex = -1; |
| int32_t gPlatformNo = 0; |
| int32_t gPlatformIndexes[10]; |
| int32_t gLocaleNo = 0; |
| const char* gLocales[100]; |
| UBool gRulesStdin = FALSE; |
| const char *outputFormat = "HTML"; |
| const char *outExtension = "html"; |
| |
| enum { |
| HELP1, |
| HELP2, |
| VERBOSE, |
| QUIET, |
| VERSION, |
| ICUDATADIR, |
| COPYRIGHT, |
| LOCALE, |
| PLATFORM, |
| DEBUG, |
| EXEMPLAR, |
| RULESSTDIN, |
| REFERENCE, |
| EXCLUDESET, |
| REPERTOIRE, |
| INTERACTIVE, |
| PRINTREF, |
| DIFF, |
| OUTPUT |
| }; |
| |
| UOption options[]={ |
| /*0*/ UOPTION_HELP_H, |
| /*1*/ UOPTION_HELP_QUESTION_MARK, |
| /*2*/ UOPTION_VERBOSE, |
| /*3*/ UOPTION_QUIET, |
| /*4*/ UOPTION_VERSION, |
| /*5*/ UOPTION_ICUDATADIR, |
| /*6*/ UOPTION_COPYRIGHT, |
| /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG), |
| /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG), |
| /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG), |
| /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG), |
| /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG), |
| /*12*/ UOPTION_DEF("ref", 'c', UOPT_REQUIRES_ARG), |
| /*13*/ UOPTION_DEF("excludeset", 'x', UOPT_REQUIRES_ARG), |
| /*14*/ UOPTION_DEF("repertoire", 't', UOPT_REQUIRES_ARG), |
| /*15*/ UOPTION_DEF("interactive", 'I', UOPT_NO_ARG), |
| /*16*/ UOPTION_DEF("printref", 0, UOPT_NO_ARG), |
| /*17*/ UOPTION_DEF("diff", 0, UOPT_NO_ARG), |
| /*18*/ UOPTION_DEF("output", 0, UOPT_REQUIRES_ARG) |
| }; |
| |
| UChar compA[256]; |
| UChar compB[256]; |
| int32_t compALen = 0; |
| int32_t compBLen = 0; |
| |
| char compUTF8A[256]; |
| char compUTF8B[256]; |
| int32_t compUTF8ALen = 0; |
| int32_t compUTF8BLen = 0; |
| |
| int UNIXstrcmp(const void *a, const void *b) { |
| UErrorCode status = U_ZERO_ERROR; |
| gCount++; |
| int t; |
| compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status); |
| compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status); |
| compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status); |
| compUTF8A[compUTF8ALen] = 0; |
| compUTF8BLen = ucnv_fromUChars(utf8cnv, compUTF8B, 256, compB, compBLen, &status); |
| compUTF8B[compUTF8BLen] = 0; |
| t = strcoll(compUTF8A, compUTF8B); |
| return t; |
| } |
| |
| int UNIXgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) { |
| UErrorCode status = U_ZERO_ERROR; |
| compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status); |
| compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status); |
| compUTF8A[compUTF8ALen] = 0; |
| return (strxfrm((char *)buffer, compUTF8A, buffCapacity)+1); |
| } |
| |
| #ifdef WIN32 |
| int Winstrcmp(const void *a, const void *b) { |
| UErrorCode status = U_ZERO_ERROR; |
| gCount++; |
| int t; |
| //compALen = unorm_compose(compA, 256, (*(Line **)a)->name, (*(Line **)a)->len, FALSE, 0, &status); |
| //compBLen = unorm_compose(compB, 256, (*(Line **)b)->name, (*(Line **)b)->len, FALSE, 0, &status); |
| compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status); |
| compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status); |
| t = CompareStringW(gWinLCID, SORT_STRINGSORT, //0, |
| compA, compALen, |
| compB, compBLen); |
| |
| /* |
| t = CompareStringW(gWinLCID, 0, |
| (*(Line **)a)->name, (*(Line **)a)->len, |
| (*(Line **)b)->name, (*(Line **)b)->len); |
| */ |
| return t-2; |
| } |
| |
| int WingetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) { |
| UErrorCode status = U_ZERO_ERROR; |
| compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status); |
| return LCMapStringW(gWinLCID, LCMAP_SORTKEY | SORT_STRINGSORT, compA, compALen, (unsigned short *)buffer, buffCapacity); |
| } |
| |
| #if 0 |
| int Winstrcmp(const void *a, const void *b) { |
| UErrorCode status = U_ZERO_ERROR; |
| uint8_t b1[256], b2[256]; |
| int32_t b1Len, b2Len; |
| b1Len = WingetSortKey((*(Line **)a)->name, (*(Line **)a)->len, b1, 256); |
| b2Len = WingetSortKey((*(Line **)b)->name, (*(Line **)b)->len, b2, 256); |
| |
| b1[b1Len] = 0; |
| b2[b2Len] = 0; |
| |
| return strcmp((const char *)b1, (const char *)b2); |
| } |
| #endif |
| |
| #else |
| int Winstrcmp(const void *a, const void *b) { |
| if(a == b); |
| return 0; |
| } |
| int WingetSortKey(const UChar *, int32_t , uint8_t *, int32_t ) { |
| return 0; |
| } |
| #endif |
| |
| int ICUstrcmp(const void *a, const void *b) { |
| gCount++; |
| UCollationResult t; |
| t = ucol_strcoll(gCol, |
| (*(Line **)a)->name, (*(Line **)a)->len, |
| (*(Line **)b)->name, (*(Line **)b)->len); |
| if (t == UCOL_LESS) return -1; |
| if (t == UCOL_GREATER) return +1; |
| return 0; |
| } |
| |
| int ICUgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) { |
| return ucol_getSortKey(gCol, string, len, buffer, buffCapacity); |
| } |
| |
| struct { |
| const char* name; |
| CompareFn comparer; |
| GetSortKeyFn skgetter; |
| } platforms[] = { |
| { "icu", ICUstrcmp, ICUgetSortKey }, |
| { "w2k", Winstrcmp, WingetSortKey}, |
| { "winxp", Winstrcmp, WingetSortKey}, |
| { "aix", UNIXstrcmp, UNIXgetSortKey}, |
| { "linux", UNIXstrcmp, UNIXgetSortKey} |
| }; |
| |
| |
| void stringToLower(char *string) { |
| uint32_t i = 0; |
| for(i = 0; i < strlen(string); i++) { |
| string[i] = tolower(string[i]); |
| } |
| } |
| |
| void usage(const char *name) { |
| logger->log("Usage: %s --locale loc_name --platform platform\n", name); |
| } |
| |
| void listKnownPlatforms() { |
| uint32_t i = 0; |
| logger->log("Known platforms:\n"); |
| for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) { |
| logger->log("\t%s\n", platforms[i]); |
| } |
| } |
| |
| void addPlatform(const char *platform) { |
| uint32_t i; |
| //stringToLower(platform); |
| int32_t oldPlatformNo = gPlatformNo; |
| |
| for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) { |
| if(strcmp(platform, platforms[i].name) == 0) { |
| gPlatformIndexes[gPlatformNo++] = i; |
| } |
| } |
| if(gPlatformNo == oldPlatformNo) { |
| logger->log("Unknown platform %s\n", platform); |
| listKnownPlatforms(); |
| } |
| } |
| |
| void processArgs(int argc, char* argv[], UErrorCode &status) |
| { |
| int32_t i = 0; |
| U_MAIN_INIT_ARGS(argc, argv); |
| |
| argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options); |
| |
| if(argc < 0) { |
| logger->log("Unknown option: %s\n", argv[-argc]); |
| usage(progName); |
| return; |
| } |
| |
| if(options[0].doesOccur || options[1].doesOccur) { |
| usage(progName); |
| return; |
| } |
| if(options[VERBOSE].doesOccur) { |
| gVerbose = TRUE; |
| } |
| if(options[DEBUG].doesOccur) { |
| gDebug = TRUE; |
| gVerbose = TRUE; |
| } |
| if(options[EXEMPLAR].doesOccur) { |
| gExemplar = TRUE; |
| } |
| if(options[QUIET].doesOccur) { |
| gQuiet = TRUE; |
| } |
| |
| // ASCII based options specified on the command line |
| // this is for testing purposes, will allow to load |
| // up ICU rules and then poke through them. |
| // In that case, we test only ICU and don't need |
| // a locale. |
| if(options[RULESSTDIN].doesOccur) { |
| gRulesStdin = TRUE; |
| addPlatform("icu"); |
| return; |
| } |
| |
| if(options[LOCALE].doesOccur) { |
| gLocale = options[LOCALE].value; |
| } else { |
| gLocale = argv[1]; |
| //for(i = 1; i < argc; i++) { |
| //gLocales[gLocaleNo++] = argv[i]; |
| //} |
| } |
| |
| if(options[PLATFORM].doesOccur) { |
| addPlatform(options[PLATFORM].value); |
| } else { // there is a list of platforms |
| addPlatform("icu"); |
| } |
| |
| if(options[REFERENCE].doesOccur) { |
| for(i = 0; i < (int32_t)(sizeof(platforms)/sizeof(platforms[0])); i++) { |
| if(strcmp(options[REFERENCE].value, platforms[i].name) == 0) { |
| gRefNum = i; |
| break; |
| } |
| } |
| if(i == sizeof(platforms)/sizeof(platforms[0])) { |
| logger->log("Unknown reference %s!\n", options[REFERENCE].value); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| } else { |
| gRefNum = 0; |
| } |
| |
| if(options[EXCLUDESET].doesOccur) { |
| gExcludeSet.applyPattern(UnicodeString(options[EXCLUDESET].value), status); |
| if(U_FAILURE(status)) { |
| logger->log("Cannot construct exclude set from argument %s. Error %s\n", options[EXCLUDESET].value, u_errorName(status)); |
| return; |
| } else { |
| UnicodeString pattern; |
| logger->log(gExcludeSet.toPattern(pattern, TRUE), TRUE); |
| } |
| } |
| |
| if(options[REPERTOIRE].doesOccur) { |
| gRepertoire.applyPattern(UnicodeString(options[REPERTOIRE].value), status); |
| if(U_FAILURE(status)) { |
| logger->log("Cannot construct repertoire from argument %s. Error %s\n", options[REPERTOIRE].value, u_errorName(status)); |
| return; |
| } |
| } |
| |
| if(options[OUTPUT].doesOccur) { |
| outputFormat = options[OUTPUT].value; |
| if(strcmp(outputFormat, "HTML") == 0) { |
| outExtension = "html"; |
| } else if(strcmp(outputFormat, "XML") == 0) { |
| outExtension = "xml"; |
| } else { |
| outExtension = "txt"; |
| } |
| } |
| |
| } |
| |
| // Check whether upper case comes before lower case or vice-versa |
| int32_t |
| checkCaseOrdering(void) { |
| UChar stuff[][3] = { |
| { 0x0061, separatorChar, 0x0061}, //"aa", |
| { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0", |
| { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a", |
| { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a", |
| //{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0" |
| }; |
| const int32_t size = sizeof(stuff)/sizeof(stuff[0]); |
| |
| Line **sortedLines = new Line*[size]; |
| Line lines[size]; |
| |
| int32_t i = 0; |
| int32_t ordered = 0, reversed = 0; |
| |
| for(i = 0; i < size; i++) { |
| lines[i].setName(stuff[i], 3); |
| } |
| //setArray(sortedLines, lines, size); |
| qsort(sortedLines, size, sizeof(Line*), gComparer); |
| |
| for(i = 0; i < size; i++) { |
| if(*(sortedLines+i) == &lines[i]) { |
| ordered++; |
| } |
| if(*(sortedLines+i) == &lines[size-i-1]) { |
| reversed++; |
| } |
| } |
| |
| delete[] sortedLines; |
| if(ordered == size) { |
| return 0; // in normal order |
| } else if(reversed == size) { |
| return 1; // in reversed order |
| } else { |
| return -1; // unknown order |
| } |
| } |
| |
| void |
| getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) { |
| // first we fill out structures with exemplar characters. |
| UResourceBundle *res = ures_open(NULL, locale, &status); |
| UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status); |
| exemplars.clear(); |
| exemplars.applyPattern(exemplarString, status); |
| ures_close(res); |
| } |
| |
| |
| void |
| getFileNames(const char *name, char *tailoringName, char *tailoringDumpName, char *defaultName, char *defaultDumpName, char *diffName) { |
| if(tailoringName) { |
| strcpy(tailoringName, platforms[gPlatformIndexes[0]].name); |
| strcat(tailoringName, "/"); |
| strcat(tailoringName, name); |
| strcat(tailoringName, "_raw."); |
| strcat(tailoringName, outExtension); |
| } |
| if(tailoringDumpName) { |
| strcpy(tailoringDumpName, platforms[gPlatformIndexes[0]].name); |
| strcat(tailoringDumpName, "/"); |
| strcat(tailoringDumpName, name); |
| strcat(tailoringDumpName, ".dump"); |
| } |
| |
| if(diffName) { |
| strcpy(diffName, platforms[gPlatformIndexes[0]].name); |
| strcat(diffName, "/"); |
| strcat(diffName, name); |
| strcat(diffName, "_collation."); |
| strcat(diffName, outExtension); |
| } |
| |
| if(defaultName) { |
| strcpy(defaultName, platforms[gRefNum].name); |
| strcat(defaultName, "/"); |
| strcat(defaultName, name); |
| strcat(defaultName, "_default_raw."); |
| strcat(defaultName, outExtension); |
| } |
| |
| if(defaultDumpName) { |
| strcpy(defaultDumpName, platforms[gRefNum].name); |
| strcat(defaultDumpName, "/"); |
| strcat(defaultDumpName, name); |
| strcat(defaultDumpName, "_default.dump"); |
| } |
| } |
| |
| void |
| setFiles(const char *name, UErrorCode &status) { |
| if(U_FAILURE(status)) { |
| return; |
| } |
| int32_t i = 0; |
| char tailoringName[256]; |
| char tailoringDumpName[256]; |
| char defaultName[256]; |
| char defaultDumpName[256]; |
| char diffName[256]; |
| |
| getFileNames(name, tailoringName, tailoringDumpName, defaultName, defaultDumpName, diffName); |
| if(options[PLATFORM].doesOccur && !options[DIFF].doesOccur) { |
| if(createDir(platforms[gPlatformIndexes[0]].name) == 0) { |
| tailoringBundle = new UPrinter(tailoringName, "en", "utf-8", NULL, FALSE); |
| fTailoringDump = fopen(tailoringDumpName, "wb"); |
| } else { |
| status = U_FILE_ACCESS_ERROR; |
| return; |
| } |
| } |
| |
| if(options[REFERENCE].doesOccur && !options[DIFF].doesOccur) { |
| if(createDir(platforms[gRefNum].name) == 0) { |
| referenceBundle = new UPrinter(defaultName, "en", "utf-8", NULL, FALSE); |
| fDefaultDump = fopen(defaultDumpName, "wb"); |
| } else { |
| status = U_FILE_ACCESS_ERROR; |
| return; |
| } |
| } |
| |
| if((options[PLATFORM].doesOccur && options[REFERENCE].doesOccur) || options[DIFF].doesOccur) { |
| if(createDir(platforms[gPlatformIndexes[0]].name) == 0) { |
| bundle = new UPrinter(diffName, "en", "utf-8", NULL, FALSE); |
| } |
| } |
| if(options[DIFF].doesOccur) { |
| fTailoringDump = fopen(tailoringDumpName, "rb"); |
| fDefaultDump = fopen(defaultDumpName, "rb"); |
| } |
| } |
| |
| |
| UErrorCode status = U_ZERO_ERROR; |
| static UnicodeSet UNASSIGNED(UnicodeString("[:Cn:]"), status); |
| static UnicodeSet GENERAL_ACCENTS(UnicodeString("[[:block=Combining Diacritical Marks:]-[:Cn:]]"), status); |
| //static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]-[:L:]-[:N:]]"), status); |
| static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]]"), status); |
| static UnicodeSet ALPHABETIC(UnicodeString("[:alphabetic:]"), status); |
| //static UnicodeSet CONTROL(UnicodeString("[[:control:][\\u0000-\\u002F]]"), status); |
| static UnicodeSet BMP(UnicodeString("[\\u0000-\\uFFFF]"), status); |
| |
| static UnicodeSet CONTROL(UnicodeString("[:control:]"), status); |
| |
| UCollator * |
| setLocale(const char* locale, UErrorCode &status) |
| { |
| gWinLCID = uloc_getLCID(locale); |
| setlocale(LC_COLLATE, locale); |
| |
| if(gCol) { |
| ucol_close(gCol); |
| } |
| gCol = ucol_open(locale, &status); |
| ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); |
| //ucol_setAttribute(col, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); |
| //ucol_setAttribute(col, UCOL_STRENGTH, UCOL_QUATERNARY, &status); |
| |
| return gCol; |
| } |
| |
| |
| |
| UCollator * |
| setReference(UErrorCode &status) |
| { |
| gWinLCID = uloc_getLCID("en"); |
| setlocale(LC_COLLATE, "en_US.UTF-8"); |
| if(gCol) { |
| ucol_close(gCol); |
| } |
| gCol = ucol_open("root", &status); |
| ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); |
| return gCol; |
| } |
| |
| void |
| processInteractive() { |
| char command[256]; |
| while(fgets(command, 256, stdin)) { |
| |
| } |
| } |
| |
| UChar probeChars[][4] = { |
| { 0x0061, 0x0062, 0x00E1, 0x0041 }, // latin with a-grave |
| { 0x0041, 0x0042, 0x00C1, 0x0061 }, // upper first |
| { 0x006E, 0x006F, 0x00F1, 0x004E }, // latin with n-tilda |
| { 0x004E, 0x004F, 0x00D1, 0x006E }, // upper first |
| { 0x0433, 0x0493, 0x0491, 0x0413 }, // Cyrillic |
| { 0x0413, 0x0492, 0x0490, 0x0433 }, // upper first |
| { 0x3045, 0x3047, 0x3094, 0x3046 } // Hiragana/Katakana (last resort) |
| |
| }; |
| |
| void |
| processCollator(UCollator *col, UErrorCode &status) { |
| int32_t i = 0; |
| uint32_t j = 0; |
| gCol = col; |
| UChar ruleString[16384]; |
| char myLoc[256]; |
| |
| int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384); |
| logger->log(UnicodeString(ruleString, ruleStringLength), TRUE); |
| const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status); |
| if(locale == NULL) { |
| locale = "en"; |
| } |
| strcpy(myLoc, locale); |
| UnicodeSet exemplarUSet; |
| UnicodeSet RefRepertoire; |
| |
| UnicodeSet tailored; |
| |
| tailored = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status)); |
| tailored.removeAll(CONTROL); |
| |
| |
| UnicodeString pattern; |
| int sanityResult; |
| |
| UnicodeSet hanSet; |
| UBool hanAppears = FALSE; |
| |
| debug->log("\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[0]].name); |
| gComparer = platforms[gPlatformIndexes[0]].comparer; |
| |
| StrengthProbe probe(platforms[gPlatformIndexes[0]].comparer, platforms[gPlatformIndexes[0]].skgetter, 0x0030, probeChars[0][0], probeChars[0][1], probeChars[0][2], probeChars[0][3]); |
| sanityResult = probe.checkSanity(); |
| j = 0; |
| while(sanityResult && j+1 < sizeof(probeChars)/sizeof(probeChars[0])) { |
| j++; |
| sanityResult = probe.setProbeChars(probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]); |
| } |
| if(sanityResult) { |
| logger->log("Bad choice of probe characters! Sanity returned %i. Exiting\n", sanityResult, sanityResult); |
| return; |
| } |
| logger->log("Probe chars: %C, %C, %C, %C\n", probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]); |
| |
| debug->off(); |
| |
| if(gRepertoire.size()) { |
| exemplarUSet = gRepertoire; |
| } else { |
| generateRepertoire(locale, exemplarUSet, hanAppears, status); |
| } |
| exemplarUSet.addAll(tailored); |
| hanSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status); |
| exemplarUSet.removeAll(hanSet); |
| |
| logger->log(exemplarUSet.toPattern(pattern, TRUE), TRUE); |
| |
| exemplarUSet = flatten(exemplarUSet, status); |
| logger->log(exemplarUSet.toPattern(pattern, TRUE), TRUE); |
| |
| if(!options[PRINTREF].doesOccur) { |
| |
| logger->log("\n*** Detecting ordering for the locale\n\n"); |
| |
| debug->on(); |
| SortedLines lines(exemplarUSet, gExcludeSet, probe, logger, debug); |
| lines.analyse(status); |
| lines.calculateSortKeys(); |
| debug->log("\n*** Final order\n\n"); |
| debug->log(lines.toPrettyString(TRUE, TRUE), TRUE); |
| lines.toFile(fTailoringDump, TRUE, status); |
| tailoringBundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, TRUE, TRUE, hanAppears), TRUE); |
| //debug->off(); |
| |
| if(options[REFERENCE].doesOccur) { |
| status = U_ZERO_ERROR; |
| lines.getRepertoire(RefRepertoire); |
| setReference(status); |
| |
| logger->log(exemplarUSet.toPattern(pattern, TRUE), TRUE); |
| logger->log(RefRepertoire.toPattern(pattern, TRUE), TRUE); |
| |
| StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter); |
| logger->log("\n*** Detecting ordering for reference\n\n"); |
| SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug); |
| RefLines.analyse(status); |
| referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, TRUE, TRUE, FALSE), TRUE); |
| RefLines.toFile(fDefaultDump, TRUE, status); |
| |
| lines.reduceDifference(RefLines); |
| logger->log("\n*** Final rules\n\n"); |
| logger->log(lines.toPrettyString(TRUE), TRUE); |
| bundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, TRUE, TRUE, hanAppears), TRUE); |
| } |
| } else { |
| setReference(status); |
| StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter); |
| logger->log("\n*** Detecting ordering for reference\n\n"); |
| SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug); |
| RefLines.analyse(status); |
| logger->log(RefLines.toPrettyString(TRUE), TRUE); |
| referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, TRUE, TRUE, FALSE), TRUE); |
| } |
| if(hanAppears) { |
| // there are Han characters. This is a huge block. The best we can do is to just sort it, compare to empty |
| // and spit it out. Anything else would be a suicide (actually is - kernel just kills you :) |
| logger->log("\n*** Detecting order for Han\n"); |
| debug->off(); |
| setLocale(gLocale, status); |
| exemplarUSet.clear(); |
| exemplarUSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status); |
| exemplarUSet = flatten(exemplarUSet, status); |
| SortedLines han(exemplarUSet, gExcludeSet, probe, logger, debug); |
| han.sort(TRUE, TRUE); |
| han.classifyRepertoire(); |
| han.getBounds(status); |
| tailoringBundle->log("Han ordering:<br>\n"); |
| tailoringBundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, TRUE, FALSE, FALSE), TRUE); |
| bundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, TRUE, FALSE, FALSE), TRUE); |
| } |
| ucol_close(gCol); |
| } |
| |
| void |
| processLocale(const char *locale, UErrorCode &status) { |
| setLocale(locale, status); |
| setFiles(locale, status); |
| if(U_FAILURE(status)) { |
| return; |
| } |
| |
| debug->log("Locale %s (LCID:%06X, unix:%s)\n", locale, gWinLCID, setlocale(LC_COLLATE, NULL)); |
| tailoringBundle->log("// Ordering for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n", |
| locale, gWinLCID, setlocale(LC_COLLATE, NULL), |
| platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name); |
| if(options[REFERENCE].doesOccur) { |
| referenceBundle->log("// Reference for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n", |
| locale, gWinLCID, setlocale(LC_COLLATE, NULL), |
| platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name); |
| } |
| |
| |
| processCollator(gCol, status); |
| } |
| |
| |
| |
| UBool |
| hasCollationElements(const char *locName) { |
| |
| UErrorCode status = U_ZERO_ERROR; |
| UResourceBundle *ColEl = NULL; |
| |
| UResourceBundle *loc = ures_open(NULL, locName, &status);; |
| |
| if(U_SUCCESS(status)) { |
| status = U_ZERO_ERROR; |
| ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status); |
| if(status == U_ZERO_ERROR) { /* do the test - there are real elements */ |
| ures_close(ColEl); |
| ures_close(loc); |
| return TRUE; |
| } |
| ures_close(ColEl); |
| ures_close(loc); |
| } |
| return FALSE; |
| } |
| |
| int |
| main(int argc, |
| char* argv[]) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| logger = new UPrinter(stdout, "en", "latin-1"); |
| debug = new UPrinter(stderr, "en", "latin-1"); |
| |
| /* |
| USet *wsp = uprv_openRuleWhiteSpaceSet(&status); |
| uset_add(wsp, 0x0041); |
| uset_remove(wsp, 0x0041); |
| UnicodeString pat; |
| ((UnicodeSet *)wsp)->toPattern(pat, TRUE); |
| pat.setCharAt(pat.length(), 0); |
| escapeString(pat.getBuffer(), pat.length(), log); |
| u_fflush(log); |
| */ |
| |
| processArgs(argc, argv, status); |
| int32_t i = 0; |
| |
| |
| |
| if(U_FAILURE(status) || gPlatformNo == 0) { |
| return -1; |
| } |
| |
| utf8cnv = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. |
| gUCA = ucol_open("root", &status); |
| |
| if(options[INTERACTIVE].doesOccur) { |
| processInteractive(); |
| } else { |
| if(gRulesStdin) { |
| char buffer[1024]; |
| UChar ruleBuffer[16384]; |
| UChar *rules = ruleBuffer; |
| int32_t maxRuleLen = 16384; |
| int32_t rLen = 0; |
| while(fgets(buffer, 1024, stdin)) { |
| if(buffer[0] != '/' && buffer[1] != '/') { |
| rLen = u_unescape(buffer, rules, maxRuleLen); |
| rules += rLen; |
| maxRuleLen -= rLen; |
| } |
| } |
| UParseError parseError; |
| //escapeString(ruleBuffer, rules-ruleBuffer, log);// |
| debug->log("%U\n", ruleBuffer); |
| |
| UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status); |
| if(U_SUCCESS(status)) { |
| setFiles("stdinRules", status); |
| processCollator(col, status); |
| } else { |
| logger->log("Error %s\n", u_errorName(status)); |
| } |
| } else if(options[DIFF].doesOccur) { |
| logger->log("Diffing two dumps\n"); |
| // must have locale, platform and ref in order to be |
| // able to find dump files. |
| setFiles(gLocale, status); |
| |
| if(fTailoringDump && fDefaultDump) { |
| SortedLines tailoring(fTailoringDump, logger, debug, status); |
| logger->log(tailoring.toString(TRUE), TRUE); |
| SortedLines reference(fDefaultDump, logger, debug, status); |
| logger->log(reference.toString(TRUE), TRUE); |
| tailoring.reduceDifference(reference); |
| logger->log("\n*** Final rules\n\n"); |
| logger->log(tailoring.toPrettyString(TRUE), TRUE); |
| //result->log(lines.toPrettyString(TRUE), TRUE); |
| bundle->log(tailoring.toOutput(outputFormat, gLocale, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, TRUE, TRUE, FALSE), TRUE); |
| } |
| |
| } else { |
| if(gLocale) { |
| processLocale(gLocale, status); |
| } else if(gLocaleNo) { |
| for(i = 0; i < gLocaleNo; i++) { |
| processLocale(gLocales[i], status); |
| } |
| } else { // do the loop through all the locales |
| int32_t noOfLoc = uloc_countAvailable(); |
| const char *locName = NULL; |
| for(i = 0; i<noOfLoc; i++) { |
| status = U_ZERO_ERROR; |
| locName = uloc_getAvailable(i); |
| if(hasCollationElements(locName)) { |
| processLocale(locName, status); |
| } |
| } |
| } |
| } |
| } |
| |
| |
| ucol_close(gUCA); |
| ucnv_close(utf8cnv); |
| |
| delete logger; |
| delete debug; |
| if(tailoringBundle) { |
| delete tailoringBundle; |
| } |
| if(referenceBundle) { |
| delete referenceBundle; |
| } |
| if(bundle) { |
| delete bundle; |
| } |
| if(fTailoringDump) { |
| fclose(fTailoringDump); |
| } |
| if(fDefaultDump) { |
| fclose(fDefaultDump); |
| } |
| return 0; |
| } |
| |
| |
| UnicodeString propertyAndValueName(UProperty prop, int32_t i) { |
| UnicodeString result; |
| result.append(u_getPropertyName(prop, U_LONG_PROPERTY_NAME)); |
| result.append("="); |
| result.append(u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME)); |
| |
| //+ "(" + prop + "," + i + ") "; |
| return result; |
| } |
| |
| |
| void generateRepertoire(const char *locale, UnicodeSet &rep, UBool &hanAppears, UErrorCode &status) { |
| UnicodeString dispName; |
| debug->log("Getting repertoire for %s\n", locale); |
| tailoringBundle->log("// Scripts in repertoire: "); |
| if(options[REFERENCE].doesOccur) { |
| referenceBundle->log("// Scripts in repertoire: "); |
| } |
| rep.clear(); |
| UnicodeSet delta; |
| |
| UScriptCode script[256]; |
| int32_t i = 0; |
| // now add the scripts for the locale |
| UProperty prop = UCHAR_SCRIPT; |
| int32_t scriptLength = uscript_getCode(locale, script, 256, &status); |
| if(scriptLength) { |
| for (i = 0; i < scriptLength; ++i) { |
| if(script[i] == USCRIPT_HAN) { |
| hanAppears = TRUE; |
| continue; |
| } |
| delta.applyIntPropertyValue(prop, script[i], status); |
| debug->log("Adding "); |
| debug->log(propertyAndValueName(prop, script[i]), TRUE); |
| tailoringBundle->log("// "); |
| tailoringBundle->log(propertyAndValueName(prop, script[i]), TRUE); |
| if(options[REFERENCE].doesOccur) { |
| referenceBundle->log("// "); |
| referenceBundle->log(propertyAndValueName(prop, script[i]), TRUE); |
| } |
| rep.addAll(delta); |
| } |
| } else { |
| delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_LATIN, status); |
| rep.addAll(delta); |
| } |
| |
| // now see which blocks those overlap, and add |
| prop = UCHAR_BLOCK; |
| int32_t min = u_getIntPropertyMinValue(prop); |
| int32_t max = u_getIntPropertyMaxValue(prop); |
| UnicodeSet checkDelta; |
| for (i = min; i <= max; ++i) { |
| // skip certain blocks |
| const char *name = u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME); |
| if (strcmp(name, "Superscripts_and_Subscripts") == 0 |
| || strcmp(name, "Letterlike_Symbols") == 0 |
| || strcmp(name, "Alphabetic_Presentation_Forms") == 0 |
| || strcmp(name, "Halfwidth_and_Fullwidth_Forms") == 0) continue; |
| |
| delta.applyIntPropertyValue(prop, i, status).removeAll(UNASSIGNED); |
| if (!rep.containsSome(delta)) continue; |
| if (rep.containsAll(delta)) continue; // just to see what we are adding |
| debug->log("Adding "); |
| debug->log(propertyAndValueName(prop, i), TRUE); |
| tailoringBundle->log("// "); |
| tailoringBundle->log(propertyAndValueName(prop, i), TRUE); |
| if(options[REFERENCE].doesOccur) { |
| referenceBundle->log("// "); |
| referenceBundle->log(propertyAndValueName(prop, i), TRUE); |
| } |
| rep.addAll(delta); |
| } |
| |
| // add ASCII and general accents |
| rep.addAll(GENERAL_ACCENTS).addAll(ASCII_BASE); |
| rep.removeAll(CONTROL); |
| //delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status); |
| //rep.removeAll(delta); |
| |
| // now add the exemplar characters |
| // can't get at them from Java right now |
| tailoringBundle->log("<br>\n"); |
| if(options[REFERENCE].doesOccur) { |
| referenceBundle->log("<br>\n"); |
| } |
| } |
| |
| UnicodeSet flatten(const UnicodeSet &source, UErrorCode &status) { |
| UnicodeSet result; |
| UnicodeSetIterator it(source); |
| UnicodeString item, itemNFKD, toNormalize; |
| while (it.next()) { |
| // would be nicer if UnicodeSetIterator had a getString function |
| if (it.isString()) { |
| Normalizer::normalize(it.getString(), UNORM_NFD, 0, item, status); |
| Normalizer::normalize(it.getString(), UNORM_NFKD, 0, itemNFKD, status); |
| } else { |
| toNormalize.setTo(it.getCodepoint()); |
| Normalizer::normalize(toNormalize, UNORM_NFD, 0, item, status); |
| Normalizer::normalize(toNormalize, UNORM_NFKD, 0, itemNFKD, status); |
| } |
| result.addAll(item); |
| result.addAll(itemNFKD); |
| } |
| return result; |
| } |
| |
| |
| void testWin(StrengthProbe &probe, UErrorCode &status) |
| { |
| UnicodeSet trailings(UnicodeString("[\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651]"), status); |
| char intChar[] = "\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651"; |
| UChar interesting[256]; |
| int32_t intLen = u_unescape(intChar, interesting, 256); |
| UChar i = 0; |
| UChar j = 0, k = 0; |
| int32_t count; |
| Line myCh, combo, trial, inter, kLine; |
| for(i = 0; i < intLen; i++) { |
| inter.setTo(interesting[i]); |
| logger->log(inter.toString(TRUE), TRUE); |
| logger->log("----------------------\n"); |
| for(j = 0; j < 0xFFFF; j++) { |
| myCh.setTo(j); |
| if(probe.distanceFromEmptyString(myCh) == UCOL_IDENTICAL) { |
| continue; |
| } |
| logger->log(myCh.toString(TRUE)); |
| combo.setTo(j); |
| combo.append(interesting[i]); |
| count = 0; |
| for(k = 0; k < 0xFFFF; k++) { |
| kLine.setTo(k); |
| trial.setTo(j); |
| trial.append(k); |
| if(probe.compare(kLine, inter) < 0) { |
| if(probe.compare(trial, combo) >= 0) { |
| count++; |
| } |
| } |
| } |
| logger->log("%i %i\n", count, count); |
| } |
| } |
| } |
| |