| // © 2017 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2003, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * |
| * File colprobe.cpp |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 03/18/2003 weiv Creation. |
| ******************************************************************************* |
| */ |
| |
| #include "uoptions.h" |
| #include "unicode/ucol.h" |
| #include "unicode/ucoleitr.h" |
| #include "unicode/ures.h" |
| #include "unicode/uniset.h" |
| #include "unicode/usetiter.h" |
| #include "unicode/ustring.h" |
| #include "unicode/uchar.h" |
| #include "unicode/uscript.h" |
| #include "uprops.h" |
| #include "hash.h" |
| #include "ucol_imp.h" |
| |
| #include "unicode/ustdio.h" |
| #include "unicode/utrans.h" |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <io.h> |
| #include <fcntl.h> |
| |
| #include "colprobe.h" |
| |
| |
| #ifdef WIN32 |
| #include <windows.h> |
| #else |
| // |
| // Stubs for Windows API functions when building on UNIXes. |
| // |
| typedef int DWORD; |
| inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}; |
| #include <sys/time.h> |
| unsigned long timeGetTime() { |
| struct timeval t; |
| gettimeofday(&t, 0); |
| unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. |
| val += t.tv_usec / 1000; |
| return val; |
| }; |
| inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}; |
| const int LCMAP_SORTKEY = 0; |
| #define MAKELCID(a,b) 0 |
| const int SORT_DEFAULT = 0; |
| #endif |
| |
| #include "line.h" |
| |
| static UBool gVerbose = FALSE; |
| static UBool gDebug = FALSE; |
| static UBool gQuiet = FALSE; |
| static UBool gExemplar = FALSE; |
| |
| DWORD gWinLCID; |
| int gCount; |
| Line **gICULines; |
| UCollator *gCol; |
| UCollator *gUCA; |
| Line source; |
| Line target; |
| Line *gSource = &source; |
| Line *gTarget = ⌖ |
| Hashtable gElements(FALSE); |
| Hashtable gExpansions(FALSE); |
| CompareFn gComparer; |
| |
| const UChar separatorChar = 0x0030; |
| |
| UFILE *out = NULL; |
| UFILE *err = NULL; |
| UFILE *log = NULL; |
| |
| const char *progName = "colprobe"; |
| |
| const char *gLocale = NULL; |
| //char platform[256]; |
| int32_t platformIndex = -1; |
| int32_t gPlatformNo = 0; |
| int32_t gPlatformIndexes[10]; |
| int32_t gLocaleNo = 0; |
| const char* gLocales[100]; |
| UBool gRulesStdin = FALSE; |
| |
| enum { |
| HELP1, |
| HELP2, |
| VERBOSE, |
| QUIET, |
| VERSION, |
| ICUDATADIR, |
| COPYRIGHT, |
| LOCALE, |
| PLATFORM, |
| DEBUG, |
| EXEMPLAR, |
| RULESSTDIN |
| }; |
| |
| UOption options[]={ |
| /*0*/ UOPTION_HELP_H, |
| /*1*/ UOPTION_HELP_QUESTION_MARK, |
| /*2*/ UOPTION_VERBOSE, |
| /*3*/ UOPTION_QUIET, |
| /*4*/ UOPTION_VERSION, |
| /*5*/ UOPTION_ICUDATADIR, |
| /*6*/ UOPTION_COPYRIGHT, |
| /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG), |
| /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG), |
| /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG), |
| /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG), |
| /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG) |
| }; |
| |
| int Winstrcmp(const void *a, const void *b) { |
| gCount++; |
| int t; |
| t = CompareStringW(gWinLCID, 0, |
| (*(Line **)a)->name, (*(Line **)a)->len, |
| (*(Line **)b)->name, (*(Line **)b)->len); |
| return t-2; |
| } |
| |
| int ICUstrcmp(const void *a, const void *b) { |
| gCount++; |
| UCollationResult t; |
| t = ucol_strcoll(gCol, |
| (*(Line **)a)->name, (*(Line **)a)->len, |
| (*(Line **)b)->name, (*(Line **)b)->len); |
| if (t == UCOL_LESS) return -1; |
| if (t == UCOL_GREATER) return +1; |
| return 0; |
| } |
| |
| struct { |
| const char* name; |
| CompareFn comparer; |
| } platforms[] = { |
| { "icu", ICUstrcmp }, |
| { "win", Winstrcmp} |
| }; |
| |
| |
| void deleteLineElement(void *line) { |
| delete((Line *)line); |
| } |
| |
| void stringToLower(char *string) { |
| uint32_t i = 0; |
| for(i = 0; i < strlen(string); i++) { |
| string[i] = tolower(string[i]); |
| } |
| } |
| |
| void usage(const char *name) { |
| u_fprintf(out, "Usage: %s --locale loc_name --platform platform\n", name); |
| } |
| |
| void listKnownPlatforms() { |
| int32_t i = 0; |
| u_fprintf(err, "Known platforms:\n"); |
| for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) { |
| u_fprintf(err, "\t%s\n", platforms[i]); |
| } |
| } |
| |
| void addPlatform(const char *platform) { |
| int32_t i; |
| //stringToLower(platform); |
| int32_t oldPlatformNo = gPlatformNo; |
| |
| for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) { |
| if(strcmp(platform, platforms[i].name) == 0) { |
| gPlatformIndexes[gPlatformNo++] = i; |
| } |
| } |
| if(gPlatformNo == oldPlatformNo) { |
| u_fprintf(err, "Unknown platform %s\n", platform); |
| listKnownPlatforms(); |
| } |
| } |
| |
| void processArgs(int argc, char* argv[], UErrorCode &status) |
| { |
| int32_t i = 0; |
| U_MAIN_INIT_ARGS(argc, argv); |
| |
| argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options); |
| |
| if(argc < 0) { |
| u_fprintf(err, "Unknown option: %s\n", argv[-argc]); |
| usage(progName); |
| return; |
| } |
| |
| if(options[0].doesOccur || options[1].doesOccur) { |
| usage(progName); |
| return; |
| } |
| if(options[VERBOSE].doesOccur) { |
| gVerbose = TRUE; |
| } |
| if(options[DEBUG].doesOccur) { |
| gDebug = TRUE; |
| gVerbose = TRUE; |
| } |
| if(options[EXEMPLAR].doesOccur) { |
| gExemplar = TRUE; |
| } |
| if(options[QUIET].doesOccur) { |
| gQuiet = TRUE; |
| } |
| /* |
| for(i = 8; i < 9; i++) { |
| if(!options[i].doesOccur) { |
| u_fprintf(err, "Option %s is required!\n", options[i].longName); |
| usage(progName); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| if(options[i].value == NULL) { |
| u_fprintf(err, "Option %s needs an argument!\n", options[i].longName); |
| usage(progName); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| } |
| */ |
| // ASCII based options specified on the command line |
| // this is for testing purposes, will allow to load |
| // up ICU rules and then poke through them. |
| // In that case, we test only ICU and don't need |
| // a locale. |
| if(options[RULESSTDIN].doesOccur) { |
| gRulesStdin = TRUE; |
| addPlatform("icu"); |
| return; |
| } |
| |
| if(options[LOCALE].doesOccur) { |
| gLocale = options[LOCALE].value; |
| } else { |
| for(i = 1; i < argc; i++) { |
| gLocales[gLocaleNo++] = argv[i]; |
| } |
| } |
| if(options[PLATFORM].doesOccur) { |
| //strcpy(platform, options[PLATFORM].value); |
| //addPlatform("icu"); |
| addPlatform(options[PLATFORM].value); |
| } else { // there is a list of platforms |
| u_fprintf(err, "Option %s is required!\n", options[i].longName); |
| usage(progName); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| |
| // |
| // Set up a Windows LCID |
| // |
| gWinLCID = uloc_getLCID(gLocale); |
| /* |
| if (gLocale != 0) { |
| gWinLCID = MAKELCID(gLocale, SORT_DEFAULT); |
| } |
| else { |
| gWinLCID = uloc_getLCID(gLocale); |
| } |
| */ |
| |
| } |
| |
| void printRules(const UChar *name, int32_t len, UFILE *file) { |
| // very rudimentary pretty rules print |
| int32_t i = 0; |
| UChar toPrint[16384]; |
| int32_t toPrintIndex = 0; |
| for(i = 0; i < len; i++) { |
| if(name[i] == 0x0026) { |
| if(toPrintIndex) { |
| toPrint[toPrintIndex] = 0; |
| u_fprintf(file, "%U\n", toPrint); |
| toPrintIndex = 0; |
| toPrint[toPrintIndex++] = name[i]; |
| } else { |
| toPrint[toPrintIndex++] = name[i]; |
| } |
| } else { |
| toPrint[toPrintIndex++] = name[i]; |
| } |
| } |
| if(toPrintIndex) { |
| toPrint[toPrintIndex] = 0; |
| u_fprintf(file, "%U\n", toPrint); |
| toPrintIndex = 0; |
| } |
| |
| |
| } |
| |
| void escapeString(const UChar *name, int32_t len, UFILE *file) { |
| u_fprintf(file, "%U", name); |
| /* |
| int32_t j = 0; |
| for(j = 0; j < len; j++) { |
| if(name[j] >= 0x20 && name[j] < 0x80) { |
| u_fprintf(file, "%c", name[j]); |
| } else { |
| u_fprintf(file, "\\u%04X", name[j]); |
| } |
| } |
| */ |
| } |
| void escapeALine(Line *line, UFILE *file) { |
| escapeString(line->name, line->len, file); |
| } |
| |
| void escapeExpansion(Line *line, UFILE *file) { |
| escapeString(line->expansionString, line->expLen, file); |
| } |
| |
| void showNames(Line *line, UFILE *file) { |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t j = 0; |
| char charName[256]; |
| for(j = 0; j < line->len; j++) { |
| u_charName(line->name[j], U_EXTENDED_CHAR_NAME, charName, 256, &status); |
| u_fprintf(file, "%s ", charName); |
| } |
| } |
| |
| void setArray(Line **array, Line *contents, int32_t size) { |
| int32_t i = 0; |
| for(i = 0; i < size; i++) { |
| array[i] = contents+i; |
| } |
| } |
| |
| // set an array from a Hashtable |
| int32_t |
| setArray(Line **array, Hashtable *table = &gElements) { |
| int32_t size = table->count(); |
| int32_t hashIndex = -1; |
| const UHashElement *hashElement = NULL; |
| int32_t count = 0; |
| while((hashElement = table->nextElement(hashIndex)) != NULL) { |
| array[count++] = (Line *)hashElement->value.pointer; |
| } |
| return size; |
| } |
| |
| UBool trySwamped(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) { |
| u_strcpy(gSource->name, (*smaller)->name); |
| gSource->name[(*smaller)->len] = separatorChar; |
| gSource->name[(*smaller)->len+1] = chars[0]; |
| gSource->name[(*smaller)->len+2] = 0; |
| gSource->len = (*smaller)->len+2; |
| |
| u_strcpy(gTarget->name, (*greater)->name); |
| gTarget->name[(*greater)->len] = separatorChar; |
| gTarget->name[(*greater)->len+1] = chars[1]; |
| gTarget->name[(*greater)->len+2] = 0; |
| gTarget->len = (*greater)->len+2; |
| |
| if(comparer(&gSource, &gTarget) > 0) { |
| return TRUE; |
| } else { |
| return FALSE; |
| } |
| } |
| |
| UBool trySwamps(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) { |
| gSource->name[0] = chars[0]; |
| gSource->name[1] = separatorChar; |
| u_strcpy(gSource->name+2, (*smaller)->name); |
| gSource->len = (*smaller)->len+2; |
| |
| gTarget->name[0] = chars[1]; |
| gTarget->name[1] = separatorChar; |
| u_strcpy(gTarget->name+2, (*greater)->name); |
| gTarget->len = (*greater)->len+2; |
| |
| if(comparer(&gSource, &gTarget) < 0) { |
| return TRUE; |
| } else { |
| return FALSE; |
| } |
| } |
| |
| UColAttributeValue |
| probeStrength(Line** prevLine, Line **currLine, CompareFn comparer) { |
| // Primary swamps secondary |
| // have pairs where [0] 2> [1] |
| UChar primSwamps[][2] = { |
| { 0x00E0, 0x0061 }, |
| { 0x0450, 0x0435 }, |
| { 0x31a3, 0x310d } |
| }; |
| // Secondary swamps tertiary |
| // have pairs where [0] 3> [1] |
| UChar secSwamps[][2] = { |
| { 0x0053, 0x0073 }, |
| { 0x0415, 0x0435 }, |
| { 0x31b6, 0x310e } |
| }; |
| // Secondary is swamped by primary |
| // have pairs where [0] 1> [1] |
| UChar secSwamped[][2] = { |
| { 0x0062, 0x0061 }, |
| { 0x0436, 0x0454 }, |
| { 0x310e, 0x310d } |
| }; |
| // Tertiary is swamped by secondary |
| // have pairs where [0] 2> [1] |
| UChar terSwamped[][2] = { |
| { 0x00E0, 0x0061 }, |
| { 0x0450, 0x0435 }, |
| { 0x31a3, 0x310d } |
| }; |
| int32_t i = 0; |
| // Tertiary swamps equal? |
| int result = 0; |
| // Choose the pair |
| i = 0; |
| /* |
| if((*prevLine)->name[0] > 0xFF && (*currLine)->name[0] > 0xFF) { |
| i = 0; |
| } else if((*prevLine)->name[0] < 0x0400 && (*currLine)->name[0] < 0x0400) { |
| i = 1; |
| } else { |
| i = 2; |
| } |
| */ |
| // are they equal? |
| if((result = comparer(prevLine, currLine)) == 0) { |
| return UCOL_IDENTICAL; |
| } else if(result > 0) { |
| //fprintf(stderr, "lines should be ordered!"); |
| return UCOL_OFF; |
| } else if(trySwamps(prevLine, currLine, primSwamps[i], comparer)) { |
| return UCOL_PRIMARY; |
| } else if(trySwamps(prevLine, currLine, secSwamps[i], comparer)) { |
| return UCOL_SECONDARY; |
| } else if(trySwamped(prevLine, currLine, terSwamped[i], comparer)) { |
| // is there a tertiary difference |
| return UCOL_TERTIARY; |
| } else { |
| //fprintf(stderr, "Unknown strength!\n"); |
| return UCOL_ON; |
| } |
| } |
| |
| // This function tries to probe the set of lines |
| // (already sorted by qsort) and deduct the strengths |
| void |
| analyzeStrength(Line **lines, int32_t size, CompareFn comparer) { |
| int32_t i = 0; |
| |
| for(i = 1; i < size; i++) { |
| Line **prevLine = lines+i-1; |
| Line **currLine = lines+i; |
| (*currLine)->strength = probeStrength(prevLine, currLine, comparer); |
| (*currLine)->sortedIndex = i; |
| (*currLine)->previous = *prevLine; |
| (*prevLine)->next = *currLine; |
| |
| } |
| |
| } |
| |
| void printStrength(UColAttributeValue strength, UFILE *file) { |
| u_fprintf(file, " "); |
| switch(strength) { |
| case UCOL_IDENTICAL: |
| u_fprintf(file, "="); |
| break; |
| case UCOL_TERTIARY: |
| //u_fprintf(file, "<3"); |
| u_fprintf(file, "<<<"); |
| break; |
| case UCOL_SECONDARY: |
| //u_fprintf(file, "<2"); |
| u_fprintf(file, "<<"); |
| break; |
| case UCOL_PRIMARY: |
| //u_fprintf(file, "<1"); |
| u_fprintf(file, "<"); |
| break; |
| case UCOL_OFF: |
| u_fprintf(file, ">?"); |
| default: |
| u_fprintf(file, "?!"); |
| break; |
| } |
| u_fprintf(file, " "); |
| } |
| |
| void printStrength(Line *line, UFILE *file) { |
| printStrength(line->strength, file); |
| } |
| |
| void printLine(Line *line, UFILE *file) { |
| escapeALine(line, file); |
| if(line->isExpansion) { |
| u_fprintf(file, "/"); |
| escapeExpansion(line, file); |
| } |
| } |
| |
| void printOrdering(Line **lines, int32_t size, UFILE *file, UBool useLinks = FALSE) { |
| int32_t i = 0; |
| |
| //printLine(*lines); |
| //escapeALine(*lines); // Print first line |
| |
| Line *line = NULL; |
| Line *previous = *lines; |
| if(previous->isReset) { |
| u_fprintf(file, "\n& "); |
| escapeALine(previous, file); |
| } else if(!previous->isRemoved) { |
| printLine(previous, file); |
| } |
| i = 1; |
| while(i < size && previous->next) { |
| if(useLinks) { |
| line = previous->next; |
| } else { |
| line = *(lines+i); |
| } |
| if(line->isReset) { |
| u_fprintf(file, "\n& "); |
| escapeALine(line, file); |
| } else if(!line->isRemoved) { |
| if(file == out) { |
| u_fprintf(file, "\n"); |
| } |
| if(i > 0) { |
| printStrength(line, file); |
| } |
| printLine(line, file); |
| //escapeALine(line, file); |
| } |
| previous = line; |
| i++; |
| } |
| u_fprintf(file, "\n"); |
| } |
| |
| |
| void setIndexes(Line **lines, int32_t size) { |
| int32_t i = 0; |
| (*lines)->sortedIndex = 0; |
| for(i = 1; i < size; i++) { |
| Line *line = *(lines+i); |
| Line *prev = *(lines+i-1); |
| line->previous = prev; |
| prev->next = line; |
| line->sortedIndex = i; |
| } |
| } |
| |
| |
| // this seems to be a dead end |
| void |
| noteExpansion(Line **gLines, Line *line, int32_t size, CompareFn comparer) { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| UnicodeString key(line->name, line->len); |
| //Line *toInsert = (Line *)gElements.get(key); |
| Line *toInsert = (Line *)gExpansions.get(key); |
| if(toInsert != NULL) { |
| toInsert->isExpansion = TRUE; |
| u_strcpy(toInsert->expansionString, line->expansionString); |
| toInsert->expLen = line->expLen; |
| toInsert->previous->next = toInsert->next; |
| toInsert->next->previous = toInsert->previous; |
| gElements.remove(key); |
| } else { |
| toInsert = new Line(*line); |
| toInsert->isExpansion = TRUE; |
| gElements.put(UnicodeString(toInsert->name, toInsert->len), toInsert, status); |
| } |
| |
| int32_t i = 0; |
| Line testLine; |
| Line *l = &testLine; |
| for(i = 0; i < size; i++) { |
| u_strcpy(testLine.name, (*(gLines+i))->name); |
| u_strcat(testLine.name, line->expansionString); |
| testLine.len = (*(gLines+i))->len + line->expLen; |
| if(comparer(&l, &line) > 0) { |
| toInsert->previous = *(gLines+i-1); |
| toInsert->next = *(gLines+i); |
| toInsert->previous->next = toInsert; |
| toInsert->next->previous = toInsert; |
| break; |
| } |
| } |
| if(gVerbose) { |
| u_fprintf(log, "Adding expansion\n"); |
| escapeALine(line, log); |
| u_fprintf(log, "/"); |
| escapeExpansion(line, log); |
| u_fprintf(log, " "); |
| } |
| } |
| |
| void |
| positionExpansions(Line **gLines, int32_t size, CompareFn comparer) { |
| int result = 0; |
| Line *line = NULL; |
| Line *toMove = NULL; |
| int32_t i = 0, j = 0; |
| Line **sortedExpansions = new Line*[gExpansions.count()]; |
| int32_t sortedExpansionsSize = setArray(sortedExpansions, &gExpansions); |
| qsort(sortedExpansions, sortedExpansionsSize, sizeof(Line *), comparer); |
| // Make a list of things in the vincinity of expansion candidate |
| for(j = 0; j < sortedExpansionsSize; j++) { |
| line = *(sortedExpansions+j); |
| UnicodeString key(line->name, line->len); |
| toMove = (Line *)gElements.get(key); |
| int32_t i = 0; |
| Line testLine, prevTestLine; |
| Line *l = &testLine; |
| Line *prevL = &prevTestLine; |
| // This can be further optimized, since we now know that we have a |
| // sorted list of expansions, so current can start from toMove, since all |
| // the elements before it are already smaller. In the beginning it needs to |
| // be on gLines, though. |
| Line *current = *gLines; |
| while(current) { |
| if(current == toMove) { |
| // we are wading through a sorted list |
| // if we found ourselves, it means that we |
| // are already in a right place, so no moving |
| // is needed, but we need to make sure we have |
| // the right strength. |
| toMove->strength = probeStrength(&prevL, &toMove, comparer); |
| if(0) { |
| u_fprintf(log, "Positioned expansion without moving "); |
| printLine(toMove, log); |
| u_fprintf(log, " new ordering: \n"); |
| printOrdering(gLines, size, log, TRUE); |
| } |
| break; |
| } else { |
| u_strcpy(testLine.name, current->name); |
| if(!current->isExpansion) { |
| u_strcat(testLine.name, line->expansionString); |
| testLine.len = current->len + line->expLen; |
| } else { |
| testLine.len = current->len; |
| } |
| if(comparer(&l, &line) > 0) { |
| // remove from chain |
| if(toMove->next) { |
| toMove->next->strength = probeStrength(&(toMove->previous), &(toMove->next), comparer); |
| toMove->next->previous = toMove->previous; |
| } |
| if(toMove->previous) { |
| toMove->previous->next = toMove->next; |
| } |
| |
| // insert |
| toMove->previous = current->previous; |
| toMove->next = current; |
| |
| if(current->previous) { |
| current->previous->next = toMove; |
| } |
| current->previous = toMove; |
| |
| toMove->strength = probeStrength(&prevL, &toMove, comparer); |
| toMove->next->strength = probeStrength(&toMove, &l, comparer); |
| if(0) { |
| u_fprintf(log, "Positioned expansion "); |
| printLine(toMove, log); |
| u_fprintf(log, " new ordering: \n"); |
| printOrdering(gLines, size, log, TRUE); |
| } |
| if(toMove->strength == UCOL_IDENTICAL) { |
| // check for craziness such as s = ss/s |
| // such line would consist of previous (or next) concatenated with the expansion value |
| // make a test |
| UChar fullString[256]; |
| u_strcpy(fullString, toMove->previous->name); |
| u_strcat(fullString, toMove->expansionString); |
| if(u_strcmp(fullString, toMove->name) == 0) { |
| toMove->previous->next = toMove->next; |
| toMove->next->previous = toMove->previous; |
| toMove->isRemoved = TRUE; |
| u_fprintf(log, "Removed: "); |
| printLine(toMove, log); |
| u_fprintf(log, "\n"); |
| } |
| } else if(toMove->next->strength == UCOL_IDENTICAL) { |
| UChar fullString[256]; |
| u_strcpy(fullString, toMove->next->name); |
| u_strcat(fullString, toMove->expansionString); |
| if(u_strcmp(fullString, toMove->name) == 0) { |
| toMove->next->strength = toMove->strength; |
| toMove->previous->next = toMove->next; |
| toMove->next->previous = toMove->previous; |
| toMove->isRemoved = TRUE; |
| u_fprintf(log, "Removed because of back: "); |
| printLine(toMove, log); |
| u_fprintf(log, "\n"); |
| } |
| } |
| break; |
| } |
| prevTestLine = testLine; |
| } |
| current = current->next; |
| } |
| } |
| delete[] sortedExpansions; |
| } |
| |
| |
| void |
| noteExpansion(Line *line) { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString key(line->name, line->len); |
| Line *el = (Line *)gElements.get(key); |
| if(el != NULL) { |
| el->isExpansion = TRUE; |
| u_strcpy(el->expansionString, line->expansionString); |
| el->expLen = line->expLen; |
| } else { |
| Line *toInsert = new Line(*line); |
| toInsert->isExpansion = TRUE; |
| gElements.put(UnicodeString(line->name, line->len), toInsert, status); |
| } |
| |
| Line *el2 = (Line *)gExpansions.get(key); |
| el2->isExpansion = TRUE; |
| u_strcpy(el2->expansionString, line->expansionString); |
| el2->expLen = line->expLen; |
| |
| if(gDebug) { |
| u_fprintf(log, "Adding expansion\n"); |
| printLine(line, log); |
| u_fprintf(log, "\n"); |
| } |
| } |
| |
| void |
| noteContraction(Line *line) { |
| UErrorCode status = U_ZERO_ERROR; |
| Line *toInsert = new Line(*line); |
| toInsert->isContraction = TRUE; |
| gElements.put(UnicodeString(line->name, line->len), toInsert, status); |
| if(gVerbose) { |
| u_fprintf(log, "Adding contraction\n"); |
| escapeALine(line, log); |
| u_fprintf(log, " "); |
| } |
| } |
| |
| void |
| noteElement(Line *line) { |
| UErrorCode status = U_ZERO_ERROR; |
| Line *toInsert = new Line(*line); |
| gElements.put(UnicodeString(line->name, line->len), toInsert, status); |
| if(0) { //if(gDebug) |
| escapeALine(line, log); |
| u_fprintf(log, " "); |
| } |
| } |
| |
| |
| |
| // This function checks if a combination of characters has changed place with the |
| // adjacent elements. If so, these are most probably contractions. |
| // However, it still needs to be checked if these contractions are fake - the |
| // test is simple - if xy is suspected contraction, if we get that x/y is expansion, then |
| // xy is a fake contraction. |
| int32_t |
| analyzeContractions(Line** lines, int32_t size, CompareFn comparer) { |
| int32_t i = 0, j = 0; |
| int32_t outOfOrder = 0; |
| UColAttributeValue strength = UCOL_OFF; |
| UColAttributeValue currStrength = UCOL_OFF; |
| Line **prevLine = lines; |
| Line **currLine = NULL; |
| Line **backupLine = NULL; |
| UBool prevIsContraction = FALSE, currIsContraction = FALSE; |
| // Problem here is detecting a contraction that is at the very end of the sorted list |
| for(i = 1; i < size; i++) { |
| currLine = lines+i; |
| strength = probeStrength(prevLine, currLine, comparer); |
| if(strength == UCOL_OFF || strength != (*currLine)->strength) { |
| prevIsContraction = FALSE; |
| currIsContraction = FALSE; |
| if(!outOfOrder) { |
| if(gVerbose) { |
| u_fprintf(log, "Possible contractions: "); |
| } |
| } |
| // now we have two elements that are different. The question is, |
| // which one of them is the contraction - which one has moved. |
| // Could be the previous, but could also be the current. |
| |
| outOfOrder++; |
| |
| // First, lets check whether the previous has jumped back |
| j = i+1; |
| // skip all the nexts that have smaller strength, they don't have an effect |
| while(j < size && (*(lines+j))->strength > (*currLine)->strength) { |
| j++; |
| } |
| // check if there are other elements of same or greater strength |
| while(j < size && |
| (strength = probeStrength(prevLine, (backupLine = lines+j), comparer)) == UCOL_OFF) { |
| j++; |
| // if we skipped more than one, it might be in fact a contraction |
| prevIsContraction = TRUE; |
| } |
| if(prevIsContraction) { |
| noteContraction(*prevLine); |
| j = i-2; |
| // add all the previous elements with smaller strength, since they also |
| // will jump over and are contractions |
| while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) { |
| strength = probeStrength(lines+j, currLine, comparer); |
| if(strength == UCOL_OFF) { |
| noteContraction(*(lines+j)); |
| } |
| j--; |
| } |
| } |
| |
| // now we check if the current element is jumping forward, |
| // the dance steps are analogous to above. |
| j = i - 2; |
| while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) { |
| j--; |
| } |
| while(j >= 0 && |
| (strength = probeStrength((backupLine = lines+j), currLine, comparer)) == UCOL_OFF) { |
| j--; |
| currIsContraction = TRUE; |
| } |
| if(currIsContraction) { |
| if(gVerbose) { |
| escapeALine(*currLine, log); |
| u_fprintf(log, " "); |
| } |
| j = i+1; |
| while(j < size && (*(lines+j))->strength > (*currLine)->strength) { |
| strength = probeStrength(prevLine, lines+j, comparer); |
| if(strength == UCOL_OFF) { |
| noteContraction(*(lines+j)); |
| } |
| j++; |
| } |
| } |
| |
| // Not sure about either. List both and then check |
| if(!(prevIsContraction || currIsContraction)) { |
| noteContraction(*prevLine); |
| noteContraction(*currLine); |
| } |
| } |
| prevLine = currLine; |
| } |
| if(outOfOrder) { |
| if(gVerbose) { |
| u_fprintf(log, "\n"); |
| } |
| } |
| return outOfOrder; |
| } |
| |
| int32_t |
| detectContractions(Line **gLines, Line *lines, int32_t size, CompareFn comparer) { |
| int32_t i = 0, j = 0; |
| int32_t noContractions = 0; |
| // Create and compare doubles: |
| Line *backupLines = new Line[size]; |
| Line::copyArray(backupLines, lines, size); |
| // detect contractions |
| |
| Line **gLinesBackup = NULL; //new Line*[size]; |
| |
| for(i = 0; i < size; i++) { |
| // preserve index and previous |
| Line::copyArray(lines, backupLines, size); |
| for(j = 0; j < size; j++) { |
| u_strcpy(lines[j].name, backupLines[i].name); |
| u_strcat(lines[j].name, backupLines[j].name); |
| lines[j].len = backupLines[i].len+backupLines[j].len; |
| } |
| |
| if((noContractions += analyzeContractions(gLines, size, comparer)) && gDebug) { |
| if(gLinesBackup == NULL) { |
| gLinesBackup = new Line*[size]; |
| } |
| // Show the sorted doubles, for debugging |
| setArray(gLinesBackup, lines, size); |
| qsort(gLinesBackup, size, sizeof(Line *), comparer); |
| //setIndexes(gLinesBackup, size); |
| analyzeStrength(gLinesBackup, size, comparer); |
| printOrdering(gLinesBackup, size, log); |
| } |
| if(!gQuiet) { |
| u_fprintf(log, "."); |
| } |
| } |
| if(!gQuiet) { |
| u_fprintf(log, "\n"); |
| } |
| delete[] backupLines; |
| if(gLinesBackup) { |
| delete[] gLinesBackup; |
| } |
| return noContractions; |
| } |
| |
| // gLines in this function is an array of sorted pointers. |
| // Contractions are already included. |
| int32_t |
| detectExpansions(Line **gLines, int32_t size, CompareFn comparer) { |
| UErrorCode status = U_ZERO_ERROR; |
| // detect expansions |
| |
| UColAttributeValue startStrength = UCOL_OFF, endStrength = UCOL_OFF, |
| strength = UCOL_OFF, previousStrength = UCOL_OFF; |
| Line start, end, src; |
| Line *startP = &start, *endP = &end, *srcP = &src; |
| Line *current = NULL; |
| memset(startP, 0, sizeof(Line)); |
| memset(endP, 0, sizeof(Line)); |
| memset(srcP, 0, sizeof(Line)); |
| int32_t srcLen; |
| int32_t i = 0, j = 0, k = 0; |
| for(i = 0; i < size; i++) { |
| u_strcpy(start.name, (*(gLines+i))->name); |
| u_strcpy(end.name, (*(gLines+i))->name); |
| srcLen = (*(gLines+i))->len; |
| u_strcpy(start.name+srcLen, (*(gLines))->name); |
| start.len = srcLen + (*(gLines))->len; |
| u_strcpy(end.name+srcLen, (*(gLines+size-1))->name); |
| end.len = srcLen + (*(gLines+size-1))->len; |
| |
| for(k = 0; k < size; k++) { // k is index of a thing that is not doubled |
| current = *(gLines+k); |
| // see if we have moved to front |
| // has it moved to the very beginning |
| if((startStrength = probeStrength((gLines+k), &startP, comparer)) != UCOL_OFF) { |
| continue; // this one is in the front |
| } |
| // has it moved to the very end? |
| if((endStrength = probeStrength(&endP, (gLines+k), comparer)) != UCOL_OFF) { |
| continue; // this one is in the back |
| } |
| // Potential Expansion |
| if(gDebug) { //gVerbose |
| u_fprintf(log, "Possible expansion: "); |
| escapeALine(*(gLines+k), log); |
| u_fprintf(log, " "); |
| } |
| // Now we have to make sure that this is really an expansion |
| // First, we have to find it |
| u_strcpy(src.name, (*(gLines+i))->name); |
| for(j = 0; j < size; j++) { |
| u_strcpy(src.name+srcLen, (*(gLines+j))->name); |
| src.len = srcLen + (*(gLines+j))->len; |
| if((strength = probeStrength(&srcP, (gLines+k), comparer)) == UCOL_OFF) { |
| strength = probeStrength((gLines+k), &srcP, comparer); |
| // we found it *(gLines+j-1) is the element that is interesting |
| // since gLines+j-1 < gLines+k < gLines+j |
| if(gDebug) { //gVerbose |
| u_fprintf(log, "i = %i, k = %i, j = %i ", i, k, j); |
| escapeALine(*(gLines+i), log); |
| escapeALine(*(gLines+j-1), log); |
| printStrength(previousStrength, log); |
| escapeALine(current, log); |
| printStrength(strength, log); |
| escapeALine(*(gLines+i), log); |
| escapeALine(*(gLines+j), log); |
| u_fprintf(log, "\n"); |
| } |
| // check whether it is a contraction that is the same as an expansion |
| // or a multi character that doesn't do anything |
| current->addExpansionHit(i, j); |
| current->isExpansion = TRUE; |
| current->expIndex = k; |
| // cache expansion |
| gExpansions.put(UnicodeString(current->name, current->len), current, status); //new Line(*current) |
| break; |
| } |
| previousStrength = strength; |
| } |
| } |
| if(!gQuiet) { |
| u_fprintf(log, "."); |
| } |
| } |
| if(!gQuiet) { |
| u_fprintf(log, "\n"); |
| } |
| // now we have identified possible expansions. We need to find out how do they expand. |
| // Let's iterate over expansions cache - it's easier. |
| const UHashElement *el = NULL; |
| int32_t hashIndex = -1; |
| Line *doubles = new Line[size*10]; |
| Line **sorter = new Line*[size*10]; |
| int32_t currSize = 0; |
| int32_t newSize = 0; |
| Line *prev = NULL; |
| Line *next = NULL; |
| Line *origin = NULL; |
| int result = 0; |
| // Make a list of things in the vincinity of expansion candidate |
| // in expansionPrefixes and expansionAfter we have stored the |
| // prefixes of stuff that caused the detection of an expansion |
| // and a position where the expansion was. |
| // For example (icu, de__PHONEBOOK), we had: |
| // aE <<< \u00E4 < af |
| // AD < \u00E4 <<< Ae |
| // From that we will construct the following sequence: |
| // AD < aE <<< \u00E4/ <<< Ae < af |
| // then we will take the vincinity of \u00E4: |
| // aE <<< \u00E4/ <<< Ae |
| // then we will choose the smallest expansion to be the expansion |
| // part: 'e'. |
| // if there is equality, we choose the equal part: |
| // (win32, de__PHONEBOOK): |
| // AD < \u00E4/ = ae <<< aE <<< Ae |
| // we choose 'e'. |
| |
| while((el = gExpansions.nextElement(hashIndex)) != NULL) { |
| newSize = 0; |
| current = (Line *)el->value.pointer; |
| currSize = size*current->expansionPrefixesSize; |
| if(gDebug) { |
| escapeALine(current, log); |
| u_fprintf(log, " Number: %i\n", current->expansionPrefixesSize); |
| } |
| // construct the doubles |
| for(i = 0; i < current->expansionPrefixesSize; i++) { |
| doubles[newSize].suffix = current->expansionAfter[i]-1; |
| doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]-1)); |
| doubles[newSize].suffix = current->expansionAfter[i]; |
| doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i])); |
| } |
| // add the expansion we're observing |
| doubles[newSize++] = *current; |
| setArray(sorter, doubles, newSize); |
| qsort(sorter, newSize, sizeof(Line*), comparer); |
| analyzeStrength(sorter, newSize, comparer); |
| if(gDebug) { |
| printOrdering(sorter, newSize, log); |
| } |
| i = 0; |
| while(**(sorter+i) != *current) { |
| i++; |
| } |
| // find the two additions |
| if((*(sorter+i))->strength == UCOL_IDENTICAL) { |
| // if we ae id |
| origin = *(gLines+((*(sorter+i-1))->suffix)); |
| u_strcpy(current->expansionString, origin->name); |
| current->expLen = origin->len; |
| } else if(i < newSize-1 && (*(sorter+i+1))->strength == UCOL_IDENTICAL) { |
| origin = *(gLines+((*(sorter+i+1))->suffix)); |
| u_strcpy(current->expansionString, origin->name); |
| current->expLen = origin->len; |
| } else { |
| if(i > 0) { |
| prev = *(gLines+(*(sorter+i-1))->suffix); |
| if(i < newSize-1) { |
| next = *(gLines+(*(sorter+i+1))->suffix); |
| result = comparer(&prev, &next); |
| if(result <= 0) { |
| u_strcpy(current->expansionString, prev->name); |
| current->expLen = prev->len; |
| } else { |
| u_strcpy(current->expansionString, next->name); |
| current->expLen = next->len; |
| } |
| } |
| } |
| if(0) { //if(gDebug) |
| u_fprintf(log, "Expansion is: "); |
| escapeALine(current, log); |
| u_fprintf(log, "/"); |
| escapeExpansion(current, log); |
| u_fprintf(log, "\n"); |
| } |
| } |
| noteExpansion(current); |
| //noteExpansion(gLines, current, size, comparer); |
| if(!gQuiet) { |
| u_fprintf(log, "."); |
| } |
| } |
| if(!gQuiet) { |
| u_fprintf(log, "\n"); |
| } |
| delete[] doubles; |
| delete[] sorter; |
| return gExpansions.count(); |
| } |
| |
| UBool |
| isTailored(Line *line, UErrorCode &status) { |
| UBool result = FALSE; |
| UCollationElements *tailoring = ucol_openElements(gCol, line->name, line->len, &status); |
| UCollationElements *uca = ucol_openElements(gUCA, line->name, line->len, &status); |
| |
| int32_t tailElement = UCOL_NULLORDER; |
| int32_t ucaElement = UCOL_NULLORDER; |
| |
| do { |
| do { |
| tailElement = ucol_next(tailoring, &status); |
| } while(tailElement == 0); |
| do { |
| ucaElement = ucol_next(uca, &status); |
| } while(ucaElement == 0); |
| if(tailElement != ucaElement) { |
| result = TRUE; |
| break; |
| } |
| } while (tailElement != UCOL_NULLORDER && ucaElement != UCOL_NULLORDER); |
| |
| ucol_closeElements(tailoring); |
| ucol_closeElements(uca); |
| return result; |
| } |
| |
| void |
| reduceUntailored(Line **gLines, int32_t size){ |
| UErrorCode status = U_ZERO_ERROR; |
| Line *current = *(gLines); |
| Line *previous = NULL; |
| while(current) { |
| // if the current line is not tailored according to the UCA |
| if(!isTailored(current, status)) { |
| // we remove it |
| current->isRemoved = TRUE; |
| } else { |
| // if it's tailored |
| if(current->previous && current->previous->isRemoved == TRUE) { |
| previous = current->previous; |
| while(previous && (previous->strength > current->strength || previous->isExpansion || previous->isContraction) && previous->isRemoved) { |
| if(previous->previous && previous->previous->isRemoved) { |
| previous = previous->previous; |
| } else { |
| break; |
| } |
| } |
| if(previous) { |
| previous->isReset = TRUE; |
| } else { |
| (*(gLines))->isReset = TRUE; |
| } |
| } |
| } |
| current = current->next; |
| } |
| } |
| |
| void |
| constructAndAnalyze(Line **gLines, Line *lines, int32_t size, CompareFn comparer) { |
| int32_t i = 0, j = 0, k = 0; |
| // setup our compare arrays to point to single set. |
| |
| // For contractions we need a block of data |
| setArray(gLines, lines, size); |
| //size = setArray(gLines); |
| |
| qsort(gLines, size, sizeof(Line *), comparer); |
| |
| // Establish who is previous according to the sort order |
| //setIndexes(gLines, size); |
| |
| analyzeStrength(gLines, size, comparer); |
| if(gVerbose) { |
| u_fprintf(log, "Ordering:\n"); |
| printOrdering(gLines, size, log); |
| } |
| |
| //showDifferences(exemplarSetSize); |
| //dumpData(exemplarSetSize); |
| |
| if(!gQuiet) { |
| u_fprintf(log, "Detecting contractions?\n"); |
| } |
| int32_t noContractions = 0; |
| noContractions = detectContractions(gLines, lines, size, comparer); |
| if(!gQuiet) { |
| u_fprintf(log, "Detected %i contractions\n", noContractions); |
| } |
| |
| // now we have suspected contractions in the table |
| // we have to re-sort the things |
| size = setArray(gLines); |
| qsort(gLines, size, sizeof(Line *), comparer); |
| analyzeStrength(gLines, size, comparer); |
| |
| if(!gQuiet) { |
| u_fprintf(log, "Detecting expansions\n"); |
| } |
| int32_t noExpansions = detectExpansions(gLines, size, comparer); |
| if(!gQuiet) { |
| u_fprintf(log, "Detected %i expansions\n", noExpansions); |
| } |
| |
| positionExpansions(gLines, size, comparer); |
| |
| if(gVerbose) { |
| u_fprintf(log, "After positioning expansions:\n"); |
| printOrdering(gLines, size, log, TRUE); |
| } |
| //reduceUntailored(gLines, size); |
| if(!gQuiet) { |
| u_fprintf(out, "Final result\n"); |
| } |
| printOrdering(gLines, size, out, TRUE); |
| printOrdering(gLines, size, log, TRUE); |
| } |
| |
| // Check whether upper case comes before lower case or vice-versa |
| int32_t |
| checkCaseOrdering(void) { |
| UChar stuff[][3] = { |
| { 0x0061, separatorChar, 0x0061}, //"aa", |
| { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0", |
| { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a", |
| { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a", |
| //{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0" |
| }; |
| const int32_t size = sizeof(stuff)/sizeof(stuff[0]); |
| |
| Line **sortedLines = new Line*[size]; |
| Line lines[size]; |
| |
| int32_t i = 0; |
| int32_t ordered = 0, reversed = 0; |
| |
| for(i = 0; i < size; i++) { |
| lines[i].setName(stuff[i], 3); |
| } |
| setArray(sortedLines, lines, size); |
| qsort(sortedLines, size, sizeof(Line*), gComparer); |
| |
| for(i = 0; i < size; i++) { |
| if(*(sortedLines+i) == &lines[i]) { |
| ordered++; |
| } |
| if(*(sortedLines+i) == &lines[size-i-1]) { |
| reversed++; |
| } |
| } |
| |
| delete[] sortedLines; |
| if(ordered == size) { |
| return 0; // in normal order |
| } else if(reversed == size) { |
| return 1; // in reversed order |
| } else { |
| return -1; // unknown order |
| } |
| } |
| |
| |
| // Check whether the secondaries are in the straight or reversed order |
| int32_t |
| checkSecondaryOrdering(void) { |
| UChar stuff[][5] = { |
| { 0x0061, separatorChar, 0x0061, separatorChar, 0x00E0 }, //"aa", |
| { 0x0061, separatorChar, 0x00E0, separatorChar, 0x0061 }, //"a\\u00E0", |
| { 0x00E0, separatorChar, 0x0061, separatorChar, 0x0061 }, //"\\u00E0a", |
| //{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0" |
| }; |
| const int32_t size = sizeof(stuff)/sizeof(stuff[0]); |
| |
| Line **sortedLines = new Line*[size]; |
| Line lines[size]; |
| |
| int32_t i = 0; |
| int32_t ordered = 0, reversed = 0; |
| |
| for(i = 0; i < size; i++) { |
| lines[i].setName(stuff[i], 5); |
| } |
| setArray(sortedLines, lines, size); |
| qsort(sortedLines, size, sizeof(Line*), gComparer); |
| |
| for(i = 0; i < size; i++) { |
| if(*(sortedLines+i) == &lines[i]) { |
| ordered++; |
| } |
| if(*(sortedLines+i) == &lines[size-i-1]) { |
| reversed++; |
| } |
| } |
| |
| delete[] sortedLines; |
| if(ordered == size) { |
| return 0; // in normal order |
| } else if(reversed == size) { |
| return 1; // in reversed order |
| } else { |
| return -1; // unknown order |
| } |
| } |
| |
| // We have to remove ignorable characters from the exemplar set, |
| // otherwise, we get messed up results |
| void removeIgnorableChars(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) { |
| UnicodeSet ignorables, primaryIgnorables; |
| UnicodeSetIterator exemplarUSetIter(exemplarUSet); |
| exemplarUSetIter.reset(); |
| Line empty; |
| Line *emptyP = ∅ |
| Line current; |
| Line *currLine = ¤t; |
| UColAttributeValue strength = UCOL_OFF; |
| |
| |
| while(exemplarUSetIter.next()) { |
| if(exemplarUSetIter.isString()) { // process a string |
| u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length()); |
| currLine->len = exemplarUSetIter.getString().length(); |
| strength = probeStrength(&emptyP, &currLine, comparer); |
| if(strength == UCOL_IDENTICAL) { |
| ignorables.add(exemplarUSetIter.getString()); |
| } else if(strength > UCOL_PRIMARY) { |
| primaryIgnorables.add(exemplarUSetIter.getString()); |
| } |
| } else { // process code point |
| UBool isError = FALSE; |
| UChar32 codePoint = exemplarUSetIter.getCodepoint(); |
| currLine->len = 0; |
| U16_APPEND(currLine->name, currLine->len, 25, codePoint, isError); |
| strength = probeStrength(&emptyP, &currLine, comparer); |
| if(strength == UCOL_IDENTICAL) { |
| ignorables.add(codePoint); |
| } else if(strength > UCOL_PRIMARY) { |
| primaryIgnorables.add(codePoint); |
| } |
| } |
| } |
| |
| |
| |
| exemplarUSet.removeAll(ignorables); |
| exemplarUSet.removeAll(primaryIgnorables); |
| |
| UnicodeString removedPattern; |
| if(ignorables.size()) { |
| u_fprintf(log, "Ignorables:\n"); |
| ignorables.toPattern(removedPattern, TRUE); |
| removedPattern.setCharAt(removedPattern.length(), 0); |
| escapeString(removedPattern.getBuffer(), removedPattern.length(), log); |
| u_fprintf(log, "\n"); |
| } |
| if(primaryIgnorables.size()) { |
| u_fprintf(log, "Primary ignorables:\n"); |
| primaryIgnorables.toPattern(removedPattern, TRUE); |
| removedPattern.setCharAt(removedPattern.length(), 0); |
| escapeString(removedPattern.getBuffer(), removedPattern.length(), log); |
| u_fprintf(log, "\n"); |
| } |
| |
| } |
| |
| // TODO: develop logic for choosing boundary characters - right now it is hardcoded |
| // It should be a function of used scripts. Also, check whether we need to save |
| // used script names |
| void addUtilityChars(UnicodeSet &exemplarUSet, UErrorCode &status) { |
| |
| // in order to get nice rules, we need to add some characters to the |
| // starting set. These are mostly parts of compatibility composed characters, |
| // such as L-middle dot (middle dot is 0x00B7). If we don't add these, we would |
| // get a reset at a funky character, such as L-middle dot. This list will probably |
| // grow. |
| exemplarUSet.add(0x00B7); |
| |
| // these things represent a script before the target script and |
| // a script after. More logic should be added so that these characters are |
| // chosen automatically |
| |
| exemplarUSet.add(0x0038); |
| exemplarUSet.add(0x0039); |
| |
| //exemplarUSet.add(0x0433); |
| //exemplarUSet.add(0x0436); |
| exemplarUSet.add(0xfa29); |
| exemplarUSet.add(0xfa28); |
| } |
| |
| void |
| getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) { |
| // first we fill out structures with exemplar characters. |
| UResourceBundle *res = ures_open(NULL, locale, &status); |
| int32_t exemplarLength = 0; |
| UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status); |
| exemplars.clear(); |
| exemplars.applyPattern(exemplarString, status); |
| ures_close(res); |
| } |
| |
| void |
| prepareStartingSet(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) { |
| int32_t i = 0; |
| UnicodeString exemplarString; |
| exemplarUSet.toPattern(exemplarString); |
| // Produce case closure of exemplar characters |
| // Then we want to figure out what is the script of the exemplar characters |
| // just pick several and see their script |
| const char* usedScriptNames[USCRIPT_CODE_LIMIT]; |
| int32_t numberOfUsedScripts = 0; |
| char scriptSetPattern[256]; |
| UnicodeString pattern; // for debugging |
| UChar32 exChar = -1; |
| while(exemplarUSet.size() != 0 && (exChar = exemplarUSet.charAt(0)) != -1) { |
| int32_t scriptNo = u_getIntPropertyValue(exChar, UCHAR_SCRIPT); |
| usedScriptNames[numberOfUsedScripts] = u_getPropertyValueName(UCHAR_SCRIPT, scriptNo, U_SHORT_PROPERTY_NAME); |
| sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[numberOfUsedScripts]); |
| numberOfUsedScripts++; |
| UnicodeSet scriptSet(UnicodeString(scriptSetPattern, ""), status); |
| exemplarUSet.removeAll(scriptSet); |
| exemplarUSet.toPattern(pattern, TRUE); |
| } |
| exemplarUSet.clear(); |
| |
| // always add ASCII |
| //exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0020-\\u007f]", ""), status)); |
| exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0041-\\u005b]", ""), status)); |
| if(gExemplar) { |
| exemplarUSet.applyPattern(exemplarString, status); |
| exemplarUSet.closeOver(USET_CASE); |
| if(!gQuiet) { |
| u_fprintf(out, "ICU exemplar characters:\n"); |
| escapeString(exemplarString.getBuffer(), exemplarString.length(), out); |
| u_fprintf(out, "\n"); |
| } |
| } else { |
| if(!gQuiet) { |
| u_fprintf(out, "Using scripts:\n"); |
| } |
| // add interesting scripts |
| for(i = 0; i < numberOfUsedScripts; i++) { |
| sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[i]); |
| exemplarUSet.addAll(UnicodeSet(UnicodeString(scriptSetPattern, ""), status)); |
| if(!gQuiet) { |
| u_fprintf(out, "%s\n", scriptSetPattern); |
| } |
| } |
| } |
| |
| |
| removeIgnorableChars(exemplarUSet, comparer, status); |
| |
| addUtilityChars(exemplarUSet, status); |
| |
| /* |
| // try to check whether tailored set and exemplar characters match. |
| USet *tailored = ucol_getTailoredSet(gCol, &status); |
| UBool tailoredContained = exemplarUSet.containsAll(*((UnicodeSet *)tailored)); |
| if(!tailoredContained) { |
| ((UnicodeSet *)tailored)->removeAll(exemplarUSet); |
| UnicodeString pattern; |
| ((UnicodeSet *)tailored)->toPattern(pattern, TRUE); |
| } |
| uset_close(tailored); |
| */ |
| |
| //return exemplarUSet; |
| } |
| |
| void |
| setOutputFile(const char *name, UErrorCode &status) { |
| int32_t i = 0; |
| char filename[256]; |
| strcpy(filename, name); |
| for(i = 0; i < gPlatformNo; i++) { |
| strcat(filename, "_"); |
| strcat(filename, platforms[gPlatformIndexes[i]].name); |
| } |
| if(gExemplar) { |
| strcat(filename, "_exemplar"); |
| } else { |
| strcat(filename, "_script"); |
| } |
| strcat(filename, ".utf16.txt"); |
| out = u_fopen(filename, "wb", "en", "utf-16"); |
| } |
| |
| void |
| processCollator(UCollator *col, UErrorCode &status) { |
| int32_t i = 0; |
| gCol = col; |
| UChar ruleString[16384]; |
| int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384); |
| if(!gQuiet) { |
| u_fprintf(out, "ICU rules:\n"); |
| printRules(ruleString, ruleStringLength, out); |
| printRules(ruleString, ruleStringLength, log); |
| //escapeString(ruleString, ruleStringLength, out); |
| u_fprintf(out, "\n"); |
| } |
| const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status); |
| UnicodeSet exemplarUSet; |
| if(locale) { |
| getExemplars(locale, exemplarUSet, status); |
| } else { |
| exemplarUSet = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status)); |
| } |
| |
| |
| for(i = 0; i < gPlatformNo; i++) { |
| u_fprintf(out, "\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[i]].name); |
| gComparer = platforms[gPlatformIndexes[i]].comparer; |
| |
| prepareStartingSet(exemplarUSet, gComparer, status); |
| int32_t itemLen = 0; |
| // get the number of all the items from the set (both codepoints and strings) |
| int32_t exemplarSetSize = exemplarUSet.size(); |
| UnicodeSetIterator exemplarUSetIter(exemplarUSet); |
| |
| // allocate ICU lines |
| gICULines = new Line*[exemplarSetSize*5]; |
| int32_t j = 0; |
| int32_t linesCount = 0; |
| Line *lines = new Line[exemplarSetSize]; |
| |
| int32_t reversedSecondary = checkSecondaryOrdering(); |
| if(reversedSecondary == 0) { |
| u_fprintf(out, "Secondaries do not seem to be reversed\n"); |
| } else if(reversedSecondary == 1) { |
| u_fprintf(out, "Secondaries are reversed\n"); |
| if(gComparer == ICUstrcmp) { |
| ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); |
| } |
| } else { |
| u_fprintf(out, "Cannot conclude if secondaries are reversed\n"); |
| } |
| |
| int32_t reversedCase = checkCaseOrdering(); |
| if(reversedCase == 0) { |
| u_fprintf(out, "Case does not seem to be reversed\n"); |
| } else if(reversedCase == 1) { |
| u_fprintf(out, "Case is reversed\n"); |
| if(gComparer == ICUstrcmp) { |
| ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_OFF, &status); |
| } |
| } else { |
| u_fprintf(out, "Cannot conclude if case is reversed\n"); |
| } |
| |
| exemplarUSetIter.reset(); |
| gElements.removeAll(); |
| gExpansions.removeAll(); |
| linesCount = 0; |
| |
| while(exemplarUSetIter.next()) { |
| Line *currLine = lines+linesCount; |
| if(exemplarUSetIter.isString()) { // process a string |
| u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length()); |
| currLine->len = exemplarUSetIter.getString().length(); |
| } else { // process code point |
| UBool isError = FALSE; |
| currLine->len = 0; |
| U16_APPEND(currLine->name, currLine->len, 25, exemplarUSetIter.getCodepoint(), isError); |
| } |
| currLine->name[currLine->len] = 0; // zero terminate, for our evil ways |
| currLine->index = linesCount; |
| linesCount++; |
| noteElement(currLine); |
| } |
| constructAndAnalyze(gICULines, lines, exemplarSetSize, gComparer); |
| |
| delete[] lines; |
| } |
| |
| |
| // cleanup globals |
| delete[] gICULines; |
| u_fflush(out); |
| u_fclose(out); |
| ucol_close(gCol); |
| } |
| |
| void |
| processLocale(const char *locale, UErrorCode &status) { |
| gWinLCID = uloc_getLCID(locale); |
| |
| UCollator *col = ucol_open(locale, &status); |
| |
| setOutputFile(locale, status); |
| |
| u_fprintf(out, "Locale %s (LCID:%06X)\n", locale, gWinLCID); |
| |
| processCollator(col, status); |
| } |
| |
| UBool |
| hasCollationElements(const char *locName) { |
| |
| UErrorCode status = U_ZERO_ERROR; |
| UResourceBundle *ColEl = NULL; |
| |
| UResourceBundle *loc = ures_open(NULL, locName, &status);; |
| |
| if(U_SUCCESS(status)) { |
| status = U_ZERO_ERROR; |
| ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status); |
| if(status == U_ZERO_ERROR) { /* do the test - there are real elements */ |
| ures_close(ColEl); |
| ures_close(loc); |
| return TRUE; |
| } |
| ures_close(ColEl); |
| ures_close(loc); |
| } |
| return FALSE; |
| } |
| |
| int |
| main(int argc, |
| char* argv[]) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| err = u_finit(stderr, "en", "latin-1"); |
| log = u_finit(stdout, "en", "latin-1"); |
| |
| /* |
| USet *wsp = uprv_openRuleWhiteSpaceSet(&status); |
| uset_add(wsp, 0x0041); |
| uset_remove(wsp, 0x0041); |
| UnicodeString pat; |
| ((UnicodeSet *)wsp)->toPattern(pat, TRUE); |
| pat.setCharAt(pat.length(), 0); |
| escapeString(pat.getBuffer(), pat.length(), log); |
| u_fflush(log); |
| */ |
| |
| UTransliterator *anyHex = utrans_open("[^\\u000a\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status); |
| u_fsettransliterator(log, U_WRITE, anyHex, &status); |
| |
| processArgs(argc, argv, status); |
| int32_t i = 0; |
| |
| |
| gElements.setValueDeleter(deleteLineElement); |
| |
| |
| if(U_FAILURE(status) || gPlatformNo == 0) { |
| return -1; |
| } |
| |
| gUCA = ucol_open("root", &status); |
| |
| if(gRulesStdin) { |
| char buffer[1024]; |
| UChar ruleBuffer[16384]; |
| UChar *rules = ruleBuffer; |
| int32_t maxRuleLen = 16384; |
| int32_t rLen = 0; |
| while(gets(buffer)) { |
| if(buffer[0] != '/' && buffer[1] != '/') { |
| rLen = u_unescape(buffer, rules, maxRuleLen); |
| rules += rLen; |
| maxRuleLen -= rLen; |
| } |
| } |
| UParseError parseError; |
| //escapeString(ruleBuffer, rules-ruleBuffer, log);// |
| u_fprintf(log, "%U\n", ruleBuffer); |
| |
| UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status); |
| if(U_SUCCESS(status)) { |
| setOutputFile("stdinRules", status); |
| processCollator(col, status); |
| } else { |
| u_fprintf(err, "Error %s\n", u_errorName(status)); |
| } |
| } else { |
| |
| if(gLocale) { |
| processLocale(gLocale, status); |
| } else if(gLocaleNo) { |
| for(i = 0; i < gLocaleNo; i++) { |
| processLocale(gLocales[i], status); |
| } |
| } else { // do the loop through all the locales |
| int32_t noOfLoc = uloc_countAvailable(); |
| const char *locName = NULL; |
| for(i = 0; i<noOfLoc; i++) { |
| status = U_ZERO_ERROR; |
| locName = uloc_getAvailable(i); |
| if(hasCollationElements(locName)) { |
| processLocale(locName, status); |
| } |
| } |
| } |
| } |
| |
| |
| ucol_close(gUCA); |
| |
| u_fflush(log); |
| u_fclose(log); |
| u_fflush(err); |
| u_fclose(err); |
| |
| return 0; |
| } |