| // © 2017 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| #include "sortedlines.h" |
| |
| static int codePointCmp(const void *a, const void *b) { |
| return u_strcmp((*(Line **)a)->name, (*(Line **)b)->name); |
| } |
| |
| SortedLines::SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe, |
| UPrinter *logger, UPrinter *debug) : |
| toSort(NULL), |
| toSortCapacity(0), |
| lines(NULL), |
| size(0), |
| capacity(0), |
| repertoire(set), |
| excludeBounds(excludeBounds), |
| probe(probe), |
| first(NULL), |
| last(NULL), |
| logger(logger), |
| debug(debug), |
| contractionsTable(NULL), |
| duplicators(NULL), |
| maxExpansionPrefixSize(0), |
| wordSort(FALSE), |
| frenchSecondary(FALSE), |
| upperFirst(FALSE), |
| sortkeys(NULL), |
| sortkeyOffset(0) |
| { |
| memset(UB, 0, sizeof(UB)); |
| int32_t i = 0; |
| for(i = 0; i < UCOL_OFF; i++) { |
| UB[i] = ∅ |
| } |
| init(); |
| } |
| |
| SortedLines::~SortedLines() |
| { |
| delete[] lines; |
| if(sortkeys) { |
| delete[] sortkeys; |
| } |
| if(toSort) { |
| delete[] toSort; |
| } |
| if(contractionsTable) { |
| delete contractionsTable; |
| } |
| if(duplicators) { |
| delete duplicators; |
| } |
| } |
| |
| void |
| SortedLines::getBounds(UErrorCode &status) { |
| // first sort through the set |
| debug->log(toString(), TRUE); |
| int32_t i = 0, j = 0; |
| UColAttributeValue strength = UCOL_OFF; |
| for(i = 0; i < size; i++) { |
| if(toSort[i]->strengthFromEmpty < strength) { |
| if(i && strength < UCOL_OFF) { |
| //u_strcpy(UB[strength], toSort[i-1]->name); |
| j = 1; |
| while(excludeBounds.contains(UnicodeString(toSort[i-j]->name, toSort[i-j]->len))) { |
| j++; |
| } |
| UB[strength] = toSort[i-j]; |
| } |
| strength = toSort[i]->strengthFromEmpty; |
| if(strength == UCOL_PRIMARY) { |
| probe.SE = toSort[i]->name[0]; |
| } |
| } |
| } |
| //u_strcpy(UB[strength], toSort[size-1]->name); |
| // a different solution for bounds: go from end and see if the guys on the top |
| // cause duplication for things |
| UChar dupch[] = { 0x0020, 0x0030, 0x0042, 0x0051, 0x0062, 0x0071, 0x0391, 0x0396, 0x03b1, 0x03b6 }; |
| j = 1; |
| Line dup; |
| Line bound; |
| int32_t dups = 0; |
| while(j < size) { |
| dups = 0; |
| for(i = 0; i < sizeof(dupch)/sizeof(dupch[0]); i++) { |
| dup.setTo(dupch[i]); |
| dup.append(dupch[i]); |
| bound.setTo(dupch[i]); |
| bound.append(toSort[size-j]->name, toSort[size-j]->len); |
| if(probe.getStrength(dup, bound) >= UCOL_IDENTICAL) { |
| dups++; |
| } |
| } |
| if(dups == 0) { |
| break; |
| } else { |
| if(!duplicators) { |
| duplicators = new Hashtable(); |
| } |
| duplicators->put(UnicodeString(toSort[size-j]->name, toSort[size-j]->len), &toSort[size-j], status); |
| debug->log(toSort[size-j]->toString()); |
| debug->log(" is not good enough to be an upper bound\n"); |
| j++; |
| } |
| } |
| if(j == size) { |
| debug->log("Oi! I'm hallucinating. Will use the first upper bound"); |
| delete duplicators; |
| duplicators = NULL; |
| j = 1; |
| } |
| /* |
| j = 1; |
| while(excludeBounds.contains(UnicodeString(toSort[size-j]->name, toSort[size-j]->len))) { |
| j++; |
| } |
| */ |
| UB[strength] = toSort[size-j]; |
| for(i = 0; i < UCOL_OFF; i++) { |
| if(UB[i]) { |
| //debug->log(UB[i], TRUE); |
| debug->log(UB[i]->toString(TRUE), TRUE); |
| } |
| } |
| } |
| |
| // classifies repertoire according to the strength of their difference |
| // from the empty string |
| void |
| SortedLines::classifyRepertoire() { |
| UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF; |
| int32_t lastChange = 0; |
| int32_t i = 0, j = 0; |
| while(i < size) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY) |
| { |
| toSort[i]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i]); |
| if(toSort[i]->strengthFromEmpty < strongestStrengthFromEmpty) { |
| strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty; |
| lastChange = i; |
| } else if (toSort[i]->strengthFromEmpty > strongestStrengthFromEmpty) { |
| // there is a problem in detection. Most probably a quaternary. |
| // why don't we try to interpolate |
| UColAttributeValue nextStrength = UCOL_OFF; |
| UColAttributeValue prevStrength = UCOL_OFF; |
| UColAttributeValue st = UCOL_OFF; |
| |
| logger->log("Interpolating to get the distance from empty for Line "); |
| logger->log(toSort[i]->toString(TRUE), TRUE); |
| |
| if(i) { |
| st = probe.getStrength(*toSort[i-1], *toSort[i]); |
| if(st == UCOL_OFF) { |
| logger->log("Cannot deduce distance from empty using previous element. Something is very wrong! Line:"); |
| logger->log(toSort[i]->toString(TRUE), TRUE); |
| } else if(st == UCOL_IDENTICAL || st >= toSort[i-1]->strengthFromEmpty) { |
| prevStrength = toSort[i-1]->strengthFromEmpty; |
| } else if(st < toSort[i-1]->strengthFromEmpty) { |
| prevStrength = st; |
| } |
| toSort[i]->strengthFromEmpty = prevStrength; |
| } |
| if(i < size-2) { |
| toSort[i+1]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i+1]); |
| st = probe.getStrength(*toSort[i+1], *toSort[i]); |
| if(st == UCOL_OFF) { |
| logger->log("Cannot deduce distance from empty using next element. Something is very wrong! Line:"); |
| logger->log(toSort[i]->toString(TRUE), TRUE); |
| } else if(st == UCOL_IDENTICAL || st < toSort[i+1]->strengthFromEmpty) { |
| nextStrength = toSort[i+1]->strengthFromEmpty; |
| } else if(st >= toSort[i+1]->strengthFromEmpty) { |
| nextStrength = st; |
| } |
| if(i) { |
| if(prevStrength != nextStrength) { |
| logger->log("Inconsistent results from interpolation! Results will most likely be wrong\n"); |
| } |
| } |
| toSort[i]->strengthFromEmpty = nextStrength; |
| } |
| /* |
| UColAttributeValue problemStrength = UCOL_PRIMARY; |
| for(j = lastChange; j < i ; j++) { |
| if(toSort[j]->strength > problemStrength) { |
| problemStrength = toSort[j]->strength; |
| } |
| } |
| for(j = lastChange; j < i ; j++) { |
| toSort[j]->strengthFromEmpty = problemStrength; |
| } |
| strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty; |
| lastChange = i; |
| debug->log("Problem detected in distances from empty. Most probably word sort is on\n"); |
| */ |
| wordSort = TRUE; |
| } |
| i++; |
| } |
| debug->log("Distances from empty string\n"); |
| debug->log(toStringFromEmpty(), TRUE); |
| } |
| |
| void |
| SortedLines::analyse(UErrorCode &status) { |
| frenchSecondary = probe.isFrenchSecondary(status); |
| if(U_FAILURE(status)) { |
| logger->log("Test for French secondary failed. Bailing out!\n"); |
| return; |
| } |
| logger->log("French secondary value is %i\n", frenchSecondary, frenchSecondary); |
| upperFirst = probe.isUpperFirst(status); |
| if(U_FAILURE(status)) { |
| logger->log("Test for upper first failed. Bailing out!\n"); |
| return; |
| } |
| logger->log("upper first value is %i\n", upperFirst, upperFirst); |
| sort(TRUE, TRUE); |
| classifyRepertoire(); |
| getBounds(status); |
| //sort(TRUE, TRUE); |
| addContractionsToRepertoire(status); |
| //sort(TRUE, TRUE); |
| debug->log("\n*** Order after detecting contractions\n\n"); |
| calculateSortKeys(); |
| debug->log(toPrettyString(FALSE, TRUE), TRUE); |
| detectExpansions(); |
| } |
| |
| void SortedLines::init() |
| { |
| size = repertoire.size(); |
| capacity = 5*size; |
| lines = new Line[capacity]; |
| init(repertoire, lines); |
| } |
| |
| void SortedLines::init(UnicodeSet &rep, Line *lin) |
| { |
| |
| UnicodeSetIterator exemplarUSetIter(rep); |
| int32_t size = 0; |
| |
| while(exemplarUSetIter.next()) { |
| Line *currLine = lin+size; |
| if(exemplarUSetIter.isString()) { // process a string |
| currLine->setTo(exemplarUSetIter.getString()); |
| } else { // process code point |
| currLine->setTo(exemplarUSetIter.getCodepoint()); |
| } |
| currLine->name[currLine->len] = 0; // zero terminate, for our evil ways |
| //currLine->index = size; |
| size++; |
| } |
| } |
| |
| void |
| SortedLines::setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort) { |
| int32_t i = 0; |
| for(i = 0; i < sizeToSort; i++) { |
| sortingArray[i] = &elements[i]; |
| } |
| } |
| |
| int32_t |
| SortedLines::setSortingArray(Line **sortingArray, Hashtable *table) { |
| int32_t size = table->count(); |
| int32_t hashIndex = -1; |
| const UHashElement *hashElement = NULL; |
| int32_t count = 0; |
| while((hashElement = table->nextElement(hashIndex)) != NULL) { |
| sortingArray[count++] = (Line *)hashElement->value.pointer; |
| } |
| return size; |
| } |
| |
| void |
| SortedLines::sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths, UBool link) { |
| int32_t i = 0; |
| int32_t equalStart = 0; |
| UColAttributeValue equalStrength = UCOL_OFF; |
| |
| qsort(sortingArray, sizeToSort, sizeof(Line *), probe.comparer); |
| |
| if(setStrengths) { // analyze strengths |
| for(i = 1; i < sizeToSort; i++) { |
| sortingArray[i]->strength = probe.getStrength(*sortingArray[i-1], *sortingArray[i]); |
| } |
| // for equal guys, do the code point ordering |
| |
| i = 1; |
| while(i < sizeToSort) |
| { |
| if(sortingArray[i]->strength == UCOL_IDENTICAL) { |
| equalStart = i - 1; |
| equalStrength = sortingArray[equalStart]->strength; |
| sortingArray[equalStart]->strength = UCOL_IDENTICAL; |
| while(i < sizeToSort && sortingArray[i]->strength == UCOL_IDENTICAL) { |
| i++; |
| } |
| qsort(sortingArray+equalStart, i-equalStart, sizeof(Line *), codePointCmp); |
| sortingArray[equalStart]->strength = equalStrength; |
| } else { |
| i++; |
| } |
| } |
| |
| } |
| |
| |
| |
| if(link) { // do the linking |
| for(i = 0; i < sizeToSort - 1; i++) { |
| Line *curr = *(sortingArray+i); |
| curr->next = *(sortingArray+i+1); |
| (*(sortingArray+i+1))->previous = curr; |
| } |
| } |
| } |
| |
| void |
| SortedLines::sort(UBool setStrengths, UBool link) { |
| if(toSortCapacity < size || !toSort) { |
| if(toSort) { |
| delete[] toSort; |
| } |
| toSort = new Line*[size*2]; |
| toSortCapacity = size*2; |
| } |
| |
| setSortingArray(toSort, lines, size); |
| sort(toSort, size, setStrengths, link); |
| |
| first = last = NULL; |
| |
| if(link) { // do the linking |
| first = *toSort; |
| last = *(toSort+size-1); |
| } |
| } |
| |
| void |
| SortedLines::updateBounds(UnicodeSet &set) { |
| Line line; |
| UnicodeString s1; |
| UnicodeSetIterator it1(set); |
| while(it1.next()) { |
| if(!debug->isOn()) { |
| logger->log("."); |
| } |
| if(it1.isString()) { // process a string |
| s1.setTo(it1.getString()); |
| } else { // process code point |
| s1.setTo(it1.getCodepoint()); |
| } |
| //line.setTo(s1); |
| UColAttributeValue strength = probe.distanceFromEmptyString(s1); |
| if(probe.compare(UnicodeString(UB[strength]->name), s1) < 0) { |
| // TODO: leak here - fixit! |
| UB[strength] = new Line(s1); |
| //u_strcpy(UB[strength], s1.getTerminatedBuffer()); |
| } |
| } |
| |
| |
| |
| } |
| |
| void SortedLines::addAll(Line* toAdd, int32_t toAddSize) |
| { |
| if(size+toAddSize > capacity) { |
| int32_t doGrowingBreakpoint = 0; |
| // we need to do growing here |
| } |
| int32_t i = 0; |
| |
| for(i = 0; i < toAddSize; i++) { |
| lines[size+i] = toAdd[i]; |
| } |
| size += toAddSize; |
| } |
| |
| void SortedLines::setDistancesFromEmpty(Line* array, int32_t arraySize) |
| { |
| int32_t i = 0; |
| for(i = 0; i < arraySize; i++) { |
| array[i].strengthFromEmpty = probe.distanceFromEmptyString(array[i]); |
| } |
| } |
| |
| |
| // adds contractions in to repertoire |
| int32_t SortedLines::addContractionsToRepertoire(UErrorCode &status) |
| { |
| logger->log("\n*** Detecting contractions\n\n"); |
| contractionsTable = new Hashtable(); |
| int32_t noConts = 0; |
| int32_t allocateSize = 50*size; |
| // first check for simple contractions |
| Line* delta = new Line[allocateSize]; |
| Line** deltaSorted = new Line*[allocateSize]; |
| Line* lesserToAddTo = new Line[allocateSize]; |
| Line* newDelta = new Line[allocateSize]; |
| Line** newDeltaSorted = new Line*[allocateSize]; |
| Line* deltaP = delta; |
| Line** deltaPP = deltaSorted; |
| Line* newDeltaP = newDelta; |
| int32_t deltaSize = 0, lesserToAddToSize = 0, newDeltaSize = 0; |
| logger->log("++ Contraction detection generation 0\n"); |
| noConts = detectContractions(toSort, size, toSort, size, |
| delta, deltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status); |
| setSortingArray(deltaSorted, delta, deltaSize); |
| sort(deltaSorted, deltaSize, TRUE); |
| |
| setDistancesFromEmpty(delta, deltaSize); |
| int32_t deltaPSize = deltaSize; |
| //updateBounds(delta); |
| |
| int32_t generation = 0; |
| // if we found any, we have to try multiple contractions |
| // However, we want to prevent the contractions explosion |
| // if the number of simple contractions is greater than the |
| // starting size, chances are that we either have an algorithmic |
| // contraction (like iteration marks on w2k) or something |
| // is seriosly wrong. |
| if(deltaPSize < size/2) { |
| while (deltaPSize && generation < 1) { |
| generation++; |
| logger->log("\n++ Contraction detection generation %i\n", generation, generation); |
| // find more, but avoid testing the combinations we already have |
| noConts += detectContractions(toSort, size, deltaPP, deltaPSize, |
| newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status); |
| noConts += detectContractions(deltaPP, deltaPSize, toSort, size, |
| newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status); |
| calculateSortKeys(); |
| |
| addAll(deltaP, deltaPSize); |
| setSortingArray(toSort, lines, size); |
| sort(TRUE, TRUE); |
| setSortingArray(newDeltaSorted, newDeltaP, newDeltaSize); |
| sort(newDeltaSorted, newDeltaSize, TRUE); |
| |
| // if no new ones, bail |
| //if (newDeltaSize == 0) break; |
| |
| deltaPSize = newDeltaSize; |
| newDeltaSize = 0; |
| if(deltaP == delta) { |
| deltaP = newDelta; |
| deltaPP = newDeltaSorted; |
| newDeltaP = delta; |
| } else { |
| deltaP = delta; |
| deltaPP = deltaSorted; |
| newDeltaP = newDelta; |
| } |
| setDistancesFromEmpty(deltaP, deltaPSize); |
| } |
| } |
| status = U_ZERO_ERROR; |
| // add stuff from the last batch |
| addAll(deltaP, deltaPSize); |
| |
| // warning: we don't add the lesser ones in recursively, since they will |
| // infinitely loop |
| setDistancesFromEmpty(lesserToAddTo, lesserToAddToSize); |
| addAll(lesserToAddTo, lesserToAddToSize); |
| setSortingArray(toSort, lines, size); |
| sort(TRUE, TRUE); |
| |
| delete[] deltaSorted; |
| delete[] delta; |
| delete[] lesserToAddTo; |
| delete[] newDeltaSorted; |
| delete[] newDelta; |
| return noConts; |
| } |
| |
| |
| int32_t SortedLines::detectContractions(Line **firstRep, int32_t firstSize, |
| Line **secondRep, int32_t secondSize, |
| Line *toAddTo, int32_t &toAddToSize, |
| Line *lesserToAddTo, int32_t &lesserToAddToSize, |
| int32_t capacity, UErrorCode &status) |
| { |
| int32_t noConts = 0; |
| int i = 0, j = 0, k = 0; |
| Line lower, upper, trial, toAdd, helper; |
| UChar32 firstStart, firstEnd, secondStart; |
| UChar NFCTrial[256]; |
| int32_t NFCTrialLen = 0; |
| UBool thai; |
| i = -1; |
| while(i < firstSize-1 && U_SUCCESS(status)) { |
| i++; |
| if(!debug->isOn()) { |
| logger->log("\rTesting %05i/%05i. Found %05i conts.", i, firstSize, noConts); |
| } |
| U16_GET(firstRep[i]->name, 0, 0, firstRep[i]->len, firstStart); |
| if(uscript_getScript(firstStart, &status) == USCRIPT_HAN || firstRep[i]->strengthFromEmpty > UCOL_PRIMARY) //UCOL_TERTIARY) |
| { |
| continue; |
| } |
| lower = *firstRep[i]; |
| for(j = 0; j < secondSize; j++) { |
| if(noConts == capacity) { |
| return noConts; |
| } |
| U16_GET(secondRep[j]->name, 0, 0, secondRep[j]->len, secondStart); |
| if(firstStart == 0x41 && secondStart == 0x308) { |
| int32_t putBreakPointHere = 0; |
| } |
| if(uscript_getScript(secondStart, &status) == USCRIPT_HAN) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) |
| { |
| continue; |
| } |
| if(duplicators && duplicators->get(UnicodeString(secondRep[j]->name, secondRep[j]->len)) != NULL) { |
| debug->log("Skipping duplicator "); |
| debug->log(secondRep[j]->toString(), TRUE); |
| continue; |
| } |
| |
| if(firstRep[i]->name[0] == 0x61 && secondRep[j]->name[0] == 0x308) { |
| int32_t putBreakpointhere = 0; |
| } |
| upper.setToConcat(firstRep[i], UB[UCOL_PRIMARY]); |
| //upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]); |
| toAdd.setToConcat(firstRep[i], secondRep[j]); |
| U16_GET(firstRep[i]->name, 0, firstRep[i]->len-1, firstRep[i]->len, firstEnd); |
| if((thai = u_hasBinaryProperty(firstEnd, UCHAR_LOGICAL_ORDER_EXCEPTION))) { |
| // this means that the lower is single reordering character |
| // if we do the lower test without taking this into account, |
| // we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to |
| // end of lower and in the middle of trial, so we will have |
| // lower = Thai + UB, trial Thai + UB + x, resolving to |
| // UB + Thai vs UB + Thai + x. |
| // for upper bound, we do the similar, so we have |
| // upper = Thai + UB + UB, trial = Thai + UB + x, |
| // resolving to UB + Thai + UB vs UB + Thai + x |
| if(secondRep[j]->firstCC) { |
| UChar32 UBChar; |
| U16_GET(UB[UCOL_SECONDARY]->name, 0, 0, UB[UCOL_SECONDARY]->len, UBChar); |
| if(secondRep[j]->firstCC > u_getCombiningClass(UBChar)) { |
| continue; |
| } |
| } |
| upper = *firstRep[i]; |
| upper.append(*UB[UCOL_PRIMARY]); |
| //upper.append(*UB[secondRep[j]->strengthFromEmpty]); |
| upper.append(*UB[UCOL_PRIMARY]); |
| lower.append(*UB[UCOL_PRIMARY]); |
| trial = *firstRep[i]; |
| trial.append(*UB[UCOL_PRIMARY]); |
| trial.append(*secondRep[j]); |
| } else if((firstRep[i]->lastCC > secondRep[j]->firstCC && secondRep[j]->firstCC && !frenchSecondary) |
| || (firstRep[i]->firstCC < secondRep[j]->lastCC && firstRep[i]->firstCC && frenchSecondary)) { |
| // Skip because normalization will reorder |
| // there will be a chance to check this again, since if we |
| // try a+b, we will also try b+a |
| continue; |
| } else if(frenchSecondary && (firstRep[i]->strengthFromEmpty > UCOL_PRIMARY && secondRep[j]->strengthFromEmpty > UCOL_PRIMARY)) { |
| continue; |
| }else if(firstRep[i]->lastCC && secondRep[j]->firstCC && frenchSecondary) { |
| trial.setToConcat(secondRep[j], firstRep[i]); |
| } else { |
| trial.setToConcat(firstRep[i], secondRep[j]); |
| } |
| // Now let's check the trial. The problem is that when you combine characters, |
| // you can end up with concatenation that is unknown for the examined API. |
| NFCTrialLen = unorm_normalize(trial.name, trial.len, UNORM_NFC, 0, NFCTrial, 256, &status); |
| if((u_strcmp(trial.name, NFCTrial) == 0) || u_strFindLast(NFCTrial, NFCTrialLen, secondRep[j]->name, secondRep[j]->len)) { |
| if(secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) { |
| continue; |
| } |
| } |
| UChar32 c; |
| U16_GET(NFCTrial, 0, 0, NFCTrialLen, c); |
| helper.setTo(c); |
| if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) { |
| continue; |
| } |
| if(NFCTrialLen > 1) { |
| U16_GET(NFCTrial, 0, NFCTrialLen-1, NFCTrialLen, c); |
| helper.setTo(c); |
| if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) { |
| continue; |
| } |
| } |
| |
| if (probe.compare(lower, trial) >= 0) { // if lower is bigger than trial |
| // this might be ok, but I'm having doubts. Here is an additional check: |
| if(firstRep[i]->len == 1 || secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) { |
| // I'm basically saying that I'll add this kind of contraction for cases where I combine |
| // one letter with an accent OR when I'm combining more than one symbol with a letter. |
| noteContraction("L", lesserToAddTo, lesserToAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } |
| } |
| else if (probe.compare(trial, upper) > 0) { // trial is bigger than upper?? |
| noteContraction("U", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } |
| #if 0 |
| else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY) |
| { |
| Line expansionLine; |
| if(getExpansionLine(trial, *firstRep[i], *secondRep[j], expansionLine) && |
| expansionLine.len && !(expansionLine == *secondRep[j])) { |
| noteContraction("D", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } |
| } |
| #endif |
| else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && probe.getStrength(lower, trial) < secondRep[j]->strengthFromEmpty) { |
| noteContraction("D1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } |
| else if (firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) |
| { |
| // I have added an additional check. The checks versus upper and lower bound should be sufficient |
| // when the right side is a combining mark. There might be a reordering of combining marks, but |
| // that should be already visible in their order. |
| // compare the sequence |
| // Y- <? Y <? Y+ |
| // and |
| // XY- <? XY <? XY+ |
| Line xym, xyp, xy; |
| UBool xymIsContraction = FALSE, toAddIsContraction = FALSE; |
| if(j) { |
| if(((!secondRep[j-1]->firstCC || firstRep[i]->lastCC < secondRep[j-1]->firstCC) && !frenchSecondary) |
| ||((!firstRep[i]->firstCC || firstRep[i]->firstCC > secondRep[j-1]->lastCC) && frenchSecondary)) { |
| xym.setToConcat(firstRep[i], secondRep[j-1]); |
| toAdd.strength = probe.getStrength(xym, toAdd); |
| if(secondRep[j]->strength != toAdd.strength) { |
| // there is possibility that either xym or xy are contractions |
| // There are two situations: |
| // xym > xy or xym <n xy and ym <k y but n != k |
| // if they are reordered, we are going to see if each of them |
| // is further reordered |
| if(toAdd.strength == UCOL_OFF) { |
| // check whether toAdd shifted more down |
| k = j - 2; |
| while(k>=0 && secondRep[k]->strength > secondRep[j]->strength) { |
| k--; |
| } |
| while(!toAddIsContraction && k>=0) { |
| xyp.setToConcat(firstRep[i], secondRep[k]); |
| if(contractionsTable->get(UnicodeString(xyp.name, xyp.len)) != NULL) { |
| k--; |
| continue; |
| } |
| if(probe.compare(xyp, xym) >= 0) { |
| // xyp looks like a contraction |
| noteContraction("!1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| toAddIsContraction = TRUE; |
| } else { |
| break; |
| } |
| } |
| // first let's see if xym has moved beyond |
| if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) { |
| k = j+1; |
| // ignore weaker strengths |
| while(k < secondSize && secondRep[k]->strength > secondRep[j]->strength) { |
| k++; |
| } |
| // check if we skipped the following guy |
| if(k < secondSize) { |
| xyp.setToConcat(firstRep[i], secondRep[k]); |
| if(probe.compare(xyp, xym) <= 0) { |
| // xyp looks like a contraction |
| noteContraction("!2", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status); |
| xymIsContraction = TRUE; |
| } |
| } |
| } else { |
| xymIsContraction = TRUE; |
| } |
| // if they have reordered, but none has moved, then we add them both |
| // and hope for the best |
| if(!xymIsContraction && !toAddIsContraction) { |
| // it is possible that there is an NFC version version of one of the |
| // strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have |
| // have a false contraction. |
| trial.len = unorm_normalize(toAdd.name, toAdd.len, UNORM_NFC, 0, trial.name, 25, &status); |
| //UColAttributeValue strength = probe.getStrength(*firstRep[i], trial); |
| if(trial == toAdd) { |
| noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status); |
| noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } else { |
| noteContraction("!4", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } |
| } |
| } else { // only the strength has changed |
| // check whether the previous is contraction and if not, add the current |
| if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) { |
| noteContraction("!5", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); |
| } |
| } |
| } |
| } |
| } |
| } |
| if(thai) { // restore lower |
| lower = *firstRep[i]; |
| } |
| } |
| } |
| return noConts; |
| } |
| |
| void |
| SortedLines::noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status) |
| { |
| Line toAdd; |
| toAdd.setToConcat(left, right); |
| toAdd.left = left; |
| toAdd.right = right; |
| // if we're adding an accent to an existing contraction, we want to check |
| #if 0 |
| Line test, trial1, trial2; |
| if(right->strengthFromEmpty > UCOL_PRIMARY) { |
| if(left->right && left->right->previous && left->right->next) { |
| test.setToConcat(left->left, left->right->previous); |
| trial1.setToConcat(&test, right); |
| |
| test.setToConcat(left->left, left->right->next); |
| trial2.setToConcat(&test, right); |
| if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { |
| // this means that the contraction has been broken by the newly added accent |
| // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above |
| debug->log("Con -"); |
| debug->log(msg); |
| debug->log(toAdd.toString(FALSE), TRUE); |
| return; |
| } |
| } else { |
| if(right->previous && right->next) { |
| trial1.setToConcat(left, right->previous); |
| trial2.setToConcat(left, right->next); |
| if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { |
| // this means that the contraction has been broken by the newly added accent |
| // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above |
| debug->log("Con -"); |
| debug->log(msg); |
| debug->log(toAdd.toString(FALSE), TRUE); |
| return; |
| } |
| } |
| if(left->previous && left->next) { |
| trial1.setToConcat(left->previous, right); |
| trial2.setToConcat(left->next, right); |
| if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { |
| // this means that the contraction has been broken by the newly added accent |
| // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above |
| debug->log("Con -"); |
| debug->log(msg); |
| debug->log(toAdd.toString(FALSE), TRUE); |
| return; |
| } |
| } |
| |
| } |
| } |
| if(right->right && right->right->strengthFromEmpty > UCOL_PRIMARY && right->left->previous && right->left->next) { // maybe we already had a contraction with an accent |
| test.setToConcat(right->left->previous, right->right); |
| trial1.setToConcat(left, &test); |
| test.setToConcat(right->left->next, right->right); |
| trial2.setToConcat(left, &test); |
| if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { |
| // this means that the contraction has been broken by the newly added accent |
| // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above |
| debug->log("Con -"); |
| debug->log(msg); |
| debug->log(toAdd.toString(FALSE), TRUE); |
| return; |
| } |
| } |
| #endif |
| if(contractionsTable->get(UnicodeString(toAdd.name, toAdd.len)) == NULL) { |
| if(probe.distanceFromEmptyString(toAdd) <= UCOL_TERTIARY) { |
| toAddTo[toAddToSize++] = toAdd; |
| contractionsTable->put(UnicodeString(toAdd.name, toAdd.len), &toAdd, status); |
| noConts++; |
| debug->log(msg); |
| debug->log(" Con + "); |
| debug->log(toAdd.toString(FALSE), TRUE); |
| |
| if(!left->sortKey) { |
| calculateSortKey(*left); |
| } |
| debug->log(left->dumpSortkey()); |
| debug->log(" + "); |
| |
| if(!right->sortKey) { |
| calculateSortKey(*right); |
| } |
| debug->log(right->dumpSortkey()); |
| debug->log(" = "); |
| |
| calculateSortKey(toAdd); |
| debug->log(toAdd.dumpSortkey(), TRUE); |
| if(noConts > size/2) { |
| status = U_BUFFER_OVERFLOW_ERROR; |
| } |
| } |
| } |
| } |
| |
| |
| UBool |
| SortedLines::getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine) |
| { |
| int expIndexSize = 0; |
| UColAttributeValue expStrength = UCOL_OFF; |
| int32_t comparisonResult = 0; |
| int32_t i = 0, k = 0, prevK = 0; |
| Line trial; |
| UBool sequenceCompleted = FALSE; |
| int32_t expIndexes[256]; |
| int32_t expIndexesSize = 0; |
| |
| if(!sequenceCompleted) { |
| expIndexSize = 0; |
| expansionLine.clear(); |
| |
| // we will start from strength between the expansion |
| // and the target (toSort[i] and toSort[j]. First we |
| // will add as many primaries as possible. Then we will |
| // try to add secondary pieces and then tertiary. |
| // found an expansion - what is the expanding sequence? |
| |
| expStrength = UCOL_PRIMARY; |
| while(!sequenceCompleted) { |
| k = 0; |
| prevK = 0; |
| while(k < size) { |
| if(expansionLine.len > 15) { |
| sequenceCompleted = TRUE; |
| break; |
| } |
| while(k < size && toSort[k]->strength != UCOL_PRIMARY) |
| { |
| k++; |
| } |
| // nothing found |
| if(k == size) { |
| break; |
| } |
| // we need to skip over reordering things. If they were worthy, they would |
| // have been detected in the previous iteration. |
| //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) { |
| //k++; |
| //continue; |
| //} |
| trial = previous; |
| trial.append(expansionLine); |
| trial.append(*toSort[k]); |
| if(toSort[k]->name[0] == 0x0067) { |
| int32_t putBreakPointHere = 0; |
| } |
| comparisonResult = probe.compare(trial, expansion); |
| if(comparisonResult == 0) { |
| expansionLine = *toSort[k]; |
| return TRUE; |
| } else if (comparisonResult > 0) { |
| if(prevK) { |
| if(exp == *toSort[prevK]) { |
| expansionLine = exp; |
| return TRUE; |
| } |
| i = prevK; |
| while(i < k-1) { |
| i++; |
| if(toSort[i]->strength > exp.strength) { |
| continue; |
| } |
| trial = previous; |
| trial.append(expansionLine); |
| trial.append(*toSort[i]); |
| if(probe.compare(trial, expansion) > 0) { |
| break; |
| } |
| } |
| // we got into situation where we have ch > ch+dot-below |
| // however, ch is a contraction and therefore we cannot use |
| // it properly. If we have hit on a contraction, we'll just try |
| // to continue. Probably need more logic here. |
| if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) { |
| expansionLine.append(*toSort[i-1]); |
| expIndexes[expIndexSize++] = i-1; |
| break; |
| } else { |
| int32_t putBreakPointHere = 0; |
| } |
| } else { |
| sequenceCompleted = TRUE; |
| break; |
| } |
| //break; |
| } |
| prevK = k; |
| k++; |
| } |
| if(!prevK || k == size) { |
| break; |
| } |
| } |
| } |
| return expIndexSize > 0; |
| } |
| |
| int32_t |
| SortedLines::gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength) |
| { |
| int32_t i = expansionIndex, k = resetIndex+1, n = 0, m = 0, start = 0; |
| UBool haveChanges = FALSE; |
| Line trial, prefix, suffix; |
| // we will first try goosing up the reset index |
| //while(toSort[k]->strength >= strength) |
| for( ; toSort[k]->strength == strength; k++) |
| { |
| //if(toSort[k]->strength > strength) { |
| //continue; |
| //} |
| trial.setToConcat(toSort[k], &expLine); |
| if(probe.compare(trial, *toSort[i]) > 0) { |
| break; |
| } |
| } |
| resetIndex = k-1; |
| |
| // goose up individual characters |
| prefix = *toSort[resetIndex]; |
| for(n = 0; n < expIndexSize; n++) { |
| suffix.clear(); |
| for(m = n+1; m < expIndexSize; m++) { |
| suffix.append(*toSort[expIndexes[m]]); |
| } |
| k = expIndexes[n]+1; |
| //while(toSort[k]->strength >= strength) |
| for( ; toSort[k]->strength == strength; k++) |
| { |
| //if(toSort[k]->strength > strength) { |
| //continue; |
| //} |
| trial.setToConcat(&prefix, toSort[k]); |
| trial.append(suffix); |
| if(probe.compare(trial, *toSort[i]) > 0) { |
| break; |
| } |
| } |
| if(k > expIndexes[n]+1) { |
| haveChanges = TRUE; |
| expIndexes[n] = k-1; |
| } |
| prefix.append(*toSort[expIndexes[n]]); |
| } |
| |
| // try inserting ignorables |
| UColAttributeValue lastStr = UCOL_OFF; |
| k = 0; |
| while(toSort[k]->strengthFromEmpty > strength) { |
| k++; |
| } |
| if(toSort[k]->strengthFromEmpty == strength) { |
| start = k; |
| prefix = *toSort[resetIndex]; |
| n = 0; |
| while(n <= expIndexSize) { |
| suffix.clear(); |
| for(m = n; m < expIndexSize; m++) { |
| suffix.append(*toSort[expIndexes[m]]); |
| } |
| k = start; |
| while(toSort[k]->strengthFromEmpty == strength) { |
| trial.setToConcat(&prefix, toSort[k]); |
| trial.append(suffix); |
| lastStr = probe.getStrength(trial, *toSort[i]); |
| if(lastStr == UCOL_OFF) { // shot over - we won't find anything here |
| break; |
| } else if(lastStr > strength) { |
| for(m = expIndexSize; m > n; m--) { |
| expIndexes[m] = expIndexes[m-1]; |
| } |
| expIndexes[n] = k; |
| expIndexSize++; |
| haveChanges = TRUE; |
| break; |
| } |
| #if 0 |
| if(probe.compare(trial, *toSort[i]) > 0) { |
| // if the first one skips, that means that |
| // this position doesn't work |
| if(k > start) { |
| // insert an ignorable on position n |
| for(m = expIndexSize; m > n; m--) { |
| expIndexes[m] = expIndexes[m-1]; |
| } |
| expIndexes[n] = k-1; |
| expIndexSize++; |
| haveChanges = TRUE; |
| if(n == expIndexSize-1) { // added to the end of the string |
| UColAttributeValue str = probe.getStrength(trial, *toSort[i]); |
| int32_t putBreakHere = 0; |
| } |
| } |
| break; |
| } else { |
| lastStr = probe.getStrength(trial, *toSort[i]); |
| } |
| #endif |
| k++; |
| } |
| prefix.append(*toSort[expIndexes[n]]); |
| n++; |
| } |
| } |
| |
| if(haveChanges) { |
| expLine.clear(); |
| for(m = 0; m < expIndexSize; m++) { |
| expLine.append(*toSort[expIndexes[m]]); |
| } |
| } |
| return resetIndex; |
| } |
| |
| int32_t |
| SortedLines::detectExpansions() |
| { |
| logger->log("\n*** Detecting expansions\n\n"); |
| int32_t exCount = 0; |
| int32_t i = 0, j = 0, k = 0, prevK = 0; |
| Line *previous, trial, expansionLine; |
| UBool foundExp = FALSE, sequenceCompleted = FALSE; |
| UColAttributeValue strength = UCOL_OFF; |
| UColAttributeValue maxStrength = UCOL_IDENTICAL; |
| UColAttributeValue expStrength = UCOL_OFF; |
| int32_t expIndexes[256]; |
| int32_t expIndexSize = 0; |
| memset(expIndexes, 0, sizeof(expIndexes)); |
| |
| // for each element, we look back to find whether there is such a q for which |
| // q <n x < qUBn. These are possible expansions. When going backwards we skip |
| // over already detected expansions. |
| i = 0; |
| // it turns out that looking at accents as possible expansions is |
| // quite a stupid thing to do, especially on non ICU platforms. |
| // Previously this line skipped over identicals only, but |
| // now we are going to skip all the way to non-ignorables. |
| while(toSort[i]->strengthFromEmpty > UCOL_PRIMARY) { |
| i++; |
| } |
| i++; |
| for( ; i < size; i++) { |
| if(toSort[i]->name[0]==0x0063 && toSort[i]->name[1] == 0x68) // && toSort[i]->name[1] == 0x308)0043 0043 0219 |
| { |
| int32_t putBreakpointhere = 0; |
| } |
| foundExp = FALSE; |
| sequenceCompleted = FALSE; |
| strength = toSort[i]->strength; |
| if(strength == UCOL_IDENTICAL && toSort[i-1]->isExpansion == TRUE) { |
| u_strcpy(toSort[i]->expansionString, toSort[i-1]->expansionString); |
| toSort[i]->expLen = toSort[i-1]->expLen; |
| toSort[i]->isExpansion = TRUE; |
| toSort[i]->expIndex = toSort[i-1]->expIndex; |
| toSort[i]->expStrength = UCOL_IDENTICAL; |
| //toSort[i]->expStrength = toSort[i-1]->expStrength; |
| foundExp = TRUE; |
| sequenceCompleted = TRUE; |
| } |
| //logger->log("%i %i\n", i, j); |
| while(!foundExp && strength <= maxStrength) { |
| j = i-1; |
| while(j && (toSort[j]->isExpansion == TRUE || toSort[j]->isRemoved == TRUE)) { |
| //if(toSort[j]->strength < strength) { |
| //strength = toSort[j]->strength; |
| //} |
| j--; |
| } |
| |
| //while(j && toSort[j]->strength > strength) |
| while(j && toSort[j]->strength > probe.getStrength(*toSort[j], *toSort[i])) |
| { |
| j--; |
| } |
| //if(toSort[j]->strength == strength) { |
| previous = toSort[j]; |
| if(previous->strengthFromEmpty >= UCOL_IDENTICAL || |
| (previous->strengthFromEmpty == UCOL_SECONDARY |
| && strength == UCOL_SECONDARY |
| && previous->lastCC > UB[strength]->firstCC)) { |
| break; |
| //continue; |
| } |
| //trial.setToConcat(previous, UB[strength]); |
| trial.setToConcat(previous, UB[probe.getStrength(*toSort[j], *toSort[i])]); |
| if(probe.compare(trial, *toSort[i]) > 0) { |
| foundExp = TRUE; |
| } |
| //} |
| if(strength == UCOL_QUATERNARY) { |
| strength = UCOL_IDENTICAL; |
| } else { |
| strength = (UColAttributeValue)(strength + 1); |
| } |
| } |
| // calculate the expanding sequence |
| if(foundExp && !sequenceCompleted) { |
| expIndexSize = 0; |
| expansionLine.clear(); |
| exCount++; |
| // we will start from strength between the expansion |
| // and the target (toSort[i] and toSort[j]. First we |
| // will add as many primaries as possible. Then we will |
| // try to add secondary pieces and then tertiary. |
| // found an expansion - what is the expanding sequence? |
| |
| expStrength = UCOL_PRIMARY; |
| while(!sequenceCompleted) { |
| k = 0; |
| prevK = 0; |
| while(k < size) { |
| if(expansionLine.len > 15) { |
| sequenceCompleted = TRUE; |
| break; |
| } |
| while(k < size && toSort[k]->strength != UCOL_PRIMARY) { |
| k++; |
| } |
| // nothing found |
| if(k == size) { |
| break; |
| } |
| // we need to skip over reordering things. If they were worthy, they would |
| // have been detected in the previous iteration. |
| //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) { |
| //k++; |
| //continue; |
| //} |
| trial = *previous; |
| trial.append(expansionLine); |
| trial.append(*toSort[k]); |
| if(toSort[k]->name[0] == 0x0067) { |
| int32_t putBreakPointHere = 0; |
| } |
| if(probe.compare(trial, *toSort[i]) > 0) { |
| if(prevK) { |
| // we got into situation where we have ch > ch+dot-below |
| // however, ch is a contraction and therefore we cannot use |
| // it properly. If we have hit on a contraction, we'll just try |
| // to continue. Probably need more logic here. |
| if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) { |
| expansionLine.append(*toSort[prevK]); |
| expIndexes[expIndexSize++] = prevK; |
| break; |
| } else { |
| int32_t putBreakPointHere = 0; |
| } |
| } else { |
| sequenceCompleted = TRUE; |
| break; |
| } |
| //break; |
| } |
| prevK = k; |
| k++; |
| } |
| if(!prevK || k == size) { |
| break; |
| } |
| } |
| // after this we have primaries lined up. |
| // we are going to goose up with secondaries and |
| // tertiaries |
| trial.setToConcat(toSort[j], &expansionLine); |
| expStrength = probe.getStrength(trial, *toSort[i]); |
| if(expStrength > UCOL_PRIMARY) { |
| if(expStrength == UCOL_SECONDARY || expStrength == UCOL_OFF) { |
| j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_SECONDARY); |
| trial.setToConcat(toSort[j], &expansionLine); |
| expStrength = probe.getStrength(trial, *toSort[i]); |
| if(expStrength == UCOL_TERTIARY) { |
| j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY); |
| } |
| } else if(expStrength == UCOL_TERTIARY) { |
| j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY); |
| } |
| } |
| trial.setToConcat(toSort[j], &expansionLine); |
| expStrength = probe.getStrength(trial, *toSort[i]); |
| if(expansionLine.len) { |
| if(expansionLine.name[0] == 0x73 && expansionLine.name[1] == 0x7a) { |
| int32_t putBreakpointhere = 0; |
| } |
| UBool isExpansionLineAContraction = (contractionsTable->get(UnicodeString(expansionLine.name, expansionLine.len)) != NULL); |
| // we have an expansion line and an expansion. There could be some expansions where |
| // the difference between expansion line and the end of expansion sequence is less or |
| // equal than the expansion strength. These should probably be removed. |
| int32_t diffLen = toSort[i]->len - expansionLine.len; |
| if(diffLen > 0) { |
| trial.setTo(UnicodeString(toSort[i]->name + diffLen, toSort[i]->len - diffLen)); |
| } else { |
| trial = *toSort[i]; |
| } |
| UColAttributeValue s1 = probe.getStrength(trial, expansionLine); |
| if(s1 == UCOL_OFF) { |
| s1 = probe.getStrength(expansionLine, trial); |
| } |
| if((!isExpansionLineAContraction && s1 >= expStrength) || (diffLen <= 0 && s1 == UCOL_IDENTICAL)) { |
| contractionsTable->remove(UnicodeString(toSort[i]->name, toSort[i]->len)); |
| toSort[i]->isRemoved = TRUE; |
| if(toSort[i]->next && toSort[i]->previous) { |
| toSort[i]->previous->next = toSort[i]->next; |
| } |
| if(toSort[i]->previous && toSort[i]->next) { |
| toSort[i]->next->previous = toSort[i]->previous; |
| } |
| debug->log("Exp -N: "); |
| debug->log(toSort[i]->toString(FALSE)); |
| debug->log(" / "); |
| debug->log(expansionLine.toString(FALSE), TRUE); |
| } |
| else |
| { |
| u_strncat(toSort[i]->expansionString, expansionLine.name, expansionLine.len); |
| toSort[i]->isExpansion = TRUE; |
| toSort[i]->expStrength = expStrength; |
| toSort[i]->expLen = expansionLine.len; |
| toSort[i]->expansionString[toSort[i]->expLen] = 0; |
| toSort[i]->expIndex = j; |
| } |
| } |
| } |
| if(toSort[i]->isExpansion == TRUE) { |
| if(debug->isOn()) { |
| debug->log("Exp + : &"); |
| debug->log(toSort[j]->toString(FALSE)); |
| debug->log(toSort[i]->strengthToString(toSort[i]->expStrength, TRUE)); |
| debug->log(toSort[i]->toString(FALSE)); |
| debug->log(" "); |
| if(!toSort[j]->sortKey) { |
| calculateSortKey(*toSort[j]); |
| } |
| debug->log(toSort[j]->dumpSortkey()); |
| debug->log(" ... "); |
| if(!toSort[i]->sortKey) { |
| calculateSortKey(*toSort[i]); |
| } |
| debug->log(toSort[i]->dumpSortkey()); |
| calculateSortKey(expansionLine); |
| debug->log("/"); |
| debug->log(expansionLine.dumpSortkey(), TRUE); |
| } |
| |
| } |
| } |
| // after detecting expansions, we want to position them. |
| // it is better to position expansions after all have been detected, |
| // since otherwise we will change the ordering. |
| for(i = size-1; i >= 0; i--) { |
| if(toSort[i]->isExpansion) { |
| if(toSort[i]->name[0] == 0x2A3) { |
| int32_t putBreakPointHere = 0; |
| } |
| if(i) { |
| if(toSort[i]->previous) { |
| toSort[i]->previous->next = toSort[i]->next; |
| } |
| } |
| if(i < size-1) { |
| if(toSort[i]->next) { |
| toSort[i]->next->previous = toSort[i]->previous; |
| } |
| } |
| j = toSort[i]->expIndex; |
| toSort[i]->next = toSort[j]->next; |
| toSort[i]->previous = toSort[j]; |
| toSort[j]->next = toSort[i]; |
| if(toSort[i]->next) { |
| toSort[i]->next->previous = toSort[i]; |
| } |
| toSort[i]->strength = toSort[i]->expStrength; |
| } |
| } |
| return exCount; |
| } |
| |
| |
| Line * |
| SortedLines::getFirst() { |
| current = first; |
| return current; |
| } |
| |
| Line * |
| SortedLines::getLast() { |
| current = last; |
| return current; |
| } |
| |
| void |
| SortedLines::add(Line *line, UBool linkIn) { |
| if(size++ == capacity) { |
| // grow |
| } |
| lines[size] = *line; |
| Line *toAdd = &lines[size]; |
| if(linkIn && first) { |
| Line *current = first; |
| while(current != NULL && probe.comparer(¤t, &toAdd) < 0) { |
| current = current->next; |
| } |
| if(current == NULL) { |
| toAdd->previous = last; |
| toAdd->next = NULL; |
| if(last != NULL) { |
| last->next = toAdd; |
| } |
| last = toAdd; |
| if(first == NULL) { |
| first = toAdd; |
| } |
| } else { // current != NULL |
| toAdd->next = current; |
| toAdd->previous = current->previous; |
| if(current->previous) { |
| current->previous->next = toAdd; |
| } else { |
| first = toAdd; |
| } |
| current->previous = toAdd; |
| } |
| } |
| } |
| |
| |
| Line * |
| SortedLines::getNext() |
| { |
| if(current != NULL) { |
| current=current->next; |
| } |
| return current; |
| } |
| |
| Line * |
| SortedLines::getPrevious() |
| { |
| if(current != NULL) { |
| current=current->previous; |
| } |
| return current; |
| } |
| |
| Line * |
| SortedLines::operator[](int32_t index) |
| { |
| int32_t i = 0; |
| Line *c = first; |
| for(i = 0; i < index; i++) { |
| if(c != NULL) { |
| c = c->next; |
| } |
| } |
| return c; |
| } |
| |
| UnicodeString |
| SortedLines::arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys) { |
| UnicodeString result; |
| int32_t i = 0; |
| |
| Line *line = NULL; |
| Line *previous = sortedLines[0]; |
| if(printSortKeys && !sortkeys) { |
| printSortKeys = FALSE; |
| } |
| if(previous->isReset) { |
| result.append(" & "); |
| result.append(previous->name, previous->len); |
| if(pretty) { |
| result.append(" # "); |
| result.append(previous->stringToName(previous->name, previous->len)); |
| result.append("\n"); |
| } |
| } else if(!previous->isRemoved) { |
| result.append(previous->toString(pretty)); |
| if(pretty) { |
| result.append("\n"); |
| } |
| } |
| i = 1; |
| while((i < linesSize && !useLinks) || (previous->next && useLinks)) { |
| if(useLinks) { |
| line = previous->next; |
| } else { |
| line = sortedLines[i]; |
| } |
| if(line->isReset) { |
| result.append(" &"); |
| result.append(line->name, line->len); |
| if(pretty) { |
| result.append(" # "); |
| result.append(line->stringToName(line->name, line->len)); |
| result.append("\n"); |
| } |
| } else if(!line->isRemoved) { |
| if(i > 0) { |
| result.append(line->strengthToString(line->strength, pretty)); |
| } |
| result.append(line->toString(pretty)); |
| if(printSortKeys) { |
| result.append(line->dumpSortkey()); |
| } |
| if(pretty) { |
| result.append("\n"); |
| } |
| } |
| previous = line; |
| i++; |
| } |
| return result; |
| } |
| |
| SortedLines::SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status) : |
| toSort(NULL), |
| toSortCapacity(0), |
| lines(NULL), |
| size(0), |
| capacity(0), |
| first(NULL), |
| last(NULL), |
| logger(logger), |
| debug(debug), |
| contractionsTable(NULL), |
| duplicators(NULL), |
| maxExpansionPrefixSize(0), |
| wordSort(FALSE), |
| frenchSecondary(FALSE), |
| upperFirst(FALSE), |
| sortkeys(NULL), |
| sortkeyOffset(0) |
| { |
| debug->log("*** loading a dump\n"); |
| memset(UB, 0, sizeof(UB)); |
| int32_t i = 0; |
| for(i = 0; i < UCOL_OFF; i++) { |
| UB[i] = ∅ |
| } |
| |
| int32_t newFrench, newUpperFirst; |
| fscanf(file, "%i,%i,%i\n", &size, &newFrench, &newUpperFirst); |
| debug->log("Read size %i, frenchSecondary %i and upperFirst %i\n", size, newFrench, newUpperFirst); |
| frenchSecondary = (UBool)newFrench; |
| upperFirst = (UBool)newUpperFirst; |
| capacity = size; |
| lines = new Line[capacity]; |
| i = 0; |
| |
| char buff[256]; |
| |
| while(fgets(buff, 256, file)) { |
| if(i % 20 == 0) { |
| logger->log("\rLine: %04i", i, buff); |
| } |
| lines[i].initFromString(buff, 256, status); |
| if(i) { |
| lines[i].previous = &lines[i-1]; |
| lines[i-1].next = &lines[i]; |
| } |
| i++; |
| } |
| size = i; |
| toSort = new Line*[size]; |
| setSortingArray(toSort, lines, size); |
| first = &lines[0]; |
| last = &lines[size-1]; |
| } |
| |
| void |
| SortedLines::toFile(FILE *file, UBool useLinks, UErrorCode &status) |
| { |
| fprintf(file, "%i,%i,%i\n", size, frenchSecondary, upperFirst); |
| int32_t i = 1; |
| Line *previous = toSort[0]; |
| Line *line = NULL; |
| char buff[256]; |
| previous->write(buff, 256, status); |
| fprintf(file, "%s\n", buff); |
| fflush(file); |
| while(previous->next) { |
| if(useLinks) { |
| line = previous->next; |
| } else { |
| line = toSort[i]; |
| } |
| line->write(buff, 256, status); |
| fprintf(file, "%s\n", buff); |
| i++; |
| previous = line; |
| } |
| } |
| |
| |
| |
| UnicodeString |
| SortedLines::toStringFromEmpty() { |
| UBool useLinks = FALSE; |
| UBool pretty = FALSE; |
| UnicodeString result; |
| int32_t i = 0; |
| |
| Line *line = NULL; |
| Line *previous = toSort[0]; |
| if(previous->isReset) { |
| result.append(" & "); |
| if(pretty) { |
| result.append("\n"); |
| } |
| result.append(previous->name, previous->len); |
| } else if(!previous->isRemoved) { |
| result.append(previous->toString(pretty)); |
| if(pretty) { |
| result.append("\n"); |
| } |
| } |
| i = 1; |
| while(i < size || previous->next) { |
| if(useLinks) { |
| line = previous->next; |
| } else { |
| line = toSort[i]; |
| } |
| if(line->isReset) { |
| result.append(" &"); |
| result.append(line->name, line->len); |
| if(pretty) { |
| result.append(" # "); |
| result.append(line->stringToName(line->name, line->len)); |
| result.append("\n"); |
| } |
| } else if(!line->isRemoved) { |
| if(i > 0) { |
| result.append(line->strengthToString(line->strengthFromEmpty, pretty)); |
| } |
| result.append(line->toString(pretty)); |
| if(pretty) { |
| result.append("\n"); |
| } |
| } |
| previous = line; |
| i++; |
| } |
| return result; |
| } |
| |
| UnicodeString |
| SortedLines::toString(UBool useLinks) |
| { |
| return arrayToString(toSort, size, FALSE, useLinks, FALSE); |
| } |
| |
| |
| UnicodeString |
| SortedLines::toPrettyString(UBool useLinks, UBool printSortKeys) |
| { |
| return arrayToString(toSort, size, TRUE, useLinks, printSortKeys); |
| } |
| |
| UnicodeString |
| SortedLines::toOutput(const char *format, |
| const char *locale, const char *platform, const char *reference, |
| UBool useLinks, UBool initialize, UBool moreToCome) { |
| if(strcmp(format, "HTML") == 0) { |
| return toHTML(locale, platform, reference, useLinks, initialize, moreToCome); |
| } else if(strcmp(format, "XML") == 0) { |
| return toXML(locale, platform, reference, useLinks, initialize, moreToCome); |
| } else { |
| return toBundle(locale, platform, reference, useLinks, initialize, moreToCome); |
| } |
| } |
| |
| |
| UnicodeString |
| SortedLines::toHTML(const char *locale, |
| const char *platform, const char *reference, |
| UBool useLinks, UBool initialize, UBool moreToCome) |
| { |
| UnicodeString result; |
| int32_t i = 0; |
| if(initialize) { |
| result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n"); |
| result.append("# Collation data resource bundle generated for locale: "); |
| result.append(locale); |
| result.append("<br>\n# For platform "); |
| result.append(platform); |
| result.append(" reference platform "); |
| result.append(reference); |
| result.append("<br><br>\n\n\n"); |
| |
| result.append(locale); |
| if(platform) { |
| result.append("_"); |
| result.append(platform); |
| } |
| if(reference) { |
| result.append("_vs_"); |
| result.append(reference); |
| } |
| result.append(" {<br>\n"); |
| |
| result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n"); |
| } |
| |
| if(frenchSecondary) { |
| result.append("[backwards 2]<br>\n"); |
| } |
| if(upperFirst) { |
| result.append("[casefirst upper]<br>\n"); |
| } |
| |
| Line *line = toSort[0]; |
| |
| i = 0; |
| while((i < size && !useLinks) || (line->next && useLinks)) { |
| if(line->isReset || !line->isRemoved) { |
| result.append(line->toHTMLString()); |
| } |
| i++; |
| if(useLinks) { |
| line = line->next; |
| } else { |
| line = toSort[i]; |
| } |
| } |
| if(!moreToCome) { |
| result.append(" }<br>\n }<br>\n }<br>\n}<br>\n"); |
| |
| result.append("</html>\n"); |
| } |
| |
| return result; |
| } |
| |
| UnicodeString |
| SortedLines::toXML(const char *locale, |
| const char *platform, const char *reference, |
| UBool useLinks, UBool initialize, UBool moreToCome) |
| { |
| UnicodeString result; |
| int32_t i = 0; |
| if(initialize) { |
| result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n"); |
| result.append("# Collation data resource bundle generated for locale: "); |
| result.append(locale); |
| result.append("<br>\n# For platform "); |
| result.append(platform); |
| result.append(" reference platform "); |
| result.append(reference); |
| result.append("<br><br>\n\n\n"); |
| |
| result.append(locale); |
| if(platform) { |
| result.append("_"); |
| result.append(platform); |
| } |
| if(reference) { |
| result.append("_vs_"); |
| result.append(reference); |
| } |
| result.append(" {<br>\n"); |
| |
| result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n"); |
| } |
| |
| if(frenchSecondary) { |
| result.append("[backwards 2]<br>\n"); |
| } |
| if(upperFirst) { |
| result.append("[casefirst upper]<br>\n"); |
| } |
| |
| Line *line = toSort[0]; |
| |
| i = 0; |
| while((i < size && !useLinks) || (line->next && useLinks)) { |
| if(line->isReset || !line->isRemoved) { |
| result.append(line->toHTMLString()); |
| } |
| i++; |
| if(useLinks) { |
| line = line->next; |
| } else { |
| line = toSort[i]; |
| } |
| } |
| if(!moreToCome) { |
| result.append(" }<br>\n }<br>\n }<br>\n}<br>\n"); |
| |
| result.append("</html>\n"); |
| } |
| |
| return result; |
| } |
| |
| UnicodeString |
| SortedLines::toBundle(const char *locale, |
| const char *platform, const char *reference, |
| UBool useLinks, UBool initialize, UBool moreToCome) |
| { |
| UnicodeString result; |
| int32_t i = 0; |
| |
| if(initialize) { |
| result.append("// Collation data resource bundle generated for locale: "); |
| result.append(locale); |
| result.append("\n// For platform "); |
| result.append(platform); |
| result.append(" reference platform "); |
| result.append(reference); |
| result.append("\n\n\n"); |
| |
| result.append(locale); |
| /* |
| if(platform) { |
| result.append("_"); |
| result.append(platform); |
| } |
| if(reference) { |
| result.append("_vs_"); |
| result.append(reference); |
| } |
| */ |
| result.append(" {\n"); |
| |
| result.append(" collations {\n standard {\n Sequence {\n"); |
| } |
| |
| if(frenchSecondary) { |
| result.append("[backwards 2]\n"); |
| } |
| if(upperFirst) { |
| result.append("[casefirst upper]\n"); |
| } |
| |
| Line *line = toSort[0]; |
| |
| i = 0; |
| while((i < size && !useLinks) || (line->next && useLinks)) { |
| if(line->isReset || !line->isRemoved) { |
| result.append(line->toBundleString()); |
| } |
| i++; |
| if(useLinks) { |
| line = line->next; |
| } else { |
| line = toSort[i]; |
| } |
| } |
| |
| if(!moreToCome) { |
| result.append(" }\n }\n }\n}\n"); |
| } |
| |
| return result; |
| } |
| |
| |
| int32_t |
| SortedLines::getSize() const { |
| return repertoire.size(); |
| } |
| |
| void |
| SortedLines::reduceDifference(SortedLines& reference) { |
| UErrorCode status = U_ZERO_ERROR; |
| if(upperFirst) { |
| swapCase(); |
| } |
| // both sorted lines structures need to have established links and strengths |
| // We walk down both structures and note differences. These |
| // differences will modify this by removng elements, setting resets |
| // etc... |
| // we will prefer insertions from tailoring to reference, then deletions |
| // there are two tables that keep seen elements. |
| Hashtable *seenThis = new Hashtable(); |
| Hashtable *seenReference = new Hashtable(); |
| |
| |
| UBool found = FALSE; |
| UBool finished = FALSE; |
| const int32_t lookForward = 20; |
| int32_t tailoringMove = 0; |
| //int32_t referenceSize = reference.getSize(); |
| Line *refLine = reference.getFirst(); |
| Line *refLatestEqual = refLine; |
| refLine = refLine->next; |
| Line *myLine = getFirst(); |
| Line *myLatestEqual = myLine; |
| myLatestEqual->isRemoved = TRUE; |
| myLine = myLine->next; |
| while(myLine && refLine) { |
| found = FALSE; |
| while(myLine && refLine && myLine->equals(*refLine)) { |
| myLatestEqual = myLine; |
| myLatestEqual->isRemoved = TRUE; |
| myLine = myLine->next; |
| refLatestEqual = refLine; |
| refLine = refLine->next; |
| if(refLine == NULL && myLine == NULL) { |
| finished = TRUE; |
| } |
| } |
| if(myLine) { |
| myLine->cumulativeStrength = myLine->strength; |
| } |
| if(refLine) { |
| refLine->cumulativeStrength = refLine->strength; |
| } |
| |
| // here is the difference |
| while(!found && !finished) { |
| tailoringMove = 0; |
| if(myLine && refLine) { |
| if(myLine->cumulativeStrength > refLine->cumulativeStrength) { |
| // tailoring z <<< x, UCA z < y |
| while(myLine->cumulativeStrength > refLine->cumulativeStrength) { |
| myLine = myLine->next; |
| if(myLine) { |
| transferCumulativeStrength(myLine->previous, myLine); |
| } else { |
| break; |
| } |
| } |
| } else if(myLine->cumulativeStrength < refLine->cumulativeStrength) { |
| // tailoring z < x, UCA z <<< y |
| while(myLine->cumulativeStrength < refLine->cumulativeStrength) { |
| seenReference->put(UnicodeString(refLine->name, refLine->len), refLine, status); |
| refLine = refLine->next; |
| if(refLine) { |
| transferCumulativeStrength(refLine->previous, refLine); |
| } else { |
| break; |
| } |
| } |
| } |
| // this is the interesting point. Now we search for character match |
| while(myLine && refLine && (!myLine->equals(*refLine) || myLine->strength == UCOL_IDENTICAL) |
| && tailoringMove < lookForward) { |
| if(seenThis->get(UnicodeString(refLine->name, refLine->len))) { |
| // we are not interested in stuff from the reference that is already accounted |
| // for in the tailoring. |
| refLine = refLine->next; |
| if(refLine) { |
| transferCumulativeStrength(refLine->previous, refLine); |
| } |
| } else { |
| myLine = myLine->next; |
| if(myLine) { |
| transferCumulativeStrength(myLine->previous, myLine); |
| if(!seenReference->get(UnicodeString(myLine->name, myLine->len))) { |
| tailoringMove++; |
| } |
| } |
| } |
| } |
| } |
| if(refLine == NULL) { // ran out of reference |
| // this is the tail of tailoring - the last insertion |
| myLine = NULL; |
| found = TRUE; |
| } else if(tailoringMove == lookForward || myLine == NULL) { // run over treshold or out of tailoring |
| tailoringMove = 0; |
| // we didn't find insertion after all |
| // we will try substitution next |
| // reset the tailoring pointer |
| myLine = myLatestEqual->next; |
| // move the reference |
| refLine = refLine->next; |
| if(refLine) { |
| transferCumulativeStrength(refLine->previous, refLine); |
| } |
| } else { // we found an insertion |
| tailoringMove = 0; |
| if(myLine->strength != refLine->strength) { |
| while(myLine && refLine && *myLine == *refLine |
| && (myLine->strength != refLine->strength |
| || myLine->strength == UCOL_IDENTICAL)) { |
| myLine = myLine->next; |
| refLine = refLine->next; |
| } |
| if(*myLine != *refLine) { |
| continue; |
| } |
| } |
| if(myLine && refLine && myLine->previous->strength < myLine->strength) { |
| myLine = myLine->next; |
| refLine = refLine->next; |
| if(*myLine != *refLine) { |
| continue; |
| } |
| } |
| found = TRUE; |
| } |
| if(found) { |
| if(myLatestEqual->next != myLine || refLine == NULL) { |
| Line *myStart = NULL; |
| // this is a reset and a sequence |
| // myLatestEqual points at the last point that was the same |
| // This point will be a reset |
| if(myLine && refLine) { // if there is anything more to do - it might be worth saving it |
| myStart = myLatestEqual; |
| while(myStart != myLine) { |
| seenThis->put(UnicodeString(myStart->name, myStart->len), myStart, status); |
| myStart = myStart->next; |
| } |
| } |
| // Try to weed out stuff that is not affected, like: |
| // Tailoring: |
| // <<<S<<\u017F<\u0161<<<\u0160<t |
| // UCA: |
| // <<<S<<\u0161<<<\u0160<<\u017F<t |
| // Result: |
| // &S<<\u017F<\u0161<<<\u0160 |
| // we have a sequence that spans from myLatestEqual to myLine (that one could be NULL, |
| // so we have to go down from myLatestEqual. |
| // Basically, for every element, we want to see the strongest cumulative difference |
| // from the reset point. If the cumulative difference is the same in both the reference and |
| // tailoring, that element could be removed. |
| calculateCumulativeStrengths(myLatestEqual, myLine); |
| calculateCumulativeStrengths(refLatestEqual, refLine); |
| myStart = myLatestEqual; |
| int32_t removed = 0; |
| int32_t traversed = 0; |
| while(myStart && myStart != myLine) { |
| Line *refStart = refLatestEqual; |
| while(refStart && refStart != refLine) { |
| if(*myStart == *refStart) { |
| if(myStart->cumulativeStrength == refStart->cumulativeStrength) { |
| myStart->isRemoved = TRUE; |
| removed++; |
| } |
| } |
| refStart = refStart->next; |
| } |
| myStart = myStart->next; |
| traversed++; |
| } |
| if(removed < traversed) { |
| myLatestEqual->isReset = TRUE; |
| myLatestEqual->isRemoved = FALSE; |
| } |
| |
| myLatestEqual = myLine; |
| } |
| } |
| } |
| } |
| |
| if(upperFirst) { |
| //swapCase(); |
| } |
| |
| delete seenThis; |
| delete seenReference; |
| |
| } |
| |
| void |
| SortedLines::transferCumulativeStrength(Line *previous, Line *that) { |
| if(that->strength > previous->cumulativeStrength) { |
| that->cumulativeStrength = previous->cumulativeStrength; |
| } else { |
| that->cumulativeStrength = that->strength; |
| } |
| } |
| |
| void |
| SortedLines::calculateCumulativeStrengths(Line *start, Line *end) { |
| // start is a reset - end may be NULL |
| start = start->next; |
| UColAttributeValue cumulativeStrength = UCOL_OFF; |
| while(start && start != end) { |
| if(start->strength < cumulativeStrength) { |
| cumulativeStrength = start->strength; |
| } |
| start->cumulativeStrength = cumulativeStrength; |
| start = start->next; |
| } |
| } |
| |
| |
| void |
| SortedLines::getRepertoire(UnicodeSet &fillIn) { |
| fillIn.clear(); |
| fillIn.addAll(repertoire); |
| } |
| |
| |
| void |
| SortedLines::removeDecompositionsFromRepertoire() { |
| UnicodeSetIterator repertoireIter(repertoire); |
| UErrorCode status = U_ZERO_ERROR; |
| UChar string[256]; |
| UChar composed[256]; |
| int32_t len = 0, compLen = 0; |
| UnicodeString compString; |
| UnicodeSet toRemove; |
| |
| while(repertoireIter.next()) { |
| len = 0; |
| if(repertoireIter.isString()) { // process a string |
| len = repertoireIter.getString().length(); |
| u_memcpy(string, repertoireIter.getString().getBuffer(), len); |
| } else { // process code point |
| UBool isError = FALSE; |
| U16_APPEND(string, len, 25, repertoireIter.getCodepoint(), isError); |
| } |
| string[len] = 0; // zero terminate, for our evil ways |
| compLen = unorm_normalize(string, len, UNORM_NFC, 0, composed, 256, &status); |
| if(compLen != len || u_strcmp(string, composed) != 0) { |
| compString.setTo(composed, compLen); |
| if(repertoire.contains(compString)) { |
| toRemove.add(UnicodeString(string, len)); |
| } |
| } |
| } |
| debug->log("\nRemoving\n"); |
| debug->log(toRemove.toPattern(compString, TRUE), TRUE); |
| repertoire.removeAll(toRemove); |
| } |
| |
| |
| void |
| SortedLines::swapCase() |
| { |
| int32_t i = 0; |
| for(i = 0; i < size; i++) { |
| toSort[i]->swapCase(); |
| } |
| } |
| |
| void |
| SortedLines::calculateSortKey(Line &line) |
| { |
| if(!sortkeys) { |
| sortkeys = new uint8_t[size*1024]; |
| memset(sortkeys, 0, size*1024); |
| } |
| line.sortKey = sortkeys+sortkeyOffset; |
| sortkeyOffset += probe.getSortKey(line, sortkeys+sortkeyOffset, size*256-sortkeyOffset); |
| } |
| |
| |
| void |
| SortedLines::calculateSortKeys() |
| { |
| if(sortkeys) { |
| delete[] sortkeys; |
| } |
| sortkeyOffset = 0; |
| sortkeys = new uint8_t[size*256]; |
| memset(sortkeys, 0, size*256); |
| int32_t i = 0; |
| for(i = 0; i < size; i++) { |
| calculateSortKey(*toSort[i]); |
| } |
| } |