| /* |
| ********************************************************************** |
| * Copyright (C) 2005-2008, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| |
| #include "unicode/utypes.h" |
| #include "unicode/unistr.h" |
| #include "unicode/putil.h" |
| #include "unicode/usearch.h" |
| |
| #include "cmemory.h" |
| #include "unicode/coll.h" |
| #include "unicode/tblcoll.h" |
| #include "unicode/coleitr.h" |
| #include "unicode/ucoleitr.h" |
| |
| #include "unicode/regex.h" // TODO: make conditional on regexp being built. |
| |
| #include "unicode/uniset.h" |
| #include "unicode/uset.h" |
| #include "unicode/ustring.h" |
| #include "hash.h" |
| #include "uhash.h" |
| #include "ucol_imp.h" |
| |
| #include "intltest.h" |
| #include "ssearch.h" |
| |
| #include "xmlparser.h" |
| |
| #include <stdlib.h> |
| #include <string.h> |
| #include <stdio.h> |
| |
| char testId[100]; |
| |
| #define TEST_ASSERT(x) {if (!(x)) { \ |
| errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}} |
| |
| #define TEST_ASSERT_M(x, m) {if (!(x)) { \ |
| errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}} |
| |
| #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ |
| errln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ |
| __FILE__, __LINE__, testId, u_errorName(errcode));}} |
| |
| #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| |
| //--------------------------------------------------------------------------- |
| // |
| // Test class boilerplate |
| // |
| //--------------------------------------------------------------------------- |
| SSearchTest::SSearchTest() |
| { |
| } |
| |
| SSearchTest::~SSearchTest() |
| { |
| } |
| |
| void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params ) |
| { |
| if (exec) logln("TestSuite SSearchTest: "); |
| switch (index) { |
| case 0: name = "searchTest"; |
| if (exec) searchTest(); |
| break; |
| |
| case 1: name = "offsetTest"; |
| if (exec) offsetTest(); |
| break; |
| |
| case 2: name = "monkeyTest"; |
| if (exec) monkeyTest(params); |
| break; |
| |
| default: name = ""; |
| break; //needed to end loop |
| } |
| } |
| |
| |
| |
| #define PATH_BUFFER_SIZE 2048 |
| const char *SSearchTest::getPath(char buffer[2048], const char *filename) { |
| UErrorCode status = U_ZERO_ERROR; |
| const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| |
| if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) { |
| errln("ERROR: getPath() failed - %s", u_errorName(status)); |
| return NULL; |
| } |
| |
| strcpy(buffer, testDataDirectory); |
| strcat(buffer, filename); |
| return buffer; |
| } |
| |
| |
| void SSearchTest::searchTest() |
| { |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| UErrorCode status = U_ZERO_ERROR; |
| char path[PATH_BUFFER_SIZE]; |
| const char *testFilePath = getPath(path, "ssearch.xml"); |
| |
| if (testFilePath == NULL) { |
| return; /* Couldn't get path: error message already output. */ |
| } |
| |
| UXMLParser *parser = UXMLParser::createParser(status); |
| TEST_ASSERT_SUCCESS(status); |
| UXMLElement *root = parser->parseFile(testFilePath, status); |
| TEST_ASSERT_SUCCESS(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| const UnicodeString *debugTestCase = root->getAttribute("debug"); |
| if (debugTestCase != NULL) { |
| // setenv("USEARCH_DEBUG", "1", 1); |
| } |
| |
| |
| const UXMLElement *testCase; |
| int32_t tc = 0; |
| |
| while((testCase = root->nextChildElement(tc)) != NULL) { |
| |
| if (testCase->getTagName().compare("test-case") != 0) { |
| errln("ssearch, unrecognized XML Element in test file"); |
| continue; |
| } |
| const UnicodeString *id = testCase->getAttribute("id"); |
| *testId = 0; |
| if (id != NULL) { |
| id->extract(0, id->length(), testId, sizeof(testId), US_INV); |
| } |
| |
| // If debugging test case has been specified and this is not it, skip to next. |
| if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { |
| continue; |
| } |
| // |
| // Get the requested collation strength. |
| // Default is tertiary if the XML attribute is missing from the test case. |
| // |
| const UnicodeString *strength = testCase->getAttribute("strength"); |
| UColAttributeValue collatorStrength; |
| if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} |
| else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} |
| else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} |
| else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} |
| else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} |
| else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} |
| else { |
| // Bogus value supplied for strength. Shouldn't happen, even from |
| // typos, if the XML source has been validated. |
| // This assert is a little deceiving in that strength can be |
| // any of the allowed values, not just TERTIARY, but it will |
| // do the job of getting the error output. |
| TEST_ASSERT(*strength=="TERTIARY") |
| } |
| |
| // |
| // Get the collator normalization flag. Default is UCOL_OFF. |
| // |
| UColAttributeValue normalize = UCOL_OFF; |
| const UnicodeString *norm = testCase->getAttribute("norm"); |
| TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); |
| if (norm!=NULL && *norm=="ON") { |
| normalize = UCOL_ON; |
| } |
| |
| const UnicodeString defLocale("en"); |
| char clocale[100]; |
| const UnicodeString *locale = testCase->getAttribute("locale"); |
| if (locale == NULL || locale->length()==0) { |
| locale = &defLocale; |
| }; |
| locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); |
| |
| |
| UnicodeString text; |
| UnicodeString target; |
| UnicodeString pattern; |
| int32_t expectedMatchStart = -1; |
| int32_t expectedMatchLimit = -1; |
| const UXMLElement *n; |
| int nodeCount = 0; |
| |
| n = testCase->getChildElement("pattern"); |
| TEST_ASSERT(n != NULL); |
| if (n==NULL) { |
| continue; |
| } |
| text = n->getText(FALSE); |
| text = text.unescape(); |
| pattern.append(text); |
| nodeCount++; |
| |
| n = testCase->getChildElement("pre"); |
| if (n!=NULL) { |
| text = n->getText(FALSE); |
| text = text.unescape(); |
| target.append(text); |
| nodeCount++; |
| } |
| |
| n = testCase->getChildElement("m"); |
| if (n!=NULL) { |
| expectedMatchStart = target.length(); |
| text = n->getText(FALSE); |
| text = text.unescape(); |
| target.append(text); |
| expectedMatchLimit = target.length(); |
| nodeCount++; |
| } |
| |
| n = testCase->getChildElement("post"); |
| if (n!=NULL) { |
| text = n->getText(FALSE); |
| text = text.unescape(); |
| target.append(text); |
| nodeCount++; |
| } |
| |
| // Check that there weren't extra things in the XML |
| TEST_ASSERT(nodeCount == testCase->countChildren()); |
| |
| // Open a collotor and StringSearch based on the parameters |
| // obtained from the XML. |
| // |
| status = U_ZERO_ERROR; |
| UCollator *collator = ucol_open(clocale, &status); |
| ucol_setStrength(collator, collatorStrength); |
| ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status); |
| UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
| target.getBuffer(), target.length(), |
| collator, |
| NULL, // the break iterator |
| &status); |
| |
| TEST_ASSERT_SUCCESS(status); |
| if (U_FAILURE(status)) { |
| usearch_close(uss); |
| ucol_close(collator); |
| continue; |
| } |
| |
| int32_t foundStart = 0; |
| int32_t foundLimit = 0; |
| UBool foundMatch; |
| |
| // |
| // Do the search, check the match result against the expected results. |
| // |
| foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status); |
| TEST_ASSERT_SUCCESS(status); |
| if (foundMatch && expectedMatchStart<0 || |
| foundStart != expectedMatchStart || |
| foundLimit != expectedMatchLimit) { |
| TEST_ASSERT(FALSE); // ouput generic error position |
| infoln("Found, expected match start = %d, %d \n" |
| "Found, expected match limit = %d, %d", |
| foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); |
| } |
| |
| // In case there are other matches... |
| // (should we only do this if the test case passed?) |
| while (foundMatch) { |
| expectedMatchStart = foundStart; |
| expectedMatchLimit = foundLimit; |
| |
| foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status); |
| } |
| |
| usearch_close(uss); |
| usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
| target.getBuffer(), target.length(), |
| collator, |
| NULL, |
| &status); |
| |
| // |
| // Do the backwards search, check the match result against the expected results. |
| // |
| foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status); |
| TEST_ASSERT_SUCCESS(status); |
| if (foundMatch && expectedMatchStart<0 || |
| foundStart != expectedMatchStart || |
| foundLimit != expectedMatchLimit) { |
| TEST_ASSERT(FALSE); // ouput generic error position |
| infoln("Found, expected backwards match start = %d, %d \n" |
| "Found, expected backwards match limit = %d, %d", |
| foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); |
| } |
| |
| usearch_close(uss); |
| ucol_close(collator); |
| } |
| |
| delete root; |
| delete parser; |
| #endif |
| } |
| |
| struct Order |
| { |
| int32_t order; |
| int32_t lowOffset; |
| int32_t highOffset; |
| }; |
| |
| class OrderList |
| { |
| public: |
| OrderList(); |
| OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0); |
| ~OrderList(); |
| |
| int32_t size(void) const; |
| void add(int32_t order, int32_t low, int32_t high); |
| const Order *get(int32_t index) const; |
| int32_t getLowOffset(int32_t index) const; |
| int32_t getHighOffset(int32_t index) const; |
| int32_t getOrder(int32_t index) const; |
| void reverse(void); |
| UBool compare(const OrderList &other) const; |
| UBool matchesAt(int32_t offset, const OrderList &other) const; |
| |
| private: |
| Order *list; |
| int32_t listMax; |
| int32_t listSize; |
| }; |
| |
| OrderList::OrderList() |
| : list(NULL), listSize(0), listMax(16) |
| { |
| list = new Order[listMax]; |
| } |
| |
| OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) |
| : list(NULL), listMax(16), listSize(0) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); |
| uint32_t strengthMask = 0; |
| int32_t order, low, high; |
| |
| switch (ucol_getStrength(coll)) |
| { |
| default: |
| strengthMask |= UCOL_TERTIARYORDERMASK; |
| /* fall through */ |
| |
| case UCOL_SECONDARY: |
| strengthMask |= UCOL_SECONDARYORDERMASK; |
| /* fall through */ |
| |
| case UCOL_PRIMARY: |
| strengthMask |= UCOL_PRIMARYORDERMASK; |
| } |
| |
| list = new Order[listMax]; |
| |
| ucol_setOffset(elems, stringOffset, &status); |
| |
| do { |
| low = ucol_getOffset(elems); |
| order = ucol_next(elems, &status); |
| high = ucol_getOffset(elems); |
| |
| if (order != UCOL_NULLORDER) { |
| order &= strengthMask; |
| } |
| |
| if (order != UCOL_IGNORABLE) { |
| add(order, low, high); |
| } |
| } while (order != UCOL_NULLORDER); |
| |
| ucol_closeElements(elems); |
| } |
| |
| OrderList::~OrderList() |
| { |
| delete[] list; |
| } |
| |
| void OrderList::add(int32_t order, int32_t low, int32_t high) |
| { |
| if (listSize >= listMax) { |
| listMax *= 2; |
| |
| Order *newList = new Order[listMax]; |
| |
| uprv_memcpy(newList, list, listSize * sizeof(Order)); |
| delete[] list; |
| list = newList; |
| } |
| |
| list[listSize].order = order; |
| list[listSize].lowOffset = low; |
| list[listSize].highOffset = high; |
| |
| listSize += 1; |
| } |
| |
| const Order *OrderList::get(int32_t index) const |
| { |
| if (index >= listSize) { |
| return NULL; |
| } |
| |
| return &list[index]; |
| } |
| |
| int32_t OrderList::getLowOffset(int32_t index) const |
| { |
| const Order *order = get(index); |
| |
| if (order != NULL) { |
| return order->lowOffset; |
| } |
| |
| return -1; |
| } |
| |
| int32_t OrderList::getHighOffset(int32_t index) const |
| { |
| const Order *order = get(index); |
| |
| if (order != NULL) { |
| return order->highOffset; |
| } |
| |
| return -1; |
| } |
| |
| int32_t OrderList::getOrder(int32_t index) const |
| { |
| const Order *order = get(index); |
| |
| if (order != NULL) { |
| return order->order; |
| } |
| |
| return UCOL_NULLORDER; |
| } |
| |
| int32_t OrderList::size() const |
| { |
| return listSize; |
| } |
| |
| void OrderList::reverse() |
| { |
| for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { |
| Order swap = list[b]; |
| |
| list[b] = list[f]; |
| list[f] = swap; |
| } |
| } |
| |
| UBool OrderList::compare(const OrderList &other) const |
| { |
| if (listSize != other.listSize) { |
| return FALSE; |
| } |
| |
| for(int32_t i = 0; i < listSize; i += 1) { |
| if (list[i].order != other.list[i].order || |
| list[i].lowOffset != other.list[i].lowOffset || |
| list[i].highOffset != other.list[i].highOffset) { |
| return FALSE; |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const |
| { |
| // NOTE: sizes include the NULLORDER, which we don't want to compare. |
| int32_t otherSize = other.size() - 1; |
| |
| if (listSize - 1 - offset < otherSize) { |
| return FALSE; |
| } |
| |
| for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { |
| if (getOrder(i) != other.getOrder(j)) { |
| return FALSE; |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| static char *printOffsets(char *buffer, OrderList &list) |
| { |
| int32_t size = list.size(); |
| char *s = buffer; |
| |
| for(int32_t i = 0; i < size; i += 1) { |
| const Order *order = list.get(i); |
| |
| if (i != 0) { |
| s += sprintf(s, ", "); |
| } |
| |
| s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); |
| } |
| |
| return buffer; |
| } |
| |
| static char *printOrders(char *buffer, OrderList &list) |
| { |
| int32_t size = list.size(); |
| char *s = buffer; |
| |
| for(int32_t i = 0; i < size; i += 1) { |
| const Order *order = list.get(i); |
| |
| if (i != 0) { |
| s += sprintf(s, ", "); |
| } |
| |
| s += sprintf(s, "%8.8X", order->order); |
| } |
| |
| return buffer; |
| } |
| |
| void SSearchTest::offsetTest() |
| { |
| UnicodeString test[] = { |
| "\\ua191\\u16ef\\u2036\\u017a", |
| |
| #if 0 |
| // This results in a complex interaction between contraction, |
| // expansion and normalization that confuses the backwards offset fixups. |
| "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", |
| #endif |
| |
| "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", |
| "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", |
| |
| "\\u02FE\\u02FF" |
| "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" |
| "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" |
| "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" |
| "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" |
| "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", |
| |
| "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", |
| "abc\\u0E41\\u0301\\u0316", |
| "abc\\u0E41\\u0316\\u0301", |
| "\\u0E41\\u0301\\u0316", |
| "\\u0E41\\u0316\\u0301", |
| "a\\u0301\\u0316", |
| "a\\u0316\\u0301", |
| "\\uAC52\\uAC53", |
| "\\u34CA\\u34CB", |
| "\\u11ED\\u11EE", |
| "\\u30C3\\u30D0", |
| "p\\u00E9ch\\u00E9", |
| "a\\u0301\\u0325", |
| "a\\u0300\\u0325", |
| "a\\u0325\\u0300", |
| "A\\u0323\\u0300B", |
| "A\\u0300\\u0323B", |
| "A\\u0301\\u0323B", |
| "A\\u0302\\u0301\\u0323B", |
| "abc", |
| "ab\\u0300c", |
| "ab\\u0300\\u0323c", |
| " \\uD800\\uDC00\\uDC00", |
| "a\\uD800\\uDC00\\uDC00", |
| "A\\u0301\\u0301", |
| "A\\u0301\\u0323", |
| "A\\u0301\\u0323B", |
| "B\\u0301\\u0323C", |
| "A\\u0300\\u0323B", |
| "\\u0301A\\u0301\\u0301", |
| "abcd\\r\\u0301", |
| "p\\u00EAche", |
| "pe\\u0302che", |
| }; |
| |
| int32_t testCount = ARRAY_SIZE(test); |
| UErrorCode status = U_ZERO_ERROR; |
| RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status); |
| char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases... |
| // We could allocate one that's the right size by (CE_count * 10) + 2 |
| // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]" |
| |
| col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| |
| for(int32_t i = 0; i < testCount; i += 1) { |
| UnicodeString ts = test[i].unescape(); |
| CollationElementIterator *iter = col->createCollationElementIterator(ts); |
| OrderList forwardList; |
| OrderList backwardList; |
| int32_t order, low, high; |
| |
| do { |
| low = iter->getOffset(); |
| order = iter->next(status); |
| high = iter->getOffset(); |
| |
| forwardList.add(order, low, high); |
| } while (order != CollationElementIterator::NULLORDER); |
| |
| iter->reset(); |
| iter->setOffset(ts.length(), status); |
| |
| backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset()); |
| |
| do { |
| high = iter->getOffset(); |
| order = iter->previous(status); |
| low = iter->getOffset(); |
| |
| if (order == CollationElementIterator::NULLORDER) { |
| break; |
| } |
| |
| backwardList.add(order, low, high); |
| } while (TRUE); |
| |
| backwardList.reverse(); |
| |
| if (forwardList.compare(backwardList)) { |
| logln("Works with \"%S\"", test[i].getTerminatedBuffer()); |
| logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); |
| // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); |
| |
| logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); |
| // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); |
| |
| logln(); |
| } else { |
| errln("Fails with \"%S\"", test[i].getTerminatedBuffer()); |
| infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); |
| infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); |
| |
| infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); |
| infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); |
| |
| infoln(); |
| } |
| } |
| } |
| |
| class CEList |
| { |
| public: |
| CEList(UCollator *coll, const UnicodeString &string); |
| ~CEList(); |
| |
| int32_t size() const; |
| int32_t get(int32_t index) const; |
| UBool matchesAt(int32_t offset, const CEList *other) const; |
| |
| private: |
| void add(int32_t ce); |
| |
| int32_t *ces; |
| int32_t listMax; |
| int32_t listSize; |
| }; |
| |
| CEList::CEList(UCollator *coll, const UnicodeString &string) |
| : ces(NULL), listMax(8), listSize(0) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); |
| uint32_t strengthMask = 0; |
| int32_t order; |
| |
| #if 0 |
| switch (ucol_getStrength(coll)) |
| { |
| default: |
| strengthMask |= UCOL_TERTIARYORDERMASK; |
| /* fall through */ |
| |
| case UCOL_SECONDARY: |
| strengthMask |= UCOL_SECONDARYORDERMASK; |
| /* fall through */ |
| |
| case UCOL_PRIMARY: |
| strengthMask |= UCOL_PRIMARYORDERMASK; |
| } |
| #else |
| strengthMask = UCOL_PRIMARYORDERMASK; |
| #endif |
| |
| ces = new int32_t[listMax]; |
| |
| while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { |
| order &= strengthMask; |
| |
| if (order == UCOL_IGNORABLE) { |
| continue; |
| } |
| |
| add(order); |
| } |
| |
| ucol_closeElements(elems); |
| } |
| |
| CEList::~CEList() |
| { |
| delete[] ces; |
| } |
| |
| void CEList::add(int32_t ce) |
| { |
| if (listSize >= listMax) { |
| listMax *= 2; |
| |
| int32_t *newCEs = new int32_t[listMax]; |
| |
| uprv_memcpy(newCEs, ces, listSize * sizeof(int32_t)); |
| delete[] ces; |
| ces = newCEs; |
| } |
| |
| ces[listSize++] = ce; |
| } |
| |
| int32_t CEList::get(int32_t index) const |
| { |
| if (index >= 0 && index < listSize) { |
| return ces[index]; |
| } |
| |
| return -1; |
| } |
| |
| UBool CEList::matchesAt(int32_t offset, const CEList *other) const |
| { |
| if (listSize - offset < other->size()) { |
| return FALSE; |
| } |
| |
| for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { |
| if (ces[i] != other->get(j)) { |
| return FALSE; |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| int32_t CEList::size() const |
| { |
| return listSize; |
| } |
| |
| class StringList |
| { |
| public: |
| StringList(); |
| ~StringList(); |
| |
| void add(const UnicodeString *string); |
| void add(const UChar *chars, int32_t count); |
| const UnicodeString *get(int32_t index) const; |
| int32_t size() const; |
| |
| private: |
| UnicodeString *strings; |
| int32_t listMax; |
| int32_t listSize; |
| }; |
| |
| StringList::StringList() |
| : strings(NULL), listMax(16), listSize(0) |
| { |
| strings = new UnicodeString [listMax]; |
| } |
| |
| StringList::~StringList() |
| { |
| delete[] strings; |
| } |
| |
| void StringList::add(const UnicodeString *string) |
| { |
| if (listSize >= listMax) { |
| listMax *= 2; |
| |
| UnicodeString *newStrings = new UnicodeString[listMax]; |
| |
| uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString)); |
| |
| delete[] strings; |
| strings = newStrings; |
| } |
| |
| // The ctor initialized all the strings in |
| // the array to empty strings, so this |
| // is the same as copying the source string. |
| strings[listSize++].append(*string); |
| } |
| |
| void StringList::add(const UChar *chars, int32_t count) |
| { |
| const UnicodeString string(chars, count); |
| |
| add(&string); |
| } |
| |
| const UnicodeString *StringList::get(int32_t index) const |
| { |
| if (index >= 0 && index < listSize) { |
| return &strings[index]; |
| } |
| |
| return NULL; |
| } |
| |
| int32_t StringList::size() const |
| { |
| return listSize; |
| } |
| |
| class CEToStringsMap |
| { |
| public: |
| |
| CEToStringsMap(); |
| ~CEToStringsMap(); |
| |
| void put(int32_t ce, UnicodeString *string); |
| StringList *getStringList(int32_t ce) const; |
| |
| private: |
| |
| static void deleteStringList(void *obj); |
| void putStringList(int32_t ce, StringList *stringList); |
| UHashtable *map; |
| }; |
| |
| CEToStringsMap::CEToStringsMap() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| map = uhash_open(uhash_hashLong, uhash_compareLong, |
| uhash_compareCaselessUnicodeString, |
| &status); |
| |
| uhash_setValueDeleter(map, deleteStringList); |
| } |
| |
| CEToStringsMap::~CEToStringsMap() |
| { |
| uhash_close(map); |
| } |
| |
| void CEToStringsMap::put(int32_t ce, UnicodeString *string) |
| { |
| StringList *strings = getStringList(ce); |
| |
| if (strings == NULL) { |
| strings = new StringList(); |
| putStringList(ce, strings); |
| } |
| |
| strings->add(string); |
| } |
| |
| StringList *CEToStringsMap::getStringList(int32_t ce) const |
| { |
| return (StringList *) uhash_iget(map, ce); |
| } |
| |
| void CEToStringsMap::putStringList(int32_t ce, StringList *stringList) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| uhash_iput(map, ce, (void *) stringList, &status); |
| } |
| |
| void CEToStringsMap::deleteStringList(void *obj) |
| { |
| StringList *strings = (StringList *) obj; |
| |
| delete strings; |
| } |
| |
| class StringToCEsMap |
| { |
| public: |
| StringToCEsMap(); |
| ~StringToCEsMap(); |
| |
| void put(const UnicodeString *string, const CEList *ces); |
| const CEList *get(const UnicodeString *string); |
| |
| private: |
| |
| static void deleteCEList(void *obj); |
| |
| UHashtable *map; |
| }; |
| |
| StringToCEsMap::StringToCEsMap() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| map = uhash_open(uhash_hashCaselessUnicodeString, |
| uhash_compareCaselessUnicodeString, |
| uhash_compareLong, |
| &status); |
| |
| uhash_setValueDeleter(map, deleteCEList); |
| } |
| |
| StringToCEsMap::~StringToCEsMap() |
| { |
| uhash_close(map); |
| } |
| |
| void StringToCEsMap::put(const UnicodeString *string, const CEList *ces) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| uhash_put(map, (void *) string, (void *) ces, &status); |
| } |
| |
| const CEList *StringToCEsMap::get(const UnicodeString *string) |
| { |
| return (const CEList *) uhash_get(map, string); |
| } |
| |
| void StringToCEsMap::deleteCEList(void *obj) |
| { |
| CEList *list = (CEList *) obj; |
| |
| delete list; |
| } |
| |
| static void buildData(UCollator *coll, USet *charsToTest, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith) |
| { |
| int32_t itemCount = uset_getItemCount(charsToTest); |
| UErrorCode status = U_ZERO_ERROR; |
| |
| for(int32_t item = 0; item < itemCount; item += 1) { |
| UChar32 start = 0, end = 0; |
| UChar buffer[16]; |
| int32_t len = uset_getItem(charsToTest, item, &start, &end, |
| buffer, 16, &status); |
| |
| if (len == 0) { |
| for (UChar32 ch = start; ch <= end; ch += 1) { |
| UnicodeString *st = new UnicodeString(ch); |
| CEList *ceList = new CEList(coll, *st); |
| |
| charsToCEList->put(st, ceList); |
| ceToCharsStartingWith->put(ceList->get(0), st); |
| } |
| } else if (len > 0) { |
| UnicodeString *st = new UnicodeString(buffer, len); |
| CEList *ceList = new CEList(coll, *st); |
| |
| charsToCEList->put(st, ceList); |
| ceToCharsStartingWith->put(ceList->get(0), st); |
| } else { |
| // shouldn't happen... |
| } |
| } |
| } |
| |
| static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) |
| { |
| for(int32_t i = 0; i < string.length(); i += 1) { |
| UChar32 ch = string.char32At(i); |
| |
| if (ch >= 0x0020 && ch <= 0x007F) { |
| if (ch == 0x005C) { |
| buffer.append("\\\\"); |
| } else { |
| buffer.append(ch); |
| } |
| } else { |
| char cbuffer[12]; |
| |
| if (ch <= 0xFFFFL) { |
| sprintf(cbuffer, "\\u%4.4X", ch); |
| } else { |
| sprintf(cbuffer, "\\U%8.8X", ch); |
| } |
| |
| buffer.append(cbuffer); |
| } |
| |
| if (ch >= 0x10000L) { |
| i += 1; |
| } |
| } |
| |
| return buffer; |
| } |
| |
| static int32_t minLengthInChars(const CEList *ceList, int32_t offset, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith, |
| UnicodeString &debug) |
| { |
| // find out shortest string for the longest sequence of ces. |
| // needs to be refined to use dynamic programming, but will be roughly right |
| int32_t totalStringLength = 0; |
| |
| while (offset < ceList->size()) { |
| int32_t ce = ceList->get(offset); |
| int32_t bestLength = INT32_MIN; |
| const UnicodeString *bestString = NULL; |
| int32_t bestCeLength = 0; |
| const StringList *strings = ceToCharsStartingWith->getStringList(ce); |
| int32_t stringCount = strings->size(); |
| |
| for (int32_t s = 0; s < stringCount; s += 1) { |
| const UnicodeString *string = strings->get(s); |
| const CEList *ceList2 = charsToCEList->get(string); |
| |
| if (ceList->matchesAt(offset, ceList2)) { |
| int32_t length = ceList2->size() - string->length(); |
| |
| if (bestLength < length) { |
| bestLength = length; |
| bestCeLength = ceList2->size(); |
| bestString = string; |
| } |
| } |
| } |
| |
| totalStringLength += bestString->length(); |
| escape(*bestString, debug).append("/"); |
| offset += bestCeLength; |
| } |
| |
| debug.append((UChar)0x0000); |
| return totalStringLength; |
| } |
| |
| static void minLengthTest(UCollator *coll, StringToCEsMap *charsToCEList, CEToStringsMap *ceToCharsStartingWith) |
| { |
| UnicodeString examples[] = {"fuss", "fiss", "affliss", "VII"}; |
| UnicodeString debug; |
| int32_t nExamples = sizeof(examples) / sizeof(examples[0]); |
| |
| for (int32_t s = 0; s < nExamples; s += 1) { |
| CEList *ceList = new CEList(coll, examples[s]); |
| |
| //infoln("%S:", examples[s].getTerminatedBuffer()); |
| |
| for(int32_t i = 0; i < examples[s].length(); i += 1) { |
| debug.remove(); |
| |
| int32_t minLength = minLengthInChars(ceList, i, charsToCEList, ceToCharsStartingWith, debug); |
| //infoln("\t%d\t%S", minLength, debug.getTerminatedBuffer()); |
| } |
| |
| //infoln(); |
| delete ceList; |
| } |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Random Numbers. Similar to standard lib rand() and srand() |
| // Not using library to |
| // 1. Get same results on all platforms. |
| // 2. Get access to current seed, to more easily reproduce failures. |
| // |
| //--------------------------------------------------------------------------------------- |
| static uint32_t m_seed = 1; |
| |
| static uint32_t m_rand() |
| { |
| m_seed = m_seed * 1103515245 + 12345; |
| return (uint32_t)(m_seed/65536) % 32768; |
| } |
| |
| class Monkey |
| { |
| public: |
| virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; |
| |
| protected: |
| Monkey(); |
| virtual ~Monkey(); |
| }; |
| |
| Monkey::Monkey() |
| { |
| // ook? |
| } |
| |
| Monkey::~Monkey() |
| { |
| // ook? |
| } |
| |
| class SetMonkey : public Monkey |
| { |
| public: |
| SetMonkey(const USet *theSet); |
| ~SetMonkey(); |
| |
| virtual void append(UnicodeString &test, UnicodeString &alternate); |
| |
| private: |
| const USet *set; |
| }; |
| |
| SetMonkey::SetMonkey(const USet *theSet) |
| : Monkey(), set(theSet) |
| { |
| // ook? |
| } |
| |
| SetMonkey::~SetMonkey() |
| { |
| //ook... |
| } |
| |
| void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) |
| { |
| int32_t size = uset_size(set); |
| int32_t index = m_rand() % size; |
| UChar32 ch = uset_charAt(set, index); |
| UnicodeString str(ch); |
| |
| test.append(str); |
| alternate.append(str); // flip case, or some junk? |
| } |
| |
| class StringSetMonkey : public Monkey |
| { |
| public: |
| StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith); |
| ~StringSetMonkey(); |
| |
| void append(UnicodeString &testCase, UnicodeString &alternate); |
| |
| private: |
| UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate); |
| |
| const USet *set; |
| UCollator *coll; |
| StringToCEsMap *charsToCEList; |
| CEToStringsMap *ceToCharsStartingWith; |
| }; |
| |
| StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, StringToCEsMap *theCharsToCEList, CEToStringsMap *theCeToCharsStartingWith) |
| : Monkey(), set(theSet), coll(theCollator), charsToCEList(theCharsToCEList), ceToCharsStartingWith(theCeToCharsStartingWith) |
| { |
| // ook. |
| } |
| |
| StringSetMonkey::~StringSetMonkey() |
| { |
| // ook? |
| } |
| |
| void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) |
| { |
| int32_t itemCount = uset_getItemCount(set), len = 0; |
| int32_t index = m_rand() % itemCount; |
| UChar32 rangeStart = 0, rangeEnd = 0; |
| UChar buffer[16]; |
| UErrorCode err = U_ZERO_ERROR; |
| |
| len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); |
| |
| if (len == 0) { |
| int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); |
| UChar32 ch = rangeStart + offset; |
| UnicodeString str(ch); |
| |
| testCase.append(str); |
| generateAlternative(str, alternate); |
| } else if (len > 0) { |
| // should check that len < 16... |
| UnicodeString str(buffer, len); |
| |
| testCase.append(str); |
| generateAlternative(str, alternate); |
| } else { |
| // shouldn't happen... |
| } |
| } |
| |
| UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate) |
| { |
| // find out shortest string for the longest sequence of ces. |
| // needs to be refined to use dynamic programming, but will be roughly right |
| CEList ceList(coll, testCase); |
| UnicodeString alt; |
| int32_t offset = 0; |
| |
| if (ceList.size() == 0) { |
| return alternate.append(testCase); |
| } |
| |
| while (offset < ceList.size()) { |
| int32_t ce = ceList.get(offset); |
| const StringList *strings = ceToCharsStartingWith->getStringList(ce); |
| |
| if (strings == NULL) { |
| return alternate.append(testCase); |
| } |
| |
| int32_t stringCount = strings->size(); |
| int32_t tries = 0; |
| |
| // find random string that generates the same CEList |
| const CEList *ceList2; |
| const UnicodeString *string; |
| |
| do { |
| int32_t s = m_rand() % stringCount; |
| |
| if (tries++ > stringCount) { |
| alternate.append(testCase); |
| return alternate; |
| } |
| |
| string = strings->get(s); |
| ceList2 = charsToCEList->get(string); |
| } while (! ceList.matchesAt(offset, ceList2)); |
| |
| alt.append(*string); |
| offset += ceList2->size(); |
| } |
| |
| const CEList altCEs(coll, alt); |
| |
| if (ceList.matchesAt(0, &altCEs)) { |
| return alternate.append(alt); |
| } |
| |
| return alternate.append(testCase); |
| } |
| |
| static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate) |
| { |
| int32_t pieces = (m_rand() % 4) + 1; |
| UBool matches; |
| |
| do { |
| testCase.remove(); |
| alternate.remove(); |
| monkeys[0]->append(testCase, alternate); |
| |
| for(int32_t piece = 0; piece < pieces; piece += 1) { |
| int32_t monkey = m_rand() % monkeyCount; |
| |
| monkeys[monkey]->append(testCase, alternate); |
| } |
| |
| const CEList ceTest(coll, testCase); |
| const CEList ceAlt(coll, alternate); |
| |
| matches = ceTest.matchesAt(0, &ceAlt); |
| } while (! matches); |
| } |
| |
| static inline USet *uset_openEmpty() |
| { |
| return uset_open(1, 0); |
| } |
| |
| // |
| // Find the next acceptable boundary following the specified starting index |
| // in the target text being searched. |
| // TODO: refine what is an acceptable boundary. For the moment, |
| // choose the next position not within a combining sequence. |
| // |
| static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) { |
| const UChar *text = string.getBuffer(); |
| int32_t textLen = string.length(); |
| |
| if (startIndex >= textLen) { |
| return startIndex; |
| } |
| |
| UChar32 c; |
| int32_t i = startIndex; |
| |
| U16_NEXT(text, i, textLen, c); |
| |
| // If we are on a control character, stop without looking for combining marks. |
| // Control characters do not combine. |
| int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { |
| return i; |
| } |
| |
| // The initial character was not a control, and can thus accept trailing |
| // combining characters. Advance over however many of them there are. |
| int32_t indexOfLastCharChecked; |
| |
| for (;;) { |
| indexOfLastCharChecked = i; |
| |
| if (i>=textLen) { |
| break; |
| } |
| |
| U16_NEXT(text, i, textLen, c); |
| gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| |
| if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { |
| break; |
| } |
| } |
| |
| return indexOfLastCharChecked; |
| } |
| |
| static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) { |
| const UChar *text = string.getBuffer(); |
| int32_t textLen = string.length(); |
| |
| if (index>=textLen || index<=0) { |
| return FALSE; |
| } |
| |
| // If the character at the current index is not a GRAPHEME_EXTEND |
| // then we can not be within a combining sequence. |
| UChar32 c; |
| U16_GET(text, 0, index, textLen, c); |
| int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { |
| return FALSE; |
| } |
| |
| // We are at a combining mark. If the preceding character is anything |
| // except a CONTROL, CR or LF, we are in a combining sequence. |
| U16_PREV(text, 0, index, c); |
| gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| |
| return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); |
| } |
| |
| static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| OrderList targetOrders(coll, target, offset); |
| OrderList patternOrders(coll, pattern); |
| int32_t targetSize = targetOrders.size() - 1; |
| int32_t patternSize = patternOrders.size() - 1; |
| UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale(coll, ULOC_VALID_LOCALE, &status), |
| target.getBuffer(), target.length(), &status); |
| |
| if (patternSize == 0) { |
| matchStart = matchEnd = 0; |
| return FALSE; |
| } |
| |
| matchStart = matchEnd = -1; |
| |
| for(int32_t i = 0; i < targetSize; i += 1) { |
| if (targetOrders.matchesAt(i, patternOrders)) { |
| int32_t start = targetOrders.getLowOffset(i); |
| int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); |
| int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); |
| |
| // if the low and high offsets of the first CE in |
| // the match are the same, it means that the match |
| // starts in the middle of an expansion - all but |
| // the first CE of the expansion will have the offset |
| // of the following character. |
| if (start == targetOrders.getHighOffset(i)) { |
| continue; |
| } |
| |
| // Make sure match starts on a grapheme boundary |
| if (! ubrk_isBoundary(charBreakIterator, start)) { |
| continue; |
| } |
| |
| // If the low and high offsets of the CE after the match |
| // are the same, it means that the match ends in the middle |
| // of an expansion sequence. |
| if (maxLimit == targetOrders.getHighOffset(i + patternSize) && |
| targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { |
| continue; |
| } |
| |
| int32_t mend = maxLimit; |
| |
| // Find the first grapheme break after the character index |
| // of the last CE in the match. If it's after character index |
| // that's after the last CE in the match, use that index |
| // as the end of the match. |
| if (minLimit < maxLimit) { |
| int32_t nba = ubrk_following(charBreakIterator, minLimit); |
| |
| if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { |
| mend = nba; |
| } |
| } |
| |
| if (mend > maxLimit) { |
| continue; |
| } |
| |
| if (! ubrk_isBoundary(charBreakIterator, mend)) { |
| continue; |
| } |
| |
| matchStart = start; |
| matchEnd = mend; |
| |
| ubrk_close(charBreakIterator); |
| return TRUE; |
| } |
| } |
| |
| ubrk_close(charBreakIterator); |
| return FALSE; |
| } |
| |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { |
| int32_t val = defaultVal; |
| |
| name.append(" *= *(-?\\d+)"); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| RegexMatcher m(name, params, 0, status); |
| |
| if (m.find()) { |
| // The param exists. Convert the string to an int. |
| char valString[100]; |
| int32_t paramLength = m.end(1, status) - m.start(1, status); |
| |
| if (paramLength >= (int32_t)(sizeof(valString)-1)) { |
| paramLength = (int32_t)(sizeof(valString)-2); |
| } |
| |
| params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); |
| val = strtol(valString, NULL, 10); |
| |
| // Delete this parameter from the params string. |
| m.reset(); |
| params = m.replaceFirst("", status); |
| } |
| |
| //U_ASSERT(U_SUCCESS(status)); |
| if (! U_SUCCESS(status)) { |
| val = defaultVal; |
| } |
| |
| return val; |
| } |
| #endif |
| |
| int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern, |
| const char *name, const char *strength, uint32_t seed) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t actualStart = -1, actualEnd = -1; |
| //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length(); |
| int32_t expectedStart = -1, expectedEnd = -1; |
| int32_t notFoundCount = 0; |
| UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(), |
| testCase.getBuffer(), testCase.length(), |
| coll, |
| NULL, // the break iterator |
| &status); |
| |
| // **** TODO: find *all* matches, not just first one **** |
| simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); |
| |
| #if 0 |
| usearch_search(uss, 0, &actualStart, &actualEnd, &status); |
| #else |
| actualStart = usearch_next(uss, &status); |
| actualEnd = actualStart + usearch_getMatchedLength(uss); |
| #endif |
| |
| if (actualStart != expectedStart || actualEnd != expectedEnd) { |
| errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" |
| " strength=%s seed=%d", |
| name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); |
| } |
| |
| if (expectedStart == -1 && actualStart == -1) { |
| notFoundCount += 1; |
| } |
| |
| // **** TODO: find *all* matches, not just first one **** |
| simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); |
| |
| usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status); |
| |
| #if 0 |
| usearch_search(uss, 0, &actualStart, &actualEnd, &status); |
| #else |
| usearch_reset(uss); |
| actualStart = usearch_next(uss, &status); |
| actualEnd = actualStart + usearch_getMatchedLength(uss); |
| #endif |
| |
| if (actualStart != expectedStart || actualEnd != expectedEnd) { |
| errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n" |
| " strength=%s seed=%d", |
| name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed); |
| } |
| |
| if (expectedStart == -1 && actualStart == -1) { |
| notFoundCount += 1; |
| } |
| |
| usearch_close(uss); |
| |
| return notFoundCount; |
| } |
| |
| void SSearchTest::monkeyTest(char *params) |
| { |
| // ook! |
| UErrorCode status = U_ZERO_ERROR; |
| U_STRING_DECL(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47); |
| U_STRING_INIT(test_pattern, "[[:assigned:]-[:ideographic:]-[:hangul:]-[:c:]]", 47); |
| UCollator *coll = ucol_open(NULL, &status); |
| USet *charsToTest = uset_openPattern(test_pattern, 47, &status); |
| USet *expansions = uset_openEmpty(); |
| USet *contractions = uset_openEmpty(); |
| StringToCEsMap *charsToCEList = new StringToCEsMap(); |
| CEToStringsMap *ceToCharsStartingWith = new CEToStringsMap(); |
| |
| ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); |
| |
| uset_addAll(charsToTest, contractions); |
| uset_addAll(charsToTest, expansions); |
| |
| // TODO: set strength to UCOL_PRIMARY, change CEList to use strength? |
| buildData(coll, charsToTest, charsToCEList, ceToCharsStartingWith); |
| |
| U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); |
| U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); |
| USet *letters = uset_openPattern(letter_pattern, 39, &status); |
| SetMonkey letterMonkey(letters); |
| StringSetMonkey contractionMonkey(contractions, coll, charsToCEList, ceToCharsStartingWith); |
| StringSetMonkey expansionMonkey(expansions, coll, charsToCEList, ceToCharsStartingWith); |
| UnicodeString testCase; |
| UnicodeString alternate; |
| UnicodeString pattern, altPattern; |
| UnicodeString prefix, altPrefix; |
| UnicodeString suffix, altSuffix; |
| |
| Monkey *monkeys[] = { |
| &letterMonkey, |
| &contractionMonkey, |
| &expansionMonkey, |
| &contractionMonkey, |
| &expansionMonkey, |
| &contractionMonkey, |
| &expansionMonkey, |
| &contractionMonkey, |
| &expansionMonkey}; |
| int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); |
| int32_t nonMatchCount = 0; |
| |
| UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; |
| const char *strengthNames[] = {"primary", "secondary", "tertiary"}; |
| int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); |
| int32_t loopCount = quick? 1000 : 10000; |
| int32_t firstStrength = 0; |
| int32_t lastStrength = strengthCount - 1; |
| |
| if (params != NULL) { |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| UnicodeString p(params); |
| |
| loopCount = getIntParam("loop", p, loopCount); |
| m_seed = getIntParam("seed", p, m_seed); |
| |
| RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); |
| if (m.find()) { |
| UnicodeString breakType = m.group(1, status); |
| |
| for (int32_t s = 0; s < strengthCount; s += 1) { |
| if (breakType == strengthNames[s]) { |
| firstStrength = lastStrength = s; |
| break; |
| } |
| } |
| |
| m.reset(); |
| p = m.replaceFirst("", status); |
| } |
| |
| if (RegexMatcher("\\S", p, 0, status).find()) { |
| // Each option is stripped out of the option string as it is processed. |
| // All options have been checked. The option string should have been completely emptied.. |
| char buf[100]; |
| p.extract(buf, sizeof(buf), NULL, status); |
| buf[sizeof(buf)-1] = 0; |
| errln("Unrecognized or extra parameter: %s\n", buf); |
| return; |
| } |
| #else |
| infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); |
| #endif |
| } |
| |
| for(int32_t s = firstStrength; s <= lastStrength; s += 1) { |
| int32_t notFoundCount = 0; |
| |
| ucol_setStrength(coll, strengths[s]); |
| |
| // TODO: try alternate prefix and suffix too? |
| // TODO: alterntaes are only equal at primary strength. Is this OK? |
| for(int32_t t = 0; t < 10000; t += 1) { |
| uint32_t seed = m_seed; |
| int32_t nmc = 0; |
| |
| generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); |
| generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); |
| generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); |
| |
| // pattern |
| notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); |
| |
| testCase.remove(); |
| testCase.append(prefix); |
| testCase.append(/*alt*/pattern); |
| |
| // prefix + pattern |
| notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); |
| |
| testCase.append(suffix); |
| |
| // prefix + pattern + suffix |
| notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); |
| |
| testCase.remove(); |
| testCase.append(pattern); |
| testCase.append(suffix); |
| |
| // pattern + suffix |
| notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); |
| } |
| |
| logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); |
| } |
| |
| delete ceToCharsStartingWith; |
| delete charsToCEList; |
| |
| uset_close(contractions); |
| uset_close(expansions); |
| uset_close(charsToTest); |
| |
| ucol_close(coll); |
| } |
| |
| |