| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /******************************************************************** |
| * COPYRIGHT: |
| * Copyright (c) 2002-2016, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ********************************************************************/ |
| |
| // |
| // regextst.cpp |
| // |
| // ICU Regular Expressions test, part of intltest. |
| // |
| |
| /* |
| NOTE!! |
| |
| PLEASE be careful about ASCII assumptions in this test. |
| This test is one of the worst repeat offenders. |
| If you have questions, contact someone on the ICU PMC |
| who has access to an EBCDIC system. |
| |
| */ |
| |
| #include "intltest.h" |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <string.h> |
| |
| #include "unicode/localpointer.h" |
| #include "unicode/regex.h" |
| #include "unicode/stringpiece.h" |
| #include "unicode/uchar.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uregex.h" |
| #include "unicode/usetiter.h" |
| #include "unicode/ustring.h" |
| #include "unicode/utext.h" |
| #include "unicode/utf16.h" |
| #include "cstr.h" |
| #include "regextst.h" |
| #include "regexcmp.h" |
| #include "uvector.h" |
| #include "util.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "uinvchar.h" |
| |
| #define SUPPORT_MUTATING_INPUT_STRING 0 |
| |
| //--------------------------------------------------------------------------- |
| // |
| // Test class boilerplate |
| // |
| //--------------------------------------------------------------------------- |
| RegexTest::RegexTest() |
| { |
| } |
| |
| |
| RegexTest::~RegexTest() |
| { |
| } |
| |
| |
| |
| void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) |
| { |
| if (exec) logln("TestSuite RegexTest: "); |
| TESTCASE_AUTO_BEGIN; |
| TESTCASE_AUTO(Basic); |
| TESTCASE_AUTO(API_Match); |
| TESTCASE_AUTO(API_Replace); |
| TESTCASE_AUTO(API_Pattern); |
| #if !UCONFIG_NO_FILE_IO |
| TESTCASE_AUTO(Extended); |
| #endif |
| TESTCASE_AUTO(Errors); |
| TESTCASE_AUTO(PerlTests); |
| TESTCASE_AUTO(Callbacks); |
| TESTCASE_AUTO(FindProgressCallbacks); |
| TESTCASE_AUTO(Bug6149); |
| TESTCASE_AUTO(UTextBasic); |
| TESTCASE_AUTO(API_Match_UTF8); |
| TESTCASE_AUTO(API_Replace_UTF8); |
| TESTCASE_AUTO(API_Pattern_UTF8); |
| TESTCASE_AUTO(PerlTestsUTF8); |
| TESTCASE_AUTO(PreAllocatedUTextCAPI); |
| TESTCASE_AUTO(Bug7651); |
| TESTCASE_AUTO(Bug7740); |
| TESTCASE_AUTO(Bug8479); |
| TESTCASE_AUTO(Bug7029); |
| TESTCASE_AUTO(CheckInvBufSize); |
| TESTCASE_AUTO(Bug9283); |
| TESTCASE_AUTO(Bug10459); |
| TESTCASE_AUTO(TestCaseInsensitiveStarters); |
| TESTCASE_AUTO(TestBug11049); |
| TESTCASE_AUTO(TestBug11371); |
| TESTCASE_AUTO(TestBug11480); |
| TESTCASE_AUTO(NamedCapture); |
| TESTCASE_AUTO(NamedCaptureLimits); |
| TESTCASE_AUTO(TestBug12884); |
| TESTCASE_AUTO(TestBug13631); |
| TESTCASE_AUTO(TestBug13632); |
| TESTCASE_AUTO(TestBug20359); |
| TESTCASE_AUTO(TestBug20863); |
| TESTCASE_AUTO_END; |
| } |
| |
| |
| /** |
| * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage |
| * into ASCII. |
| * @see utext_openUTF8 |
| */ |
| static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); |
| |
| //--------------------------------------------------------------------------- |
| // |
| // Error Checking / Reporting macros used in all of the tests. |
| // |
| //--------------------------------------------------------------------------- |
| |
| static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { |
| int64_t oldIndex = utext_getNativeIndex(text); |
| utext_setNativeIndex(text, 0); |
| char *bufPtr = buf; |
| UChar32 c = utext_next32From(text, 0); |
| while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { |
| if (0x000020<=c && c<0x00007e) { |
| *bufPtr = c; |
| } else { |
| #if 0 |
| sprintf(bufPtr,"U+%04X", c); |
| bufPtr+= strlen(bufPtr)-1; |
| #else |
| *bufPtr = '%'; |
| #endif |
| } |
| bufPtr++; |
| c = UTEXT_NEXT32(text); |
| } |
| *bufPtr = 0; |
| #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) |
| char *ebuf = (char*)malloc(bufLen); |
| uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); |
| uprv_strncpy(buf, ebuf, bufLen); |
| free((void*)ebuf); |
| #endif |
| utext_setNativeIndex(text, oldIndex); |
| } |
| |
| |
| static char ASSERT_BUF[1024]; |
| |
| const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { |
| if(message.length()==0) { |
| strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); |
| } else { |
| UnicodeString buf; |
| IntlTest::prettify(message,buf); |
| if(buf.length()==0) { |
| strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); |
| } else { |
| buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); |
| if(ASSERT_BUF[0]==0) { |
| ASSERT_BUF[0]=0; |
| for(int32_t i=0;i<buf.length();i++) { |
| UChar ch = buf[i]; |
| sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); |
| } |
| } |
| } |
| } |
| ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; |
| return ASSERT_BUF; |
| } |
| |
| #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \ |
| char buf[200]; \ |
| utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \ |
| logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \ |
| if (U_FAILURE(status)) { \ |
| dataerrln("%s:%d: RegexTest failure. status=%s", \ |
| __FILE__, __LINE__, u_errorName(status)); \ |
| return; \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \ |
| if ((expr)==FALSE) { \ |
| errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \ |
| UErrorCode status=U_ZERO_ERROR; \ |
| (expr); \ |
| if (status!=errcode) { \ |
| dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ |
| __LINE__, u_errorName(errcode), u_errorName(status)); \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \ |
| if (U_FAILURE(status)) { \ |
| errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \ |
| if ((expr)==FALSE) { \ |
| errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \ |
| return; \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| // expected: const char * , restricted to invariant characters. |
| // actual: const UnicodeString & |
| #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \ |
| if (UnicodeString(expected, -1, US_INV) != (actual)) { \ |
| errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ |
| __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \ |
| } \ |
| } UPRV_BLOCK_MACRO_END |
| |
| |
| static UBool testUTextEqual(UText *uta, UText *utb) { |
| UChar32 ca = 0; |
| UChar32 cb = 0; |
| utext_setNativeIndex(uta, 0); |
| utext_setNativeIndex(utb, 0); |
| do { |
| ca = utext_next32(uta); |
| cb = utext_next32(utb); |
| if (ca != cb) { |
| break; |
| } |
| } while (ca != U_SENTINEL); |
| return ca == cb; |
| } |
| |
| |
| /** |
| * @param expected expected text in UTF-8 (not platform) codepage |
| */ |
| void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { |
| UErrorCode status = U_ZERO_ERROR; |
| UText expectedText = UTEXT_INITIALIZER; |
| utext_openUTF8(&expectedText, expected, -1, &status); |
| if(U_FAILURE(status)) { |
| errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); |
| return; |
| } |
| if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { |
| errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); |
| return; |
| } |
| utext_setNativeIndex(actual, 0); |
| if (!testUTextEqual(&expectedText, actual)) { |
| char buf[201 /*21*/]; |
| char expectedBuf[201]; |
| utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); |
| utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); |
| errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); |
| } |
| utext_close(&expectedText); |
| } |
| /** |
| * @param expected invariant (platform local text) input |
| */ |
| |
| void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { |
| UErrorCode status = U_ZERO_ERROR; |
| UText expectedText = UTEXT_INITIALIZER; |
| regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); |
| if(U_FAILURE(status)) { |
| errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); |
| return; |
| } |
| utext_setNativeIndex(actual, 0); |
| if (!testUTextEqual(&expectedText, actual)) { |
| char buf[201 /*21*/]; |
| char expectedBuf[201]; |
| utextToPrintable(buf, UPRV_LENGTHOF(buf), actual); |
| utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText); |
| errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); |
| } |
| utext_close(&expectedText); |
| } |
| |
| /** |
| * Assumes utf-8 input |
| */ |
| #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) |
| /** |
| * Assumes Invariant input |
| */ |
| #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) |
| |
| /** |
| * This buffer ( inv_buf ) is used to hold the UTF-8 strings |
| * passed into utext_openUTF8. An error will be given if |
| * INV_BUFSIZ is too small. It's only used on EBCDIC systems. |
| */ |
| |
| #define INV_BUFSIZ 2048 /* increase this if too small */ |
| |
| static int64_t inv_next=0; |
| |
| #if U_CHARSET_FAMILY!=U_ASCII_FAMILY |
| static char inv_buf[INV_BUFSIZ]; |
| #endif |
| |
| static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { |
| if(length==-1) length=strlen(inv); |
| #if U_CHARSET_FAMILY==U_ASCII_FAMILY |
| inv_next+=length; |
| return utext_openUTF8(ut, inv, length, status); |
| #else |
| if(inv_next+length+1>INV_BUFSIZ) { |
| fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", |
| __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| |
| unsigned char *buf = (unsigned char*)inv_buf+inv_next; |
| uprv_aestrncpy(buf, (const uint8_t*)inv, length); |
| inv_next+=length; |
| |
| #if 0 |
| fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); |
| #endif |
| |
| return utext_openUTF8(ut, (const char*)buf, length, status); |
| #endif |
| } |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // REGEX_TESTLM Macro + invocation function to simplify writing quick tests |
| // for the LookingAt() and Match() functions. |
| // |
| // usage: |
| // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); |
| // |
| // The expected results are UBool - TRUE or FALSE. |
| // The input text is unescaped. The pattern is not. |
| // |
| // |
| //--------------------------------------------------------------------------- |
| |
| #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \ |
| doRegexLMTest(pat, text, looking, match, __LINE__); \ |
| doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \ |
| } UPRV_BLOCK_MACRO_END |
| |
| UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { |
| const UnicodeString pattern(pat, -1, US_INV); |
| const UnicodeString inputText(text, -1, US_INV); |
| UErrorCode status = U_ZERO_ERROR; |
| UParseError pe; |
| RegexPattern *REPattern = NULL; |
| RegexMatcher *REMatcher = NULL; |
| UBool retVal = TRUE; |
| |
| UnicodeString patString(pat, -1, US_INV); |
| REPattern = RegexPattern::compile(patString, 0, pe, status); |
| if (U_FAILURE(status)) { |
| dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", |
| line, u_errorName(status)); |
| return FALSE; |
| } |
| if (line==376) { REPattern->dumpPattern();} |
| |
| UnicodeString inputString(inputText); |
| UnicodeString unEscapedInput = inputString.unescape(); |
| REMatcher = REPattern->matcher(unEscapedInput, status); |
| if (U_FAILURE(status)) { |
| errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", |
| line, u_errorName(status)); |
| return FALSE; |
| } |
| |
| UBool actualmatch; |
| actualmatch = REMatcher->lookingAt(status); |
| if (U_FAILURE(status)) { |
| errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", |
| line, u_errorName(status)); |
| retVal = FALSE; |
| } |
| if (actualmatch != looking) { |
| errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); |
| retVal = FALSE; |
| } |
| |
| status = U_ZERO_ERROR; |
| actualmatch = REMatcher->matches(status); |
| if (U_FAILURE(status)) { |
| errln("RegexTest failure in matches() at line %d. Status = %s\n", |
| line, u_errorName(status)); |
| retVal = FALSE; |
| } |
| if (actualmatch != match) { |
| errln("RegexTest: wrong return from matches() at line %d.\n", line); |
| retVal = FALSE; |
| } |
| |
| if (retVal == FALSE) { |
| REPattern->dumpPattern(); |
| } |
| |
| delete REPattern; |
| delete REMatcher; |
| return retVal; |
| } |
| |
| |
| UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { |
| UText pattern = UTEXT_INITIALIZER; |
| int32_t inputUTF8Length; |
| char *textChars = NULL; |
| UText inputText = UTEXT_INITIALIZER; |
| UErrorCode status = U_ZERO_ERROR; |
| UParseError pe; |
| RegexPattern *REPattern = NULL; |
| RegexMatcher *REMatcher = NULL; |
| UBool retVal = TRUE; |
| |
| regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); |
| REPattern = RegexPattern::compile(&pattern, 0, pe, status); |
| if (U_FAILURE(status)) { |
| dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", |
| line, u_errorName(status)); |
| return FALSE; |
| } |
| |
| UnicodeString inputString(text, -1, US_INV); |
| UnicodeString unEscapedInput = inputString.unescape(); |
| LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); |
| ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); |
| |
| inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); |
| if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { |
| // UTF-8 does not allow unpaired surrogates, so this could actually happen |
| logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); |
| return TRUE; // not a failure of the Regex engine |
| } |
| status = U_ZERO_ERROR; // buffer overflow |
| textChars = new char[inputUTF8Length+1]; |
| unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); |
| utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); |
| |
| REMatcher = &REPattern->matcher(status)->reset(&inputText); |
| if (U_FAILURE(status)) { |
| errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", |
| line, u_errorName(status)); |
| return FALSE; |
| } |
| |
| UBool actualmatch; |
| actualmatch = REMatcher->lookingAt(status); |
| if (U_FAILURE(status)) { |
| errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", |
| line, u_errorName(status)); |
| retVal = FALSE; |
| } |
| if (actualmatch != looking) { |
| errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); |
| retVal = FALSE; |
| } |
| |
| status = U_ZERO_ERROR; |
| actualmatch = REMatcher->matches(status); |
| if (U_FAILURE(status)) { |
| errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", |
| line, u_errorName(status)); |
| retVal = FALSE; |
| } |
| if (actualmatch != match) { |
| errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); |
| retVal = FALSE; |
| } |
| |
| if (retVal == FALSE) { |
| REPattern->dumpPattern(); |
| } |
| |
| delete REPattern; |
| delete REMatcher; |
| utext_close(&inputText); |
| utext_close(&pattern); |
| delete[] textChars; |
| return retVal; |
| } |
| |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // REGEX_ERR Macro + invocation function to simplify writing tests |
| // regex tests for incorrect patterns |
| // |
| // usage: |
| // REGEX_ERR("pattern", expected error line, column, expected status); |
| // |
| //--------------------------------------------------------------------------- |
| #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__) |
| |
| void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, |
| UErrorCode expectedStatus, int32_t line) { |
| UnicodeString pattern(pat); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| UParseError pe; |
| RegexPattern *callerPattern = NULL; |
| |
| // |
| // Compile the caller's pattern |
| // |
| UnicodeString patString(pat); |
| callerPattern = RegexPattern::compile(patString, 0, pe, status); |
| if (status != expectedStatus) { |
| dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); |
| } else { |
| if (status != U_ZERO_ERROR) { |
| if (pe.line != errLine || pe.offset != errCol) { |
| errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", |
| line, errLine, errCol, pe.line, pe.offset); |
| } |
| } |
| } |
| |
| delete callerPattern; |
| |
| // |
| // Compile again, using a UTF-8-based UText |
| // |
| UText patternText = UTEXT_INITIALIZER; |
| regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); |
| callerPattern = RegexPattern::compile(&patternText, 0, pe, status); |
| if (status != expectedStatus) { |
| dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); |
| } else { |
| if (status != U_ZERO_ERROR) { |
| if (pe.line != errLine || pe.offset != errCol) { |
| errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", |
| line, errLine, errCol, pe.line, pe.offset); |
| } |
| } |
| } |
| |
| delete callerPattern; |
| utext_close(&patternText); |
| } |
| |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // Basic Check for basic functionality of regex pattern matching. |
| // Avoid the use of REGEX_FIND test macro, which has |
| // substantial dependencies on basic Regex functionality. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::Basic() { |
| |
| |
| // |
| // Debug - slide failing test cases early |
| // |
| #if 0 |
| { |
| // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); |
| UParseError pe; |
| UErrorCode status = U_ZERO_ERROR; |
| RegexPattern *pattern; |
| pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); |
| pattern->dumpPattern(); |
| RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); |
| UBool result = m->find(); |
| printf("result = %d\n", result); |
| // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); |
| // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); |
| } |
| exit(1); |
| #endif |
| |
| |
| // |
| // Pattern with parentheses |
| // |
| REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); |
| REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); |
| REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); |
| |
| // |
| // Patterns with * |
| // |
| REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); |
| REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); |
| REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); |
| REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); |
| REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); |
| |
| REGEX_TESTLM("a*", "", TRUE, TRUE); |
| REGEX_TESTLM("a*", "b", TRUE, FALSE); |
| |
| |
| // |
| // Patterns with "." |
| // |
| REGEX_TESTLM(".", "abc", TRUE, FALSE); |
| REGEX_TESTLM("...", "abc", TRUE, TRUE); |
| REGEX_TESTLM("....", "abc", FALSE, FALSE); |
| REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); |
| REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); |
| REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); |
| REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); |
| REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); |
| |
| // |
| // Patterns with * applied to chars at end of literal string |
| // |
| REGEX_TESTLM("abc*", "ab", TRUE, TRUE); |
| REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); |
| |
| // |
| // Supplemental chars match as single chars, not a pair of surrogates. |
| // |
| REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); |
| REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); |
| REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); |
| |
| |
| // |
| // UnicodeSets in the pattern |
| // |
| REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); |
| REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); |
| REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); |
| REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); |
| REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); |
| REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); |
| |
| REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); |
| REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); |
| REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); |
| REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. |
| REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); |
| |
| // |
| // OR operator in patterns |
| // |
| REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); |
| REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); |
| REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); |
| REGEX_TESTLM("a|b", "b", TRUE, TRUE); |
| |
| REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); |
| REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); |
| REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); |
| REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); |
| REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); |
| REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); |
| |
| // |
| // + |
| // |
| REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); |
| REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); |
| REGEX_TESTLM("b+", "", FALSE, FALSE); |
| REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); |
| REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); |
| REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); |
| |
| // |
| // ? |
| // |
| REGEX_TESTLM("ab?", "ab", TRUE, TRUE); |
| REGEX_TESTLM("ab?", "a", TRUE, TRUE); |
| REGEX_TESTLM("ab?", "ac", TRUE, FALSE); |
| REGEX_TESTLM("ab?", "abb", TRUE, FALSE); |
| REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); |
| REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); |
| REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); |
| REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); |
| REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); |
| |
| // |
| // Escape sequences that become single literal chars, handled internally |
| // by ICU's Unescape. |
| // |
| |
| // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. |
| REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL |
| REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L |
| REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape |
| REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed |
| REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line |
| REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR |
| REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab |
| REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); |
| REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); |
| |
| REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input |
| REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input |
| |
| // Escape of special chars in patterns |
| REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); |
| } |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // UTextBasic Check for quirks that are specific to the UText |
| // implementation. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::UTextBasic() { |
| const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ |
| UErrorCode status = U_ZERO_ERROR; |
| UText pattern = UTEXT_INITIALIZER; |
| utext_openUTF8(&pattern, str_abc, -1, &status); |
| RegexMatcher matcher(&pattern, 0, status); |
| REGEX_CHECK_STATUS; |
| |
| UText input = UTEXT_INITIALIZER; |
| utext_openUTF8(&input, str_abc, -1, &status); |
| REGEX_CHECK_STATUS; |
| matcher.reset(&input); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); |
| |
| matcher.reset(matcher.inputText()); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); |
| |
| utext_close(&pattern); |
| utext_close(&input); |
| } |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // API_Match Test that the API for class RegexMatcher |
| // is present and nominally working, but excluding functions |
| // implementing replace operations. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::API_Match() { |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| int32_t flags = 0; |
| |
| // |
| // Debug - slide failing test cases early |
| // |
| #if 0 |
| { |
| } |
| return; |
| #endif |
| |
| // |
| // Simple pattern compilation |
| // |
| { |
| UnicodeString re("abc"); |
| RegexPattern *pat2; |
| pat2 = RegexPattern::compile(re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| |
| UnicodeString inStr1 = "abcdef this is a test"; |
| UnicodeString instr2 = "not abc"; |
| UnicodeString empty = ""; |
| |
| |
| // |
| // Matcher creation and reset. |
| // |
| RegexMatcher *m1 = pat2->matcher(inStr1, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m1->lookingAt(status) == TRUE); |
| REGEX_ASSERT(m1->input() == inStr1); |
| m1->reset(instr2); |
| REGEX_ASSERT(m1->lookingAt(status) == FALSE); |
| REGEX_ASSERT(m1->input() == instr2); |
| m1->reset(inStr1); |
| REGEX_ASSERT(m1->input() == inStr1); |
| REGEX_ASSERT(m1->lookingAt(status) == TRUE); |
| m1->reset(empty); |
| REGEX_ASSERT(m1->lookingAt(status) == FALSE); |
| REGEX_ASSERT(m1->input() == empty); |
| REGEX_ASSERT(&m1->pattern() == pat2); |
| |
| // |
| // reset(pos, status) |
| // |
| m1->reset(inStr1); |
| m1->reset(4, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m1->input() == inStr1); |
| REGEX_ASSERT(m1->lookingAt(status) == TRUE); |
| |
| m1->reset(-1, status); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| |
| m1->reset(0, status); |
| REGEX_CHECK_STATUS; |
| status = U_ZERO_ERROR; |
| |
| int32_t len = m1->input().length(); |
| m1->reset(len-1, status); |
| REGEX_CHECK_STATUS; |
| status = U_ZERO_ERROR; |
| |
| m1->reset(len, status); |
| REGEX_CHECK_STATUS; |
| status = U_ZERO_ERROR; |
| |
| m1->reset(len+1, status); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| |
| // |
| // match(pos, status) |
| // |
| m1->reset(instr2); |
| REGEX_ASSERT(m1->matches(4, status) == TRUE); |
| m1->reset(); |
| REGEX_ASSERT(m1->matches(3, status) == FALSE); |
| m1->reset(); |
| REGEX_ASSERT(m1->matches(5, status) == FALSE); |
| REGEX_ASSERT(m1->matches(4, status) == TRUE); |
| REGEX_ASSERT(m1->matches(-1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| // Match() at end of string should fail, but should not |
| // be an error. |
| status = U_ZERO_ERROR; |
| len = m1->input().length(); |
| REGEX_ASSERT(m1->matches(len, status) == FALSE); |
| REGEX_CHECK_STATUS; |
| |
| // Match beyond end of string should fail with an error. |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT(m1->matches(len+1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| // Successful match at end of string. |
| { |
| status = U_ZERO_ERROR; |
| RegexMatcher m("A?", 0, status); // will match zero length string. |
| REGEX_CHECK_STATUS; |
| m.reset(inStr1); |
| len = inStr1.length(); |
| REGEX_ASSERT(m.matches(len, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| m.reset(empty); |
| REGEX_ASSERT(m.matches(0, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| } |
| |
| |
| // |
| // lookingAt(pos, status) |
| // |
| status = U_ZERO_ERROR; |
| m1->reset(instr2); // "not abc" |
| REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); |
| REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); |
| REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); |
| REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); |
| REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| len = m1->input().length(); |
| REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| delete m1; |
| delete pat2; |
| } |
| |
| |
| // |
| // Capture Group. |
| // RegexMatcher::start(); |
| // RegexMatcher::end(); |
| // RegexMatcher::groupCount(); |
| // |
| { |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| |
| UnicodeString re("01(23(45)67)(.*)"); |
| RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString data = "0123456789"; |
| |
| RegexMatcher *matcher = pat->matcher(data, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher->lookingAt(status) == TRUE); |
| static const int32_t matchStarts[] = {0, 2, 4, 8}; |
| static const int32_t matchEnds[] = {10, 8, 6, 10}; |
| int32_t i; |
| for (i=0; i<4; i++) { |
| int32_t actualStart = matcher->start(i, status); |
| REGEX_CHECK_STATUS; |
| if (actualStart != matchStarts[i]) { |
| errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", |
| __LINE__, i, matchStarts[i], actualStart); |
| } |
| int32_t actualEnd = matcher->end(i, status); |
| REGEX_CHECK_STATUS; |
| if (actualEnd != matchEnds[i]) { |
| errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", |
| __LINE__, i, matchEnds[i], actualEnd); |
| } |
| } |
| |
| REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); |
| REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); |
| |
| REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| matcher->reset(); |
| REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); |
| |
| matcher->lookingAt(status); |
| REGEX_ASSERT(matcher->group(status) == "0123456789"); |
| REGEX_ASSERT(matcher->group(0, status) == "0123456789"); |
| REGEX_ASSERT(matcher->group(1, status) == "234567" ); |
| REGEX_ASSERT(matcher->group(2, status) == "45" ); |
| REGEX_ASSERT(matcher->group(3, status) == "89" ); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| matcher->reset(); |
| REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); |
| |
| delete matcher; |
| delete pat; |
| |
| } |
| |
| // |
| // find |
| // |
| { |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| |
| UnicodeString re("abc"); |
| RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString data = ".abc..abc...abc.."; |
| // 012345678901234567 |
| |
| RegexMatcher *matcher = pat->matcher(data, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 6); |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 12); |
| REGEX_ASSERT(matcher->find() == FALSE); |
| REGEX_ASSERT(matcher->find() == FALSE); |
| |
| matcher->reset(); |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| |
| REGEX_ASSERT(matcher->find(0, status)); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| REGEX_ASSERT(matcher->find(1, status)); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| REGEX_ASSERT(matcher->find(2, status)); |
| REGEX_ASSERT(matcher->start(status) == 6); |
| REGEX_ASSERT(matcher->find(12, status)); |
| REGEX_ASSERT(matcher->start(status) == 12); |
| REGEX_ASSERT(matcher->find(13, status) == FALSE); |
| REGEX_ASSERT(matcher->find(16, status) == FALSE); |
| REGEX_ASSERT(matcher->find(17, status) == FALSE); |
| REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); |
| |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| REGEX_ASSERT(matcher->groupCount() == 0); |
| |
| delete matcher; |
| delete pat; |
| } |
| |
| |
| // |
| // find, with \G in pattern (true if at the end of a previous match). |
| // |
| { |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| |
| UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); |
| RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString data = ".abcabc.abc.."; |
| // 012345678901234567 |
| |
| RegexMatcher *matcher = pat->matcher(data, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 0); |
| REGEX_ASSERT(matcher->start(1, status) == -1); |
| REGEX_ASSERT(matcher->start(2, status) == 1); |
| |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 4); |
| REGEX_ASSERT(matcher->start(1, status) == 4); |
| REGEX_ASSERT(matcher->start(2, status) == -1); |
| REGEX_CHECK_STATUS; |
| |
| delete matcher; |
| delete pat; |
| } |
| |
| // |
| // find with zero length matches, match position should bump ahead |
| // to prevent loops. |
| // |
| { |
| int32_t i; |
| UErrorCode status=U_ZERO_ERROR; |
| RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, |
| // using an always-true look-ahead. |
| REGEX_CHECK_STATUS; |
| UnicodeString s(" "); |
| m.reset(s); |
| for (i=0; ; i++) { |
| if (m.find() == FALSE) { |
| break; |
| } |
| REGEX_ASSERT(m.start(status) == i); |
| REGEX_ASSERT(m.end(status) == i); |
| } |
| REGEX_ASSERT(i==5); |
| |
| // Check that the bump goes over surrogate pairs OK |
| s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); |
| s = s.unescape(); |
| m.reset(s); |
| for (i=0; ; i+=2) { |
| if (m.find() == FALSE) { |
| break; |
| } |
| REGEX_ASSERT(m.start(status) == i); |
| REGEX_ASSERT(m.end(status) == i); |
| } |
| REGEX_ASSERT(i==10); |
| } |
| { |
| // find() loop breaking test. |
| // with pattern of /.?/, should see a series of one char matches, then a single |
| // match of zero length at the end of the input string. |
| int32_t i; |
| UErrorCode status=U_ZERO_ERROR; |
| RegexMatcher m(".?", 0, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString s(" "); |
| m.reset(s); |
| for (i=0; ; i++) { |
| if (m.find() == FALSE) { |
| break; |
| } |
| REGEX_ASSERT(m.start(status) == i); |
| REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); |
| } |
| REGEX_ASSERT(i==5); |
| } |
| |
| |
| // |
| // Matchers with no input string behave as if they had an empty input string. |
| // |
| |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| RegexMatcher m(".?", 0, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m.find()); |
| REGEX_ASSERT(m.start(status) == 0); |
| REGEX_ASSERT(m.input() == ""); |
| } |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| RegexPattern *p = RegexPattern::compile(".", 0, status); |
| RegexMatcher *m = p->matcher(status); |
| REGEX_CHECK_STATUS; |
| |
| REGEX_ASSERT(m->find() == FALSE); |
| REGEX_ASSERT(m->input() == ""); |
| delete m; |
| delete p; |
| } |
| |
| // |
| // Regions |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString testString("This is test data"); |
| RegexMatcher m(".*", testString, 0, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m.regionStart() == 0); |
| REGEX_ASSERT(m.regionEnd() == testString.length()); |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| |
| m.region(2,4, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m.matches(status)); |
| REGEX_ASSERT(m.start(status)==2); |
| REGEX_ASSERT(m.end(status)==4); |
| REGEX_CHECK_STATUS; |
| |
| m.reset(); |
| REGEX_ASSERT(m.regionStart() == 0); |
| REGEX_ASSERT(m.regionEnd() == testString.length()); |
| |
| UnicodeString shorterString("short"); |
| m.reset(shorterString); |
| REGEX_ASSERT(m.regionStart() == 0); |
| REGEX_ASSERT(m.regionEnd() == shorterString.length()); |
| |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); |
| REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); |
| |
| REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); |
| REGEX_ASSERT(m.hasTransparentBounds() == TRUE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasTransparentBounds() == TRUE); |
| |
| REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| |
| } |
| |
| // |
| // hitEnd() and requireEnd() |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString testString("aabb"); |
| RegexMatcher m1(".*", testString, 0, status); |
| REGEX_ASSERT(m1.lookingAt(status) == TRUE); |
| REGEX_ASSERT(m1.hitEnd() == TRUE); |
| REGEX_ASSERT(m1.requireEnd() == FALSE); |
| REGEX_CHECK_STATUS; |
| |
| status = U_ZERO_ERROR; |
| RegexMatcher m2("a*", testString, 0, status); |
| REGEX_ASSERT(m2.lookingAt(status) == TRUE); |
| REGEX_ASSERT(m2.hitEnd() == FALSE); |
| REGEX_ASSERT(m2.requireEnd() == FALSE); |
| REGEX_CHECK_STATUS; |
| |
| status = U_ZERO_ERROR; |
| RegexMatcher m3(".*$", testString, 0, status); |
| REGEX_ASSERT(m3.lookingAt(status) == TRUE); |
| REGEX_ASSERT(m3.hitEnd() == TRUE); |
| REGEX_ASSERT(m3.requireEnd() == TRUE); |
| REGEX_CHECK_STATUS; |
| } |
| |
| |
| // |
| // Compilation error on reset with UChar * |
| // These were a hazard that people were stumbling over with runtime errors. |
| // Changed them to compiler errors by adding private methods that more closely |
| // matched the incorrect use of the functions. |
| // |
| #if 0 |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UChar ucharString[20]; |
| RegexMatcher m(".", 0, status); |
| m.reset(ucharString); // should not compile. |
| |
| RegexPattern *p = RegexPattern::compile(".", 0, status); |
| RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. |
| |
| RegexMatcher m3(".", ucharString, 0, status); // Should not compile |
| } |
| #endif |
| |
| // |
| // Time Outs. |
| // Note: These tests will need to be changed when the regexp engine is |
| // able to detect and cut short the exponential time behavior on |
| // this type of match. |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| // Enough 'a's in the string to cause the match to time out. |
| // (Each on additonal 'a' doubles the time) |
| UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); |
| RegexMatcher matcher("(a+)+b", testString, 0, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher.getTimeLimit() == 0); |
| matcher.setTimeLimit(100, status); |
| REGEX_ASSERT(matcher.getTimeLimit() == 100); |
| REGEX_ASSERT(matcher.lookingAt(status) == FALSE); |
| REGEX_ASSERT(status == U_REGEX_TIME_OUT); |
| } |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| // Few enough 'a's to slip in under the time limit. |
| UnicodeString testString("aaaaaaaaaaaaaaaaaa"); |
| RegexMatcher matcher("(a+)+b", testString, 0, status); |
| REGEX_CHECK_STATUS; |
| matcher.setTimeLimit(100, status); |
| REGEX_ASSERT(matcher.lookingAt(status) == FALSE); |
| REGEX_CHECK_STATUS; |
| } |
| |
| // |
| // Stack Limits |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' |
| |
| // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations |
| // of the '+', and makes the stack frames larger. |
| RegexMatcher matcher("(A)+A$", testString, 0, status); |
| |
| // With the default stack, this match should fail to run |
| REGEX_ASSERT(matcher.lookingAt(status) == FALSE); |
| REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); |
| |
| // With unlimited stack, it should run |
| status = U_ZERO_ERROR; |
| matcher.setStackLimit(0, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher.lookingAt(status) == TRUE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher.getStackLimit() == 0); |
| |
| // With a limited stack, it the match should fail |
| status = U_ZERO_ERROR; |
| matcher.setStackLimit(10000, status); |
| REGEX_ASSERT(matcher.lookingAt(status) == FALSE); |
| REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); |
| REGEX_ASSERT(matcher.getStackLimit() == 10000); |
| } |
| |
| // A pattern that doesn't save state should work with |
| // a minimal sized stack |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString testString = "abc"; |
| RegexMatcher matcher("abc", testString, 0, status); |
| REGEX_CHECK_STATUS; |
| matcher.setStackLimit(30, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher.matches(status) == TRUE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher.getStackLimit() == 30); |
| |
| // Negative stack sizes should fail |
| status = U_ZERO_ERROR; |
| matcher.setStackLimit(1000, status); |
| REGEX_CHECK_STATUS; |
| matcher.setStackLimit(-1, status); |
| REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); |
| REGEX_ASSERT(matcher.getStackLimit() == 1000); |
| } |
| |
| |
| } |
| |
| |
| |
| |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // API_Replace API test for class RegexMatcher, testing the |
| // Replace family of functions. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::API_Replace() { |
| // |
| // Replace |
| // |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| |
| UnicodeString re("abc"); |
| RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString data = ".abc..abc...abc.."; |
| // 012345678901234567 |
| RegexMatcher *matcher = pat->matcher(data, status); |
| |
| // |
| // Plain vanilla matches. |
| // |
| UnicodeString dest; |
| dest = matcher->replaceFirst("yz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == ".yz..abc...abc.."); |
| |
| dest = matcher->replaceAll("yz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == ".yz..yz...yz.."); |
| |
| // |
| // Plain vanilla non-matches. |
| // |
| UnicodeString d2 = ".abx..abx...abx.."; |
| matcher->reset(d2); |
| dest = matcher->replaceFirst("yz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == ".abx..abx...abx.."); |
| |
| dest = matcher->replaceAll("yz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == ".abx..abx...abx.."); |
| |
| // |
| // Empty source string |
| // |
| UnicodeString d3 = ""; |
| matcher->reset(d3); |
| dest = matcher->replaceFirst("yz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == ""); |
| |
| dest = matcher->replaceAll("yz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == ""); |
| |
| // |
| // Empty substitution string |
| // |
| matcher->reset(data); // ".abc..abc...abc.." |
| dest = matcher->replaceFirst("", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "...abc...abc.."); |
| |
| dest = matcher->replaceAll("", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "........"); |
| |
| // |
| // match whole string |
| // |
| UnicodeString d4 = "abc"; |
| matcher->reset(d4); |
| dest = matcher->replaceFirst("xyz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "xyz"); |
| |
| dest = matcher->replaceAll("xyz", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "xyz"); |
| |
| // |
| // Capture Group, simple case |
| // |
| UnicodeString re2("a(..)"); |
| RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString d5 = "abcdefg"; |
| RegexMatcher *matcher2 = pat2->matcher(d5, status); |
| REGEX_CHECK_STATUS; |
| dest = matcher2->replaceFirst("$1$1", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "bcbcdefg"); |
| |
| dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "The value of $1 is bc.defg"); |
| |
| dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); |
| REGEX_ASSERT(U_FAILURE(status)); |
| status = U_ZERO_ERROR; |
| |
| UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); |
| replacement = replacement.unescape(); |
| dest = matcher2->replaceFirst(replacement, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); |
| |
| REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| |
| // |
| // Replacement String with \u hex escapes |
| // |
| { |
| UnicodeString src = "abc 1 abc 2 abc 3"; |
| UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); |
| matcher->reset(src); |
| UnicodeString result = matcher->replaceAll(substitute, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); |
| } |
| { |
| UnicodeString src = "abc !"; |
| UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); |
| matcher->reset(src); |
| UnicodeString result = matcher->replaceAll(substitute, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString expected = UnicodeString("--"); |
| expected.append((UChar32)0x10000); |
| expected.append("-- !"); |
| REGEX_ASSERT(result == expected); |
| } |
| // TODO: need more through testing of capture substitutions. |
| |
| // Bug 4057 |
| // |
| { |
| status = U_ZERO_ERROR; |
| UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; |
| RegexMatcher m("ss(.*?)ee", 0, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString result; |
| |
| // Multiple finds do NOT bump up the previous appendReplacement postion. |
| m.reset(s); |
| m.find(); |
| m.find(); |
| m.appendReplacement(result, "ooh", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); |
| |
| // After a reset into the interior of a string, appendReplacemnt still starts at beginning. |
| status = U_ZERO_ERROR; |
| result.truncate(0); |
| m.reset(10, status); |
| m.find(); |
| m.find(); |
| m.appendReplacement(result, "ooh", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); |
| |
| // find() at interior of string, appendReplacemnt still starts at beginning. |
| status = U_ZERO_ERROR; |
| result.truncate(0); |
| m.reset(); |
| m.find(10, status); |
| m.find(); |
| m.appendReplacement(result, "ooh", status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); |
| |
| m.appendTail(result); |
| REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); |
| |
| } |
| |
| delete matcher2; |
| delete pat2; |
| delete matcher; |
| delete pat; |
| } |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // API_Pattern Test that the API for class RegexPattern is |
| // present and nominally working. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::API_Pattern() { |
| RegexPattern pata; // Test default constructor to not crash. |
| RegexPattern patb; |
| |
| REGEX_ASSERT(pata == patb); |
| REGEX_ASSERT(pata == pata); |
| |
| UnicodeString re1("abc[a-l][m-z]"); |
| UnicodeString re2("def"); |
| UErrorCode status = U_ZERO_ERROR; |
| UParseError pe; |
| |
| RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); |
| RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(*pat1 == *pat1); |
| REGEX_ASSERT(*pat1 != pata); |
| |
| // Assign |
| patb = *pat1; |
| REGEX_ASSERT(patb == *pat1); |
| |
| // Copy Construct |
| RegexPattern patc(*pat1); |
| REGEX_ASSERT(patc == *pat1); |
| REGEX_ASSERT(patb == patc); |
| REGEX_ASSERT(pat1 != pat2); |
| patb = *pat2; |
| REGEX_ASSERT(patb != patc); |
| REGEX_ASSERT(patb == *pat2); |
| |
| // Compile with no flags. |
| RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); |
| REGEX_ASSERT(*pat1a == *pat1); |
| |
| REGEX_ASSERT(pat1a->flags() == 0); |
| |
| // Compile with different flags should be not equal |
| RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); |
| REGEX_CHECK_STATUS; |
| |
| REGEX_ASSERT(*pat1b != *pat1a); |
| REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); |
| REGEX_ASSERT(pat1a->flags() == 0); |
| delete pat1b; |
| |
| // clone |
| RegexPattern *pat1c = pat1->clone(); |
| REGEX_ASSERT(*pat1c == *pat1); |
| REGEX_ASSERT(*pat1c != *pat2); |
| |
| delete pat1c; |
| delete pat1a; |
| delete pat1; |
| delete pat2; |
| |
| |
| // |
| // Verify that a matcher created from a cloned pattern works. |
| // (Jitterbug 3423) |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); |
| RegexPattern *pClone = pSource->clone(); |
| delete pSource; |
| RegexMatcher *mFromClone = pClone->matcher(status); |
| REGEX_CHECK_STATUS; |
| UnicodeString s = "Hello World"; |
| mFromClone->reset(s); |
| REGEX_ASSERT(mFromClone->find() == TRUE); |
| REGEX_ASSERT(mFromClone->group(status) == "Hello"); |
| REGEX_ASSERT(mFromClone->find() == TRUE); |
| REGEX_ASSERT(mFromClone->group(status) == "World"); |
| REGEX_ASSERT(mFromClone->find() == FALSE); |
| delete mFromClone; |
| delete pClone; |
| } |
| |
| // |
| // matches convenience API |
| // |
| REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); |
| REGEX_CHECK_STATUS; |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| |
| // |
| // Split() |
| // |
| status = U_ZERO_ERROR; |
| pat1 = RegexPattern::compile(" +", pe, status); |
| REGEX_CHECK_STATUS; |
| UnicodeString fields[10]; |
| |
| int32_t n; |
| n = pat1->split("Now is the time", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==4); |
| REGEX_ASSERT(fields[0]=="Now"); |
| REGEX_ASSERT(fields[1]=="is"); |
| REGEX_ASSERT(fields[2]=="the"); |
| REGEX_ASSERT(fields[3]=="time"); |
| REGEX_ASSERT(fields[4]==""); |
| |
| n = pat1->split("Now is the time", fields, 2, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==2); |
| REGEX_ASSERT(fields[0]=="Now"); |
| REGEX_ASSERT(fields[1]=="is the time"); |
| REGEX_ASSERT(fields[2]=="the"); // left over from previous test |
| |
| fields[1] = "*"; |
| status = U_ZERO_ERROR; |
| n = pat1->split("Now is the time", fields, 1, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==1); |
| REGEX_ASSERT(fields[0]=="Now is the time"); |
| REGEX_ASSERT(fields[1]=="*"); |
| status = U_ZERO_ERROR; |
| |
| n = pat1->split(" Now is the time ", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==6); |
| REGEX_ASSERT(fields[0]==""); |
| REGEX_ASSERT(fields[1]=="Now"); |
| REGEX_ASSERT(fields[2]=="is"); |
| REGEX_ASSERT(fields[3]=="the"); |
| REGEX_ASSERT(fields[4]=="time"); |
| REGEX_ASSERT(fields[5]==""); |
| |
| n = pat1->split(" ", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==2); |
| REGEX_ASSERT(fields[0]==""); |
| REGEX_ASSERT(fields[1]==""); |
| |
| fields[0] = "foo"; |
| n = pat1->split("", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==0); |
| REGEX_ASSERT(fields[0]=="foo"); |
| |
| delete pat1; |
| |
| // split, with a pattern with (capture) |
| pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); |
| REGEX_CHECK_STATUS; |
| |
| status = U_ZERO_ERROR; |
| n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==7); |
| REGEX_ASSERT(fields[0]==""); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="Now is "); |
| REGEX_ASSERT(fields[3]=="b"); |
| REGEX_ASSERT(fields[4]=="the time"); |
| REGEX_ASSERT(fields[5]=="c"); |
| REGEX_ASSERT(fields[6]==""); |
| REGEX_ASSERT(status==U_ZERO_ERROR); |
| |
| n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==7); |
| REGEX_ASSERT(fields[0]==" "); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="Now is "); |
| REGEX_ASSERT(fields[3]=="b"); |
| REGEX_ASSERT(fields[4]=="the time"); |
| REGEX_ASSERT(fields[5]=="c"); |
| REGEX_ASSERT(fields[6]==""); |
| |
| status = U_ZERO_ERROR; |
| fields[6] = "foo"; |
| n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==6); |
| REGEX_ASSERT(fields[0]==" "); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="Now is "); |
| REGEX_ASSERT(fields[3]=="b"); |
| REGEX_ASSERT(fields[4]=="the time"); |
| REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. |
| REGEX_ASSERT(fields[6]=="foo"); |
| |
| status = U_ZERO_ERROR; |
| fields[5] = "foo"; |
| n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==5); |
| REGEX_ASSERT(fields[0]==" "); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="Now is "); |
| REGEX_ASSERT(fields[3]=="b"); |
| REGEX_ASSERT(fields[4]=="the time<c>"); |
| REGEX_ASSERT(fields[5]=="foo"); |
| |
| status = U_ZERO_ERROR; |
| fields[5] = "foo"; |
| n = pat1->split(" <a>Now is <b>the time", fields, 5, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==5); |
| REGEX_ASSERT(fields[0]==" "); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="Now is "); |
| REGEX_ASSERT(fields[3]=="b"); |
| REGEX_ASSERT(fields[4]=="the time"); |
| REGEX_ASSERT(fields[5]=="foo"); |
| |
| status = U_ZERO_ERROR; |
| n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==4); |
| REGEX_ASSERT(fields[0]==" "); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="Now is "); |
| REGEX_ASSERT(fields[3]=="the time<c>"); |
| status = U_ZERO_ERROR; |
| delete pat1; |
| |
| pat1 = RegexPattern::compile("([-,])", pe, status); |
| REGEX_CHECK_STATUS; |
| n = pat1->split("1-10,20", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==5); |
| REGEX_ASSERT(fields[0]=="1"); |
| REGEX_ASSERT(fields[1]=="-"); |
| REGEX_ASSERT(fields[2]=="10"); |
| REGEX_ASSERT(fields[3]==","); |
| REGEX_ASSERT(fields[4]=="20"); |
| delete pat1; |
| |
| // Test split of string with empty trailing fields |
| pat1 = RegexPattern::compile(",", pe, status); |
| REGEX_CHECK_STATUS; |
| n = pat1->split("a,b,c,", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==4); |
| REGEX_ASSERT(fields[0]=="a"); |
| REGEX_ASSERT(fields[1]=="b"); |
| REGEX_ASSERT(fields[2]=="c"); |
| REGEX_ASSERT(fields[3]==""); |
| |
| n = pat1->split("a,,,", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==4); |
| REGEX_ASSERT(fields[0]=="a"); |
| REGEX_ASSERT(fields[1]==""); |
| REGEX_ASSERT(fields[2]==""); |
| REGEX_ASSERT(fields[3]==""); |
| delete pat1; |
| |
| // Split Separator with zero length match. |
| pat1 = RegexPattern::compile(":?", pe, status); |
| REGEX_CHECK_STATUS; |
| n = pat1->split("abc", fields, 10, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(n==5); |
| REGEX_ASSERT(fields[0]==""); |
| REGEX_ASSERT(fields[1]=="a"); |
| REGEX_ASSERT(fields[2]=="b"); |
| REGEX_ASSERT(fields[3]=="c"); |
| REGEX_ASSERT(fields[4]==""); |
| |
| delete pat1; |
| |
| // |
| // RegexPattern::pattern() |
| // |
| pat1 = new RegexPattern(); |
| REGEX_ASSERT(pat1->pattern() == ""); |
| delete pat1; |
| |
| pat1 = RegexPattern::compile("(Hello, world)*", pe, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); |
| delete pat1; |
| |
| |
| // |
| // classID functions |
| // |
| pat1 = RegexPattern::compile("(Hello, world)*", pe, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); |
| REGEX_ASSERT(pat1->getDynamicClassID() != NULL); |
| UnicodeString Hello("Hello, world."); |
| RegexMatcher *m = pat1->matcher(Hello, status); |
| REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); |
| REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); |
| REGEX_ASSERT(m->getDynamicClassID() != NULL); |
| delete m; |
| delete pat1; |
| |
| } |
| |
| //--------------------------------------------------------------------------- |
| // |
| // API_Match_UTF8 Test that the alternate engine for class RegexMatcher |
| // is present and working, but excluding functions |
| // implementing replace operations. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::API_Match_UTF8() { |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| int32_t flags = 0; |
| |
| // |
| // Debug - slide failing test cases early |
| // |
| #if 0 |
| { |
| } |
| return; |
| #endif |
| |
| // |
| // Simple pattern compilation |
| // |
| { |
| UText re = UTEXT_INITIALIZER; |
| regextst_openUTF8FromInvariant(&re, "abc", -1, &status); |
| REGEX_VERBOSE_TEXT(&re); |
| RegexPattern *pat2; |
| pat2 = RegexPattern::compile(&re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| |
| UText input1 = UTEXT_INITIALIZER; |
| UText input2 = UTEXT_INITIALIZER; |
| UText empty = UTEXT_INITIALIZER; |
| regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); |
| REGEX_VERBOSE_TEXT(&input1); |
| regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); |
| REGEX_VERBOSE_TEXT(&input2); |
| utext_openUChars(&empty, NULL, 0, &status); |
| |
| int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */ |
| int32_t input2Len = static_cast<int32_t>(strlen("not abc")); |
| |
| |
| // |
| // Matcher creation and reset. |
| // |
| RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m1->lookingAt(status) == TRUE); |
| const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ |
| REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); |
| m1->reset(&input2); |
| REGEX_ASSERT(m1->lookingAt(status) == FALSE); |
| const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ |
| REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); |
| m1->reset(&input1); |
| REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); |
| REGEX_ASSERT(m1->lookingAt(status) == TRUE); |
| m1->reset(&empty); |
| REGEX_ASSERT(m1->lookingAt(status) == FALSE); |
| REGEX_ASSERT(utext_nativeLength(&empty) == 0); |
| |
| // |
| // reset(pos, status) |
| // |
| m1->reset(&input1); |
| m1->reset(4, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); |
| REGEX_ASSERT(m1->lookingAt(status) == TRUE); |
| |
| m1->reset(-1, status); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| |
| m1->reset(0, status); |
| REGEX_CHECK_STATUS; |
| status = U_ZERO_ERROR; |
| |
| m1->reset(input1Len-1, status); |
| REGEX_CHECK_STATUS; |
| status = U_ZERO_ERROR; |
| |
| m1->reset(input1Len, status); |
| REGEX_CHECK_STATUS; |
| status = U_ZERO_ERROR; |
| |
| m1->reset(input1Len+1, status); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| |
| // |
| // match(pos, status) |
| // |
| m1->reset(&input2); |
| REGEX_ASSERT(m1->matches(4, status) == TRUE); |
| m1->reset(); |
| REGEX_ASSERT(m1->matches(3, status) == FALSE); |
| m1->reset(); |
| REGEX_ASSERT(m1->matches(5, status) == FALSE); |
| REGEX_ASSERT(m1->matches(4, status) == TRUE); |
| REGEX_ASSERT(m1->matches(-1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| // Match() at end of string should fail, but should not |
| // be an error. |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); |
| REGEX_CHECK_STATUS; |
| |
| // Match beyond end of string should fail with an error. |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| // Successful match at end of string. |
| { |
| status = U_ZERO_ERROR; |
| RegexMatcher m("A?", 0, status); // will match zero length string. |
| REGEX_CHECK_STATUS; |
| m.reset(&input1); |
| REGEX_ASSERT(m.matches(input1Len, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| m.reset(&empty); |
| REGEX_ASSERT(m.matches(0, status) == TRUE); |
| REGEX_CHECK_STATUS; |
| } |
| |
| |
| // |
| // lookingAt(pos, status) |
| // |
| status = U_ZERO_ERROR; |
| m1->reset(&input2); // "not abc" |
| REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); |
| REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); |
| REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); |
| REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); |
| REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); |
| REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| delete m1; |
| delete pat2; |
| |
| utext_close(&re); |
| utext_close(&input1); |
| utext_close(&input2); |
| utext_close(&empty); |
| } |
| |
| |
| // |
| // Capture Group. |
| // RegexMatcher::start(); |
| // RegexMatcher::end(); |
| // RegexMatcher::groupCount(); |
| // |
| { |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| UText re=UTEXT_INITIALIZER; |
| const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ |
| utext_openUTF8(&re, str_01234567_pat, -1, &status); |
| |
| RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| |
| UText input = UTEXT_INITIALIZER; |
| const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ |
| utext_openUTF8(&input, str_0123456789, -1, &status); |
| |
| RegexMatcher *matcher = &pat->matcher(status)->reset(&input); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher->lookingAt(status) == TRUE); |
| static const int32_t matchStarts[] = {0, 2, 4, 8}; |
| static const int32_t matchEnds[] = {10, 8, 6, 10}; |
| int32_t i; |
| for (i=0; i<4; i++) { |
| int32_t actualStart = matcher->start(i, status); |
| REGEX_CHECK_STATUS; |
| if (actualStart != matchStarts[i]) { |
| errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", |
| __FILE__, __LINE__, i, matchStarts[i], actualStart); |
| } |
| int32_t actualEnd = matcher->end(i, status); |
| REGEX_CHECK_STATUS; |
| if (actualEnd != matchEnds[i]) { |
| errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", |
| __FILE__, __LINE__, i, matchEnds[i], actualEnd); |
| } |
| } |
| |
| REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); |
| REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); |
| |
| REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| matcher->reset(); |
| REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); |
| |
| matcher->lookingAt(status); |
| |
| UnicodeString dest; |
| UText destText = UTEXT_INITIALIZER; |
| utext_openUnicodeString(&destText, &dest, &status); |
| UText *result; |
| //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ |
| // Test shallow-clone API |
| int64_t group_len; |
| result = matcher->group((UText *)NULL, group_len, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); |
| utext_close(result); |
| result = matcher->group(0, &destText, group_len, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); |
| // destText is now immutable, reopen it |
| utext_close(&destText); |
| utext_openUnicodeString(&destText, &dest, &status); |
| |
| int64_t length; |
| result = matcher->group(0, NULL, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); |
| utext_close(result); |
| result = matcher->group(0, &destText, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT(utext_getNativeIndex(result) == 0); |
| REGEX_ASSERT(length == 10); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| |
| // Capture Group 1 == "234567" |
| result = matcher->group(1, NULL, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(utext_getNativeIndex(result) == 2); |
| REGEX_ASSERT(length == 6); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| utext_close(result); |
| |
| result = matcher->group(1, &destText, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT(utext_getNativeIndex(result) == 2); |
| REGEX_ASSERT(length == 6); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| utext_close(result); |
| |
| // Capture Group 2 == "45" |
| result = matcher->group(2, NULL, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(utext_getNativeIndex(result) == 4); |
| REGEX_ASSERT(length == 2); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| utext_close(result); |
| |
| result = matcher->group(2, &destText, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT(utext_getNativeIndex(result) == 4); |
| REGEX_ASSERT(length == 2); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| utext_close(result); |
| |
| // Capture Group 3 == "89" |
| result = matcher->group(3, NULL, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(utext_getNativeIndex(result) == 8); |
| REGEX_ASSERT(length == 2); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| utext_close(result); |
| |
| result = matcher->group(3, &destText, length, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT(utext_getNativeIndex(result) == 8); |
| REGEX_ASSERT(length == 2); |
| REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| utext_close(result); |
| |
| // Capture Group number out of range. |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| matcher->reset(); |
| REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); |
| |
| delete matcher; |
| delete pat; |
| |
| utext_close(&destText); |
| utext_close(&input); |
| utext_close(&re); |
| } |
| |
| // |
| // find |
| // |
| { |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| UText re=UTEXT_INITIALIZER; |
| const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ |
| utext_openUTF8(&re, str_abc, -1, &status); |
| |
| RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| UText input = UTEXT_INITIALIZER; |
| const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ |
| utext_openUTF8(&input, str_abcabcabc, -1, &status); |
| // 012345678901234567 |
| |
| RegexMatcher *matcher = &pat->matcher(status)->reset(&input); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 6); |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 12); |
| REGEX_ASSERT(matcher->find() == FALSE); |
| REGEX_ASSERT(matcher->find() == FALSE); |
| |
| matcher->reset(); |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| |
| REGEX_ASSERT(matcher->find(0, status)); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| REGEX_ASSERT(matcher->find(1, status)); |
| REGEX_ASSERT(matcher->start(status) == 1); |
| REGEX_ASSERT(matcher->find(2, status)); |
| REGEX_ASSERT(matcher->start(status) == 6); |
| REGEX_ASSERT(matcher->find(12, status)); |
| REGEX_ASSERT(matcher->start(status) == 12); |
| REGEX_ASSERT(matcher->find(13, status) == FALSE); |
| REGEX_ASSERT(matcher->find(16, status) == FALSE); |
| REGEX_ASSERT(matcher->find(17, status) == FALSE); |
| REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); |
| |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| status = U_ZERO_ERROR; |
| REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); |
| |
| REGEX_ASSERT(matcher->groupCount() == 0); |
| |
| delete matcher; |
| delete pat; |
| |
| utext_close(&input); |
| utext_close(&re); |
| } |
| |
| |
| // |
| // find, with \G in pattern (true if at the end of a previous match). |
| // |
| { |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| UText re=UTEXT_INITIALIZER; |
| const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ |
| utext_openUTF8(&re, str_Gabcabc, -1, &status); |
| |
| RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); |
| |
| REGEX_CHECK_STATUS; |
| UText input = UTEXT_INITIALIZER; |
| const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ |
| utext_openUTF8(&input, str_abcabcabc, -1, &status); |
| // 012345678901234567 |
| |
| RegexMatcher *matcher = &pat->matcher(status)->reset(&input); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 0); |
| REGEX_ASSERT(matcher->start(1, status) == -1); |
| REGEX_ASSERT(matcher->start(2, status) == 1); |
| |
| REGEX_ASSERT(matcher->find()); |
| REGEX_ASSERT(matcher->start(status) == 4); |
| REGEX_ASSERT(matcher->start(1, status) == 4); |
| REGEX_ASSERT(matcher->start(2, status) == -1); |
| REGEX_CHECK_STATUS; |
| |
| delete matcher; |
| delete pat; |
| |
| utext_close(&input); |
| utext_close(&re); |
| } |
| |
| // |
| // find with zero length matches, match position should bump ahead |
| // to prevent loops. |
| // |
| { |
| int32_t i; |
| UErrorCode status=U_ZERO_ERROR; |
| RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, |
| // using an always-true look-ahead. |
| REGEX_CHECK_STATUS; |
| UText s = UTEXT_INITIALIZER; |
| utext_openUTF8(&s, " ", -1, &status); |
| m.reset(&s); |
| for (i=0; ; i++) { |
| if (m.find() == FALSE) { |
| break; |
| } |
| REGEX_ASSERT(m.start(status) == i); |
| REGEX_ASSERT(m.end(status) == i); |
| } |
| REGEX_ASSERT(i==5); |
| |
| // Check that the bump goes over characters outside the BMP OK |
| // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 |
| unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; |
| utext_openUTF8(&s, (char *)aboveBMP, -1, &status); |
| m.reset(&s); |
| for (i=0; ; i+=4) { |
| if (m.find() == FALSE) { |
| break; |
| } |
| REGEX_ASSERT(m.start(status) == i); |
| REGEX_ASSERT(m.end(status) == i); |
| } |
| REGEX_ASSERT(i==20); |
| |
| utext_close(&s); |
| } |
| { |
| // find() loop breaking test. |
| // with pattern of /.?/, should see a series of one char matches, then a single |
| // match of zero length at the end of the input string. |
| int32_t i; |
| UErrorCode status=U_ZERO_ERROR; |
| RegexMatcher m(".?", 0, status); |
| REGEX_CHECK_STATUS; |
| UText s = UTEXT_INITIALIZER; |
| utext_openUTF8(&s, " ", -1, &status); |
| m.reset(&s); |
| for (i=0; ; i++) { |
| if (m.find() == FALSE) { |
| break; |
| } |
| REGEX_ASSERT(m.start(status) == i); |
| REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); |
| } |
| REGEX_ASSERT(i==5); |
| |
| utext_close(&s); |
| } |
| |
| |
| // |
| // Matchers with no input string behave as if they had an empty input string. |
| // |
| |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| RegexMatcher m(".?", 0, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m.find()); |
| REGEX_ASSERT(m.start(status) == 0); |
| REGEX_ASSERT(m.input() == ""); |
| } |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| RegexPattern *p = RegexPattern::compile(".", 0, status); |
| RegexMatcher *m = p->matcher(status); |
| REGEX_CHECK_STATUS; |
| |
| REGEX_ASSERT(m->find() == FALSE); |
| REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); |
| delete m; |
| delete p; |
| } |
| |
| // |
| // Regions |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UText testPattern = UTEXT_INITIALIZER; |
| UText testText = UTEXT_INITIALIZER; |
| regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); |
| REGEX_VERBOSE_TEXT(&testPattern); |
| regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); |
| REGEX_VERBOSE_TEXT(&testText); |
| |
| RegexMatcher m(&testPattern, &testText, 0, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m.regionStart() == 0); |
| REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| |
| m.region(2,4, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(m.matches(status)); |
| REGEX_ASSERT(m.start(status)==2); |
| REGEX_ASSERT(m.end(status)==4); |
| REGEX_CHECK_STATUS; |
| |
| m.reset(); |
| REGEX_ASSERT(m.regionStart() == 0); |
| REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); |
| |
| regextst_openUTF8FromInvariant(&testText, "short", -1, &status); |
| REGEX_VERBOSE_TEXT(&testText); |
| m.reset(&testText); |
| REGEX_ASSERT(m.regionStart() == 0); |
| REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); |
| |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); |
| REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); |
| |
| REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); |
| |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); |
| REGEX_ASSERT(m.hasTransparentBounds() == TRUE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasTransparentBounds() == TRUE); |
| |
| REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| REGEX_ASSERT(&m == &m.reset()); |
| REGEX_ASSERT(m.hasTransparentBounds() == FALSE); |
| |
| utext_close(&testText); |
| utext_close(&testPattern); |
| } |
| |
| // |
| // hitEnd() and requireEnd() |
| // |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UText testPattern = UTEXT_INITIALIZER; |
| UText testText = UTEXT_INITIALIZER; |
| const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ |
| const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ |
| utext_openUTF8(&testPattern, str_, -1, &status); |
| utext_openUTF8(&testText, str_aabb, -1, &status); |
| |
| RegexMatcher m1(&testPattern, &testText, 0, status); |
| REGEX_ASSERT(m1.lookingAt(status) == TRUE); |
| REGEX_ASSERT(m1.hitEnd() == TRUE); |
| REGEX_ASSERT(m1.requireEnd() == FALSE); |
| REGEX_CHECK_STATUS; |
| |
| status = U_ZERO_ERROR; |
| const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ |
| utext_openUTF8(&testPattern, str_a, -1, &status); |
| RegexMatcher m2(&testPattern, &testText, 0, status); |
| REGEX_ASSERT(m2.lookingAt(status) == TRUE); |
| REGEX_ASSERT(m2.hitEnd() == FALSE); |
| REGEX_ASSERT(m2.requireEnd() == FALSE); |
| REGEX_CHECK_STATUS; |
| |
| status = U_ZERO_ERROR; |
| const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ |
| utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); |
| RegexMatcher m3(&testPattern, &testText, 0, status); |
| REGEX_ASSERT(m3.lookingAt(status) == TRUE); |
| REGEX_ASSERT(m3.hitEnd() == TRUE); |
| REGEX_ASSERT(m3.requireEnd() == TRUE); |
| REGEX_CHECK_STATUS; |
| |
| utext_close(&testText); |
| utext_close(&testPattern); |
| } |
| } |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // API_Replace_UTF8 API test for class RegexMatcher, testing the |
| // Replace family of functions. |
| // |
| //--------------------------------------------------------------------------- |
| void RegexTest::API_Replace_UTF8() { |
| // |
| // Replace |
| // |
| int32_t flags=0; |
| UParseError pe; |
| UErrorCode status=U_ZERO_ERROR; |
| |
| UText re=UTEXT_INITIALIZER; |
| regextst_openUTF8FromInvariant(&re, "abc", -1, &status); |
| REGEX_VERBOSE_TEXT(&re); |
| RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| |
| char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ |
| // 012345678901234567 |
| UText dataText = UTEXT_INITIALIZER; |
| utext_openUTF8(&dataText, data, -1, &status); |
| REGEX_CHECK_STATUS; |
| REGEX_VERBOSE_TEXT(&dataText); |
| RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); |
| |
| // |
| // Plain vanilla matches. |
| // |
| UnicodeString dest; |
| UText destText = UTEXT_INITIALIZER; |
| utext_openUnicodeString(&destText, &dest, &status); |
| UText *result; |
| |
| UText replText = UTEXT_INITIALIZER; |
| |
| const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ |
| utext_openUTF8(&replText, str_yz, -1, &status); |
| REGEX_VERBOSE_TEXT(&replText); |
| result = matcher->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ |
| REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); |
| utext_close(result); |
| result = matcher->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); |
| |
| result = matcher->replaceAll(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ |
| REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); |
| utext_close(result); |
| |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher->replaceAll(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); |
| |
| // |
| // Plain vanilla non-matches. |
| // |
| const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ |
| utext_openUTF8(&dataText, str_abxabxabx, -1, &status); |
| matcher->reset(&dataText); |
| |
| result = matcher->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); |
| utext_close(result); |
| result = matcher->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); |
| |
| result = matcher->replaceAll(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher->replaceAll(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); |
| |
| // |
| // Empty source string |
| // |
| utext_openUTF8(&dataText, NULL, 0, &status); |
| matcher->reset(&dataText); |
| |
| result = matcher->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8("", result); |
| utext_close(result); |
| result = matcher->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8("", result); |
| |
| result = matcher->replaceAll(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8("", result); |
| utext_close(result); |
| result = matcher->replaceAll(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8("", result); |
| |
| // |
| // Empty substitution string |
| // |
| utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." |
| matcher->reset(&dataText); |
| |
| utext_openUTF8(&replText, NULL, 0, &status); |
| result = matcher->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ |
| REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); |
| utext_close(result); |
| result = matcher->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); |
| |
| result = matcher->replaceAll(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ |
| REGEX_ASSERT_UTEXT_UTF8(str_dots, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher->replaceAll(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_dots, result); |
| |
| // |
| // match whole string |
| // |
| const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ |
| utext_openUTF8(&dataText, str_abc, -1, &status); |
| matcher->reset(&dataText); |
| |
| const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ |
| utext_openUTF8(&replText, str_xyz, -1, &status); |
| result = matcher->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); |
| |
| result = matcher->replaceAll(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher->replaceAll(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); |
| |
| // |
| // Capture Group, simple case |
| // |
| const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ |
| utext_openUTF8(&re, str_add, -1, &status); |
| RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); |
| REGEX_CHECK_STATUS; |
| |
| const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ |
| utext_openUTF8(&dataText, str_abcdefg, -1, &status); |
| RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); |
| REGEX_CHECK_STATUS; |
| |
| const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ |
| utext_openUTF8(&replText, str_11, -1, &status); |
| result = matcher2->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ |
| REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher2->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); |
| |
| const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ |
| utext_openUTF8(&replText, str_v, -1, &status); |
| REGEX_VERBOSE_TEXT(&replText); |
| result = matcher2->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ |
| REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher2->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); |
| |
| const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, |
| 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, |
| 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ |
| utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); |
| result = matcher2->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ |
| REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher2->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); |
| |
| unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ |
| //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE |
| // 012345678901234567890123456 |
| supplDigitChars[22] = 0xF0; |
| supplDigitChars[23] = 0x9D; |
| supplDigitChars[24] = 0x9F; |
| supplDigitChars[25] = 0x8F; |
| utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); |
| |
| result = matcher2->replaceFirst(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ |
| REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher2->replaceFirst(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); |
| const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ |
| utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); |
| REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); |
| // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); |
| REGEX_ASSERT(result == &destText); |
| // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); |
| |
| // |
| // Replacement String with \u hex escapes |
| // |
| { |
| const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ |
| const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ |
| utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); |
| utext_openUTF8(&replText, str_u0043, -1, &status); |
| matcher->reset(&dataText); |
| |
| result = matcher->replaceAll(&replText, NULL, status); |
| REGEX_CHECK_STATUS; |
| const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ |
| REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); |
| utext_close(result); |
| utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); |
| result = matcher->replaceAll(&replText, &destText, status); |
| REGEX_CHECK_STATUS; |
| REGEX_ASSERT(result == &destText); |
| REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); |
| } |
|