ICU-21492 Fix regex compile assertion failure. A regex pattern containing nested look-behind blocks could trigger an assertion failure during pattern compilation. The problem was caused by an off-by-one error in the code that computes an upper bound on the match length, needed because look-behind expressions are constrained to not have unbounded match length. Nested look-behind blocks come into play because, when computing the maximum match length of an outer block, any inner look-behind blocks are skipped over - they do not directly contribute to the length matched by the outer block. The problem was in the code that skips over these nested look-behind blocks.
diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index b75d80f..ec8654d 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp
@@ -3475,6 +3475,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { // value may be longer than the actual maximum; it must // never be shorter. // +// start, end: the range of the pattern to check. +// end is inclusive. +// //------------------------------------------------------------------------------ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { if (U_FAILURE(*fStatus)) { @@ -3720,14 +3723,14 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // Look-behind. Scan forward until the matching look-around end, // without processing the look-behind block. int32_t dataLoc = URX_VAL(op); - for (loc = loc + 1; loc < end; ++loc) { + for (loc = loc + 1; loc <= end; ++loc) { op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); int32_t opType = URX_TYPE(op); if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) { break; } } - U_ASSERT(loc < end); + U_ASSERT(loc <= end); } break;
diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 4609ee9..12146bc 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt
@@ -1497,6 +1497,11 @@ # "(?w)\b" v2 "äää<0></0> äää" +# Bug ICU-21492 Assertion failure with nested look-around expressions. +# +"(?<=(?:(?<=(?:(?<=(?:(?<=)){2})){3})){4}" E "<0></0>" # orig failure from bug report, w mismatched parens. +"(?:(?<=(?:(?<=)){2}))" "<0></0>" # Simplified case, with a valid pattern. + # Random debugging, Temporary #