ICU-21492 Fix regex compile assertion failure.
A regex pattern containing nested look-behind blocks could trigger an assertion
failure during pattern compilation. The problem was caused by an off-by-one
error in the code that computes an upper bound on the match length, needed
because look-behind expressions are constrained to not have unbounded match
length.
Nested look-behind blocks come into play because, when computing the maximum
match length of an outer block, any inner look-behind blocks are skipped over -
they do not directly contribute to the length matched by the outer block. The
problem was in the code that skips over these nested look-behind blocks.
diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp
index b75d80f..ec8654d 100644
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@@ -3475,6 +3475,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
// value may be longer than the actual maximum; it must
// never be shorter.
//
+// start, end: the range of the pattern to check.
+// end is inclusive.
+//
//------------------------------------------------------------------------------
int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
if (U_FAILURE(*fStatus)) {
@@ -3720,14 +3723,14 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// Look-behind. Scan forward until the matching look-around end,
// without processing the look-behind block.
int32_t dataLoc = URX_VAL(op);
- for (loc = loc + 1; loc < end; ++loc) {
+ for (loc = loc + 1; loc <= end; ++loc) {
op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
int32_t opType = URX_TYPE(op);
if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
break;
}
}
- U_ASSERT(loc < end);
+ U_ASSERT(loc <= end);
}
break;
diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt
index 4609ee9..12146bc 100644
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@@ -1497,6 +1497,11 @@
#
"(?w)\b" v2 "äää<0></0> äää"
+# Bug ICU-21492 Assertion failure with nested look-around expressions.
+#
+"(?<=(?:(?<=(?:(?<=(?:(?<=)){2})){3})){4}" E "<0></0>" # orig failure from bug report, w mismatched parens.
+"(?:(?<=(?:(?<=)){2}))" "<0></0>" # Simplified case, with a valid pattern.
+
# Random debugging, Temporary
#