If there is unicode flag is set but find un-paired utf-16 surrogate in RegExp interpreter,

We should not use utf-16 surrogate pair rule for input.

Signed-off-by: Seonghyun Kim <sh8281.kim@samsung.com>
This commit is contained in:
Seonghyun Kim 2024-07-18 17:14:07 +09:00 committed by Hyukwoo Park
commit eda2f8d4fa

View file

@ -1661,6 +1661,15 @@ public:
BACKTRACK();
}
MATCH_NEXT();
#if defined(ENABLE_ICU)
} else if (U16_IS_SURROGATE(currentTerm().atom.patternCharacter)) {
// Escargot update
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
if (!checkCharacter(currentTerm(), currentTerm().inputPosition - matchAmount))
BACKTRACK();
}
MATCH_NEXT();
#endif
}
}
@ -1685,6 +1694,18 @@ public:
BACKTRACK();
}
MATCH_NEXT();
#if defined(ENABLE_ICU)
} else if (U16_IS_SURROGATE(currentTerm().atom.patternCharacter)) {
// Escargot update
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
auto inputPosition = term.inputPosition + 2 * matchAmount;
if (input.getPos() < inputPosition)
BACKTRACK();
if (!checkCharacter(term, inputPosition))
BACKTRACK();
}
MATCH_NEXT();
#endif
}
}
@ -2197,6 +2218,7 @@ public:
lo = tolower(ch);
hi = toupper(ch);
} else {
// Escargot update
// if ch is ALPHABETIC like latin or greek, we should not apply u_tolower or u_toupper (print('iI\u0130'.replace(/\u0130/gi, '#')))
auto v = u_getIntPropertyValue(ch, UProperty::UCHAR_ALPHABETIC);
if (v) {