mirror of
https://github.com/Samsung/escargot.git
synced 2026-06-22 10:01:50 +00:00
2276 lines
80 KiB
C++
2276 lines
80 KiB
C++
/*
|
|
* Copyright (C) 2009-2020 Apple Inc. All rights reserved.
|
|
* Copyright (C) 2020 Alexey Shvayka <shvaikalesh@gmail.com>.
|
|
* Copyright (C) 2025 Tetsuharu Ohzeki <tetsuharu.ohzeki@gmail.com>.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
|
* THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "Yarr.h"
|
|
#include "YarrPattern.h"
|
|
#include "YarrUnicodeProperties.h"
|
|
|
|
WTF_ALLOW_UNSAFE_BUFFER_USAGE_BEGIN
|
|
|
|
namespace JSC { namespace Yarr {
|
|
|
|
enum class CreateDisjunctionPurpose : uint8_t { NotForNextAlternative, ForNextAlternative };
|
|
|
|
enum class CharacterClassSetOp : uint8_t {
|
|
Default,
|
|
Union,
|
|
Intersection,
|
|
Subtraction
|
|
};
|
|
|
|
// The Parser class should not be used directly - only via the Yarr::parse() method.
|
|
template<class Delegate, typename CharType>
|
|
class Parser {
|
|
public:
|
|
Parser(Delegate& delegate, StringView pattern, CompileMode compileMode, unsigned backReferenceLimit, bool isNamedForwardReferenceAllowed)
|
|
: m_delegate(delegate)
|
|
, m_data(pattern.characters<CharType>())
|
|
, m_size(pattern.length())
|
|
, m_compileMode(compileMode)
|
|
, m_backReferenceLimit(backReferenceLimit)
|
|
, m_isNamedForwardReferenceAllowed(isNamedForwardReferenceAllowed)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* parse():
|
|
*
|
|
* This method calls parseTokens() to parse over the input and returns error code for a result.
|
|
*/
|
|
ErrorCode parse()
|
|
{
|
|
if (m_size > MAX_PATTERN_SIZE)
|
|
return ErrorCode::PatternTooLarge;
|
|
|
|
parseTokens();
|
|
|
|
if (!hasError(m_errorCode)) {
|
|
ASSERT(atEndOfPattern());
|
|
handleIllegalReferences();
|
|
ASSERT(atEndOfPattern());
|
|
}
|
|
|
|
return m_errorCode;
|
|
}
|
|
|
|
private:
|
|
static constexpr char32_t errorCodePoint = 0xFFFFFFFFu;
|
|
|
|
template<typename FriendDelegate>
|
|
friend ErrorCode parse(FriendDelegate&, StringView pattern, CompileMode, unsigned backReferenceLimit, bool isNamedForwardReferenceAllowed);
|
|
|
|
enum class UnicodeParseContext : uint8_t { PatternCodePoint, GroupName };
|
|
|
|
enum class ParseEscapeMode : uint8_t { Normal, CharacterClass, ClassSet, ClassStringDisjunction };
|
|
|
|
enum class TokenType : uint8_t {
|
|
NotAtom = 0,
|
|
Atom = 1,
|
|
Lookbehind = 2,
|
|
SetDisjunction = 3,
|
|
SetDisjunctionMayContainStrings = 4,
|
|
};
|
|
|
|
class NamedCaptureGroups {
|
|
typedef GCHashSet<String> GroupNameHashSet;
|
|
|
|
public:
|
|
NamedCaptureGroups()
|
|
{
|
|
m_nestedCaptureGroupNames.grow(1);
|
|
m_activeCaptureGroupNames.grow(1);
|
|
}
|
|
|
|
bool contains(String name)
|
|
{
|
|
return m_captureGroupNames.contains(name);
|
|
}
|
|
|
|
bool isEmpty()
|
|
{
|
|
return m_captureGroupNames.isEmpty();
|
|
}
|
|
|
|
void reset()
|
|
{
|
|
m_captureGroupNames.clear();
|
|
m_nestedCaptureGroupNames.clear();
|
|
m_nestedCaptureGroupNames.grow(1);
|
|
m_activeCaptureGroupNames.clear();
|
|
m_activeCaptureGroupNames.grow(1);
|
|
}
|
|
|
|
void nextAlternative()
|
|
{
|
|
m_nestedCaptureGroupNames.last().formUnion(m_activeCaptureGroupNames.last());
|
|
m_activeCaptureGroupNames.last().clear();
|
|
|
|
// For nested parenthesis, we need to seed the new alternative with the already seen
|
|
// named captures from the containing alternative.
|
|
if (m_activeCaptureGroupNames.size() > 1)
|
|
m_activeCaptureGroupNames.last().formUnion(m_activeCaptureGroupNames[m_activeCaptureGroupNames.size() - 2]);
|
|
}
|
|
|
|
void pushParenthesis()
|
|
{
|
|
auto currentTop = m_activeCaptureGroupNames.last();
|
|
m_nestedCaptureGroupNames.append(GroupNameHashSet());
|
|
m_activeCaptureGroupNames.append(currentTop);
|
|
}
|
|
|
|
void popParenthesis()
|
|
{
|
|
ASSERT(m_nestedCaptureGroupNames.size() > 1);
|
|
ASSERT(m_activeCaptureGroupNames.size() > 1);
|
|
m_nestedCaptureGroupNames.last().formUnion(m_activeCaptureGroupNames.last());
|
|
|
|
// Add all the names seen in this parenthesis to the containing alternative.
|
|
m_activeCaptureGroupNames[m_activeCaptureGroupNames.size() - 2].formUnion(m_nestedCaptureGroupNames.last());
|
|
|
|
m_nestedCaptureGroupNames.removeLast();
|
|
m_activeCaptureGroupNames.removeLast();
|
|
}
|
|
|
|
GroupNameHashSet::AddResult add(String name)
|
|
{
|
|
m_captureGroupNames.add(name);
|
|
|
|
// If the name is not new, the caller should flag a syntax error.
|
|
return m_activeCaptureGroupNames.last().add(name);
|
|
}
|
|
|
|
private:
|
|
// Names seen in the whole expression up to this point.
|
|
GroupNameHashSet m_captureGroupNames;
|
|
// All active names from prior alternatives at this nesting level.
|
|
GCVector<GroupNameHashSet> m_nestedCaptureGroupNames;
|
|
// Names seen in containing disjunction / alternative and the current alternative.
|
|
GCVector<GroupNameHashSet> m_activeCaptureGroupNames;
|
|
};
|
|
|
|
/*
|
|
* CharacterClassParserDelegate:
|
|
*
|
|
* The class CharacterClassParserDelegate is used in the parsing of character
|
|
* classes. This class handles detection of character ranges. This class
|
|
* implements enough of the delegate interface such that it can be passed to
|
|
* parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
|
|
* to perform the parsing of escape characters in character sets.
|
|
*/
|
|
class CharacterClassParserDelegate {
|
|
public:
|
|
CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err, CompileMode compileMode)
|
|
: m_delegate(delegate)
|
|
, m_errorCode(err)
|
|
, m_isUnicode(compileMode == CompileMode::Unicode)
|
|
, m_state(CharacterClassConstructionState::Empty)
|
|
, m_character(0)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* begin():
|
|
*
|
|
* Called at beginning of construction.
|
|
*/
|
|
void begin(bool invert)
|
|
{
|
|
m_delegate.atomCharacterClassBegin(invert);
|
|
}
|
|
|
|
/*
|
|
* atomPatternCharacter():
|
|
*
|
|
* This method is called either from parseCharacterClass() (for an unescaped
|
|
* character in a character class), or from parseEscape(). In the former case
|
|
* the value true will be passed for the argument 'hyphenIsRange', and in this
|
|
* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
|
|
* is different to /[a\-z]/).
|
|
*/
|
|
void atomPatternCharacter(char32_t ch, bool hyphenIsRange = false)
|
|
{
|
|
switch (m_state) {
|
|
case CharacterClassConstructionState::AfterCharacterClass:
|
|
// Following a built-in character class we need look out for a hyphen.
|
|
// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
|
|
// If we see a hyphen following a character class then unlike usual
|
|
// we'll report it to the delegate immediately, and put ourself into
|
|
// a poisoned state. In a unicode pattern, any following calls to add
|
|
// another character or character class will result in syntax error.
|
|
// A hypen following a character class is itself valid, but only at
|
|
// the end of a regex.
|
|
if (hyphenIsRange && ch == '-') {
|
|
m_delegate.atomCharacterClassAtom('-');
|
|
m_state = CharacterClassConstructionState::AfterCharacterClassHyphen;
|
|
return;
|
|
}
|
|
// Otherwise just fall through - cached character so treat this as CharacterClassConstructionState::Empty.
|
|
FALLTHROUGH;
|
|
|
|
case CharacterClassConstructionState::Empty:
|
|
m_character = ch;
|
|
m_state = CharacterClassConstructionState::CachedCharacter;
|
|
return;
|
|
|
|
case CharacterClassConstructionState::CachedCharacter:
|
|
if (hyphenIsRange && ch == '-')
|
|
m_state = CharacterClassConstructionState::CachedCharacterHyphen;
|
|
else {
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
m_character = ch;
|
|
}
|
|
return;
|
|
|
|
case CharacterClassConstructionState::CachedCharacterHyphen:
|
|
if (ch < m_character) {
|
|
m_errorCode = ErrorCode::CharacterClassRangeOutOfOrder;
|
|
return;
|
|
}
|
|
m_delegate.atomCharacterClassRange(m_character, ch);
|
|
m_state = CharacterClassConstructionState::Empty;
|
|
return;
|
|
|
|
// If we hit this case, we have an invalid range like /[\d-a]/.
|
|
// See coment in atomBuiltInCharacterClass() below.
|
|
case CharacterClassConstructionState::AfterCharacterClassHyphen:
|
|
if (m_isUnicode) {
|
|
m_errorCode = ErrorCode::CharacterClassRangeInvalid;
|
|
return;
|
|
}
|
|
m_delegate.atomCharacterClassAtom(ch);
|
|
m_state = CharacterClassConstructionState::Empty;
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* atomBuiltInCharacterClass():
|
|
*
|
|
* Adds a built-in character class, called by parseEscape().
|
|
*/
|
|
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
|
|
{
|
|
switch (m_state) {
|
|
case CharacterClassConstructionState::CachedCharacter:
|
|
// Flush the currently cached character, then fall through.
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
FALLTHROUGH;
|
|
case CharacterClassConstructionState::Empty:
|
|
case CharacterClassConstructionState::AfterCharacterClass:
|
|
m_delegate.atomCharacterClassBuiltIn(classID, invert);
|
|
m_state = CharacterClassConstructionState::AfterCharacterClass;
|
|
return;
|
|
|
|
// If we hit either of these cases, we have an invalid range that
|
|
// looks something like /[a-\d]/ or /[\d-\d]/.
|
|
// Since ES2015, this should be syntax error in a unicode pattern,
|
|
// yet gracefully handled in a regular regex to avoid breaking the web.
|
|
// Effectively we handle the hyphen as if it was (implicitly) escaped,
|
|
// e.g. /[\d-a-z]/ is treated as /[\d\-a\-z]/.
|
|
// See usages of CharacterRangeOrUnion abstract op in
|
|
// https://tc39.es/ecma262/#sec-regular-expression-patterns-semantics
|
|
case CharacterClassConstructionState::CachedCharacterHyphen:
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
m_delegate.atomCharacterClassAtom('-');
|
|
FALLTHROUGH;
|
|
case CharacterClassConstructionState::AfterCharacterClassHyphen:
|
|
if (m_isUnicode) {
|
|
m_errorCode = ErrorCode::CharacterClassRangeInvalid;
|
|
return;
|
|
}
|
|
m_delegate.atomCharacterClassBuiltIn(classID, invert);
|
|
m_state = CharacterClassConstructionState::Empty;
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* end():
|
|
*
|
|
* Called at end of construction.
|
|
*/
|
|
void end()
|
|
{
|
|
if (m_state == CharacterClassConstructionState::CachedCharacter)
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
else if (m_state == CharacterClassConstructionState::CachedCharacterHyphen) {
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
m_delegate.atomCharacterClassAtom('-');
|
|
}
|
|
m_delegate.atomCharacterClassEnd();
|
|
}
|
|
|
|
// parseEscape() should never call these delegate methods when
|
|
// invoked with inCharacterClass set.
|
|
NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
|
|
private:
|
|
Delegate& m_delegate;
|
|
ErrorCode& m_errorCode;
|
|
bool m_isUnicode;
|
|
enum class CharacterClassConstructionState {
|
|
Empty,
|
|
CachedCharacter,
|
|
CachedCharacterHyphen,
|
|
AfterCharacterClass,
|
|
AfterCharacterClassHyphen,
|
|
};
|
|
CharacterClassConstructionState m_state;
|
|
char32_t m_character;
|
|
};
|
|
|
|
/*
|
|
* ClassSetParserDelegate:
|
|
*
|
|
* The class ClassSetParserDelegate is used in the parsing of class sets
|
|
* This class handles detection of class set ops and character ranges.
|
|
* This class implements enough of the delegate interface such that it can be passed to
|
|
* parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
|
|
* to perform the parsing of escape characters in character sets.
|
|
*/
|
|
class ClassSetParserDelegate {
|
|
private:
|
|
struct NestingState {
|
|
public:
|
|
NestingState(CharacterClassSetOp setOp, bool mayContainStrings, bool inverted)
|
|
: m_setOp(setOp)
|
|
, m_mayContainStrings(mayContainStrings)
|
|
, m_inverted(inverted)
|
|
{ }
|
|
|
|
CharacterClassSetOp m_setOp;
|
|
bool m_mayContainStrings;
|
|
bool m_inverted;
|
|
};
|
|
|
|
public:
|
|
ClassSetParserDelegate(Delegate& delegate, ErrorCode& err)
|
|
: m_delegate(delegate)
|
|
, m_errorCode(err)
|
|
, m_state(ClassSetConstructionState::Empty)
|
|
, m_setOp(CharacterClassSetOp::Default)
|
|
, m_mayContainStrings(false)
|
|
, m_inverted(false)
|
|
, m_processingEscape(false)
|
|
, m_character(0)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* begin():
|
|
*
|
|
* Called at beginning of construction.
|
|
*/
|
|
void begin(bool invert)
|
|
{
|
|
m_inverted = invert;
|
|
m_delegate.atomCharacterClassBegin(invert);
|
|
}
|
|
|
|
void nestedClassBegin(bool invert)
|
|
{
|
|
m_delegate.atomCharacterClassPushNested();
|
|
nestedParseState.append(NestingState(m_setOp, m_mayContainStrings, m_inverted));
|
|
m_setOp = CharacterClassSetOp::Default;
|
|
m_mayContainStrings = false;
|
|
m_inverted = invert;
|
|
}
|
|
|
|
bool nestedClassEnd()
|
|
{
|
|
flushCachedCharacterIfNeeded();
|
|
|
|
if (m_inverted && m_mayContainStrings)
|
|
m_errorCode = ErrorCode::NegatedClassSetMayContainStrings;
|
|
|
|
if (nestedParseState.isEmpty()) {
|
|
end();
|
|
return true;
|
|
}
|
|
|
|
bool rhsMayContainStrings = m_mayContainStrings;
|
|
|
|
NestingState lastState = nestedParseState.takeLast();
|
|
m_setOp = lastState.m_setOp;
|
|
m_inverted = lastState.m_inverted;
|
|
m_mayContainStrings = lastState.m_mayContainStrings;
|
|
|
|
m_delegate.atomCharacterClassPopNested();
|
|
m_state = ClassSetConstructionState::AfterSetOperand;
|
|
computeMayContainStrings(rhsMayContainStrings);
|
|
return false;
|
|
}
|
|
|
|
void setUnionOp()
|
|
{
|
|
if (m_setOp != CharacterClassSetOp::Default && m_setOp != CharacterClassSetOp::Union) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return;
|
|
}
|
|
|
|
flushCachedCharacterIfNeeded();
|
|
m_setOp = CharacterClassSetOp::Union;
|
|
m_delegate.atomCharacterClassSetOp(m_setOp);
|
|
}
|
|
|
|
void switchFromDefaultOpToUnionOpIfNeeded()
|
|
{
|
|
if (m_setOp == CharacterClassSetOp::Default) {
|
|
m_setOp = CharacterClassSetOp::Union;
|
|
m_delegate.atomCharacterClassSetOp(m_setOp);
|
|
}
|
|
}
|
|
|
|
void setSubtractOp()
|
|
{
|
|
if (m_state == ClassSetConstructionState::Empty || (m_setOp != CharacterClassSetOp::Default && m_setOp != CharacterClassSetOp::Subtraction)) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return;
|
|
}
|
|
|
|
flushCachedCharacterIfNeeded();
|
|
m_setOp = CharacterClassSetOp::Subtraction;
|
|
m_delegate.atomCharacterClassSetOp(m_setOp);
|
|
m_state = ClassSetConstructionState::AfterSetOperator;
|
|
}
|
|
|
|
void setIntersectionOp()
|
|
{
|
|
if (m_state == ClassSetConstructionState::Empty || (m_setOp != CharacterClassSetOp::Default && m_setOp != CharacterClassSetOp::Intersection)) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return;
|
|
}
|
|
|
|
flushCachedCharacterIfNeeded();
|
|
m_setOp = CharacterClassSetOp::Intersection;
|
|
m_delegate.atomCharacterClassSetOp(m_setOp);
|
|
m_state = ClassSetConstructionState::AfterSetOperator;
|
|
}
|
|
|
|
void computeMayContainStrings(bool rhsMayContainStrings)
|
|
{
|
|
switch (m_setOp) {
|
|
case CharacterClassSetOp::Default:
|
|
case CharacterClassSetOp::Union:
|
|
m_mayContainStrings |= rhsMayContainStrings;
|
|
break;
|
|
|
|
case CharacterClassSetOp::Intersection:
|
|
m_mayContainStrings = m_mayContainStrings && rhsMayContainStrings;
|
|
break;
|
|
|
|
case CharacterClassSetOp::Subtraction:
|
|
// Result is the value of the LHS
|
|
break;
|
|
}
|
|
}
|
|
|
|
void flushCachedCharacterIfNeeded()
|
|
{
|
|
if (m_state == ClassSetConstructionState::CachedCharacter) {
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
m_state = ClassSetConstructionState::Empty;
|
|
}
|
|
}
|
|
|
|
void afterSetOperand()
|
|
{
|
|
flushCachedCharacterIfNeeded();
|
|
m_state = ClassSetConstructionState::AfterSetOperand;
|
|
}
|
|
|
|
bool canTakeSetOperand()
|
|
{
|
|
bool unionOpActive = m_setOp == CharacterClassSetOp::Default || m_setOp == CharacterClassSetOp::Union;
|
|
|
|
switch (m_state) {
|
|
case ClassSetConstructionState::Empty:
|
|
case ClassSetConstructionState::AfterSetOperator:
|
|
return true;
|
|
|
|
case ClassSetConstructionState::CachedCharacter:
|
|
if (!unionOpActive)
|
|
return false;
|
|
|
|
flushCachedCharacterIfNeeded();
|
|
return true;
|
|
|
|
case ClassSetConstructionState::CachedCharacterHyphen:
|
|
case ClassSetConstructionState::AfterCharacterClassHyphen:
|
|
case ClassSetConstructionState::AfterCharacterClass:
|
|
case ClassSetConstructionState::AfterSetRange:
|
|
case ClassSetConstructionState::AfterSetOperand:
|
|
return unionOpActive;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void setProcessingEscape()
|
|
{
|
|
m_processingEscape = true;
|
|
}
|
|
|
|
/*
|
|
* atomPatternCharacter():
|
|
*
|
|
* This method is called either from parseCharacterClass() (for an unescaped
|
|
* character in a character class), or from parseEscape(). In the former case
|
|
* the value true will be passed for the argument 'hyphenIsRange', and in this
|
|
* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
|
|
* is different to /[a\-z]/).
|
|
*/
|
|
void atomPatternCharacter(char32_t ch)
|
|
{
|
|
bool unionOpActive = m_setOp == CharacterClassSetOp::Default || m_setOp == CharacterClassSetOp::Union;
|
|
bool processingEscape = m_processingEscape;
|
|
m_processingEscape = false;
|
|
|
|
auto processCharacter = [&] () {
|
|
m_character = ch;
|
|
m_state = ClassSetConstructionState::CachedCharacter;
|
|
return;
|
|
};
|
|
|
|
switch (m_state) {
|
|
case ClassSetConstructionState::AfterCharacterClass:
|
|
// Following a built-in character class we need look out for a hyphen.
|
|
// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
|
|
// If we see a hyphen following a character class then unlike usual
|
|
// we'll report it to the delegate immediately, and put ourself into
|
|
// a poisoned state. In a unicode pattern, any following calls to add
|
|
// another character or character class will result in syntax error.
|
|
// A hypen following a character class is itself valid, but only at
|
|
// the end of a regex.
|
|
if (unionOpActive && ch == '-') {
|
|
m_delegate.atomCharacterClassAtom('-');
|
|
m_state = ClassSetConstructionState::AfterCharacterClassHyphen;
|
|
return;
|
|
}
|
|
// Otherwise just fall through - cached character so treat this as ClassSetConstructionState::Empty.
|
|
FALLTHROUGH;
|
|
|
|
case ClassSetConstructionState::AfterSetRange:
|
|
switchFromDefaultOpToUnionOpIfNeeded();
|
|
|
|
// Continue processing the current character.
|
|
FALLTHROUGH;
|
|
|
|
case ClassSetConstructionState::Empty:
|
|
case ClassSetConstructionState::AfterSetOperator:
|
|
if (!processingEscape && ch == '-') {
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
return;
|
|
}
|
|
|
|
processCharacter();
|
|
return;
|
|
|
|
case ClassSetConstructionState::CachedCharacter:
|
|
if (!unionOpActive) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return;
|
|
}
|
|
|
|
if (ch == '-')
|
|
m_state = ClassSetConstructionState::CachedCharacterHyphen;
|
|
else {
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
switchFromDefaultOpToUnionOpIfNeeded();
|
|
processCharacter();
|
|
}
|
|
return;
|
|
|
|
case ClassSetConstructionState::CachedCharacterHyphen:
|
|
if (ch < m_character) {
|
|
m_errorCode = ErrorCode::CharacterClassRangeOutOfOrder;
|
|
return;
|
|
}
|
|
|
|
m_delegate.atomCharacterClassRange(m_character, ch);
|
|
switchFromDefaultOpToUnionOpIfNeeded();
|
|
m_state = ClassSetConstructionState::AfterSetRange;
|
|
return;
|
|
|
|
// If we hit this case, we have an invalid range like /[\d-a]/.
|
|
// See coment in atomBuiltInCharacterClass() below.
|
|
case ClassSetConstructionState::AfterCharacterClassHyphen:
|
|
m_errorCode = ErrorCode::CharacterClassRangeInvalid;
|
|
return;
|
|
|
|
case ClassSetConstructionState::AfterSetOperand:
|
|
if (!unionOpActive)
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
|
|
if (ch == '-')
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
else {
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
switchFromDefaultOpToUnionOpIfNeeded();
|
|
processCharacter();
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* atomBuiltInCharacterClass():
|
|
*
|
|
* Adds a built-in character class, called by parseEscape().
|
|
*/
|
|
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
|
|
{
|
|
bool unionOpActive = m_setOp == CharacterClassSetOp::Default || m_setOp == CharacterClassSetOp::Union;
|
|
|
|
auto processBuiltInCharacterClass = [&] () {
|
|
computeMayContainStrings(characterClassMayContainStrings(classID));
|
|
|
|
m_delegate.atomCharacterClassBuiltIn(classID, invert);
|
|
m_state = ClassSetConstructionState::AfterCharacterClass;
|
|
return;
|
|
};
|
|
|
|
switch (m_state) {
|
|
case ClassSetConstructionState::CachedCharacter:
|
|
if (!unionOpActive) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return;
|
|
}
|
|
|
|
// Flush the currently cached character, then fall through.
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
|
|
// Yes, we really want to fall through to the AfterSetRange case to switch from Default to Union op
|
|
// and then handle the built in class by falling through again.
|
|
FALLTHROUGH;
|
|
|
|
case ClassSetConstructionState::AfterSetRange:
|
|
switchFromDefaultOpToUnionOpIfNeeded();
|
|
|
|
// Continue processing the current character.
|
|
FALLTHROUGH;
|
|
|
|
case ClassSetConstructionState::Empty:
|
|
case ClassSetConstructionState::AfterCharacterClass:
|
|
case ClassSetConstructionState::AfterSetOperator:
|
|
processBuiltInCharacterClass();
|
|
return;
|
|
|
|
// If we hit either of these cases, we have an invalid range that
|
|
// looks something like /[a-\d]/ or /[\d-\d]/.
|
|
// Since ES2015, this should be syntax error in a unicode pattern,
|
|
// yet gracefully handled in a regular regex to avoid breaking the web.
|
|
// Effectively we handle the hyphen as if it was (implicitly) escaped,
|
|
// e.g. /[\d-a-z]/ is treated as /[\d\-a\-z]/.
|
|
// See usages of CharacterRangeOrUnion abstract op in
|
|
// https://tc39.es/ecma262/#sec-regular-expression-patterns-semantics
|
|
case ClassSetConstructionState::CachedCharacterHyphen:
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
m_delegate.atomCharacterClassAtom('-');
|
|
FALLTHROUGH;
|
|
case ClassSetConstructionState::AfterCharacterClassHyphen:
|
|
m_errorCode = ErrorCode::CharacterClassRangeInvalid;
|
|
return;
|
|
|
|
case ClassSetConstructionState::AfterSetOperand:
|
|
if (!unionOpActive)
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
|
|
processBuiltInCharacterClass();
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* end():
|
|
*
|
|
* Called at end of construction.
|
|
*/
|
|
void end()
|
|
{
|
|
if (m_state == ClassSetConstructionState::CachedCharacter)
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
else if (m_state == ClassSetConstructionState::CachedCharacterHyphen) {
|
|
m_delegate.atomCharacterClassAtom(m_character);
|
|
m_delegate.atomCharacterClassAtom('-');
|
|
} else if (m_state == ClassSetConstructionState::AfterSetOperator)
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
|
|
if (isInverted() && m_mayContainStrings)
|
|
m_errorCode = ErrorCode::NegatedClassSetMayContainStrings;
|
|
|
|
m_delegate.atomCharacterClassEnd();
|
|
}
|
|
|
|
bool isInverted() { return m_inverted; }
|
|
|
|
ErrorCode error() { return m_errorCode; }
|
|
|
|
// parseEscape() should never call these delegate methods when
|
|
// invoked with inCharacterClass set.
|
|
NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
|
|
private:
|
|
Delegate& m_delegate;
|
|
ErrorCode& m_errorCode;
|
|
enum class ClassSetConstructionState {
|
|
Empty,
|
|
CachedCharacter,
|
|
CachedCharacterHyphen,
|
|
AfterCharacterClass,
|
|
AfterCharacterClassHyphen,
|
|
AfterSetRange,
|
|
AfterSetOperand,
|
|
AfterSetOperator,
|
|
};
|
|
ClassSetConstructionState m_state;
|
|
CharacterClassSetOp m_setOp;
|
|
bool m_mayContainStrings;
|
|
bool m_inverted;
|
|
bool m_processingEscape;
|
|
char32_t m_character;
|
|
Vector<NestingState> nestedParseState;
|
|
};
|
|
|
|
/*
|
|
* ClassStringDisjunctionParserDelegate:
|
|
*
|
|
* The class ClassStringDisjunctionParserDelegate is used in the parsing of class string disjunctions,
|
|
* e.g \q{...}. This class builds strings from the alternatives and passes them on to
|
|
* character class delegate.
|
|
*/
|
|
class ClassStringDisjunctionParserDelegate {
|
|
public:
|
|
ClassStringDisjunctionParserDelegate(Delegate& delegate, ErrorCode& err)
|
|
: m_delegate(delegate)
|
|
, m_mayContainStrings(false)
|
|
, m_errorCode(err)
|
|
{
|
|
}
|
|
|
|
void atomPatternCharacter(char32_t ch, bool = false)
|
|
{
|
|
m_stringInProgress.append(ch);
|
|
if (m_stringInProgress.size() > 1)
|
|
m_mayContainStrings = true;
|
|
}
|
|
|
|
void newAlternative()
|
|
{
|
|
m_strings.append(m_stringInProgress);
|
|
m_stringInProgress.clear();
|
|
}
|
|
|
|
/*
|
|
* end():
|
|
*
|
|
* Called at end of construction.
|
|
*/
|
|
void end()
|
|
{
|
|
newAlternative();
|
|
m_delegate.atomClassStringDisjunction(m_strings);
|
|
}
|
|
|
|
bool mayContainStrings() { return m_mayContainStrings; }
|
|
|
|
// parseEscape() should never call these delegate methods when parsing a class string disjunction.
|
|
NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
NO_RETURN_DUE_TO_ASSERT void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) { RELEASE_ASSERT_NOT_REACHED(); }
|
|
|
|
private:
|
|
Delegate& m_delegate;
|
|
bool m_mayContainStrings;
|
|
ErrorCode& m_errorCode;
|
|
Vector<char32_t> m_stringInProgress;
|
|
Vector<Vector<char32_t>> m_strings;
|
|
};
|
|
|
|
// The handling of IdentityEscapes is different depending on which unicode flag if any is active.
|
|
// For both Unicode and UnicodeSet patterns, IdentityEscapes only include SyntaxCharacters or '/'.
|
|
// For UnicodeSet patterns when parsing ClassSet expressions and ClassStringDisjunctions, escapes include SyntaxCharacters, '/'
|
|
// and ClassSetReservedPunctionation, which is any of &-!#%,:;<=>@`~
|
|
// For non-unicode patterns, most any character can be escaped.
|
|
template<ParseEscapeMode parseEscapeMode>
|
|
bool isIdentityEscapeAnError(char32_t ch)
|
|
{
|
|
if (isEitherUnicodeCompilation()
|
|
&& ((isASCII(ch) && !strchr((parseEscapeMode == ParseEscapeMode::ClassSet || parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) ? "^$\\.*+?()[]{}|/&-!#%,:;<=>@`~" : "^$\\.*+?()[]{}|/", ch)) || !ch)) {
|
|
m_errorCode = ErrorCode::InvalidIdentityEscape;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* parseEscape():
|
|
*
|
|
* Helper for parseTokens(), parseAtomEscape(), parseCharacterClassEscape(),
|
|
* parseClassSetEscape() and parseClassStringDisjunctionEscape().
|
|
*
|
|
* Unlike the other parser methods, this function does not report tokens
|
|
* directly to the member delegate (m_delegate), instead tokens are
|
|
* emitted to the delegate provided as an argument. In the case of atom
|
|
* escapes, parseTokens() will call parseEscape() passing m_delegate as
|
|
* an argument, and as such the escape will be reported to the delegate.
|
|
*
|
|
* However this method may also be used by parseCharacterClass(), parseClassSet(), or
|
|
* parseClassStringDisjunctionEscape() in which case a CharacterClassParserDelegate,
|
|
* ClassSetParserDelegate or ClassStringDisjunctionParserDelegate respectively will be
|
|
* passed as the delegate that tokens should be added to. Delegate should have the
|
|
* following methods:
|
|
*
|
|
* Required methods:
|
|
* void atomPatternCharacter(char32_t ch);
|
|
*
|
|
* Optional methods based on parseEscapeMode:
|
|
* void assertionWordBoundary(bool invert);
|
|
* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
|
|
* void atomBackReference(unsigned subpatternId);
|
|
* void atomNamedBackReference(const String& subpatternName);
|
|
* void atomNamedForwardReference(const String& subpatternName);
|
|
*/
|
|
template<ParseEscapeMode parseEscapeMode, class EscapeDelegate>
|
|
TokenType parseEscape(EscapeDelegate& delegate)
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(peek() == '\\');
|
|
consume();
|
|
|
|
if (atEndOfPattern()) {
|
|
m_errorCode = ErrorCode::EscapeUnterminated;
|
|
return TokenType::NotAtom;
|
|
}
|
|
|
|
switch (peek()) {
|
|
// Assertions
|
|
case 'b':
|
|
consume();
|
|
if (parseEscapeMode != ParseEscapeMode::Normal)
|
|
delegate.atomPatternCharacter('\b');
|
|
else {
|
|
delegate.assertionWordBoundary(false);
|
|
return TokenType::NotAtom;
|
|
}
|
|
break;
|
|
case 'B':
|
|
consume();
|
|
if (parseEscapeMode != ParseEscapeMode::Normal) {
|
|
if (isIdentityEscapeAnError<parseEscapeMode>('B'))
|
|
break;
|
|
|
|
delegate.atomPatternCharacter('B');
|
|
} else {
|
|
delegate.assertionWordBoundary(true);
|
|
return TokenType::NotAtom;
|
|
}
|
|
break;
|
|
|
|
// CharacterClassEscape
|
|
case 'd':
|
|
consume();
|
|
if (parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
delegate.atomPatternCharacter('d');
|
|
break;
|
|
}
|
|
delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
|
|
break;
|
|
case 's':
|
|
consume();
|
|
if (parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
delegate.atomPatternCharacter('s');
|
|
break;
|
|
}
|
|
delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
|
|
break;
|
|
case 'w':
|
|
consume();
|
|
if (parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
delegate.atomPatternCharacter('w');
|
|
break;
|
|
}
|
|
delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
|
|
break;
|
|
case 'D':
|
|
consume();
|
|
if (parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
delegate.atomPatternCharacter('D');
|
|
break;
|
|
}
|
|
delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
|
|
break;
|
|
case 'S':
|
|
consume();
|
|
if (parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
delegate.atomPatternCharacter('S');
|
|
break;
|
|
}
|
|
delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
|
|
break;
|
|
case 'W':
|
|
consume();
|
|
if (parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
delegate.atomPatternCharacter('W');
|
|
break;
|
|
}
|
|
delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
|
|
break;
|
|
|
|
case '0': {
|
|
consume();
|
|
|
|
if (!peekIsDigit()) {
|
|
delegate.atomPatternCharacter(0);
|
|
break;
|
|
}
|
|
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::InvalidOctalEscape;
|
|
break;
|
|
}
|
|
|
|
delegate.atomPatternCharacter(consumeOctal(2));
|
|
break;
|
|
}
|
|
|
|
// DecimalEscape
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9': {
|
|
// For non-Unicode patterns, invalid backreferences are parsed as octal or decimal escapes.
|
|
// First, try to parse this as backreference.
|
|
if (parseEscapeMode == ParseEscapeMode::Normal) {
|
|
ParseState state = saveState();
|
|
|
|
unsigned backReference = consumeNumber();
|
|
if (backReference <= m_backReferenceLimit) {
|
|
m_maxSeenBackReference = std::max(m_maxSeenBackReference, backReference);
|
|
delegate.atomBackReference(backReference);
|
|
break;
|
|
}
|
|
|
|
restoreState(state);
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::InvalidBackreference;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::InvalidOctalEscape;
|
|
break;
|
|
}
|
|
|
|
delegate.atomPatternCharacter(peek() < '8' ? consumeOctal(3) : consume());
|
|
break;
|
|
}
|
|
|
|
// ControlEscape
|
|
case 'f':
|
|
consume();
|
|
delegate.atomPatternCharacter('\f');
|
|
break;
|
|
case 'n':
|
|
consume();
|
|
delegate.atomPatternCharacter('\n');
|
|
break;
|
|
case 'r':
|
|
consume();
|
|
delegate.atomPatternCharacter('\r');
|
|
break;
|
|
case 't':
|
|
consume();
|
|
delegate.atomPatternCharacter('\t');
|
|
break;
|
|
case 'v':
|
|
consume();
|
|
delegate.atomPatternCharacter('\v');
|
|
break;
|
|
|
|
// ControlLetter
|
|
case 'c': {
|
|
ParseState state = saveState();
|
|
consume();
|
|
if (!atEndOfPattern()) {
|
|
char32_t control = consume();
|
|
|
|
if (WTF::isASCIIAlpha(control)) {
|
|
delegate.atomPatternCharacter(control & 0x1f);
|
|
break;
|
|
}
|
|
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::InvalidControlLetterEscape;
|
|
break;
|
|
}
|
|
|
|
// https://tc39.es/ecma262/#prod-annexB-ClassControlLetter
|
|
if (parseEscapeMode != ParseEscapeMode::Normal && (WTF::isASCIIDigit(control) || control == '_')) {
|
|
delegate.atomPatternCharacter(control & 0x1f);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::InvalidIdentityEscape;
|
|
break;
|
|
}
|
|
|
|
restoreState(state);
|
|
delegate.atomPatternCharacter('\\');
|
|
break;
|
|
}
|
|
|
|
// HexEscape
|
|
case 'x': {
|
|
consume();
|
|
char32_t x = tryConsumeHex(2);
|
|
if (x == errorCodePoint) {
|
|
if (isIdentityEscapeAnError<parseEscapeMode>('x'))
|
|
break;
|
|
|
|
delegate.atomPatternCharacter('x');
|
|
} else
|
|
delegate.atomPatternCharacter(x);
|
|
break;
|
|
}
|
|
|
|
// Named backreference
|
|
case 'k': {
|
|
consume();
|
|
ParseState state = saveState();
|
|
if (parseEscapeMode == ParseEscapeMode::Normal && tryConsume('<')) {
|
|
auto groupName = tryConsumeGroupName();
|
|
if (hasError(m_errorCode))
|
|
break;
|
|
|
|
if (groupName) {
|
|
if (m_namedCaptureGroups.contains(groupName.value())) {
|
|
delegate.atomNamedBackReference(groupName.value());
|
|
break;
|
|
}
|
|
|
|
if (m_isNamedForwardReferenceAllowed) {
|
|
m_forwardReferenceNames.add(groupName.value());
|
|
delegate.atomNamedForwardReference(groupName.value());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
restoreState(state);
|
|
if (!isIdentityEscapeAnError<parseEscapeMode>('k')) {
|
|
delegate.atomPatternCharacter('k');
|
|
m_kIdentityEscapeSeen = true;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Unicode property escapes
|
|
case 'p':
|
|
case 'P': {
|
|
char32_t escapeChar = consume();
|
|
|
|
if (isLegacyCompilation() || parseEscapeMode == ParseEscapeMode::ClassStringDisjunction) {
|
|
if (isIdentityEscapeAnError<parseEscapeMode>(escapeChar))
|
|
break;
|
|
delegate.atomPatternCharacter(escapeChar);
|
|
break;
|
|
}
|
|
|
|
if (!atEndOfPattern() && peek() == '{') {
|
|
consume();
|
|
auto optClassID = tryConsumeUnicodePropertyExpression();
|
|
if (!optClassID) {
|
|
// tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
|
|
break;
|
|
}
|
|
|
|
if (escapeChar == 'P' && characterClassMayContainStrings(optClassID.value())) {
|
|
m_errorCode = ErrorCode::NegatedClassSetMayContainStrings;
|
|
break;
|
|
}
|
|
|
|
delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P');
|
|
} else
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
break;
|
|
}
|
|
|
|
// Class String Disjunction
|
|
case 'q': {
|
|
char32_t escapeChar = consume();
|
|
|
|
if (parseEscapeMode == ParseEscapeMode::ClassSet) {
|
|
if (!atEndOfPattern() && peek() == '{') {
|
|
bool disjunctionMayContainStrings = false;
|
|
parseClassStringDisjunction(disjunctionMayContainStrings);
|
|
|
|
return disjunctionMayContainStrings ? TokenType::SetDisjunctionMayContainStrings : TokenType::SetDisjunction;
|
|
}
|
|
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
}
|
|
|
|
if (isIdentityEscapeAnError<parseEscapeMode>(escapeChar))
|
|
break;
|
|
|
|
delegate.atomPatternCharacter(escapeChar);
|
|
break;
|
|
}
|
|
|
|
// UnicodeEscape
|
|
case 'u': {
|
|
char32_t codePoint = tryConsumeUnicodeEscape<UnicodeParseContext::PatternCodePoint>();
|
|
if (hasError(m_errorCode))
|
|
break;
|
|
|
|
delegate.atomPatternCharacter(codePoint == errorCodePoint ? 'u' : codePoint);
|
|
break;
|
|
}
|
|
|
|
// IdentityEscape
|
|
default:
|
|
char32_t ch = peek();
|
|
|
|
if (ch == '-' && isEitherUnicodeCompilation() && parseEscapeMode != ParseEscapeMode::Normal) {
|
|
// \- is allowed for ClassEscape with unicode flag.
|
|
delegate.atomPatternCharacter(consume());
|
|
break;
|
|
}
|
|
|
|
if (isIdentityEscapeAnError<parseEscapeMode>(ch))
|
|
break;
|
|
|
|
delegate.atomPatternCharacter(consume());
|
|
}
|
|
|
|
return TokenType::Atom;
|
|
}
|
|
|
|
template<UnicodeParseContext context>
|
|
char32_t consumePossibleSurrogatePair()
|
|
{
|
|
bool unicodePatternOrGroupName = isEitherUnicodeCompilation() || context == UnicodeParseContext::GroupName;
|
|
|
|
char32_t ch = consume();
|
|
if (U16_IS_LEAD(ch) && unicodePatternOrGroupName && !atEndOfPattern()) {
|
|
ParseState state = saveState();
|
|
|
|
char32_t surrogate2 = consume();
|
|
if (U16_IS_TRAIL(surrogate2))
|
|
ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
|
|
else
|
|
restoreState(state);
|
|
}
|
|
|
|
return ch;
|
|
}
|
|
|
|
inline char32_t consumeAndCheckIfValidClassSetCharacter()
|
|
{
|
|
char32_t ch = consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>();
|
|
|
|
if (!ch) {
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
return errorCodePoint;
|
|
}
|
|
|
|
if (isASCII(ch)) {
|
|
// Check if the character is part of ClassSetSyntaxCharacter.
|
|
// We leave handling of - and \ to the caller.
|
|
if (strchr("()[]{}/|)", ch)) {
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
return errorCodePoint;
|
|
}
|
|
|
|
// Check if the current character and the next are part of ClassSetReservedDoublePunctuator.
|
|
if (!atEndOfPattern()) {
|
|
char32_t nextCh = peek();
|
|
if (ch == nextCh && strchr("&!#$%*+,.:;<=>?@^`~", ch)) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return errorCodePoint;
|
|
}
|
|
}
|
|
}
|
|
|
|
return ch;
|
|
}
|
|
|
|
/*
|
|
* parseAtomEscape(), parseCharacterClassEscape(), parseClassSetEscape() and parseClassStringDisjunctionEscape():
|
|
*
|
|
* These methods alias to parseEscape().
|
|
*/
|
|
TokenType parseAtomEscape()
|
|
{
|
|
return parseEscape<ParseEscapeMode::Normal>(m_delegate);
|
|
}
|
|
|
|
void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
|
|
{
|
|
parseEscape<ParseEscapeMode::CharacterClass>(delegate);
|
|
}
|
|
|
|
TokenType parseClassSetEscape(ClassSetParserDelegate& delegate)
|
|
{
|
|
return parseEscape<ParseEscapeMode::ClassSet>(delegate);
|
|
}
|
|
|
|
void parseClassStringDisjunctionEscape(ClassStringDisjunctionParserDelegate& delegate)
|
|
{
|
|
parseEscape<ParseEscapeMode::ClassStringDisjunction>(delegate);
|
|
}
|
|
|
|
/*
|
|
* parseCharacterClass():
|
|
*
|
|
* Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
|
|
* to an instance of CharacterClassParserDelegate, to describe the character class to the
|
|
* delegate.
|
|
*/
|
|
void parseCharacterClass()
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(peek() == '[');
|
|
consume();
|
|
|
|
CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode, m_compileMode);
|
|
|
|
characterClassConstructor.begin(tryConsume('^'));
|
|
|
|
while (!atEndOfPattern()) {
|
|
switch (peek()) {
|
|
case ']':
|
|
consume();
|
|
characterClassConstructor.end();
|
|
return;
|
|
|
|
case '\\':
|
|
parseCharacterClassEscape(characterClassConstructor);
|
|
break;
|
|
|
|
default:
|
|
characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>(), true);
|
|
}
|
|
|
|
if (hasError(m_errorCode))
|
|
return;
|
|
}
|
|
|
|
m_errorCode = ErrorCode::CharacterClassUnmatched;
|
|
}
|
|
|
|
/*
|
|
* parseClassSet():
|
|
*
|
|
* Helper for parseTokens() calls directly and indirectly (via parseClassSetEscape)
|
|
* to an instance of CharacterClassParserDelegate, to describe the character class to the
|
|
* delegate.
|
|
*/
|
|
void parseClassSet()
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(peek() == '[');
|
|
consume();
|
|
|
|
ClassSetParserDelegate classSetConstructor(m_delegate, m_errorCode);
|
|
|
|
classSetConstructor.begin(tryConsume('^'));
|
|
|
|
auto processCharacterNormally = [&] () {
|
|
char32_t ch = consumeAndCheckIfValidClassSetCharacter();
|
|
if (ch == errorCodePoint)
|
|
return;
|
|
|
|
classSetConstructor.atomPatternCharacter(static_cast<char32_t>(ch));
|
|
};
|
|
|
|
while (!atEndOfPattern()) {
|
|
switch (peek()) {
|
|
case ']':
|
|
consume();
|
|
if (classSetConstructor.nestedClassEnd())
|
|
return;
|
|
break;
|
|
|
|
case '[': {
|
|
consume();
|
|
classSetConstructor.nestedClassBegin(tryConsume('^'));
|
|
break;
|
|
}
|
|
|
|
case '\\': {
|
|
if (!classSetConstructor.canTakeSetOperand()) {
|
|
m_errorCode = ErrorCode::InvalidClassSetOperation;
|
|
return;
|
|
}
|
|
|
|
classSetConstructor.setProcessingEscape();
|
|
|
|
TokenType tokenType = parseClassSetEscape(classSetConstructor);
|
|
|
|
classSetConstructor.computeMayContainStrings(tokenType == TokenType::SetDisjunctionMayContainStrings);
|
|
|
|
if (tokenType == TokenType::SetDisjunction || tokenType == TokenType::SetDisjunctionMayContainStrings)
|
|
classSetConstructor.afterSetOperand();
|
|
|
|
break;
|
|
}
|
|
|
|
case '-': {
|
|
ParseState state = saveState();
|
|
consume();
|
|
if (atEndOfPattern()) {
|
|
m_errorCode = ErrorCode::CharacterClassUnmatched;
|
|
return;
|
|
}
|
|
if (peek() == '-') {
|
|
consume();
|
|
if (atEndOfPattern() || peek() == '-') {
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
return;
|
|
}
|
|
classSetConstructor.setSubtractOp();
|
|
break;
|
|
}
|
|
restoreState(state);
|
|
processCharacterNormally();
|
|
break;
|
|
}
|
|
|
|
case '&': {
|
|
ParseState state = saveState();
|
|
consume();
|
|
if (atEndOfPattern()) {
|
|
m_errorCode = ErrorCode::CharacterClassUnmatched;
|
|
return;
|
|
}
|
|
if (peek() == '&') {
|
|
consume();
|
|
if (atEndOfPattern() || peek() == '&') {
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
return;
|
|
}
|
|
classSetConstructor.setIntersectionOp();
|
|
break;
|
|
}
|
|
restoreState(state);
|
|
processCharacterNormally();
|
|
break;
|
|
}
|
|
|
|
default:
|
|
processCharacterNormally();
|
|
break;
|
|
}
|
|
|
|
if (hasError(m_errorCode))
|
|
return;
|
|
}
|
|
|
|
m_errorCode = ErrorCode::CharacterClassUnmatched;
|
|
}
|
|
|
|
/*
|
|
* parseClassStringDisjunction():
|
|
*
|
|
* Helper for parseTokens() calls directly and indirectly (via parseClassStringDisjunctionEscape)
|
|
* to an instance of ClassStringDisjunctionParserDelegate, to describe the Class String Disjunction to the
|
|
* delegate.
|
|
*/
|
|
void parseClassStringDisjunction(bool &disjunctionMayContainStrings)
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(peek() == '{');
|
|
consume();
|
|
|
|
ClassStringDisjunctionParserDelegate stringDisjunctionDelegate(m_delegate, m_errorCode);
|
|
|
|
while (!atEndOfPattern()) {
|
|
switch (peek()) {
|
|
case '}':
|
|
consume();
|
|
stringDisjunctionDelegate.end();
|
|
disjunctionMayContainStrings = stringDisjunctionDelegate.mayContainStrings();
|
|
return;
|
|
|
|
case '\\':
|
|
parseClassStringDisjunctionEscape(stringDisjunctionDelegate);
|
|
break;
|
|
|
|
case '|':
|
|
consume();
|
|
stringDisjunctionDelegate.newAlternative();
|
|
break;
|
|
|
|
case '-':
|
|
consume();
|
|
m_errorCode = ErrorCode::InvalidClassSetCharacter;
|
|
return;
|
|
|
|
default: {
|
|
char32_t ch = consumeAndCheckIfValidClassSetCharacter();
|
|
|
|
if (ch == errorCodePoint)
|
|
return;
|
|
|
|
stringDisjunctionDelegate.atomPatternCharacter(static_cast<char32_t>(ch));
|
|
}
|
|
}
|
|
|
|
if (hasError(m_errorCode))
|
|
return;
|
|
}
|
|
|
|
m_errorCode = ErrorCode::ClassStringDisjunctionUnmatched;
|
|
}
|
|
|
|
/*
|
|
* parseParenthesesBegin():
|
|
*
|
|
* Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
|
|
*/
|
|
void parseParenthesesBegin()
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(peek() == '(');
|
|
consume();
|
|
|
|
auto type = ParenthesesType::Subpattern;
|
|
|
|
if (tryConsume('?')) {
|
|
if (atEndOfPattern()) {
|
|
m_errorCode = ErrorCode::ParenthesesTypeInvalid;
|
|
return;
|
|
}
|
|
|
|
switch (peek()) {
|
|
case ':':
|
|
consume();
|
|
m_delegate.atomParenthesesSubpatternBegin(false);
|
|
break;
|
|
|
|
case '=':
|
|
consume();
|
|
m_delegate.atomParentheticalAssertionBegin(false, Forward);
|
|
type = ParenthesesType::Assertion;
|
|
break;
|
|
|
|
case '!':
|
|
consume();
|
|
m_delegate.atomParentheticalAssertionBegin(true, Forward);
|
|
type = ParenthesesType::Assertion;
|
|
break;
|
|
|
|
case '<': {
|
|
consume();
|
|
auto groupName = tryConsumeGroupName();
|
|
if (hasError(m_errorCode))
|
|
break;
|
|
|
|
if (groupName) {
|
|
if (m_kIdentityEscapeSeen) {
|
|
m_errorCode = ErrorCode::InvalidNamedBackReference;
|
|
break;
|
|
}
|
|
|
|
auto setAddResult = m_namedCaptureGroups.add(groupName.value());
|
|
if (setAddResult.isNewEntry)
|
|
m_delegate.atomParenthesesSubpatternBegin(true, groupName);
|
|
else
|
|
m_errorCode = ErrorCode::DuplicateGroupName;
|
|
} else {
|
|
if (tryConsume('=')) {
|
|
m_delegate.atomParentheticalAssertionBegin(false, Backward);
|
|
type = ParenthesesType::LookbehindAssertion;
|
|
break;
|
|
}
|
|
|
|
if (tryConsume('!')) {
|
|
m_delegate.atomParentheticalAssertionBegin(true, Backward);
|
|
type = ParenthesesType::LookbehindAssertion;
|
|
break;
|
|
}
|
|
m_errorCode = ErrorCode::InvalidGroupName;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
#define REGEXP_MOD_CASE(key, name, lowerCaseName) \
|
|
case key:
|
|
|
|
// Valid RegularExpressionFlags for regexp modifiers
|
|
case '-':
|
|
JSC_REGEXP_MOD_FLAGS(REGEXP_MOD_CASE)
|
|
|
|
#undef REGEXP_MOD_CASE
|
|
{
|
|
// consume characters until :
|
|
OptionSet<Flags> set;
|
|
OptionSet<Flags> unset;
|
|
bool hasHitNegation = false;
|
|
char32_t c;
|
|
while (!atEndOfPattern() && (c = consume()) != ':') {
|
|
switch (c) {
|
|
case '-':
|
|
if (hasHitNegation)
|
|
m_errorCode = ErrorCode::InvalidRegularExpressionModifier;
|
|
hasHitNegation = true;
|
|
break;
|
|
|
|
// It is a Syntax Error if the source text matched by RegularExpressionModifiers contains the same code point more than once
|
|
#define HANDLE_REGEXP_MOD_FLAG(key, name, lowerCaseName) \
|
|
case key: \
|
|
if (hasHitNegation) { \
|
|
if (unset.contains(Flags::name)) \
|
|
m_errorCode = ErrorCode::InvalidRegularExpressionModifier; \
|
|
unset.add(Flags::name); \
|
|
} else { \
|
|
if (set.contains(Flags::name)) \
|
|
m_errorCode = ErrorCode::InvalidRegularExpressionModifier; \
|
|
set.add(Flags::name); \
|
|
} \
|
|
break;
|
|
|
|
JSC_REGEXP_MOD_FLAGS(HANDLE_REGEXP_MOD_FLAG)
|
|
#undef HANDLE_REGEXP_MOD_FLAG
|
|
|
|
default:
|
|
m_errorCode = ErrorCode::ParenthesesTypeInvalid;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (hasError(m_errorCode))
|
|
break;
|
|
|
|
// we've consumed (?<flags>:
|
|
|
|
// It is a Syntax Error if any code point in the source text matched by the first RegularExpressionModifiers is also contained in the source text matched by the second RegularExpressionModifiers.
|
|
if (set.containsAny(unset))
|
|
m_errorCode = ErrorCode::InvalidRegularExpressionModifier;
|
|
// It is a Syntax Error if the source text matched by the first RegularExpressionModifiers and the source text matched by the second RegularExpressionModifiers are both empty.
|
|
if (set.isEmpty() && unset.isEmpty())
|
|
m_errorCode = ErrorCode::InvalidRegularExpressionModifier;
|
|
m_delegate.atomParentheticalModifierBegin(set, unset);
|
|
|
|
break;
|
|
}
|
|
|
|
default:
|
|
m_errorCode = ErrorCode::ParenthesesTypeInvalid;
|
|
}
|
|
} else
|
|
m_delegate.atomParenthesesSubpatternBegin();
|
|
|
|
if (type == ParenthesesType::Subpattern)
|
|
++m_numSubpatterns;
|
|
|
|
m_parenthesesStack.append(type);
|
|
m_namedCaptureGroups.pushParenthesis();
|
|
}
|
|
|
|
/*
|
|
* parseParenthesesEnd():
|
|
*
|
|
* Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
|
|
*
|
|
* The boolean value returned by this method indicates whether the token parsed
|
|
* was either an Atom or, for web compatibility reasons, QuantifiableAssertion
|
|
* in non-Unicode pattern.
|
|
*/
|
|
TokenType parseParenthesesEnd()
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(peek() == ')');
|
|
consume();
|
|
|
|
if (m_parenthesesStack.isEmpty()) {
|
|
m_errorCode = ErrorCode::ParenthesesUnmatched;
|
|
return TokenType::NotAtom;
|
|
}
|
|
|
|
m_delegate.atomParenthesesEnd();
|
|
|
|
m_namedCaptureGroups.popParenthesis();
|
|
|
|
auto type = m_parenthesesStack.takeLast();
|
|
if (type == ParenthesesType::LookbehindAssertion)
|
|
return TokenType::Lookbehind;
|
|
|
|
if (type == ParenthesesType::Subpattern || isLegacyCompilation())
|
|
return TokenType::Atom;
|
|
|
|
return TokenType::NotAtom;
|
|
}
|
|
|
|
/*
|
|
* parseQuantifier():
|
|
*
|
|
* Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
|
|
*/
|
|
void parseQuantifier(TokenType lastTokenType, unsigned min, unsigned max)
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
ASSERT(min <= max);
|
|
|
|
if (lastTokenType == TokenType::Atom)
|
|
m_delegate.quantifyAtom(min, max, !tryConsume('?'));
|
|
else if (lastTokenType == TokenType::Lookbehind)
|
|
m_errorCode = ErrorCode::CantQuantifyAtom;
|
|
else
|
|
m_errorCode = ErrorCode::QuantifierWithoutAtom;
|
|
}
|
|
|
|
/*
|
|
* parseTokens():
|
|
*
|
|
* This method loops over the input pattern reporting tokens to the delegate.
|
|
* The method returns when a parse error is detected, or the end of the pattern
|
|
* is reached. One piece of state is tracked around the loop, which is whether
|
|
* the last token passed to the delegate was an atom (this is necessary to detect
|
|
* a parse error when a quantifier provided without an atom to quantify).
|
|
*/
|
|
void parseTokens()
|
|
{
|
|
TokenType lastTokenType = TokenType::NotAtom;
|
|
|
|
while (!atEndOfPattern()) {
|
|
switch (peek()) {
|
|
case '|':
|
|
consume();
|
|
m_delegate.disjunction(CreateDisjunctionPurpose::ForNextAlternative);
|
|
lastTokenType = TokenType::NotAtom;
|
|
m_namedCaptureGroups.nextAlternative();
|
|
break;
|
|
|
|
case '(':
|
|
parseParenthesesBegin();
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
|
|
case ')':
|
|
lastTokenType = parseParenthesesEnd();
|
|
break;
|
|
|
|
case '^':
|
|
consume();
|
|
m_delegate.assertionBOL();
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
|
|
case '$':
|
|
consume();
|
|
m_delegate.assertionEOL();
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
|
|
case '.':
|
|
consume();
|
|
m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
|
|
lastTokenType = TokenType::Atom;
|
|
break;
|
|
|
|
case '[':
|
|
if (isUnicodeSetsCompilation())
|
|
parseClassSet();
|
|
else
|
|
parseCharacterClass();
|
|
lastTokenType = TokenType::Atom;
|
|
break;
|
|
|
|
case ']':
|
|
case '}':
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::BracketUnmatched;
|
|
break;
|
|
}
|
|
|
|
m_delegate.atomPatternCharacter(consume());
|
|
lastTokenType = TokenType::Atom;
|
|
break;
|
|
|
|
case '\\':
|
|
lastTokenType = parseAtomEscape();
|
|
break;
|
|
|
|
case '*':
|
|
consume();
|
|
parseQuantifier(lastTokenType, 0, quantifyInfinite);
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
|
|
case '+':
|
|
consume();
|
|
parseQuantifier(lastTokenType, 1, quantifyInfinite);
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
|
|
case '?':
|
|
consume();
|
|
parseQuantifier(lastTokenType, 0, 1);
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
|
|
case '{': {
|
|
ParseState state = saveState();
|
|
|
|
consume();
|
|
if (peekIsDigit()) {
|
|
uint64_t min = consumeNumber64();
|
|
uint64_t max = min;
|
|
|
|
if (tryConsume(','))
|
|
max = peekIsDigit() ? consumeNumber64() : quantifyInfinite64;
|
|
|
|
if (tryConsume('}')) {
|
|
if (min == quantifyInfinite64) {
|
|
m_errorCode = ErrorCode::QuantifierTooLarge;
|
|
} else if (min <= max) {
|
|
min = std::min<uint64_t>(min, quantifyInfinite);
|
|
max = std::min<uint64_t>(max, quantifyInfinite);
|
|
parseQuantifier(lastTokenType, static_cast<unsigned>(min), static_cast<unsigned>(max));
|
|
} else
|
|
m_errorCode = ErrorCode::QuantifierOutOfOrder;
|
|
lastTokenType = TokenType::NotAtom;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::QuantifierIncomplete;
|
|
break;
|
|
}
|
|
|
|
restoreState(state);
|
|
// if we did not find a complete quantifer, fall through to the default case.
|
|
FALLTHROUGH;
|
|
}
|
|
|
|
default:
|
|
m_delegate.atomPatternCharacter(consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>());
|
|
lastTokenType = TokenType::Atom;
|
|
}
|
|
|
|
if (hasError(m_errorCode))
|
|
return;
|
|
|
|
if (m_delegate.abortedDueToError()) {
|
|
m_errorCode = m_delegate.abortErrorCode();
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!m_parenthesesStack.isEmpty())
|
|
m_errorCode = ErrorCode::MissingParentheses;
|
|
}
|
|
|
|
void handleIllegalReferences()
|
|
{
|
|
bool shouldReparse = false;
|
|
|
|
if (m_maxSeenBackReference > m_numSubpatterns) {
|
|
// Contains illegal numeric backreference. See https://tc39.es/ecma262/#prod-annexB-AtomEscape
|
|
if (isEitherUnicodeCompilation()) {
|
|
m_errorCode = ErrorCode::InvalidBackreference;
|
|
return;
|
|
}
|
|
|
|
m_backReferenceLimit = m_numSubpatterns;
|
|
shouldReparse = true;
|
|
}
|
|
|
|
if (m_kIdentityEscapeSeen && !m_namedCaptureGroups.isEmpty()) {
|
|
m_errorCode = ErrorCode::InvalidNamedBackReference;
|
|
return;
|
|
}
|
|
|
|
if (containsIllegalNamedForwardReference()) {
|
|
// \k<a> is parsed as named reference in Unicode patterns because of strict IdentityEscape grammar.
|
|
// See https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors
|
|
if (isEitherUnicodeCompilation() || !m_namedCaptureGroups.isEmpty()) {
|
|
m_errorCode = ErrorCode::InvalidNamedBackReference;
|
|
return;
|
|
}
|
|
|
|
m_isNamedForwardReferenceAllowed = false;
|
|
shouldReparse = true;
|
|
}
|
|
|
|
if (shouldReparse) {
|
|
resetForReparsing();
|
|
parseTokens();
|
|
}
|
|
}
|
|
|
|
bool containsIllegalNamedForwardReference()
|
|
{
|
|
if (m_forwardReferenceNames.isEmpty())
|
|
return false;
|
|
|
|
if (m_namedCaptureGroups.isEmpty())
|
|
return true;
|
|
|
|
for (auto& entry : m_forwardReferenceNames) {
|
|
if (!m_namedCaptureGroups.contains(entry))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void resetForReparsing()
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
|
|
m_delegate.resetForReparsing();
|
|
m_index = 0;
|
|
m_numSubpatterns = 0;
|
|
m_maxSeenBackReference = 0;
|
|
m_kIdentityEscapeSeen = false;
|
|
m_parenthesesStack.clear();
|
|
m_namedCaptureGroups.reset();
|
|
m_forwardReferenceNames.clear();
|
|
}
|
|
|
|
// Misc helper functions:
|
|
|
|
typedef unsigned ParseState;
|
|
|
|
ParseState saveState()
|
|
{
|
|
return m_index;
|
|
}
|
|
|
|
void restoreState(ParseState state)
|
|
{
|
|
m_index = state;
|
|
}
|
|
|
|
bool atEndOfPattern()
|
|
{
|
|
ASSERT(m_index <= m_size);
|
|
return m_index == m_size;
|
|
}
|
|
|
|
unsigned patternRemaining()
|
|
{
|
|
ASSERT(m_index <= m_size);
|
|
return m_size - m_index;
|
|
}
|
|
|
|
char32_t peek()
|
|
{
|
|
ASSERT(m_index < m_size);
|
|
return m_data[m_index];
|
|
}
|
|
|
|
bool peekIsDigit()
|
|
{
|
|
return !atEndOfPattern() && WTF::isASCIIDigit(peek());
|
|
}
|
|
|
|
unsigned peekDigit()
|
|
{
|
|
ASSERT(peekIsDigit());
|
|
return peek() - '0';
|
|
}
|
|
|
|
template<UnicodeParseContext context>
|
|
char32_t tryConsumeUnicodeEscape()
|
|
{
|
|
ASSERT(!hasError(m_errorCode));
|
|
|
|
bool unicodePatternOrGroupName = isEitherUnicodeCompilation() || context == UnicodeParseContext::GroupName;
|
|
|
|
if (!tryConsume('u') || atEndOfPattern()) {
|
|
if (unicodePatternOrGroupName)
|
|
m_errorCode = ErrorCode::InvalidUnicodeEscape;
|
|
return errorCodePoint;
|
|
}
|
|
|
|
if (unicodePatternOrGroupName && tryConsume('{')) {
|
|
char32_t codePoint = 0;
|
|
do {
|
|
if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
|
|
m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape;
|
|
return errorCodePoint;
|
|
}
|
|
|
|
codePoint = (codePoint << 4) | toASCIIHexValue(consume());
|
|
|
|
if (codePoint > UCHAR_MAX_VALUE) {
|
|
m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape;
|
|
return errorCodePoint;
|
|
}
|
|
} while (!atEndOfPattern() && peek() != '}');
|
|
|
|
if (!tryConsume('}')) {
|
|
m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape;
|
|
return errorCodePoint;
|
|
}
|
|
|
|
return codePoint;
|
|
}
|
|
|
|
char32_t codeUnit = tryConsumeHex(4);
|
|
if (codeUnit == errorCodePoint) {
|
|
if (unicodePatternOrGroupName)
|
|
m_errorCode = ErrorCode::InvalidUnicodeEscape;
|
|
return errorCodePoint;
|
|
}
|
|
|
|
// If we have the first of a surrogate pair, look for the second.
|
|
if (U16_IS_LEAD(codeUnit) && unicodePatternOrGroupName && patternRemaining() >= 6 && peek() == '\\') {
|
|
ParseState state = saveState();
|
|
consume();
|
|
|
|
if (tryConsume('u')) {
|
|
char32_t surrogate2 = tryConsumeHex(4);
|
|
if (U16_IS_TRAIL(surrogate2))
|
|
return U16_GET_SUPPLEMENTARY(codeUnit, surrogate2);
|
|
}
|
|
|
|
restoreState(state);
|
|
}
|
|
|
|
return codeUnit;
|
|
}
|
|
|
|
char32_t tryConsumeIdentifierCharacter()
|
|
{
|
|
if (tryConsume('\\'))
|
|
return tryConsumeUnicodeEscape<UnicodeParseContext::GroupName>();
|
|
|
|
return consumePossibleSurrogatePair<UnicodeParseContext::GroupName>();
|
|
}
|
|
|
|
bool isIdentifierStart(char32_t ch)
|
|
{
|
|
#if defined(ENABLE_ICU)
|
|
return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK);
|
|
#else
|
|
return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$'));
|
|
#endif
|
|
}
|
|
|
|
bool isIdentifierPart(char32_t ch)
|
|
{
|
|
#if defined(ENABLE_ICU)
|
|
return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D;
|
|
#else
|
|
return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || ch == 0x200C || ch == 0x200D;
|
|
#endif
|
|
}
|
|
|
|
bool isUnicodePropertyValueExpressionChar(char32_t ch)
|
|
{
|
|
return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '=';
|
|
}
|
|
|
|
char32_t consume()
|
|
{
|
|
ASSERT(m_index < m_size);
|
|
return m_data[m_index++];
|
|
}
|
|
|
|
unsigned consumeDigit()
|
|
{
|
|
ASSERT(peekIsDigit());
|
|
return consume() - '0';
|
|
}
|
|
|
|
unsigned consumeNumber()
|
|
{
|
|
CheckedUint32 n = consumeDigit();
|
|
while (peekIsDigit())
|
|
n = n * 10 + consumeDigit();
|
|
return n.hasOverflowed() ? quantifyInfinite : n.value();
|
|
}
|
|
|
|
uint64_t consumeNumber64()
|
|
{
|
|
CheckedUint64 n = consumeDigit();
|
|
while (peekIsDigit())
|
|
n = n * static_cast<uint64_t>(10) + consumeDigit();
|
|
return n.hasOverflowed() ? quantifyInfinite64 : n.value();
|
|
}
|
|
|
|
// https://tc39.es/ecma262/#prod-annexB-LegacyOctalEscapeSequence
|
|
unsigned consumeOctal(unsigned count)
|
|
{
|
|
unsigned octal = 0;
|
|
while (count-- && octal < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
|
|
octal = octal * 8 + consumeDigit();
|
|
return octal;
|
|
}
|
|
|
|
bool tryConsume(UChar ch)
|
|
{
|
|
if (atEndOfPattern() || (m_data[m_index] != ch))
|
|
return false;
|
|
++m_index;
|
|
return true;
|
|
}
|
|
|
|
char32_t tryConsumeHex(char32_t count)
|
|
{
|
|
ParseState state = saveState();
|
|
|
|
char32_t n = 0;
|
|
while (count--) {
|
|
if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
|
|
restoreState(state);
|
|
return errorCodePoint;
|
|
}
|
|
n = (n << 4) | WTF::toASCIIHexValue(consume());
|
|
}
|
|
return n;
|
|
}
|
|
|
|
std::optional<String> tryConsumeGroupName()
|
|
{
|
|
if (atEndOfPattern())
|
|
return std::nullopt;
|
|
|
|
ParseState state = saveState();
|
|
|
|
char32_t ch = tryConsumeIdentifierCharacter();
|
|
|
|
if (isIdentifierStart(ch)) {
|
|
StringBuilder identifierBuilder;
|
|
identifierBuilder.append(ch);
|
|
|
|
while (!atEndOfPattern()) {
|
|
ch = tryConsumeIdentifierCharacter();
|
|
if (ch == '>')
|
|
return identifierBuilder.toString();
|
|
|
|
if (!isIdentifierPart(ch))
|
|
break;
|
|
|
|
identifierBuilder.append(ch);
|
|
}
|
|
}
|
|
|
|
restoreState(state);
|
|
|
|
return std::nullopt;
|
|
}
|
|
|
|
std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
|
|
{
|
|
if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) {
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
return std::nullopt;
|
|
}
|
|
|
|
StringBuilder expressionBuilder;
|
|
String unicodePropertyName;
|
|
bool foundEquals = false;
|
|
unsigned errors = 0;
|
|
|
|
expressionBuilder.append(consume());
|
|
|
|
while (!atEndOfPattern()) {
|
|
char32_t ch = peek();
|
|
if (ch == '}') {
|
|
consume();
|
|
if (errors) {
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
return std::nullopt;
|
|
}
|
|
|
|
if (foundEquals) {
|
|
auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
|
|
if (!result)
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
return result;
|
|
}
|
|
|
|
auto result = unicodeMatchProperty(expressionBuilder.toString(), m_compileMode);
|
|
if (!result)
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
return result;
|
|
}
|
|
|
|
consume();
|
|
if (ch == '=') {
|
|
if (!foundEquals) {
|
|
foundEquals = true;
|
|
unicodePropertyName = expressionBuilder.toString();
|
|
expressionBuilder.clear();
|
|
} else
|
|
errors++;
|
|
} else if (!isUnicodePropertyValueExpressionChar(ch))
|
|
errors++;
|
|
else
|
|
expressionBuilder.append(ch);
|
|
}
|
|
|
|
m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
|
|
return std::nullopt;
|
|
}
|
|
|
|
bool isLegacyCompilation() const { return m_compileMode == CompileMode::Legacy; }
|
|
bool isUnicodeCompilation() const { return m_compileMode == CompileMode::Unicode; }
|
|
bool isUnicodeSetsCompilation() const { return m_compileMode == CompileMode::UnicodeSets; }
|
|
bool isEitherUnicodeCompilation() const { return isUnicodeCompilation() || isUnicodeSetsCompilation(); }
|
|
|
|
enum class ParenthesesType : uint8_t { Subpattern, Assertion, LookbehindAssertion };
|
|
|
|
Delegate& m_delegate;
|
|
ErrorCode m_errorCode { ErrorCode::NoError };
|
|
const CharType* m_data;
|
|
unsigned m_size;
|
|
unsigned m_index { 0 };
|
|
CompileMode m_compileMode;
|
|
unsigned m_backReferenceLimit;
|
|
unsigned m_numSubpatterns { 0 };
|
|
unsigned m_maxSeenBackReference { 0 };
|
|
bool m_isNamedForwardReferenceAllowed;
|
|
bool m_kIdentityEscapeSeen { false };
|
|
Vector<ParenthesesType, 16> m_parenthesesStack;
|
|
NamedCaptureGroups m_namedCaptureGroups;
|
|
GCHashSet<String> m_forwardReferenceNames;
|
|
|
|
// Derived by empirical testing of compile time in PCRE and WREC.
|
|
static constexpr unsigned MAX_PATTERN_SIZE = 1024 * 1024;
|
|
};
|
|
|
|
/*
|
|
* Yarr::parse():
|
|
*
|
|
* The parse method is passed a pattern to be parsed and a delegate upon which
|
|
* callbacks will be made to record the parsed tokens forming the regex.
|
|
* Yarr::parse() returns null on success, or a const C string providing an error
|
|
* message where a parse error occurs.
|
|
*
|
|
* The Delegate must implement the following interface:
|
|
*
|
|
* void assertionBOL();
|
|
* void assertionEOL();
|
|
* void assertionWordBoundary(bool invert);
|
|
*
|
|
* void atomPatternCharacter(char32_t ch);
|
|
* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
|
|
* void atomCharacterClassBegin(bool invert)
|
|
* void atomCharacterClassAtom(char32_t ch)
|
|
* void atomCharacterClassRange(char32_t begin, char32_t end)
|
|
* void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
|
|
* void atomClassStringDisjunction(Vector<Vector<char32_t>>&)
|
|
* void atomCharacterClassSetOp(CharacterClassSetOp setOp)
|
|
* void atomCharacterClassPushNested()
|
|
* void atomCharacterClassPopNested()
|
|
* void atomCharacterClassEnd()
|
|
* void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> groupName);
|
|
* void atomParentheticalAssertionBegin(bool invert, MatchDirection matchDirection);
|
|
* void atomParenthesesEnd();
|
|
* void atomBackReference(unsigned subpatternId);
|
|
* void atomNamedBackReference(const String& subpatternName);
|
|
* void atomNamedForwardReference(const String& subpatternName);
|
|
*
|
|
* void quantifyAtom(unsigned min, unsigned max, bool greedy);
|
|
*
|
|
* void disjunction(CreateDisjunctionPurpose purpose);
|
|
*
|
|
* bool abortedDueToError() const;
|
|
* ErrorCode abortErrorCode() const;
|
|
*
|
|
* void resetForReparsing();
|
|
*
|
|
* The regular expression is described by a sequence of assertion*() and atom*()
|
|
* callbacks to the delegate, describing the terms in the regular expression.
|
|
* Following an atom a quantifyAtom() call may occur to indicate that the previous
|
|
* atom should be quantified. In the case of atoms described across multiple
|
|
* calls (parentheses and character classes) the call to quantifyAtom() will come
|
|
* after the call to the atom*End() method, never after atom*Begin().
|
|
*
|
|
* Character classes may either be described by a single call to
|
|
* atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
|
|
* In the latter case, ...Begin() will be called, followed by a sequence of
|
|
* calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
|
|
*
|
|
* Sequences of atoms and assertions are broken into alternatives via calls to
|
|
* disjunction(). Assertions, atoms, and disjunctions emitted between calls to
|
|
* atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
|
|
* atomParenthesesBegin() is passed a subpatternId. In the case of a regular
|
|
* capturing subpattern, this will be the subpatternId associated with these
|
|
* parentheses, and will also by definition be the lowest subpatternId of these
|
|
* parentheses and of any nested paretheses. The atomParenthesesEnd() method
|
|
* is passed the subpatternId of the last capturing subexpression nested within
|
|
* these paretheses. In the case of a capturing subpattern with no nested
|
|
* capturing subpatterns, the same subpatternId will be passed to the begin and
|
|
* end functions. In the case of non-capturing subpatterns the subpatternId
|
|
* passed to the begin method is also the first possible subpatternId that might
|
|
* be nested within these paretheses. If a set of non-capturing parentheses does
|
|
* not contain any capturing subpatterns, then the subpatternId passed to begin
|
|
* will be greater than the subpatternId passed to end.
|
|
*/
|
|
|
|
inline CompileMode compileMode(std::optional<OptionSet<Flags>> flags)
|
|
{
|
|
if (flags->contains(Flags::Unicode))
|
|
return CompileMode::Unicode;
|
|
|
|
if (flags->contains(Flags::UnicodeSets))
|
|
return CompileMode::UnicodeSets;
|
|
|
|
return CompileMode::Legacy;
|
|
}
|
|
|
|
template<typename Delegate>
|
|
ErrorCode parse(Delegate& delegate, StringView pattern, CompileMode compileMode, unsigned backReferenceLimit = quantifyInfinite, bool isNamedForwardReferenceAllowed = true)
|
|
{
|
|
if (pattern.is8Bit())
|
|
return Parser<Delegate, LChar>(delegate, pattern, compileMode, backReferenceLimit, isNamedForwardReferenceAllowed).parse();
|
|
return Parser<Delegate, UChar>(delegate, pattern, compileMode, backReferenceLimit, isNamedForwardReferenceAllowed).parse();
|
|
}
|
|
|
|
} } // namespace JSC::Yarr
|
|
|
|
WTF_ALLOW_UNSAFE_BUFFER_USAGE_END
|