escargot/src/runtime/String.cpp
HyukWoo Park d1535b2088 Update measurement of source code size for the code compression
Signed-off-by: HyukWoo Park <hyukwoo.park@samsung.com>
2022-01-27 21:18:18 +09:00

1119 lines
35 KiB
C++

// Copyright 2010 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*
* Copyright (c) 2016-present Samsung Electronics Co., Ltd
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
* USA
*/
#include "Escargot.h"
#include "String.h"
#include "CompressibleString.h"
#include "Value.h"
#include "parser/Lexer.h"
#include "fast-dtoa.h"
#include "bignum-dtoa.h"
#include "runtime/VMInstance.h"
namespace Escargot {
MAY_THREAD_LOCAL String* String::emptyString;
std::vector<std::string> split(const std::string& s, char seperator)
{
std::vector<std::string> output;
std::string::size_type prevPos = 0, pos = 0;
while ((pos = s.find(seperator, pos)) != std::string::npos) {
std::string substring(s.substr(prevPos, pos - prevPos));
output.push_back(substring);
prevPos = ++pos;
}
output.push_back(s.substr(prevPos, pos - prevPos));
return output;
}
std::vector<std::string> split(const std::string& str, const std::string& seperator)
{
std::vector<std::string> tokens;
size_t prev = 0, pos = 0;
do {
pos = str.find(seperator, prev);
if (pos == std::string::npos) {
pos = str.length();
}
std::string token = str.substr(prev, pos - prev);
if (!token.empty()) {
tokens.push_back(token);
}
prev = pos + seperator.length();
} while (pos < str.length() && prev < str.length());
return tokens;
}
bool isASCIIAlpha(char ch)
{
return isalpha(ch);
}
bool isASCIIDigit(char ch)
{
return isdigit(ch);
}
bool isASCIIAlphanumeric(char ch)
{
return isASCIIAlpha(ch) || isASCIIDigit(ch);
}
bool isAllSpecialCharacters(const std::string& s, bool (*fn)(char))
{
bool isAllSpecial = true;
for (size_t i = 0; i < s.size(); i++) {
if (!fn(s[i])) {
isAllSpecial = false;
break;
}
}
return isAllSpecial;
}
bool isAllASCII(const char* buf, const size_t len)
{
for (unsigned i = 0; i < len; i++) {
if ((buf[i] & 0x80) != 0) {
return false;
}
}
return true;
}
bool isAllASCII(const char16_t* buf, const size_t len)
{
for (unsigned i = 0; i < len; i++) {
if (buf[i] >= 128) {
return false;
}
}
return true;
}
bool isAllLatin1(const char16_t* buf, const size_t len)
{
for (unsigned i = 0; i < len; i++) {
if (buf[i] >= 256) {
return false;
}
}
return true;
}
bool isIndexString(String* str)
{
size_t len = str->length();
if (len == 1) {
char16_t c = str->charAt(0);
if (c < '0' || c > '9') {
return false;
}
} else {
char16_t c = str->charAt(0);
if (c < '1' || c > '9') {
return false;
}
for (unsigned i = 1; i < len; i++) {
char16_t c = str->charAt(i);
if (c < '0' || c > '9') {
return false;
}
}
}
return true;
}
static const char32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<char32_t>(0xFA082080UL), static_cast<char32_t>(0x82082080UL) };
char32_t readUTF8Sequence(const char*& sequence, bool& valid, int& charlen)
{
unsigned length;
const char sch = *sequence;
valid = true;
if ((sch & 0x80) == 0)
length = 1;
else {
unsigned char ch2 = static_cast<unsigned char>(*(sequence + 1));
if ((sch & 0xE0) == 0xC0
&& (ch2 & 0xC0) == 0x80)
length = 2;
else {
unsigned char ch3 = static_cast<unsigned char>(*(sequence + 2));
if ((sch & 0xF0) == 0xE0
&& (ch2 & 0xC0) == 0x80
&& (ch3 & 0xC0) == 0x80)
length = 3;
else {
unsigned char ch4 = static_cast<unsigned char>(*(sequence + 3));
if ((sch & 0xF8) == 0xF0
&& (ch2 & 0xC0) == 0x80
&& (ch3 & 0xC0) == 0x80
&& (ch4 & 0xC0) == 0x80)
length = 4;
else {
valid = false;
sequence++;
return -1;
}
}
}
}
charlen = length;
char32_t ch = 0;
switch (length) {
case 4:
ch += static_cast<unsigned char>(*sequence++);
ch <<= 6; // Fall through.
case 3:
ch += static_cast<unsigned char>(*sequence++);
ch <<= 6; // Fall through.
case 2:
ch += static_cast<unsigned char>(*sequence++);
ch <<= 6; // Fall through.
case 1:
ch += static_cast<unsigned char>(*sequence++);
}
return ch - offsetsFromUTF8[length - 1];
}
UTF16StringDataNonGCStd utf8StringToUTF16StringNonGC(const char* buf, const size_t len)
{
UTF16StringDataNonGCStd str;
const char* source = buf;
int charlen;
bool valid;
while (source < buf + len) {
char32_t ch = readUTF8Sequence(source, valid, charlen);
if (!valid) { // Invalid sequence
str += 0xFFFD;
} else if ((uint32_t)(ch) <= 0xffff) { // BMP
if (((ch)&0xfffff800) == 0xd800) { // SURROGATE
str += 0xFFFD;
source -= (charlen - 1);
} else {
str += ch; // normal case
}
} else if ((uint32_t)((ch)-0x10000) <= 0xfffff) { // SUPPLEMENTARY
str += (char16_t)(((ch) >> 10) + 0xd7c0); // LEAD
str += (char16_t)(((ch)&0x3ff) | 0xdc00); // TRAIL
} else {
str += 0xFFFD;
source -= (charlen - 1);
}
}
return str;
}
UTF16StringData utf8StringToUTF16String(const char* buf, const size_t len)
{
auto str = utf8StringToUTF16StringNonGC(buf, len);
return UTF16StringData(str.data(), str.length());
}
ASCIIStringData utf16StringToASCIIString(const char16_t* buf, const size_t len)
{
ASCIIStringData str;
str.resizeWithUninitializedValues(len);
for (unsigned i = 0; i < len; i++) {
ASSERT(buf[i] < 128);
str[i] = buf[i];
}
return ASCIIStringData(std::move(str));
}
size_t utf32ToUtf8(char32_t uc, char* UTF8)
{
size_t tRequiredSize = 0;
if (uc <= 0x7f) {
if (NULL != UTF8) {
UTF8[0] = (char)uc;
UTF8[1] = (char)'\0';
}
tRequiredSize = 1;
} else if (uc <= 0x7ff) {
if (NULL != UTF8) {
UTF8[0] = (char)(0xc0 + uc / (0x01 << 6));
UTF8[1] = (char)(0x80 + uc % (0x01 << 6));
UTF8[2] = (char)'\0';
}
tRequiredSize = 2;
} else if (uc <= 0xffff) {
if (NULL != UTF8) {
UTF8[0] = (char)(0xe0 + uc / (0x01 << 12));
UTF8[1] = (char)(0x80 + uc / (0x01 << 6) % (0x01 << 6));
UTF8[2] = (char)(0x80 + uc % (0x01 << 6));
UTF8[3] = (char)'\0';
}
tRequiredSize = 3;
} else if (uc <= 0x1fffff) {
if (NULL != UTF8) {
UTF8[0] = (char)(0xf0 + uc / (0x01 << 18));
UTF8[1] = (char)(0x80 + uc / (0x01 << 12) % (0x01 << 12));
UTF8[2] = (char)(0x80 + uc / (0x01 << 6) % (0x01 << 6));
UTF8[3] = (char)(0x80 + uc % (0x01 << 6));
UTF8[4] = (char)'\0';
}
tRequiredSize = 4;
} else if (uc <= 0x3ffffff) {
if (NULL != UTF8) {
UTF8[0] = (char)(0xf8 + uc / (0x01 << 24));
UTF8[1] = (char)(0x80 + uc / (0x01 << 18) % (0x01 << 18));
UTF8[2] = (char)(0x80 + uc / (0x01 << 12) % (0x01 << 12));
UTF8[3] = (char)(0x80 + uc / (0x01 << 6) % (0x01 << 6));
UTF8[4] = (char)(0x80 + uc % (0x01 << 6));
UTF8[5] = (char)'\0';
}
tRequiredSize = 5;
} else if (uc <= 0x7fffffff) {
if (NULL != UTF8) {
UTF8[0] = (char)(0xfc + uc / (0x01 << 30));
UTF8[1] = (char)(0x80 + uc / (0x01 << 24) % (0x01 << 24));
UTF8[2] = (char)(0x80 + uc / (0x01 << 18) % (0x01 << 18));
UTF8[3] = (char)(0x80 + uc / (0x01 << 12) % (0x01 << 12));
UTF8[4] = (char)(0x80 + uc / (0x01 << 6) % (0x01 << 6));
UTF8[5] = (char)(0x80 + uc % (0x01 << 6));
UTF8[6] = (char)'\0';
}
tRequiredSize = 6;
} else {
return utf32ToUtf8(0xFFFD, UTF8);
}
return tRequiredSize;
}
size_t utf32ToUtf16(char32_t i, char16_t* u)
{
if (i <= 0xffff) {
if (i >= 0xd800 && i <= 0xdfff) {
// illegal conversion
*u = 0xFFFD;
} else {
// normal case
*u = (char16_t)(i & 0xffff);
}
return 1;
} else if (i <= 0x10ffff) {
// surrogate pair can encode to 0x10ffff
// https://en.wikipedia.org/wiki/UTF-16
i -= 0x10000;
*u++ = 0xd800 | (i >> 10);
*u = 0xdc00 | (i & 0x3ff);
return 2;
} else {
// produce error char
// U+FFFD
*u = 0xFFFD;
return 1;
}
}
bool StringBufferAccessData::equals16Bit(const char16_t* c1, const char* c2, size_t len)
{
while (len > 0) {
if (*c1++ != *c2++) {
return false;
}
len--;
}
return true;
}
UTF16StringData ASCIIString::toUTF16StringData() const
{
UTF16StringData ret;
size_t len = length();
ret.resizeWithUninitializedValues(len);
for (size_t i = 0; i < len; i++) {
ret[i] = charAt(i);
}
return ret;
}
UTF8StringData ASCIIString::toUTF8StringData() const
{
return UTF8StringData((const char*)ASCIIString::characters8(), ASCIIString::length());
}
UTF8StringDataNonGCStd ASCIIString::toNonGCUTF8StringData(int options) const
{
return UTF8StringDataNonGCStd((const char*)ASCIIString::characters8(), ASCIIString::length());
}
UTF16StringData Latin1String::toUTF16StringData() const
{
UTF16StringData ret;
size_t len = length();
ret.resizeWithUninitializedValues(len);
for (size_t i = 0; i < len; i++) {
ret[i] = charAt(i);
}
return ret;
}
UTF8StringData Latin1String::toUTF8StringData() const
{
UTF8StringData ret;
size_t len = length();
for (size_t i = 0; i < len; i++) {
uint8_t ch = m_bufferData.uncheckedCharAtFor8Bit(i); /* assume that code points above 0xff are impossible since latin-1 is 8-bit */
if (ch < 0x80) {
ret.append((char*)&ch, 1);
} else {
char buf[8];
auto len = utf32ToUtf8(ch, buf);
ret.append(buf, len);
}
}
return ret;
}
UTF8StringDataNonGCStd Latin1String::toNonGCUTF8StringData(int options) const
{
UTF8StringDataNonGCStd ret;
size_t len = length();
for (size_t i = 0; i < len; i++) {
uint8_t ch = m_bufferData.uncheckedCharAtFor8Bit(i); /* assume that code points above 0xff are impossible since latin-1 is 8-bit */
if (ch < 0x80) {
ret.append((char*)&ch, 1);
} else {
char buf[8];
auto len = utf32ToUtf8(ch, buf);
ret.append(buf, len);
}
}
return ret;
}
UTF16StringData UTF16String::toUTF16StringData() const
{
return UTF16StringData(UTF16String::characters16(), UTF16String::length());
}
UTF8StringData UTF16String::toUTF8StringData() const
{
return bufferAccessData().toUTF8String<UTF8StringData, UTF8StringDataNonGCStd>();
}
UTF8StringDataNonGCStd UTF16String::toNonGCUTF8StringData(int options) const
{
return bufferAccessData().toUTF8String<UTF8StringDataNonGCStd>(options);
}
enum Flags : unsigned {
NO_FLAGS = 0,
EMIT_POSITIVE_EXPONENT_SIGN = 1,
EMIT_TRAILING_DECIMAL_POINT = 2,
EMIT_TRAILING_ZERO_AFTER_POINT = 4,
UNIQUE_ZERO = 8
};
void CreateDecimalRepresentation(
int flags_,
const char* decimal_digits,
int length,
int decimal_point,
int digits_after_point,
double_conversion::StringBuilder* result_builder)
{
// Create a representation that is padded with zeros if needed.
if (decimal_point <= 0) {
// "0.00000decimal_rep".
result_builder->AddCharacter('0');
if (digits_after_point > 0) {
result_builder->AddCharacter('.');
result_builder->AddPadding('0', -decimal_point);
ASSERT(length <= digits_after_point - (-decimal_point));
result_builder->AddSubstring(decimal_digits, length);
int remaining_digits = digits_after_point - (-decimal_point) - length;
result_builder->AddPadding('0', remaining_digits);
}
} else if (decimal_point >= length) {
// "decimal_rep0000.00000" or "decimal_rep.0000"
result_builder->AddSubstring(decimal_digits, length);
result_builder->AddPadding('0', decimal_point - length);
if (digits_after_point > 0) {
result_builder->AddCharacter('.');
result_builder->AddPadding('0', digits_after_point);
}
} else {
// "decima.l_rep000"
ASSERT(digits_after_point > 0);
result_builder->AddSubstring(decimal_digits, decimal_point);
result_builder->AddCharacter('.');
ASSERT(length - decimal_point <= digits_after_point);
result_builder->AddSubstring(&decimal_digits[decimal_point],
length - decimal_point);
int remaining_digits = digits_after_point - (length - decimal_point);
result_builder->AddPadding('0', remaining_digits);
}
if (digits_after_point == 0) {
if ((flags_ & EMIT_TRAILING_DECIMAL_POINT) != 0) {
result_builder->AddCharacter('.');
}
if ((flags_ & EMIT_TRAILING_ZERO_AFTER_POINT) != 0) {
result_builder->AddCharacter('0');
}
}
}
void CreateExponentialRepresentation(
int flags_,
const char* decimal_digits,
int length,
int exponent,
double_conversion::StringBuilder* result_builder)
{
ASSERT(length != 0);
result_builder->AddCharacter(decimal_digits[0]);
if (length != 1) {
result_builder->AddCharacter('.');
result_builder->AddSubstring(&decimal_digits[1], length - 1);
}
result_builder->AddCharacter('e');
if (exponent < 0) {
result_builder->AddCharacter('-');
exponent = -exponent;
} else {
if ((flags_ & EMIT_POSITIVE_EXPONENT_SIGN) != 0) {
result_builder->AddCharacter('+');
}
}
if (exponent == 0) {
result_builder->AddCharacter('0');
return;
}
ASSERT(exponent < 1e4);
const int kMaxExponentLength = 5;
char buffer[kMaxExponentLength + 1];
buffer[kMaxExponentLength] = '\0';
int first_char_pos = kMaxExponentLength;
while (exponent > 0) {
buffer[--first_char_pos] = '0' + (exponent % 10);
exponent /= 10;
}
result_builder->AddSubstring(&buffer[first_char_pos],
kMaxExponentLength - first_char_pos);
}
ASCIIStringData dtoa(double number)
{
if (number == 0) {
return ASCIIStringData("0", 1);
}
const int flags = UNIQUE_ZERO | EMIT_POSITIVE_EXPONENT_SIGN;
bool sign = false;
if (number < 0) {
sign = true;
number = -number;
}
// The maximal number of digits that are needed to emit a double in base 10.
// A higher precision can be achieved by using more digits, but the shortest
// accurate representation of any double will never use more digits than
// kBase10MaximalLength.
// Note that DoubleToAscii null-terminates its input. So the given buffer
// should be at least kBase10MaximalLength + 1 characters long.
const int kBase10MaximalLength = 17;
const int kDecimalRepCapacity = kBase10MaximalLength + 1;
char decimal_rep[kDecimalRepCapacity];
int decimal_rep_length;
int decimal_point;
double_conversion::Vector<char> vector(decimal_rep, kDecimalRepCapacity);
bool fast_worked = FastDtoa(number, double_conversion::FAST_DTOA_SHORTEST, 0, vector, &decimal_rep_length, &decimal_point);
if (!fast_worked) {
BignumDtoa(number, double_conversion::BIGNUM_DTOA_SHORTEST, 0, vector, &decimal_rep_length, &decimal_point);
vector[decimal_rep_length] = '\0';
}
/* reserve(decimal_rep_length + sign ? 1 : 0);
if (sign)
operator +=('-');
for (unsigned i = 0; i < decimal_rep_length; i ++) {
operator +=(decimal_rep[i]);
}*/
const int bufferLength = 128;
char buffer[bufferLength];
double_conversion::StringBuilder builder(buffer, bufferLength);
int exponent = decimal_point - 1;
const int decimal_in_shortest_low_ = -6;
const int decimal_in_shortest_high_ = 21;
if ((decimal_in_shortest_low_ <= exponent)
&& (exponent < decimal_in_shortest_high_)) {
CreateDecimalRepresentation(flags, decimal_rep, decimal_rep_length,
decimal_point,
double_conversion::Max(0, decimal_rep_length - decimal_point),
&builder);
} else {
CreateExponentialRepresentation(flags, decimal_rep, decimal_rep_length, exponent,
&builder);
}
ASCIIStringDataNonGCStd str;
if (sign)
str += '-';
char* buf = builder.Finalize();
while (*buf) {
str += *buf;
buf++;
}
return ASCIIStringData(str.data(), str.length());
}
String* String::fromASCII(const char* src, size_t len)
{
return new ASCIIString(src, len);
}
String* String::fromDouble(double v)
{
auto s = dtoa(v);
return new ASCIIString(std::move(s));
}
UTF16StringFinalizer::UTF16StringFinalizer(Context* context, UTF16StringData&& src)
: UTF16String(std::forward<UTF16StringData>(src))
, m_context(context)
{
context->vmInstance()->increaseSourceSize(length() * 2);
GC_REGISTER_FINALIZER_NO_ORDER(this, [](void* obj, void*) {
UTF16StringFinalizer* self = (UTF16StringFinalizer*)obj;
self->m_context->vmInstance()->decreaseSourceSize((self->length()) * 2);
},
nullptr, nullptr, nullptr);
}
void* UTF16StringFinalizer::operator new(size_t size)
{
static MAY_THREAD_LOCAL bool typeInited = false;
static MAY_THREAD_LOCAL GC_descr descr;
if (!typeInited) {
GC_word obj_bitmap[GC_BITMAP_SIZE(UTF16StringFinalizer)] = { 0 };
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(UTF16StringFinalizer, m_bufferData.buffer));
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(UTF16StringFinalizer, m_context));
descr = GC_make_descriptor(obj_bitmap, GC_WORD_LEN(UTF16StringFinalizer));
typeInited = true;
}
return GC_MALLOC_EXPLICITLY_TYPED(size, descr);
}
String* String::fromUTF8(const char* src, size_t len, bool maybeASCII)
{
if (maybeASCII && isAllASCII(src, len)) {
return new ASCIIString(src, len);
} else {
auto s = utf8StringToUTF16String(src, len);
return new UTF16String(std::move(s));
}
}
#if defined(ENABLE_COMPRESSIBLE_STRING)
String* String::fromUTF8ToCompressibleString(VMInstance* instance, const char* src, size_t len, bool maybeASCII)
{
if (maybeASCII && isAllASCII(src, len)) {
RELEASE_ASSERT_NOT_REACHED();
return new CompressibleString(instance, src, len);
} else {
auto s = utf8StringToUTF16StringNonGC(src, len);
return new CompressibleString(instance, s.data(), s.length());
}
}
#endif
int String::stringCompare(size_t l1, size_t l2, const String* c1, const String* c2)
{
size_t s = 0;
const unsigned lmin = l1 < l2 ? l1 : l2;
unsigned pos = 0;
while (pos < lmin && c1->charAt(s) == c2->charAt(s)) {
++s;
++pos;
}
if (pos < lmin)
return (c1->charAt(s) > c2->charAt(s)) ? 1 : -1;
if (l1 == l2)
return 0;
return (l1 > l2) ? 1 : -1;
}
bool String::equals(const String* src) const
{
if (length() != src->length()) {
return false;
}
const auto& myData = bufferAccessData();
const auto& srcData = src->bufferAccessData();
bool myIs8Bit = myData.has8BitContent;
bool srcIs8Bit = srcData.has8BitContent;
if (LIKELY(myIs8Bit && srcIs8Bit)) {
return stringEqual((const LChar*)myData.buffer, (const LChar*)srcData.buffer, myData.length);
} else if (myIs8Bit && !srcIs8Bit) {
return stringEqual((const char16_t*)srcData.buffer, (const LChar*)myData.buffer, myData.length);
} else if (!myIs8Bit && srcIs8Bit) {
return stringEqual((const char16_t*)myData.buffer, (const LChar*)srcData.buffer, myData.length);
} else {
return stringEqual((const char16_t*)myData.buffer, (const char16_t*)srcData.buffer, myData.length);
}
}
static uint64_t convertToIndexNumber(const String* str, uint64_t maxLimit)
{
uint64_t number = 0;
const size_t& len = str->length();
if (UNLIKELY(len == 0)) {
return maxLimit;
}
const auto& data = str->bufferAccessData();
char16_t first;
if (LIKELY(data.has8BitContent)) {
first = ((LChar*)data.buffer)[0];
} else {
first = ((char16_t*)data.buffer)[0];
}
if (len > 1 && first == '0') {
return maxLimit;
}
for (unsigned i = 0; i < len; i++) {
char16_t c;
if (LIKELY(data.has8BitContent)) {
c = ((LChar*)data.buffer)[i];
} else {
c = ((char16_t*)data.buffer)[i];
}
if (c < '0' || c > '9') {
return maxLimit;
} else {
uint64_t cnum = c - '0';
if (UNLIKELY(number > (maxLimit - cnum) / 10)) {
return maxLimit;
}
number = number * 10 + cnum;
}
}
return number;
}
uint64_t String::tryToUseAsIndex() const
{
return convertToIndexNumber(this, Value::InvalidIndexValue);
}
uint32_t String::tryToUseAsIndex32() const
{
return convertToIndexNumber(this, static_cast<uint64_t>(Value::InvalidIndex32Value));
}
uint32_t String::tryToUseAsIndexProperty() const
{
return tryToUseAsIndex32();
}
size_t String::find(String* str, size_t pos)
{
const size_t srcStrLen = str->length();
const size_t size = length();
if (srcStrLen == 0)
return pos <= size ? pos : SIZE_MAX;
if (srcStrLen <= size) {
char32_t src0 = str->charAt(0);
const auto& data = bufferAccessData();
if (data.has8BitContent) {
for (; pos <= size - srcStrLen; ++pos) {
if (((const LChar*)data.buffer)[pos] == src0) {
bool same = true;
for (size_t k = 1; k < srcStrLen; k++) {
if (((const LChar*)data.buffer)[pos + k] != str->charAt(k)) {
same = false;
break;
}
}
if (same)
return pos;
}
}
} else {
for (; pos <= size - srcStrLen; ++pos) {
if (((const char16_t*)data.buffer)[pos] == src0) {
bool same = true;
for (size_t k = 1; k < srcStrLen; k++) {
if (((const char16_t*)data.buffer)[pos + k] != str->charAt(k)) {
same = false;
break;
}
}
if (same)
return pos;
}
}
}
}
return SIZE_MAX;
}
size_t String::rfind(String* str, size_t pos)
{
const size_t srcStrLen = str->length();
const size_t size = length();
if (srcStrLen == 0)
return pos <= size ? pos : -1;
if (srcStrLen <= size) {
do {
bool same = true;
if (pos >= size) {
continue;
}
for (size_t k = 0; k < srcStrLen; k++) {
if (charAt(pos + k) != str->charAt(k)) {
same = false;
break;
}
}
if (same)
return pos;
} while (pos-- > 0);
}
return SIZE_MAX;
}
String* String::substring(size_t from, size_t to)
{
if (to - from > STRING_SUB_STRING_MIN_VIEW_LENGTH) {
StringView* str = new StringView(this, from, to);
return str;
}
StringBuilder builder;
builder.appendSubString(this, from, to);
return builder.finalize();
}
// https://www.ecma-international.org/ecma-262/6.0/#sec-advancestringindex
uint64_t String::advanceStringIndex(uint64_t index, bool unicode)
{
ASSERT(isString());
ASSERT(index <= (1ULL << 53) - 1);
// If unicode is false, return index + 1.
if (!unicode) {
return index + 1;
}
// Let length be the number of code units in S.
size_t length = this->length();
// If index + 1 >= legnth, return index + 1.
if (index + 1 >= length) {
return index + 1;
}
// Let first be the code unit value at index `index` in S.
char16_t first = this->charAt(index);
// If first < 0xD800 or first > 0xDBFF, return index + 1.
if (first < 0xD800 || first > 0xDBFF) {
return index + 1;
}
// Let second be the code unit value at index `index + 1` in S.
char16_t second = this->charAt(index + 1);
// If second < 0xDC00 or second > 0xDFFF, return index + 1.
if (second < 0xDC00 || second > 0xDFFF) {
return index + 1;
}
// Return index + 2.
return index + 2;
}
bool String::isAllSpecialCharacters(bool (*fn)(char))
{
auto bad = bufferAccessData();
for (size_t i = 0; i < bad.length; i++) {
char32_t c = bad.charAt(i);
if (c > 127) {
return false;
}
if (!fn((char)c)) {
return false;
}
}
return true;
}
String* String::trim(String::StringTrimWhere where)
{
const auto& bad = bufferAccessData();
int64_t stringLength = (int64_t)bad.length;
int64_t s = 0;
int64_t e = stringLength - 1;
if (where == TrimStart || where == TrimBoth) {
for (s = 0; s < stringLength; s++) {
if (!EscargotLexer::isWhiteSpaceOrLineTerminator(bad.charAt(s)))
break;
}
}
if (where == TrimEnd || where == TrimBoth) {
for (e = stringLength - 1; e >= s; e--) {
if (!EscargotLexer::isWhiteSpaceOrLineTerminator(bad.charAt(e)))
break;
}
}
if (s == 0 && e == stringLength - 1) {
return this;
}
return new StringView(this, s, e + 1);
}
void* ASCIIString::operator new(size_t size)
{
static MAY_THREAD_LOCAL bool typeInited = false;
static MAY_THREAD_LOCAL GC_descr descr;
if (!typeInited) {
GC_word obj_bitmap[GC_BITMAP_SIZE(ASCIIString)] = { 0 };
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(ASCIIString, m_bufferData.buffer));
descr = GC_make_descriptor(obj_bitmap, GC_WORD_LEN(ASCIIString));
typeInited = true;
}
return GC_MALLOC_EXPLICITLY_TYPED(size, descr);
}
void* Latin1String::operator new(size_t size)
{
static MAY_THREAD_LOCAL bool typeInited = false;
static MAY_THREAD_LOCAL GC_descr descr;
if (!typeInited) {
GC_word obj_bitmap[GC_BITMAP_SIZE(Latin1String)] = { 0 };
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(Latin1String, m_bufferData.buffer));
descr = GC_make_descriptor(obj_bitmap, GC_WORD_LEN(Latin1String));
typeInited = true;
}
return GC_MALLOC_EXPLICITLY_TYPED(size, descr);
}
Latin1StringFinalizer::Latin1StringFinalizer(Context* context, const LChar* str, size_t len)
: Latin1String(str, len)
, m_context(context)
{
context->vmInstance()->increaseSourceSize(length());
GC_REGISTER_FINALIZER_NO_ORDER(this, [](void* obj, void*) {
Latin1StringFinalizer* self = (Latin1StringFinalizer*)obj;
self->m_context->vmInstance()->decreaseSourceSize(self->length());
},
nullptr, nullptr, nullptr);
}
void* Latin1StringFinalizer::operator new(size_t size)
{
static MAY_THREAD_LOCAL bool typeInited = false;
static MAY_THREAD_LOCAL GC_descr descr;
if (!typeInited) {
GC_word obj_bitmap[GC_BITMAP_SIZE(Latin1StringFinalizer)] = { 0 };
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(Latin1StringFinalizer, m_bufferData.buffer));
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(Latin1StringFinalizer, m_context));
descr = GC_make_descriptor(obj_bitmap, GC_WORD_LEN(Latin1StringFinalizer));
typeInited = true;
}
return GC_MALLOC_EXPLICITLY_TYPED(size, descr);
}
void* UTF16String::operator new(size_t size)
{
static MAY_THREAD_LOCAL bool typeInited = false;
static MAY_THREAD_LOCAL GC_descr descr;
if (!typeInited) {
GC_word obj_bitmap[GC_BITMAP_SIZE(UTF16String)] = { 0 };
GC_set_bit(obj_bitmap, GC_WORD_OFFSET(UTF16String, m_bufferData.buffer));
descr = GC_make_descriptor(obj_bitmap, GC_WORD_LEN(UTF16String));
typeInited = true;
}
return GC_MALLOC_EXPLICITLY_TYPED(size, descr);
}
String* String::getSubstitution(ExecutionState& state, String* matched, String* str, size_t position, StringVector& captures, Value namedCapture, String* replacement)
{
ASSERT(matched != nullptr);
ASSERT(str != nullptr);
ASSERT(replacement != nullptr);
ASSERT(matched->isString() == true);
size_t matchLenght = matched->length();
size_t stringLength = str->length();
ASSERT(position <= stringLength);
size_t tailPos = position + matchLenght;
size_t m = captures.size();
StringBuilder builder;
bool twodigit = false;
Object* namedCaptureObj = nullptr;
if (!namedCapture.isUndefined()) {
namedCaptureObj = namedCapture.toObject(state);
}
// dollar replace
for (size_t i = 0; i < replacement->length(); i++) {
if (replacement->charAt(i) == '$' && (i + 1) < replacement->length()) {
char16_t temp = replacement->charAt(i + 1);
if (temp == '$') {
builder.appendChar('$');
} else if (temp == '&') {
builder.appendSubString(matched, 0, matchLenght);
} else if (temp == '`') {
builder.appendSubString(str, 0, position);
} else if (temp == '\'') {
if (tailPos >= stringLength) {
tailPos = stringLength;
}
builder.appendSubString(str, tailPos, stringLength);
} else if (temp >= '0' && temp <= '9') {
size_t n = temp - '0' - 1;
if (i + 2 < replacement->length() && replacement->charAt(i + 2) >= '0' && replacement->charAt(i + 2) <= '9') {
char16_t temp1 = replacement->charAt(i + 2);
n = n + 1;
n *= 10;
n += temp1 - '0' - 1;
twodigit = true;
if (n >= m) {
twodigit = false;
n = temp - '0' - 1;
}
}
if (n < m) {
String* capturesN = captures[n];
builder.appendString(capturesN);
if (twodigit) {
i++;
twodigit = false;
}
} else {
builder.appendChar('$');
builder.appendChar(temp);
}
} else if (temp == '<' && !namedCapture.isUndefined()) {
size_t namedCaptureEnd = i + 1;
bool ValidNamedCapturedGroup = false;
char16_t temp2 = replacement->charAt(namedCaptureEnd);
while (namedCaptureEnd < replacement->length()) {
if (temp2 == '>') {
ValidNamedCapturedGroup = true;
break;
}
namedCaptureEnd++;
temp2 = replacement->charAt(namedCaptureEnd);
}
if (ValidNamedCapturedGroup && namedCaptureObj) {
String* groupName = replacement->substring((i + 2), (namedCaptureEnd));
Value capture = namedCaptureObj->get(state, ObjectPropertyName(state, groupName)).value(state, Value(0));
if (!capture.isUndefined()) {
builder.appendString(capture.toString(state));
}
i = namedCaptureEnd - 1;
} else {
builder.appendChar('$');
builder.appendChar(temp);
}
} else {
builder.appendChar('$');
builder.appendChar(temp);
}
i++;
} else {
builder.appendChar(replacement->charAt(i));
}
}
return builder.finalize(&state);
}
bool isupper(char32_t ch)
{
return (ch >= 'A' && ch <= 'Z');
}
bool islower(char32_t ch)
{
return (ch >= 'a' && ch <= 'z');
}
char32_t tolower(char32_t c)
{
return isupper(c) ? (c) - 'A' + 'a' : c;
}
char32_t toupper(char32_t c)
{
return islower(c) ? c - 'a' + 'A' : c;
}
bool isspace(char32_t c)
{
return (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v' ? 1 : 0);
}
bool isdigit(char32_t ch)
{
return (ch >= '0' && ch <= '9');
}
} // namespace Escargot