escargot/tools/gen_unicode.py
Seonghyun Kim 15db6c3c2e Update Identifier parsing rule regarding to unicode 14 & fix parser bug
Signed-off-by: Seonghyun Kim <sh8281.kim@samsung.com>
2022-09-29 10:43:21 +09:00

244 lines
11 KiB
Python
Executable file

#!/usr/bin/env python
# Copyright 2020-present Samsung Electronics Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
import sys
import os
from argparse import ArgumentParser
PARAGRAPH_SEP = "# ================================================\n"
ID_START_TITLE = "# Derived Property: ID_Start\n"
ID_CONTINUE_TITLE = "# Derived Property: ID_Continue\n"
BASIC_PLANE_START = dict()
LONG_RANGE_LENGHT = []
BASIC_PLANE_CONTINUE = dict()
SUPPLEMENTARY_PLANE = dict()
MERGED_ID_START = dict()
TERM_RED = '\033[1;31m'
TERM_GREEN = '\033[1;32m'
TERM_YELLOW = '\033[1;33m'
TERM_EMPTY = '\033[0m'
class UnicodeTable:
def __init__(self):
self.header = []
self.data = []
self.footer = []
def add_header(self, header_text):
self.header.append(header_text)
self.header.append("")
def add_footer(self, footer_text):
self.footer.append(footer_text)
self.footer.append("")
def add_table(self, table, table_name, table_type, table_descr):
self.data.append("/* %s */" % (table_descr))
self.data.append("const %s identRange%s[%s] = {" % (table_type,table_name,len(table)))
self.data.append(' ' + ', '.join(map(str, table)))
self.data.append("};")
self.data.append("")
def generate(self):
with open(os.path.dirname(os.path.abspath(__file__)) + "/../src/parser/UnicodeIdentifierTables.cpp", 'w') as unicode_table_file:
unicode_table_file.write("\n".join([str(i) for i in self.header]))
unicode_table_file.write("\n".join([str(i) for i in self.data]))
unicode_table_file.write("\n".join([str(i) for i in self.footer]))
def create_basic_plane_table():
# zero_width_non_joiner and zero_width_joiner needs to be added here
# http://www.ecma-international.org/ecma-262/10.0/#prod-IdentifierName
# $ and _ are handled directly in the Escargot Lexer
other_starters = [0x200C,0x200D]
for key in other_starters:
BASIC_PLANE_START[key] = 1
# Create new merged dict
for k, v in BASIC_PLANE_START.items():
MERGED_ID_START[k] = v
# Copy unicodes not present in base
for continue_key in BASIC_PLANE_CONTINUE.keys():
if not MERGED_ID_START.has_key(continue_key):
MERGED_ID_START[continue_key] = BASIC_PLANE_CONTINUE[continue_key]
# Merge ranges if possible
# Get collection of keys
ordered_keys = sorted(MERGED_ID_START.keys())
for key in range(len(MERGED_ID_START)-1, 1, -1):
# If key value is equal than
if ordered_keys[key] == ordered_keys[key-1]+MERGED_ID_START[ordered_keys[key-1]]+1:
# combine the length of the two
# refresh key length value with the sum of the sperate values
# pop next value out from the dict
new_length = MERGED_ID_START[ordered_keys[key]] + MERGED_ID_START[ordered_keys[key-1]] + 1
MERGED_ID_START[ordered_keys[key-1]] = new_length
error = MERGED_ID_START.pop(ordered_keys[key])
# Create the long range table
long_length_index = 0
for key, value in MERGED_ID_START.iteritems():
if value >= 200:
LONG_RANGE_LENGHT.append(value)
new_value = 200+long_length_index
long_length_index+=1
MERGED_ID_START[key] = new_value
def unify_non_basic_plane():
# Merge ranges if possible
# Get collection of keys
ordered_keys = sorted(SUPPLEMENTARY_PLANE.keys())
for key in range(len(SUPPLEMENTARY_PLANE)-1, 1, -1):
# If key value is equal than
if ordered_keys[key] == ordered_keys[key-1]+SUPPLEMENTARY_PLANE[ordered_keys[key-1]]+1:
# Combine the length of the two
# Refresh key length value with the sum of the sperate values
# Pop next value out from the dict
new_length = SUPPLEMENTARY_PLANE[ordered_keys[key]] + SUPPLEMENTARY_PLANE[ordered_keys[key-1]] + 1
SUPPLEMENTARY_PLANE[ordered_keys[key-1]] = new_length
error = SUPPLEMENTARY_PLANE.pop(ordered_keys[key])
def generate_ranges(file):
bigger = 0
long_range_index = 0
with open(file, "r") as fp:
line = fp.readline()
while line:
line = fp.readline()
if (line == PARAGRAPH_SEP):
# Read twice to skip empty line
paragraph_title = fp.readline()
paragraph_title = fp.readline()
if (paragraph_title == ID_START_TITLE) or (paragraph_title == ID_CONTINUE_TITLE):
long_range_index = 0
# Skip 8 lines, containing definition of ID_START
# or skip 9 in case of ID_CONTINUE
skip = 8
if (paragraph_title == ID_CONTINUE_TITLE):
skip = 9
for _ in range(skip):
fp.readline()
unicode_range_line = fp.readline()
while (unicode_range_line != "\n"):
sub_lines = ""
ranges = ""
# Cut out ranges
sub_lines = unicode_range_line.split(';')
# Cut ranges to check length
# Convert into int
ranges = sub_lines[0].split('..')
start = (int(ranges[0],16))
if (len(ranges) == 2):
length = (int(ranges[1],16) - int(ranges[0],16))
else:
length = 1
# Basic plane for Escargot's uint16_t
# Skip the ascii part as it is handled directly
# in the Lexer.cpp
if (128 < start) and (start <= 65535):
if skip == 8:
if not BASIC_PLANE_START.has_key(start):
BASIC_PLANE_START[start] = length
else:
if not BASIC_PLANE_CONTINUE.has_key(start):
BASIC_PLANE_CONTINUE[start] = length
# Supplementary Plane data
elif start > 65535:
SUPPLEMENTARY_PLANE[start] = length
unicode_range_line = fp.readline()
def main():
print("\n%sAutomated Unicode Identifier Table Generator for Escargot %s\n" % (TERM_GREEN,TERM_EMPTY))
parser = ArgumentParser(description='Kangax runner for the Escargot engine')
parser.add_argument('--derived_core_properties', metavar='FILE', action='store', required=True, help='specify the unicode data file')
args = parser.parse_args()
if not os.path.isfile(args.derived_core_properties):
parser.error("\n\n%sArgument file is non-existent!%s\nGiven: %s\n" % (TERM_RED,TERM_EMPTY,args.derived_core_properties))
# Process Derived File
print("%s..Processing Derived Core Properties file %s\n" % (TERM_YELLOW,TERM_EMPTY))
generate_ranges(args.derived_core_properties)
# Porcess data for the basic plane
# and unify the supplementary
print("%s..Calculating proper ranges %s\n" % (TERM_YELLOW,TERM_EMPTY))
create_basic_plane_table()
unify_non_basic_plane()
# Start collecting output data
print("%s..Collecting output data %s\n" % (TERM_YELLOW,TERM_EMPTY))
generated_data = UnicodeTable()
licence = "/* * Copyright (c) 2020-present Samsung Electronics Co., Ltd\n *\n * This library is free software; you can redistribute it and/or\n * modify it under the terms of the GNU Lesser General Public\n * License as published by the Free Software Foundation; either\n * version 2.1 of the License, or (at your option) any later version.\n *\n * This library is distributed in the hope that it will be useful,\n * but WITHOUT ANY WARRANTY; without even the implied warranty of\n * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n * Lesser General Public License for more details.\n *\n * You should have received a copy of the GNU Lesser General Public\n * License along with this library; if not, write to the Free Software\n * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301\n * USA\n */\n"
header = "/* This file is automatically generated by the %s script\n* from https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt\n* DO NOT EDIT!\n*/" % (os.path.basename(__file__))
ifdef_namespace ="#include \"UnicodeIdentifierTables.h\"\n\nnamespace Escargot {\nnamespace EscargotLexer {\n"
footer_namespace = "const uint16_t basic_plane_length = sizeof(identRangeStart);\nconst uint16_t supplementary_plane_length = sizeof(identRangeStartSupplementaryPlane);\n} // namespace EscargotLexer\n} // namespace Escargot"
# Append Generated Warning, licence, ifdef
generated_data.add_header(licence)
generated_data.add_header(header)
generated_data.add_header(ifdef_namespace)
# Fetch data for the basic plane
keys = sorted(MERGED_ID_START.keys())
values = []
for key in keys:
values.append(MERGED_ID_START[key])
# Append basic plane values
generated_data.add_table(keys,"Start","uint16_t","Starting codepoints of identifier ranges.")
generated_data.add_table(values,"Length","uint16_t","Lengths of identifier ranges.")
generated_data.add_table(LONG_RANGE_LENGHT,"LongLength","uint16_t","Lengths of identifier ranges greater than LEXER IDENT_RANGE_LONG.")
# Fetch data for the supplementary plane
keys = sorted(SUPPLEMENTARY_PLANE.keys())
values = []
for key in keys:
values.append(SUPPLEMENTARY_PLANE[key])
# Append suppelmentary values
generated_data.add_table(keys,"StartSupplementaryPlane","uint32_t","Identifier starting codepoints for the supplementary plane")
generated_data.add_table(values,"LengthSupplementaryPlane","uint16_t","Lengths of identifier ranges for the supplementary plane")
# Append footer
generated_data.add_footer(footer_namespace)
# Writeout data
print("%s..Writing out data %s\n" % (TERM_YELLOW,TERM_EMPTY))
generated_data.generate()
print("\n%sTables generated into src/parser/UnicodeIdentifierTables.cpp %s\n" % (TERM_GREEN,TERM_EMPTY))
if __name__ == '__main__':
main()