mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 21:46:05 +00:00
327 lines
11 KiB
Python
327 lines
11 KiB
Python
![]() |
#!/usr/bin/env python
|
||
|
# ===----------------------------------------------------------------------===##
|
||
|
#
|
||
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
# See https://llvm.org/LICENSE.txt for license information.
|
||
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
#
|
||
|
# ===----------------------------------------------------------------------===##
|
||
|
|
||
|
# The code is based on
|
||
|
# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
|
||
|
#
|
||
|
# Copyright (c) Microsoft Corporation.
|
||
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
|
||
|
from io import StringIO
|
||
|
from pathlib import Path
|
||
|
from dataclasses import dataclass, field
|
||
|
from typing import Optional
|
||
|
import re
|
||
|
|
||
|
|
||
|
@dataclass
|
||
|
class PropertyRange:
|
||
|
lower: int = -1
|
||
|
upper: int = -1
|
||
|
prop: str = None
|
||
|
|
||
|
|
||
|
@dataclass
|
||
|
class Entry:
|
||
|
lower: int = -1
|
||
|
offset: int = -1
|
||
|
prop: int = -1
|
||
|
|
||
|
|
||
|
LINE_REGEX = re.compile(
|
||
|
r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
|
||
|
)
|
||
|
|
||
|
|
||
|
def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
|
||
|
result = PropertyRange()
|
||
|
if m := LINE_REGEX.match(inputLine):
|
||
|
lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
|
||
|
result.lower = int(lower_str, base=16)
|
||
|
result.upper = result.lower
|
||
|
if upper_str is not None:
|
||
|
result.upper = int(upper_str, base=16)
|
||
|
return result
|
||
|
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
|
||
|
"""
|
||
|
Merges consecutive ranges with the same property to one range.
|
||
|
|
||
|
Merging the ranges results in fewer ranges in the output table,
|
||
|
reducing binary and improving lookup performance.
|
||
|
"""
|
||
|
result = list()
|
||
|
for x in input:
|
||
|
if (
|
||
|
len(result)
|
||
|
and result[-1].prop == x.prop
|
||
|
and result[-1].upper + 1 == x.lower
|
||
|
):
|
||
|
result[-1].upper = x.upper
|
||
|
continue
|
||
|
result.append(x)
|
||
|
return result
|
||
|
|
||
|
|
||
|
PROP_VALUE_ENUMERATOR_TEMPLATE = "__{}"
|
||
|
PROP_VALUE_ENUM_TEMPLATE = """
|
||
|
enum class __property : uint8_t {{
|
||
|
// Values generated from the data files.
|
||
|
{enumerators},
|
||
|
|
||
|
// The properies below aren't stored in the "database".
|
||
|
|
||
|
// Text position properties.
|
||
|
__sot,
|
||
|
__eot,
|
||
|
|
||
|
// The code unit has none of above properties.
|
||
|
__none
|
||
|
}};
|
||
|
"""
|
||
|
|
||
|
DATA_ARRAY_TEMPLATE = """
|
||
|
/// The entries of the extended grapheme cluster bondary property table.
|
||
|
///
|
||
|
/// The data is generated from
|
||
|
/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||
|
/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
||
|
///
|
||
|
/// The data has 3 values
|
||
|
/// - bits [0, 3] The property. One of the values generated form the datafiles
|
||
|
/// of \\ref __property
|
||
|
/// - bits [4, 10] The size of the range.
|
||
|
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
|
||
|
/// the range is lower bound + size.
|
||
|
///
|
||
|
/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
|
||
|
/// in the Unicode tables are larger. They are stored in multiple consecutive
|
||
|
/// ranges in the data table. An alternative would be to store the sizes in a
|
||
|
/// separate 16-bit value. The original MSVC STL code had such an approach, but
|
||
|
/// this approach uses less space for the data and is about 4% faster in the
|
||
|
/// following benchmark.
|
||
|
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
|
||
|
inline constexpr uint32_t __entries[{size}] = {{{entries}}};
|
||
|
|
||
|
/// Returns the extended grapheme cluster bondary property of a code point.
|
||
|
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
|
||
|
// TODO FMT use std::ranges::upper_bound.
|
||
|
|
||
|
// The algorithm searches for the upper bound of the range and, when found,
|
||
|
// steps back one entry. This algorithm is used since the code point can be
|
||
|
// anywhere in the range. After a lower bound is found the next step is to
|
||
|
// compare whether the code unit is indeed in the range.
|
||
|
//
|
||
|
// Since the entry contains a code unit, size, and property the code point
|
||
|
// being sought needs to be adjusted. Just shifting the code point to the
|
||
|
// proper position doesn't work; suppose an entry has property 0, size 1,
|
||
|
// and lower bound 3. This results in the entry 0x1810.
|
||
|
// When searching for code point 3 it will search for 0x1800, find 0x1810
|
||
|
// and moves to the previous entry. Thus the lower bound value will never
|
||
|
// be found.
|
||
|
// The simple solution is to set the bits belonging to the property and
|
||
|
// size. Then the upper bound for code point 3 will return the entry after
|
||
|
// 0x1810. After moving to the previous entry the algorithm arrives at the
|
||
|
// correct entry.
|
||
|
ptrdiff_t __i = std::upper_bound(__entries, std::end(__entries), (__code_point << 11) | 0x7ffu) - __entries;
|
||
|
if (__i == 0)
|
||
|
return __property::__none;
|
||
|
|
||
|
--__i;
|
||
|
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
|
||
|
if (__code_point <= __upper_bound)
|
||
|
return static_cast<__property>(__entries[__i] & 0xf);
|
||
|
|
||
|
return __property::__none;
|
||
|
}}
|
||
|
"""
|
||
|
|
||
|
MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
|
||
|
// -*- C++ -*-
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
// WARNING, this entire header is generated by
|
||
|
// utiles/generate_extended_grapheme_cluster_table.py
|
||
|
// DO NOT MODIFY!
|
||
|
|
||
|
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||
|
//
|
||
|
// See Terms of Use <https://www.unicode.org/copyright.html>
|
||
|
// for definitions of Unicode Inc.'s Data Files and Software.
|
||
|
//
|
||
|
// NOTICE TO USER: Carefully read the following legal agreement.
|
||
|
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||
|
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||
|
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||
|
// TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||
|
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||
|
// THE DATA FILES OR SOFTWARE.
|
||
|
//
|
||
|
// COPYRIGHT AND PERMISSION NOTICE
|
||
|
//
|
||
|
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
|
||
|
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||
|
//
|
||
|
// Permission is hereby granted, free of charge, to any person obtaining
|
||
|
// a copy of the Unicode data files and any associated documentation
|
||
|
// (the "Data Files") or Unicode software and any associated documentation
|
||
|
// (the "Software") to deal in the Data Files or Software
|
||
|
// without restriction, including without limitation the rights to use,
|
||
|
// copy, modify, merge, publish, distribute, and/or sell copies of
|
||
|
// the Data Files or Software, and to permit persons to whom the Data Files
|
||
|
// or Software are furnished to do so, provided that either
|
||
|
// (a) this copyright and permission notice appear with all copies
|
||
|
// of the Data Files or Software, or
|
||
|
// (b) this copyright and permission notice appear in associated
|
||
|
// Documentation.
|
||
|
//
|
||
|
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||
|
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||
|
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||
|
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||
|
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||
|
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||
|
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||
|
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||
|
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||
|
//
|
||
|
// Except as contained in this notice, the name of a copyright holder
|
||
|
// shall not be used in advertising or otherwise to promote the sale,
|
||
|
// use or other dealings in these Data Files or Software without prior
|
||
|
// written authorization of the copyright holder.
|
||
|
|
||
|
#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||
|
#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||
|
|
||
|
#include <__algorithm/upper_bound.h>
|
||
|
#include <__config>
|
||
|
#include <__iterator/access.h>
|
||
|
#include <cstddef>
|
||
|
#include <cstdint>
|
||
|
|
||
|
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||
|
# pragma GCC system_header
|
||
|
#endif
|
||
|
|
||
|
_LIBCPP_BEGIN_NAMESPACE_STD
|
||
|
|
||
|
#if _LIBCPP_STD_VER > 17
|
||
|
|
||
|
namespace __extended_grapheme_custer_property_boundary {{
|
||
|
|
||
|
{content}
|
||
|
|
||
|
}} // __extended_grapheme_custer_property_boundary
|
||
|
|
||
|
#endif //_LIBCPP_STD_VER > 17
|
||
|
|
||
|
_LIBCPP_END_NAMESPACE_STD
|
||
|
|
||
|
#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||
|
"""
|
||
|
|
||
|
|
||
|
def property_ranges_to_table(
|
||
|
ranges: list[PropertyRange], props: list[str]
|
||
|
) -> list[Entry]:
|
||
|
assert len(props) < 16
|
||
|
result = list[Entry]()
|
||
|
high = -1
|
||
|
for range in sorted(ranges, key=lambda x: x.lower):
|
||
|
# Validate overlapping ranges
|
||
|
assert range.lower > high
|
||
|
high = range.upper
|
||
|
|
||
|
while True:
|
||
|
e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
|
||
|
if e.offset <= 127:
|
||
|
result.append(e)
|
||
|
break
|
||
|
e.offset = 127
|
||
|
result.append(e)
|
||
|
range.lower += 128
|
||
|
return result
|
||
|
|
||
|
|
||
|
cpp_entrytemplate = "0x{:08x}"
|
||
|
|
||
|
|
||
|
def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
|
||
|
result = StringIO()
|
||
|
prop_values = sorted(set(x.prop for x in ranges))
|
||
|
table = property_ranges_to_table(ranges, prop_values)
|
||
|
enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
|
||
|
result.write(
|
||
|
PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",".join(enumerator_values))
|
||
|
)
|
||
|
result.write(
|
||
|
DATA_ARRAY_TEMPLATE.format(
|
||
|
prop_name=prop_name,
|
||
|
size=len(table),
|
||
|
entries=",".join(
|
||
|
[
|
||
|
cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
|
||
|
for x in table
|
||
|
]
|
||
|
),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
return result.getvalue()
|
||
|
|
||
|
|
||
|
def generate_data_tables() -> str:
|
||
|
"""
|
||
|
Generate Unicode data for inclusion into <format> from
|
||
|
GraphemeBreakProperty.txt and emoji-data.txt.
|
||
|
|
||
|
GraphemeBreakProperty.txt can be found at
|
||
|
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||
|
|
||
|
emoji-data.txt can be found at
|
||
|
https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
||
|
|
||
|
Both files are expected to be in the same directory as this script.
|
||
|
"""
|
||
|
gbp_data_path = Path(__file__).absolute().with_name("GraphemeBreakProperty.txt")
|
||
|
emoji_data_path = Path(__file__).absolute().with_name("emoji-data.txt")
|
||
|
gbp_ranges = list()
|
||
|
emoji_ranges = list()
|
||
|
with gbp_data_path.open(encoding="utf-8") as f:
|
||
|
gbp_ranges = compactPropertyRanges(
|
||
|
[x for line in f if (x := parsePropertyLine(line))]
|
||
|
)
|
||
|
with emoji_data_path.open(encoding="utf-8") as f:
|
||
|
emoji_ranges = compactPropertyRanges(
|
||
|
[x for line in f if (x := parsePropertyLine(line))]
|
||
|
)
|
||
|
|
||
|
[gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
|
||
|
gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
|
||
|
return "\n".join([gpb_cpp_data])
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
print(
|
||
|
MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
|
||
|
content=generate_data_tables()
|
||
|
)
|
||
|
)
|