Cassie Jones 7213ae8423
[include-mapping] Python fixes
- Move the multiprocessing.Pool initializer to a top-level function, it
  was previously causing a pickle failure with my machine's python.
- Change the `env python` to `env python3` for convenience
2023-09-12 11:24:35 -07:00

295 lines
9.9 KiB
Python
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# ===- gen_std.py - ------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===------------------------------------------------------------------------===#
"""gen_std.py is a tool to generate a lookup table (from qualified names to
include headers) for C/C++ Standard Library symbols by parsing archived HTML
files from cppreference.
The generated files are located in clang/include/Tooling/Inclusions.
Caveats and FIXMEs:
- only symbols directly in "std" namespace are added, we should also add std's
subnamespace symbols (e.g. chrono).
- symbols with multiple variants or defined in multiple headers aren't added,
e.g. std::move, std::swap
Usage:
1. Install BeautifulSoup dependency, see instruction:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at
https://en.cppreference.com/w/Cppreference:Archives
3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should
get a "cppreference/reference" directory.
4. Run the command:
// Generate C++ symbols
python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc
// Generate C symbols
python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc
"""
import cppreference_parser
import argparse
import datetime
import os
import sys
import re
CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for %s
// Standard Library symbols.
//
// This file was generated automatically by
// clang/tools/include-mapping/gen_std.py, DO NOT EDIT!
//
// Generated from cppreference offline HTML book (modified on %s).
//===----------------------------------------------------------------------===//
"""
def ParseArg():
parser = argparse.ArgumentParser(description="Generate StdGen file")
parser.add_argument(
"-cppreference",
metavar="PATH",
default="",
help="path to the cppreference offline HTML directory",
required=True,
)
parser.add_argument(
"-symbols",
default="cpp",
help="Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.",
required=True,
)
return parser.parse_args()
def AdditionalHeadersForIOSymbols(symbol):
# IO-related symbols declared in the <iosfwd> header, per C++
# [iosfwd.syn 31.3.1]:
iosfwd_symbols = [
"basic_ios",
"basic_streambuf",
"basic_istream",
"basic_ostream",
"basic_iostream",
"basic_stringbuf",
"basic_istringstream",
"basic_ostringstream",
"basic_stringstream",
"basic_spanbuf",
"basic_ispanstream",
"basic_ospanstream",
"basic_spanstream",
"basic_filebuf",
"basic_ifstream",
"basic_ofstream",
"basic_fstream",
"basic_syncbuf",
"basic_osyncstream",
"istreambuf_iterator",
"ostreambuf_iterator",
"ios",
"wios",
"streambuf",
"istream",
"ostream",
"iostream",
"stringbuf",
"istringstream",
"ostringstream",
"stringstream",
"spanbuf",
"ispanstream",
"ospanstream",
"spanstream",
"filebuf",
"ifstream",
"ofstream",
"fstream",
"syncbuf",
"osyncstream",
"wstreambuf",
"wistream",
"wostream",
"wiostream",
"wstringbuf",
"wistringstream",
"wostringstream",
"wstringstream",
"wspanbuf",
"wispanstream",
"wospanstream",
"wspanstream",
"wfilebuf",
"wifstream",
"wofstream",
"wfstream",
"wsyncbuf",
"wosyncstream",
"fpos",
"streampos",
"wstreampos",
"u8streampos",
"u16streampos",
"u32streampos",
]
assert len(symbol.headers) == 1
sym_header = symbol.headers[0]
headers = []
# <iostream> is preferred than <iosfwd>
# <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>.
# per C++ [iostream.syn 31.4.1]
if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]:
headers.append("<iostream>")
if symbol.name in iosfwd_symbols:
headers.append("<iosfwd>")
return headers
def GetCCompatibilitySymbols(symbol):
# C++ form of the C standard headers.
c_compat_headers = {
"<cassert>",
"<cctype>",
"<cerrno>",
"<cfenv>",
"<cfloat>",
"<cinttypes>",
"<climits>",
"<clocale>",
"<cmath>",
"<csetjmp>",
"<csignal>",
"<cstdarg>",
"<cstddef>",
"<cstdint>",
"<cstdio>",
"<cstdlib>",
"<cstring>",
"<ctime>",
"<cuchar>",
"<cwchar>",
"<cwctype>",
}
# C++ [support.c.headers.other] 17.14.7
# ..., behaves as if each name placed in the standard library namespace by
# the corresponding <cname> header is placed within the global namespace
# scope, except for the functions described in [sf.cmath], the
# std::lerp function overloads ([c.math.lerp]), the declaration of
# std::byte ([cstddef.syn]), and the functions and function templates
# described in [support.types.byteops].
exception_symbols = {
"(assoc_)?laguerre[f|l]?",
"(assoc_|sph_)?legendre[f|l]?",
"beta[f|l]?",
"(comp_)?ellint_[1-3][f|l]?",
"(cyl_|sph_)?bessel_[i-k][f|l]?",
"(cyl_|sph_)?neumann[f|l]?",
"expint[f|l]?",
"hermite[f|l]?",
"riemann_zeta[f|l]?",
"lerp",
"byte",
}
assert len(symbol.headers) == 1
header = symbol.headers[0]
if header not in c_compat_headers:
return []
if any(re.fullmatch(x, symbol.name) for x in exception_symbols):
return []
# Introduce two more entries, both in the global namespace, one using the
# C++-compat header and another using the C header.
results = []
if symbol.namespace != None:
# avoid printing duplicated entries, for C macros!
results.append(cppreference_parser.Symbol(symbol.name, None, [header]))
c_header = "<" + header[2:-1] + ".h>" # <cstdio> => <stdio.h>
results.append(cppreference_parser.Symbol(symbol.name, None, [c_header]))
return results
def main():
args = ParseArg()
if args.symbols == "cpp":
page_root = os.path.join(args.cppreference, "en", "cpp")
symbol_index_root = os.path.join(page_root, "symbol_index")
parse_pages = [
(page_root, "symbol_index.html", "std::"),
# std sub-namespace symbols have separated pages.
# We don't index std literal operators (e.g.
# std::literals::chrono_literals::operator""d), these symbols can't be
# accessed by std::<symbol_name>.
#
# std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc
(symbol_index_root, "chrono.html", "std::chrono::"),
(symbol_index_root, "execution.html", "std::execution::"),
(symbol_index_root, "numbers.html", "std::numbers::"),
(symbol_index_root, "filesystem.html", "std::filesystem::"),
(symbol_index_root, "pmr.html", "std::pmr::"),
(symbol_index_root, "ranges.html", "std::ranges::"),
(symbol_index_root, "views.html", "std::ranges::views::"),
# std::ranges::views can be accessed as std::views.
(symbol_index_root, "views.html", "std::views::"),
(symbol_index_root, "regex_constants.html", "std::regex_constants::"),
(symbol_index_root, "this_thread.html", "std::this_thread::"),
# Zombie symbols that were available from the Standard Library, but are
# removed in the following standards.
(symbol_index_root, "zombie_names.html", "std::"),
(symbol_index_root, "macro.html", None),
]
elif args.symbols == "c":
page_root = os.path.join(args.cppreference, "en", "c")
symbol_index_root = page_root
parse_pages = [(page_root, "index.html", None)]
if not os.path.exists(symbol_index_root):
exit("Path %s doesn't exist!" % symbol_index_root)
symbols = cppreference_parser.GetSymbols(parse_pages)
# We don't have version information from the unzipped offline HTML files.
# so we use the modified time of the symbol_index.html as the version.
index_page_path = os.path.join(page_root, "index.html")
cppreference_modified_date = datetime.datetime.fromtimestamp(
os.stat(index_page_path).st_mtime
).strftime("%Y-%m-%d")
print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date))
for symbol in symbols:
if len(symbol.headers) == 1:
augmented_symbols = [symbol]
augmented_symbols.extend(GetCCompatibilitySymbols(symbol))
for s in augmented_symbols:
s.headers.extend(AdditionalHeadersForIOSymbols(s))
for header in s.headers:
# SYMBOL(unqualified_name, namespace, header)
print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header))
elif len(symbol.headers) == 0:
sys.stderr.write("No header found for symbol %s\n" % symbol.name)
else:
# FIXME: support symbols with multiple headers (e.g. std::move).
sys.stderr.write(
"Ambiguous header for symbol %s: %s\n"
% (symbol.name, ", ".join(symbol.headers))
)
if __name__ == "__main__":
main()