[libc++] Rewrite the transitive header checking machinery (#110554)

Since we don't generate a full dependency graph of headers, we can
greatly simplify the script that parses the result of --trace-includes.

At the same time, we also unify the mechanism for detecting whether a
header is a public/C compat/internal/etc header with the existing
mechanism in header_information.py.

As a drive-by this fixes the headers_in_modulemap.sh.py test which had
been disabled by mistake because it used its own way of determining
the list of libc++ headers. By consistently using header_information.py
to get that information, problems like this shouldn't happen anymore.

This should also unblock #110303, which was blocked because of
a brittle implementation of the transitive includes check which broke
when the repository was cloned at a path like /path/__something/more.
This commit is contained in:
Louis Dionne 2024-10-22 09:18:58 -04:00 committed by GitHub
parent c623df38c9
commit d6e714b10e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 297 additions and 298 deletions

View File

@ -16,7 +16,7 @@ sys.path.append(sys.argv[1])
from libcxx.header_information import lit_header_restrictions, public_headers, mandatory_inclusions
for header in public_headers:
header_guard = lambda h: f"_LIBCPP_{h.upper().replace('.', '_').replace('/', '_')}"
header_guard = lambda h: f"_LIBCPP_{str(h).upper().replace('.', '_').replace('/', '_')}"
# <cassert> has no header guards
if header == 'cassert':

View File

@ -1,25 +1,15 @@
# RUN: %{python} %s %{libcxx-dir}/utils %{include-dir}
# RUN: %{python} %s %{libcxx-dir}/utils
import sys
sys.path.append(sys.argv[1])
from libcxx.header_information import all_headers, libcxx_include
import pathlib
import sys
from libcxx.header_information import is_modulemap_header, is_header
headers = list(pathlib.Path(sys.argv[2]).rglob("*"))
modulemap = open(f"{sys.argv[2]}/module.modulemap").read()
with open(libcxx_include / "module.modulemap") as f:
modulemap = f.read()
isHeaderMissing = False
for header in headers:
if not is_header(header):
continue
header = header.relative_to(pathlib.Path(sys.argv[2])).as_posix()
if not is_modulemap_header(header):
for header in all_headers:
if not header.is_in_modulemap():
continue
if not str(header) in modulemap:

View File

@ -42,10 +42,10 @@ if regenerate_expected_results:
all_traces = []
for header in sorted(public_headers):
if header.endswith(".h"): # Skip C compatibility or detail headers
if header.is_C_compatibility() or header.is_internal():
continue
normalized_header = re.sub("/", "_", header)
normalized_header = re.sub("/", "_", str(header))
print(
f"""\
// RUN: echo "#include <{header}>" | %{{cxx}} -xc++ - %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.{normalized_header}.txt
@ -55,17 +55,17 @@ if regenerate_expected_results:
print(
f"""\
// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
"""
)
else:
for header in public_headers:
if header.endswith(".h"): # Skip C compatibility or detail headers
if header.is_C_compatibility() or header.is_internal():
continue
# Escape slashes for the awk command below
escaped_header = header.replace("/", "\\/")
escaped_header = str(header).replace("/", "\\/")
print(
f"""\
@ -92,7 +92,7 @@ else:
// RUN: mkdir %t
// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
// RUN: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
// RUN: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
#include <{header}>

View File

@ -0,0 +1,120 @@
#!/usr/bin/env python
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
from typing import List, Tuple, Optional
import argparse
import io
import itertools
import os
import pathlib
import re
import sys
libcxx_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
sys.path.append(os.path.join(libcxx_root, "utils"))
from libcxx.header_information import Header
def parse_line(line: str) -> Tuple[int, str]:
"""
Parse a single line of --trace-includes output.
Returns the inclusion level and the raw file name being included.
"""
match = re.match(r"(\.+) (.+)", line)
if not match:
raise ArgumentError(f"Line {line} contains invalid data.")
# The number of periods in front of the header name is the nesting level of
# that header.
return (len(match.group(1)), match.group(2))
def make_cxx_v1_relative(header: str) -> Optional[str]:
"""
Returns the path of the header as relative to <whatever>/c++/v1, or None if the path
doesn't contain c++/v1.
We use that heuristic to figure out which headers are libc++ headers.
"""
# On Windows, the path separators can either be forward slash or backslash.
# If it is a backslash, Clang prints it escaped as two consecutive
# backslashes, and they need to be escaped in the RE. (Use a raw string for
# the pattern to avoid needing another level of escaping on the Python string
# literal level.)
pathsep = r"(?:/|\\\\)"
CXX_V1_REGEX = r"^.*c\+\+" + pathsep + r"v[0-9]+" + pathsep + r"(.+)$"
match = re.match(CXX_V1_REGEX, header)
if not match:
return None
else:
return match.group(1)
def parse_file(file: io.TextIOBase) -> List[Tuple[Header, Header]]:
"""
Parse a file containing --trace-includes output to generate a list of the
transitive includes contained in it.
"""
result = []
includer = None
for line in file.readlines():
(level, header) = parse_line(line)
relative = make_cxx_v1_relative(header)
# Not a libc++ header
if relative is None:
continue
# If we're at the first level, remember this header as being the one who includes other headers.
# There's usually exactly one, except if the compiler is passed a file with `-include`.
if level == 1:
includer = Header(relative)
continue
# Otherwise, take note that this header is being included by the top-level includer.
else:
assert includer is not None
result.append((includer, Header(relative)))
return result
def print_csv(includes: List[Tuple[Header, Header]]) -> None:
"""
Print the transitive includes as space-delimited CSV.
This function only prints public libc++ headers that are not C compatibility headers.
"""
# Sort and group by includer
by_includer = lambda t: t[0]
includes = itertools.groupby(sorted(includes, key=by_includer), key=by_includer)
for (includer, includees) in includes:
includees = map(lambda t: t[1], includees)
for h in sorted(set(includees)):
if h.is_public() and not h.is_C_compatibility():
print(f"{includer} {h}")
def main(argv):
parser = argparse.ArgumentParser(
description="""
Given a list of headers produced by --trace-includes, produce a list of libc++ headers in that output.
Note that -fshow-skipped-includes must also be passed to the compiler in order to get sufficient
information for this script to run.
The output of this script is provided in space-delimited CSV format where each line contains:
<header performing inclusion> <header being included>
""")
parser.add_argument("inputs", type=argparse.FileType("r"), nargs='+', default=None,
help="One or more files containing the result of --trace-includes")
args = parser.parse_args(argv)
includes = [line for file in args.inputs for line in parse_file(file)]
print_csv(includes)
if __name__ == "__main__":
main(sys.argv[1:])

View File

@ -1,147 +0,0 @@
#!/usr/bin/env python
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
from dataclasses import dataclass
from typing import List # Needed for python 3.8 compatibility.
import argparse
import pathlib
import re
import sys
@dataclass
class header:
name: str = None
level: int = -1
def parse_line(line: str) -> header:
"""
Parse an output line from --trace-includes into a `header`.
"""
match = re.match(r"(\.+) (.+)", line)
if not match:
sys.exit(f"Line {line} contains invalid data.")
# The number of periods in front of the header name is the nesting level of
# that header.
return header(match.group(2), len(match.group(1)))
# On Windows, the path separators can either be forward slash or backslash.
# If it is a backslash, Clang prints it escaped as two consecutive
# backslashes, and they need to be escaped in the RE. (Use a raw string for
# the pattern to avoid needing another level of escaping on the Python string
# literal level.)
LIBCXX_HEADER_REGEX = r".*c\+\+(?:/|\\\\)v[0-9]+(?:/|\\\\)(.+)"
def is_libcxx_header(header: str) -> bool:
"""
Returns whether a header is a libc++ header, excluding the C-compatibility headers.
"""
# Only keep files in the c++/vN directory.
match = re.match(LIBCXX_HEADER_REGEX, header)
if not match:
return False
# Skip C compatibility headers (in particular, make sure not to skip libc++ detail headers).
relative = match.group(1)
if relative.endswith(".h") and not (
relative.startswith("__") or re.search(r"(/|\\\\)__", relative)
):
return False
return True
def parse_file(file: pathlib.Path) -> List[str]:
"""
Parse a file containing --trace-includes output to generate a list of the top-level C++ includes
contained in it.
This effectively generates the list of public libc++ headers of the header whose --trace-includes it is.
In order to get the expected result of --trace-includes, the -fshow-skipped-includes flag also needs to
be passed.
"""
result = list()
with file.open(encoding="utf-8") as f:
for line in f.readlines():
header = parse_line(line)
# Skip non-libc++ headers
if not is_libcxx_header(header.name):
continue
# Include top-level headers in the output. There's usually exactly one,
# except if the compiler is passed a file with `-include`. Top-level
# headers are transparent, in the sense that we want to go look at
# transitive includes underneath.
if header.level == 1:
level = 999
result.append(header)
continue
# Detail headers are transparent too: we attribute all includes of public libc++
# headers under a detail header to the last public libc++ header that included it.
if header.name.startswith("__") or re.search(r"(/|\\\\)__", header.name):
level = 999
continue
# Add the non-detail libc++ header to the list.
level = header.level
result.append(header)
return result
def create_include_graph(trace_includes: List[pathlib.Path]) -> List[str]:
result = list()
for file in trace_includes:
headers = parse_file(file)
# Get actual filenames relative to libc++'s installation directory instead of full paths
relative = lambda h: re.match(LIBCXX_HEADER_REGEX, h).group(1)
top_level = relative(
next(h.name for h in headers if h.level == 1)
) # There should be only one top-level header
includes = [relative(h.name) for h in headers if h.level != 1]
# Remove duplicates in all includes.
includes = list(set(includes))
if len(includes) != 0:
result.append([top_level] + includes)
return result
def print_csv(graph: List[str]) -> None:
for includes in graph:
header = includes[0]
for include in sorted(includes[1:]):
if header == include:
sys.exit(f"Cycle detected: header {header} includes itself.")
print(f"{header} {include}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Produce a dependency graph of libc++ headers, in CSV format.
This script is normally executed by libcxx/test/libcxx/transitive_includes.gen.py""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"inputs",
default=None,
metavar="FILE",
nargs='+',
help="One or more files containing the result of --trace-includes on the headers one wishes to graph.",
)
options = parser.parse_args()
print_csv(create_include_graph(map(pathlib.Path, options.inputs)))

View File

@ -71,7 +71,7 @@ def main(argv: typing.List[str]):
mappings = [] # Pairs of (header, public_header)
for header in libcxx.header_information.all_headers:
public_headers = IWYU_mapping(header)
public_headers = IWYU_mapping(str(header))
if public_headers is not None:
mappings.extend((header, public) for public in public_headers)

View File

@ -9,19 +9,11 @@
import os.path
import sys
from libcxx.header_information import module_c_headers
from libcxx.header_information import module_headers
from libcxx.header_information import header_restrictions
from libcxx.header_information import headers_not_available
from libcxx.header_information import module_c_headers, module_headers, header_restrictions, headers_not_available, libcxx_root
def write_file(module):
libcxx_module_directory = os.path.join(
os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "modules"
)
with open(
os.path.join(libcxx_module_directory, f"{module}.cppm.in"), "w"
) as module_cpp_in:
with open(libcxx_root / "modules" / f"{module}.cppm.in", "w") as module_cpp_in:
module_cpp_in.write(
"""\
// -*- C++ -*-
@ -45,7 +37,7 @@ module;
// and the headers of Table 25: C++ headers for C library facilities[tab:headers.cpp.c]
"""
)
for header in module_headers if module == "std" else module_c_headers:
for header in sorted(module_headers if module == "std" else module_c_headers):
if header in header_restrictions:
module_cpp_in.write(
f"""\

View File

@ -6,7 +6,166 @@
#
# ===----------------------------------------------------------------------===##
import os, pathlib
import os, pathlib, functools
libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
libcxx_include = libcxx_root / "include"
assert libcxx_root.exists()
def _is_header_file(file):
"""Returns whether the given file is a header file, i.e. not a directory or the modulemap file."""
return not file.is_dir() and not file.name in [
"module.modulemap",
"CMakeLists.txt",
"libcxx.imp",
"__config_site.in",
]
@functools.total_ordering
class Header:
_name: str
"""Relative path from the root of libcxx/include"""
def __init__(self, name: str):
"""Create a Header.
name: The path of the header relative to libc++'s include directory.
For example '__algorithm/find.h' or 'coroutine'.
"""
self._name = name
def is_public(self) -> bool:
"""Returns whether the header is a public libc++ API header."""
return "__" not in self._name and not self._name.startswith("ext/")
def is_internal(self) -> bool:
"""Returns whether the header is an internal implementation detail of the library."""
return not self.is_public()
def is_C_compatibility(self) -> bool:
"""
Returns whether the header is a C compatibility header (headers ending in .h like stdlib.h).
Note that headers like <cstdlib> are not considered C compatibility headers.
"""
return self.is_public() and self._name.endswith(".h")
def is_cstd(self) -> bool:
"""Returns whether the header is a C 'std' header, like <cstddef>, <cerrno>, etc."""
return self._name in [
"cassert",
"ccomplex",
"cctype",
"cerrno",
"cfenv",
"cfloat",
"cinttypes",
"ciso646",
"climits",
"clocale",
"cmath",
"csetjmp",
"csignal",
"cstdarg",
"cstdbool",
"cstddef",
"cstdint",
"cstdio",
"cstdlib",
"cstring",
"ctgmath",
"ctime",
"cuchar",
"cwchar",
"cwctype",
]
def is_experimental(self) -> bool:
"""Returns whether the header is a public experimental header."""
return self.is_public() and self._name.startswith("experimental/")
def has_cxx20_module(self) -> bool:
"""
Returns whether the header is in the std and std.compat C++20 modules.
These headers are all C++23-and-later headers, excluding C compatibility headers and
experimental headers.
"""
# These headers have been removed in C++20 so are never part of a module.
removed_in_20 = ["ccomplex", "ciso646", "cstdbool", "ctgmath"]
return self.is_public() and not self.is_experimental() and not self.is_C_compatibility() and not self._name in removed_in_20
def is_in_modulemap(self) -> bool:
"""Returns whether a header should be listed in the modulemap."""
# TODO: Should `__config_site` be in the modulemap?
if self._name == "__config_site":
return False
if self._name == "__assertion_handler":
return False
# exclude libc++abi files
if self._name in ["cxxabi.h", "__cxxabi_config.h"]:
return False
# exclude headers in __support/ - these aren't supposed to work everywhere,
# so they shouldn't be included in general
if self._name.startswith("__support/"):
return False
# exclude ext/ headers - these are non-standard extensions and are barely
# maintained. People should migrate away from these and we don't need to
# burden ourself with maintaining them in any way.
if self._name.startswith("ext/"):
return False
return True
def __str__(self) -> str:
return self._name
def __repr__(self) -> str:
return repr(self._name)
def __eq__(self, other) -> bool:
if isinstance(other, str):
return self._name == other
return self._name == other._name
def __lt__(self, other) -> bool:
if isinstance(other, str):
return self._name < other
return self._name < other._name
def __hash__(self) -> int:
return hash(self._name)
# Commonly-used sets of headers
all_headers = [Header(p.relative_to(libcxx_include).as_posix()) for p in libcxx_include.rglob("[_a-z]*") if _is_header_file(p)]
all_headers += [Header("__config_site"), Header("__assertion_handler")] # Headers generated during the build process
public_headers = [h for h in all_headers if h.is_public()]
module_headers = [h for h in all_headers if h.has_cxx20_module()]
module_c_headers = [h for h in all_headers if h.has_cxx20_module() and h.is_cstd()]
# These headers are not yet implemented in libc++
#
# These headers are required by the latest (draft) Standard but have not been
# implemented yet. They are used in the generated module input. The C++23 standard
# modules will fail to build if a header is added but this list is not updated.
headers_not_available = list(map(Header, [
"debugging",
"flat_map",
"flat_set",
"generator",
"hazard_pointer",
"inplace_vector",
"linalg",
"rcu",
"spanstream",
"stacktrace",
"stdfloat",
"text_encoding",
]))
header_restrictions = {
# headers with #error directives
@ -112,118 +271,3 @@ mandatory_inclusions = {
"variant": ["compare"],
"vector": ["compare", "initializer_list"],
}
# These headers are not yet implemented in libc++
#
# These headers are required by the latest (draft) Standard but have not been
# implemented yet. They are used in the generated module input. The C++23 standard
# modules will fail to build if a header is added but this list is not updated.
headers_not_available = [
"debugging",
"flat_map",
"flat_set",
"generator",
"hazard_pointer",
"inplace_vector",
"linalg",
"rcu",
"spanstream",
"stacktrace",
"stdfloat",
"text_encoding",
]
def is_header(file):
"""Returns whether the given file is a header (i.e. not a directory or the modulemap file)."""
return not file.is_dir() and not file.name in [
"module.modulemap",
"CMakeLists.txt",
"libcxx.imp",
]
def is_public_header(header):
return "__" not in header and not header.startswith("ext/")
def is_modulemap_header(header):
"""Returns whether a header should be listed in the modulemap"""
# TODO: Should `__config_site` be in the modulemap?
if header == "__config_site":
return False
if header == "__assertion_handler":
return False
# exclude libc++abi files
if header in ["cxxabi.h", "__cxxabi_config.h"]:
return False
# exclude headers in __support/ - these aren't supposed to work everywhere,
# so they shouldn't be included in general
if header.startswith("__support/"):
return False
# exclude ext/ headers - these are non-standard extensions and are barely
# maintained. People should migrate away from these and we don't need to
# burden ourself with maintaining them in any way.
if header.startswith("ext/"):
return False
return True
libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
include = pathlib.Path(os.path.join(libcxx_root, "include"))
test = pathlib.Path(os.path.join(libcxx_root, "test"))
assert libcxx_root.exists()
all_headers = sorted(
p.relative_to(include).as_posix() for p in include.rglob("[_a-z]*") if is_header(p)
)
toplevel_headers = sorted(
p.relative_to(include).as_posix() for p in include.glob("[_a-z]*") if is_header(p)
)
experimental_headers = sorted(
p.relative_to(include).as_posix()
for p in include.glob("experimental/[a-z]*")
if is_header(p)
)
public_headers = [p for p in all_headers if is_public_header(p)]
# The headers used in the std and std.compat modules.
#
# This is the set of all C++23-and-later headers, excluding C compatibility headers.
module_headers = [
header
for header in toplevel_headers
if not header.endswith(".h") and is_public_header(header)
# These headers have been removed in C++20 so are never part of a module.
and not header in ["ccomplex", "ciso646", "cstdbool", "ctgmath"]
]
# The C headers used in the std and std.compat modules.
module_c_headers = [
"cassert",
"cctype",
"cerrno",
"cfenv",
"cfloat",
"cinttypes",
"climits",
"clocale",
"cmath",
"csetjmp",
"csignal",
"cstdarg",
"cstddef",
"cstdint",
"cstdio",
"cstdlib",
"cstring",
"ctime",
"cuchar",
"cwchar",
"cwctype",
]