mirror of
https://github.com/llvm/llvm-project.git
synced 2025-05-03 10:26:06 +00:00

This is the first commit in a series that will reformat all the python files in the LLVM repository. Reformatting is done with `black`. See more information here: https://discourse.llvm.org/t/rfc-document-and-standardize-python-code-style Reviewed By: jhenderson, JDevlieghere, MatzeB Differential Revision: https://reviews.llvm.org/D150545
272 lines
8.7 KiB
Python
272 lines
8.7 KiB
Python
# ===----------------------------------------------------------------------===##
|
|
#
|
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
# See https://llvm.org/LICENSE.txt for license information.
|
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
#
|
|
# ===----------------------------------------------------------------------===##
|
|
"""A linter that detects potential typos in FileCheck directive names.
|
|
|
|
Consider a broken test foo.cpp:
|
|
|
|
// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
|
|
// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
|
|
auto x = 42;
|
|
// NEWW: auto is a c++11 extension
|
|
// ODL-NOT: auto is a c++11 extension
|
|
|
|
We first detect the locally valid FileCheck directive prefixes by parsing the
|
|
--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
|
|
{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
|
|
|
|
Then we look for lines that look like directives. These are of the form 'FOO:',
|
|
usually at the beginning of a line or a comment. If any of these are a
|
|
"near-miss" for a directive name, then we suspect this is a typo and report it.
|
|
|
|
Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
|
|
"""
|
|
|
|
import itertools
|
|
import logging
|
|
import pathlib
|
|
import re
|
|
import sys
|
|
from typing import Generator, Sequence, Tuple
|
|
|
|
_distance_threshold = 3
|
|
_prefixes = {"CHECK"}
|
|
_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
|
|
# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
|
|
# if encountered and to generate noise as a result. We filter them out also to
|
|
# avoid this.
|
|
_lit_directives = {
|
|
"RUN",
|
|
"REQUIRES",
|
|
"UNSUPPORTED",
|
|
"XFAIL",
|
|
"DEFINE",
|
|
"REDEFINE",
|
|
}
|
|
# 'COM' and 'RUN' are default comment prefixes for FileCheck.
|
|
_comment_prefixes = {"COM", "RUN"}
|
|
_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})
|
|
|
|
|
|
def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args
|
|
"""Computes the edit distance between two strings.
|
|
|
|
Additions, deletions, and substitutions all count as a single operation.
|
|
"""
|
|
if not s1:
|
|
return len(s2)
|
|
if not s2:
|
|
return len(s1)
|
|
|
|
distances = range(len(s2) + 1)
|
|
for i in range(len(s1)):
|
|
new_distances = [i + 1]
|
|
for j in range(len(s2)):
|
|
cost = min(
|
|
distances[j] + int(s1[i] != s2[j]),
|
|
distances[j + 1] + 1,
|
|
new_distances[-1] + 1,
|
|
)
|
|
new_distances.append(cost)
|
|
distances = new_distances
|
|
return distances[-1]
|
|
|
|
|
|
class FileRange:
|
|
"""Stores the coordinates of a span on a single line within a file.
|
|
|
|
Attributes:
|
|
line: the line number
|
|
start_column: the (inclusive) column where the span starts
|
|
end_column: the (inclusive) column where the span ends
|
|
"""
|
|
|
|
line: int
|
|
start_column: int
|
|
end_column: int
|
|
|
|
def __init__(
|
|
self, content: str, start_byte: int, end_byte: int
|
|
): # pylint: disable=g-doc-args
|
|
"""Derives a span's coordinates based on a string and start/end bytes.
|
|
|
|
`start_byte` and `end_byte` are assumed to be on the same line.
|
|
"""
|
|
content_before_span = content[:start_byte]
|
|
self.line = content_before_span.count("\n") + 1
|
|
self.start_column = start_byte - content_before_span.rfind("\n")
|
|
self.end_column = self.start_column + (end_byte - start_byte - 1)
|
|
|
|
def __str__(self) -> str:
|
|
return f"{self.line}:{self.start_column}-{self.end_column}"
|
|
|
|
|
|
class Diagnostic:
|
|
"""Stores information about one typo and a suggested fix.
|
|
|
|
Attributes:
|
|
filepath: the path to the file in which the typo was found
|
|
filerange: the position at which the typo was found in the file
|
|
typo: the typo
|
|
fix: a suggested fix
|
|
"""
|
|
|
|
filepath: pathlib.Path
|
|
filerange: FileRange
|
|
typo: str
|
|
fix: str
|
|
|
|
def __init__(
|
|
self,
|
|
filepath: pathlib.Path,
|
|
filerange: FileRange,
|
|
typo: str,
|
|
fix: str, # pylint: disable=redefined-outer-name
|
|
):
|
|
self.filepath = filepath
|
|
self.filerange = filerange
|
|
self.typo = typo
|
|
self.fix = fix
|
|
|
|
def __str__(self) -> str:
|
|
return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}"
|
|
|
|
def summary(self) -> str:
|
|
return (
|
|
f'Found potentially misspelled directive "{self.typo}". Did you mean '
|
|
f'"{self.fix}"?'
|
|
)
|
|
|
|
|
|
def find_potential_directives(
|
|
content: str,
|
|
) -> Generator[Tuple[FileRange, str], None, None]:
|
|
"""Extracts all the potential FileCheck directives from a string.
|
|
|
|
What constitutes a potential directive is loosely defined---we err on the side
|
|
of capturing more strings than is necessary, rather than missing any.
|
|
|
|
Args:
|
|
content: the string in which to look for directives
|
|
|
|
Yields:
|
|
Tuples (p, d) where p is the span where the potential directive occurs
|
|
within the string and d is the potential directive.
|
|
"""
|
|
directive_pattern = re.compile(
|
|
r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE
|
|
)
|
|
for match in re.finditer(directive_pattern, content):
|
|
potential_directive, span = match.group(1), match.span(1)
|
|
yield (FileRange(content, span[0], span[1]), potential_directive)
|
|
|
|
|
|
# TODO(bchetioui): also parse comment prefixes to ignore.
|
|
def parse_custom_prefixes(
|
|
content: str,
|
|
) -> Generator[str, None, None]: # pylint: disable=g-doc-args
|
|
"""Parses custom prefixes defined in the string provided.
|
|
|
|
For example, given the following file content:
|
|
RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
|
|
RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
|
|
|
|
the custom prefixes are CHECK1, CHECK2, and CHECK3.
|
|
"""
|
|
param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
|
|
for m in re.finditer(
|
|
r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content
|
|
):
|
|
prefixes = m.group(1)
|
|
if prefixes.startswith("'") or prefixes.startswith('"'):
|
|
prefixes = prefixes[1:-1]
|
|
for prefix in prefixes.split(","):
|
|
yield prefix
|
|
|
|
|
|
def find_directive_typos(
|
|
content: str,
|
|
filepath: pathlib.Path,
|
|
threshold: int = 3,
|
|
) -> Generator[Diagnostic, None, None]:
|
|
"""Detects potential typos in FileCheck directives.
|
|
|
|
Args:
|
|
content: the content of the file
|
|
filepath: the path to the file to check for typos in directives
|
|
threshold: the (inclusive) maximum edit distance between a potential
|
|
directive and an actual directive, such that the potential directive is
|
|
classified as a typo
|
|
|
|
Yields:
|
|
Diagnostics, in order from the top of the file.
|
|
"""
|
|
all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
|
|
all_directives = (
|
|
[
|
|
f"{prefix}{suffix}"
|
|
for prefix, suffix in itertools.product(all_prefixes, _suffixes)
|
|
]
|
|
+ list(_ignore)
|
|
+ list(all_prefixes)
|
|
)
|
|
|
|
def find_best_match(typo):
|
|
return min(
|
|
[(threshold + 1, typo)]
|
|
+ [
|
|
(levenshtein(typo, d), d)
|
|
for d in all_directives
|
|
if abs(len(d) - len(typo)) <= threshold
|
|
],
|
|
key=lambda tup: tup[0],
|
|
)
|
|
|
|
potential_directives = find_potential_directives(content)
|
|
|
|
for filerange, potential_directive in potential_directives:
|
|
# TODO(bchetioui): match count directives more finely. We skip directives
|
|
# starting with 'CHECK-COUNT-' for the moment as they require more complex
|
|
# logic to be handled correctly.
|
|
if any(
|
|
potential_directive.startswith(f"{prefix}-COUNT-")
|
|
for prefix in all_prefixes
|
|
):
|
|
continue
|
|
|
|
# Ignoring potential typos that will not be matched later due to a too low
|
|
# threshold, in order to avoid potentially long computation times.
|
|
if len(potential_directive) > max(map(len, all_directives)) + threshold:
|
|
continue
|
|
|
|
score, best_match = find_best_match(potential_directive)
|
|
if score == 0: # This is an actual directive, ignore.
|
|
continue
|
|
elif score <= threshold and best_match not in _ignore:
|
|
yield Diagnostic(filepath, filerange, potential_directive, best_match)
|
|
|
|
|
|
def main(argv: Sequence[str]):
|
|
if len(argv) < 2:
|
|
print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
|
|
exit(1)
|
|
|
|
for filepath in argv[1:]:
|
|
logging.info("Checking %s", filepath)
|
|
with open(filepath, "rt") as f:
|
|
content = f.read()
|
|
for diagnostic in find_directive_typos(
|
|
content,
|
|
pathlib.Path(filepath),
|
|
threshold=_distance_threshold,
|
|
):
|
|
print(diagnostic)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(sys.argv)
|