mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-25 11:56:08 +00:00

e2ba1b6ffde4ec607342b1b746d1b57f0f04390a references that it reverts a commit that's not a parent of e2ba1b6ffde4ec607342b1b746d1b57f0f04390a. Functionally, this can (and demonstrably does) work(*), but from the standpoint of the revert checker, it's nonsense. Print a `logging.error` when it's detected. Tested by running the revert checker against a commit range that includes the aforementioned commit; the logging.error was fired appropriately. (*) - the specifics here are: - the _SHA_ that was referenced was on a non-main branch, but - the commit from the non-main branch was merged into the non-main branch from main - ...so the _functional_ commit being reverted was originally landed on main, but the _SHA_ referenced from main was from a branch that was cut before the reverted-commit was landed on main
426 lines
14 KiB
Python
Executable File
426 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
# ===----------------------------------------------------------------------===##
|
|
#
|
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
# See https://llvm.org/LICENSE.txt for license information.
|
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
#
|
|
# ===----------------------------------------------------------------------===##
|
|
"""Checks for reverts of commits across a given git commit.
|
|
|
|
To clarify the meaning of 'across' with an example, if we had the following
|
|
commit history (where `a -> b` notes that `b` is a direct child of `a`):
|
|
|
|
123abc -> 223abc -> 323abc -> 423abc -> 523abc
|
|
|
|
And where 423abc is a revert of 223abc, this revert is considered to be 'across'
|
|
323abc. More generally, a revert A of a parent commit B is considered to be
|
|
'across' a commit C if C is a parent of A and B is a parent of C.
|
|
|
|
Please note that revert detection in general is really difficult, since merge
|
|
conflicts/etc always introduce _some_ amount of fuzziness. This script just
|
|
uses a bundle of heuristics, and is bound to ignore / incorrectly flag some
|
|
reverts. The hope is that it'll easily catch the vast majority (>90%) of them,
|
|
though.
|
|
|
|
This is designed to be used in one of two ways: an import in Python, or run
|
|
directly from a shell. If you want to import this, the `find_reverts`
|
|
function is the thing to look at. If you'd rather use this from a shell, have a
|
|
usage example:
|
|
|
|
```
|
|
./revert_checker.py c47f97169 origin/main origin/release/12.x
|
|
```
|
|
|
|
This checks for all reverts from the tip of origin/main to c47f97169, which are
|
|
across the latter. It then does the same for origin/release/12.x to c47f97169.
|
|
Duplicate reverts discovered when walking both roots (origin/main and
|
|
origin/release/12.x) are deduplicated in output.
|
|
"""
|
|
|
|
import argparse
|
|
import collections
|
|
import logging
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from typing import Dict, Generator, Iterable, List, NamedTuple, Optional, Tuple
|
|
|
|
assert sys.version_info >= (3, 6), "Only Python 3.6+ is supported."
|
|
|
|
# People are creative with their reverts, and heuristics are a bit difficult.
|
|
# At a glance, most reverts have "This reverts commit ${full_sha}". Many others
|
|
# have `Reverts llvm/llvm-project#${PR_NUMBER}`.
|
|
#
|
|
# By their powers combined, we should be able to automatically catch something
|
|
# like 80% of reverts with reasonable confidence. At some point, human
|
|
# intervention will always be required (e.g., I saw
|
|
# ```
|
|
# This reverts commit ${commit_sha_1} and
|
|
# also ${commit_sha_2_shorthand}
|
|
# ```
|
|
# during my sample)
|
|
|
|
_CommitMessageReverts = NamedTuple(
|
|
"_CommitMessageReverts",
|
|
[
|
|
("potential_shas", List[str]),
|
|
("potential_pr_numbers", List[int]),
|
|
],
|
|
)
|
|
|
|
|
|
def _try_parse_reverts_from_commit_message(
|
|
commit_message: str,
|
|
) -> _CommitMessageReverts:
|
|
"""Tries to parse revert SHAs and LLVM PR numbers form the commit message.
|
|
|
|
Returns:
|
|
A namedtuple containing:
|
|
- A list of potentially reverted SHAs
|
|
- A list of potentially reverted LLVM PR numbers
|
|
"""
|
|
if not commit_message:
|
|
return _CommitMessageReverts([], [])
|
|
|
|
sha_reverts = re.findall(
|
|
r"This reverts commit ([a-f0-9]{40})\b",
|
|
commit_message,
|
|
)
|
|
|
|
first_line = commit_message.splitlines()[0]
|
|
initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line)
|
|
if initial_revert:
|
|
sha_reverts.append(initial_revert.group(1))
|
|
|
|
pr_numbers = [
|
|
int(x)
|
|
for x in re.findall(
|
|
r"Reverts llvm/llvm-project#(\d+)",
|
|
commit_message,
|
|
)
|
|
]
|
|
|
|
return _CommitMessageReverts(
|
|
potential_shas=sha_reverts,
|
|
potential_pr_numbers=pr_numbers,
|
|
)
|
|
|
|
|
|
def _stream_stdout(
|
|
command: List[str], cwd: Optional[str] = None
|
|
) -> Generator[str, None, None]:
|
|
with subprocess.Popen(
|
|
command,
|
|
cwd=cwd,
|
|
stdout=subprocess.PIPE,
|
|
encoding="utf-8",
|
|
errors="replace",
|
|
) as p:
|
|
assert p.stdout is not None # for mypy's happiness.
|
|
yield from p.stdout
|
|
|
|
|
|
def _resolve_sha(git_dir: str, sha: str) -> str:
|
|
if len(sha) == 40:
|
|
return sha
|
|
|
|
return subprocess.check_output(
|
|
["git", "-C", git_dir, "rev-parse", sha],
|
|
encoding="utf-8",
|
|
stderr=subprocess.DEVNULL,
|
|
).strip()
|
|
|
|
|
|
_LogEntry = NamedTuple(
|
|
"_LogEntry",
|
|
[
|
|
("sha", str),
|
|
("commit_message", str),
|
|
],
|
|
)
|
|
|
|
|
|
def _log_stream(git_dir: str, root_sha: str, end_at_sha: str) -> Iterable[_LogEntry]:
|
|
sep = 50 * "<>"
|
|
log_command = [
|
|
"git",
|
|
"-C",
|
|
git_dir,
|
|
"log",
|
|
"^" + end_at_sha,
|
|
root_sha,
|
|
"--format=" + sep + "%n%H%n%B%n",
|
|
]
|
|
|
|
stdout_stream = iter(_stream_stdout(log_command))
|
|
|
|
# Find the next separator line. If there's nothing to log, it may not exist.
|
|
# It might not be the first line if git feels complainy.
|
|
found_commit_header = False
|
|
for line in stdout_stream:
|
|
if line.rstrip() == sep:
|
|
found_commit_header = True
|
|
break
|
|
|
|
while found_commit_header:
|
|
sha = next(stdout_stream, None)
|
|
assert sha is not None, "git died?"
|
|
sha = sha.rstrip()
|
|
|
|
commit_message = []
|
|
|
|
found_commit_header = False
|
|
for line in stdout_stream:
|
|
line = line.rstrip()
|
|
if line.rstrip() == sep:
|
|
found_commit_header = True
|
|
break
|
|
commit_message.append(line)
|
|
|
|
yield _LogEntry(sha, "\n".join(commit_message).rstrip())
|
|
|
|
|
|
def _shas_between(git_dir: str, base_ref: str, head_ref: str) -> Iterable[str]:
|
|
rev_list = [
|
|
"git",
|
|
"-C",
|
|
git_dir,
|
|
"rev-list",
|
|
"--first-parent",
|
|
f"{base_ref}..{head_ref}",
|
|
]
|
|
return (x.strip() for x in _stream_stdout(rev_list))
|
|
|
|
|
|
def _rev_parse(git_dir: str, ref: str) -> str:
|
|
return subprocess.check_output(
|
|
["git", "-C", git_dir, "rev-parse", ref],
|
|
encoding="utf-8",
|
|
).strip()
|
|
|
|
|
|
Revert = NamedTuple(
|
|
"Revert",
|
|
[
|
|
("sha", str),
|
|
("reverted_sha", str),
|
|
],
|
|
)
|
|
|
|
|
|
def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
|
|
"""Finds the closest common parent commit between `ref_a` and `ref_b`.
|
|
|
|
Returns:
|
|
A SHA. Note that `ref_a` will be returned if `ref_a` is a parent of
|
|
`ref_b`, and vice-versa.
|
|
"""
|
|
return subprocess.check_output(
|
|
["git", "-C", git_dir, "merge-base", ref_a, ref_b],
|
|
encoding="utf-8",
|
|
).strip()
|
|
|
|
|
|
def _load_pr_commit_mappings(
|
|
git_dir: str, root: str, min_ref: str
|
|
) -> Dict[int, List[str]]:
|
|
git_log = ["git", "log", "--format=%H %s", f"{min_ref}..{root}"]
|
|
results = collections.defaultdict(list)
|
|
pr_regex = re.compile(r"\s\(#(\d+)\)$")
|
|
for line in _stream_stdout(git_log, cwd=git_dir):
|
|
m = pr_regex.search(line)
|
|
if not m:
|
|
continue
|
|
|
|
pr_number = int(m.group(1))
|
|
sha = line.split(None, 1)[0]
|
|
# N.B., these are kept in log (read: reverse chronological) order,
|
|
# which is what's expected by `find_reverts`.
|
|
results[pr_number].append(sha)
|
|
return results
|
|
|
|
|
|
# N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
|
|
# enough for the 99% case of reverts: rarely should someone land a cleanish
|
|
# revert of a >6 month old change...
|
|
def find_reverts(
|
|
git_dir: str, across_ref: str, root: str, max_pr_lookback: int = 20000
|
|
) -> List[Revert]:
|
|
"""Finds reverts across `across_ref` in `git_dir`, starting from `root`.
|
|
|
|
These reverts are returned in order of oldest reverts first.
|
|
|
|
Args:
|
|
git_dir: git directory to find reverts in.
|
|
across_ref: the ref to find reverts across.
|
|
root: the 'main' ref to look for reverts on.
|
|
max_pr_lookback: this function uses heuristics to map PR numbers to
|
|
SHAs. These heuristics require that commit history from `root` to
|
|
`some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
|
|
many commits behind `across_ref` should be loaded in memory.
|
|
"""
|
|
across_sha = _rev_parse(git_dir, across_ref)
|
|
root_sha = _rev_parse(git_dir, root)
|
|
|
|
common_ancestor = _find_common_parent_commit(git_dir, across_sha, root_sha)
|
|
if common_ancestor != across_sha:
|
|
raise ValueError(
|
|
f"{across_sha} isn't an ancestor of {root_sha} "
|
|
"(common ancestor: {common_ancestor})"
|
|
)
|
|
|
|
intermediate_commits = set(_shas_between(git_dir, across_sha, root_sha))
|
|
assert across_sha not in intermediate_commits
|
|
|
|
logging.debug(
|
|
"%d commits appear between %s and %s",
|
|
len(intermediate_commits),
|
|
across_sha,
|
|
root_sha,
|
|
)
|
|
|
|
all_reverts = []
|
|
# Lazily load PR <-> commit mappings, since it can be expensive.
|
|
pr_commit_mappings = None
|
|
for sha, commit_message in _log_stream(git_dir, root_sha, across_sha):
|
|
reverts, pr_reverts = _try_parse_reverts_from_commit_message(
|
|
commit_message,
|
|
)
|
|
if pr_reverts:
|
|
if pr_commit_mappings is None:
|
|
logging.info(
|
|
"Loading PR <-> commit mappings. This may take a moment..."
|
|
)
|
|
pr_commit_mappings = _load_pr_commit_mappings(
|
|
git_dir, root_sha, f"{across_sha}~{max_pr_lookback}"
|
|
)
|
|
logging.info(
|
|
"Loaded %d PR <-> commit mappings", len(pr_commit_mappings)
|
|
)
|
|
|
|
for reverted_pr_number in pr_reverts:
|
|
reverted_shas = pr_commit_mappings.get(reverted_pr_number)
|
|
if not reverted_shas:
|
|
logging.warning(
|
|
"No SHAs for reverted PR %d (commit %s)",
|
|
reverted_pr_number,
|
|
sha,
|
|
)
|
|
continue
|
|
logging.debug(
|
|
"Inferred SHAs %s for reverted PR %d (commit %s)",
|
|
reverted_shas,
|
|
reverted_pr_number,
|
|
sha,
|
|
)
|
|
reverts.extend(reverted_shas)
|
|
|
|
if not reverts:
|
|
continue
|
|
|
|
resolved_reverts = sorted(set(_resolve_sha(git_dir, x) for x in reverts))
|
|
for reverted_sha in resolved_reverts:
|
|
if reverted_sha in intermediate_commits:
|
|
logging.debug(
|
|
"Commit %s reverts %s, which happened after %s",
|
|
sha,
|
|
reverted_sha,
|
|
across_sha,
|
|
)
|
|
continue
|
|
|
|
try:
|
|
object_type = subprocess.check_output(
|
|
["git", "-C", git_dir, "cat-file", "-t", reverted_sha],
|
|
encoding="utf-8",
|
|
stderr=subprocess.DEVNULL,
|
|
).strip()
|
|
except subprocess.CalledProcessError:
|
|
logging.warning(
|
|
"Failed to resolve reverted object %s (claimed to be reverted "
|
|
"by sha %s)",
|
|
reverted_sha,
|
|
sha,
|
|
)
|
|
continue
|
|
|
|
if object_type != "commit":
|
|
logging.error(
|
|
"%s claims to revert the %s %s, which isn't a commit",
|
|
sha,
|
|
object_type,
|
|
reverted_sha,
|
|
)
|
|
continue
|
|
|
|
# Rarely, reverts will cite SHAs on other branches (e.g., revert
|
|
# commit says it reverts a commit with SHA ${X}, but ${X} is not a
|
|
# parent of the revert). This can happen if e.g., the revert has
|
|
# been mirrored to another branch. Treat them the same as
|
|
# reverts of non-commits.
|
|
if _find_common_parent_commit(git_dir, sha, reverted_sha) != reverted_sha:
|
|
logging.error(
|
|
"%s claims to revert %s, which is a commit that is not "
|
|
"a parent of the revert",
|
|
sha,
|
|
reverted_sha,
|
|
)
|
|
continue
|
|
|
|
all_reverts.append(Revert(sha, reverted_sha))
|
|
|
|
|
|
# Since `all_reverts` contains reverts in log order (e.g., newer comes before
|
|
# older), we need to reverse this to keep with our guarantee of older =
|
|
# earlier in the result.
|
|
all_reverts.reverse()
|
|
return all_reverts
|
|
|
|
|
|
def _main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
parser.add_argument("base_ref", help="Git ref or sha to check for reverts around.")
|
|
parser.add_argument("-C", "--git_dir", default=".", help="Git directory to use.")
|
|
parser.add_argument("root", nargs="+", help="Root(s) to search for commits from.")
|
|
parser.add_argument("--debug", action="store_true")
|
|
parser.add_argument(
|
|
"-u",
|
|
"--review_url",
|
|
action="store_true",
|
|
help="Format SHAs as llvm review URLs",
|
|
)
|
|
opts = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
format="%(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: %(message)s",
|
|
level=logging.DEBUG if opts.debug else logging.INFO,
|
|
)
|
|
|
|
# `root`s can have related history, so we want to filter duplicate commits
|
|
# out. The overwhelmingly common case is also to have one root, and it's way
|
|
# easier to reason about output that comes in an order that's meaningful to
|
|
# git.
|
|
seen_reverts = set()
|
|
all_reverts = []
|
|
for root in opts.root:
|
|
for revert in find_reverts(opts.git_dir, opts.base_ref, root):
|
|
if revert not in seen_reverts:
|
|
seen_reverts.add(revert)
|
|
all_reverts.append(revert)
|
|
|
|
sha_prefix = (
|
|
"https://github.com/llvm/llvm-project/commit/" if opts.review_url else ""
|
|
)
|
|
for revert in all_reverts:
|
|
sha_fmt = f"{sha_prefix}{revert.sha}"
|
|
reverted_sha_fmt = f"{sha_prefix}{revert.reverted_sha}"
|
|
print(f"{sha_fmt} claims to revert {reverted_sha_fmt}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_main()
|