llvm-project/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp

//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "BenchmarkRunner.h"
#include "Assembler.h"
#include "Error.h"
#include "MCInstrDescView.h"
#include "MmapUtils.h"
#include "PerfHelper.h"
#include "SubprocessMemory.h"
#include "Target.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/SystemZ/zOSSupport.h"
#include <cmath>
#include <memory>
#include <string>

#ifdef __linux__
#ifdef HAVE_LIBPFM
#include <perfmon/perf_event.h>
#endif
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>

#if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER)
#include <sys/rseq.h>
#if defined(RSEQ_SIG) && defined(SYS_rseq)
#define GLIBC_INITS_RSEQ
#endif
#endif
#endif // __linux__

namespace llvm {
namespace exegesis {

BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
                                 BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
                                 ExecutionModeE ExecutionMode,
                                 ArrayRef<ValidationEvent> ValCounters)
    : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
      ExecutionMode(ExecutionMode), ValidationCounters(ValCounters),
      Scratch(std::make_unique<ScratchSpace>()) {}

BenchmarkRunner::~BenchmarkRunner() = default;

void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
    const SmallVectorImpl<int64_t> &NewValues,
    SmallVectorImpl<int64_t> *Result) {
  const size_t NumValues = std::max(NewValues.size(), Result->size());
  if (NumValues > Result->size())
    Result->resize(NumValues, 0);
  for (size_t I = 0, End = NewValues.size(); I < End; ++I)
    (*Result)[I] += NewValues[I];
}

Expected<SmallVector<int64_t, 4>>
BenchmarkRunner::FunctionExecutor::runAndSample(
    const char *Counters, ArrayRef<const char *> ValidationCounters,
    SmallVectorImpl<int64_t> &ValidationCounterValues) const {
  // We sum counts when there are several counters for a single ProcRes
  // (e.g. P23 on SandyBridge).
  SmallVector<int64_t, 4> CounterValues;
  SmallVector<StringRef, 2> CounterNames;
  StringRef(Counters).split(CounterNames, '+');
  for (auto &CounterName : CounterNames) {
    CounterName = CounterName.trim();
    Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter(
        CounterName, ValidationCounters, ValidationCounterValues);
    if (!ValueOrError)
      return ValueOrError.takeError();
    accumulateCounterValues(ValueOrError.get(), &CounterValues);
  }
  return CounterValues;
}

namespace {
class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
public:
  static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
  create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
         BenchmarkRunner::ScratchSpace *Scratch,
         std::optional<int> BenchmarkProcessCPU) {
    Expected<ExecutableFunction> EF =
        ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));

    if (!EF)
      return EF.takeError();

    return std::unique_ptr<InProcessFunctionExecutorImpl>(
        new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
  }

private:
  InProcessFunctionExecutorImpl(const LLVMState &State,
                                ExecutableFunction Function,
                                BenchmarkRunner::ScratchSpace *Scratch)
      : State(State), Function(std::move(Function)), Scratch(Scratch) {}

  static void accumulateCounterValues(const SmallVector<int64_t, 4> &NewValues,
                                      SmallVector<int64_t, 4> *Result) {
    const size_t NumValues = std::max(NewValues.size(), Result->size());
    if (NumValues > Result->size())
      Result->resize(NumValues, 0);
    for (size_t I = 0, End = NewValues.size(); I < End; ++I)
      (*Result)[I] += NewValues[I];
  }

  Expected<SmallVector<int64_t, 4>> runWithCounter(
      StringRef CounterName, ArrayRef<const char *> ValidationCounters,
      SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
    const ExegesisTarget &ET = State.getExegesisTarget();
    char *const ScratchPtr = Scratch->ptr();
    auto CounterOrError =
        ET.createCounter(CounterName, State, ValidationCounters);

    if (!CounterOrError)
      return CounterOrError.takeError();

    pfm::CounterGroup *Counter = CounterOrError.get().get();
    Scratch->clear();
    {
      auto PS = ET.withSavedState();
      CrashRecoveryContext CRC;
      CrashRecoveryContext::Enable();
      const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
        Counter->start();
        this->Function(ScratchPtr);
        Counter->stop();
      });
      CrashRecoveryContext::Disable();
      PS.reset();
      if (Crashed) {
#ifdef LLVM_ON_UNIX
        // See "Exit Status for Commands":
        // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
        constexpr const int kSigOffset = 128;
        return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
#else
        // The exit code of the process on windows is not meaningful as a
        // signal, so simply pass in -1 as the signal into the error.
        return make_error<SnippetSignal>(-1);
#endif // LLVM_ON_UNIX
      }
    }

    auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
    if (!ValidationValuesOrErr)
      return ValidationValuesOrErr.takeError();

    ArrayRef RealValidationValues = *ValidationValuesOrErr;
    for (size_t I = 0; I < RealValidationValues.size(); ++I)
      ValidationCounterValues[I] = RealValidationValues[I];

    return Counter->readOrError(Function.getFunctionBytes());
  }

  const LLVMState &State;
  const ExecutableFunction Function;
  BenchmarkRunner::ScratchSpace *const Scratch;
};

#ifdef __linux__
// The following class implements a function executor that executes the
// benchmark code within a subprocess rather than within the main llvm-exegesis
// process. This allows for much more control over the execution context of the
// snippet, particularly with regard to memory. This class performs all the
// necessary functions to create the subprocess, execute the snippet in the
// subprocess, and report results/handle errors.
class SubProcessFunctionExecutorImpl
    : public BenchmarkRunner::FunctionExecutor {
public:
  static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>>
  create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
         const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) {
    Expected<ExecutableFunction> EF =
        ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
    if (!EF)
      return EF.takeError();

    return std::unique_ptr<SubProcessFunctionExecutorImpl>(
        new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key,
                                           BenchmarkProcessCPU));
  }

private:
  SubProcessFunctionExecutorImpl(const LLVMState &State,
                                 ExecutableFunction Function,
                                 const BenchmarkKey &Key,
                                 std::optional<int> BenchmarkCPU)
      : State(State), Function(std::move(Function)), Key(Key),
        BenchmarkProcessCPU(BenchmarkCPU) {}

  enum ChildProcessExitCodeE {
    CounterFDReadFailed = 1,
    RSeqDisableFailed,
    FunctionDataMappingFailed,
    AuxiliaryMemorySetupFailed,
    SetCPUAffinityFailed
  };

  StringRef childProcessExitCodeToString(int ExitCode) const {
    switch (ExitCode) {
    case ChildProcessExitCodeE::CounterFDReadFailed:
      return "Counter file descriptor read failed";
    case ChildProcessExitCodeE::RSeqDisableFailed:
      return "Disabling restartable sequences failed";
    case ChildProcessExitCodeE::FunctionDataMappingFailed:
      return "Failed to map memory for assembled snippet";
    case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
      return "Failed to setup auxiliary memory";
    case ChildProcessExitCodeE::SetCPUAffinityFailed:
      return "Failed to set CPU affinity of the benchmarking process";
    default:
      return "Child process returned with unknown exit code";
    }
  }

  Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const {
    struct msghdr Message = {};
    char Buffer[CMSG_SPACE(sizeof(FD))];
    memset(Buffer, 0, sizeof(Buffer));
    Message.msg_control = Buffer;
    Message.msg_controllen = sizeof(Buffer);

    struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
    ControlMessage->cmsg_level = SOL_SOCKET;
    ControlMessage->cmsg_type = SCM_RIGHTS;
    ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD));

    memcpy(CMSG_DATA(ControlMessage), &FD, sizeof(FD));

    Message.msg_controllen = CMSG_SPACE(sizeof(FD));

    ssize_t BytesWritten = sendmsg(SocketFD, &Message, 0);

    if (BytesWritten < 0)
      return make_error<Failure>("Failed to write FD to socket: " +
                                 Twine(strerror(errno)));

    return Error::success();
  }

  Expected<int> getFileDescriptorFromSocket(int SocketFD) const {
    struct msghdr Message = {};

    char ControlBuffer[256];
    Message.msg_control = ControlBuffer;
    Message.msg_controllen = sizeof(ControlBuffer);

    ssize_t BytesRead = recvmsg(SocketFD, &Message, 0);

    if (BytesRead < 0)
      return make_error<Failure>("Failed to read FD from socket: " +
                                 Twine(strerror(errno)));

    struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);

    int FD;

    if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD)))
      return make_error<Failure>("Failed to get correct number of bytes for "
                                 "file descriptor from socket.");

    memcpy(&FD, CMSG_DATA(ControlMessage), sizeof(FD));

    return FD;
  }

  Error
  runParentProcess(pid_t ChildPID, int WriteFD, StringRef CounterName,
                   SmallVectorImpl<int64_t> &CounterValues,
                   ArrayRef<const char *> ValidationCounters,
                   SmallVectorImpl<int64_t> &ValidationCounterValues) const {
    auto WriteFDClose = make_scope_exit([WriteFD]() { close(WriteFD); });
    const ExegesisTarget &ET = State.getExegesisTarget();
    auto CounterOrError =
        ET.createCounter(CounterName, State, ValidationCounters, ChildPID);

    if (!CounterOrError)
      return CounterOrError.takeError();

    pfm::CounterGroup *Counter = CounterOrError.get().get();

    // Make sure to attach to the process (and wait for the sigstop to be
    // delivered and for the process to continue) before we write to the counter
    // file descriptor. Attaching to the process before writing to the socket
    // ensures that the subprocess at most has blocked on the read call. If we
    // attach afterwards, the subprocess might exit before we get to the attach
    // call due to effects like scheduler contention, introducing transient
    // failures.
    if (ptrace(PTRACE_ATTACH, ChildPID, NULL, NULL) != 0)
      return make_error<Failure>("Failed to attach to the child process: " +
                                 Twine(strerror(errno)));

    if (waitpid(ChildPID, NULL, 0) == -1) {
      return make_error<Failure>(
          "Failed to wait for child process to stop after attaching: " +
          Twine(strerror(errno)));
    }

    if (ptrace(PTRACE_CONT, ChildPID, NULL, NULL) != 0)
      return make_error<Failure>(
          "Failed to continue execution of the child process: " +
          Twine(strerror(errno)));

    int CounterFileDescriptor = Counter->getFileDescriptor();
    Error SendError =
        sendFileDescriptorThroughSocket(WriteFD, CounterFileDescriptor);

    if (SendError)
      return SendError;

    int ChildStatus;
    if (waitpid(ChildPID, &ChildStatus, 0) == -1) {
      return make_error<Failure>(
          "Waiting for the child process to complete failed: " +
          Twine(strerror(errno)));
    }

    if (WIFEXITED(ChildStatus)) {
      int ChildExitCode = WEXITSTATUS(ChildStatus);
      if (ChildExitCode == 0) {
        // The child exited succesfully, read counter values and return
        // success.
        auto CounterValueOrErr = Counter->readOrError();
        if (!CounterValueOrErr)
          return CounterValueOrErr.takeError();
        CounterValues = std::move(*CounterValueOrErr);

        auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
        if (!ValidationValuesOrErr)
          return ValidationValuesOrErr.takeError();

        ArrayRef RealValidationValues = *ValidationValuesOrErr;
        for (size_t I = 0; I < RealValidationValues.size(); ++I)
          ValidationCounterValues[I] = RealValidationValues[I];

        return Error::success();
      }
      // The child exited, but not successfully.
      return make_error<Failure>(
          "Child benchmarking process exited with non-zero exit code: " +
          childProcessExitCodeToString(ChildExitCode));
    }

    // An error was encountered running the snippet, process it
    siginfo_t ChildSignalInfo;
    if (ptrace(PTRACE_GETSIGINFO, ChildPID, NULL, &ChildSignalInfo) == -1) {
      return make_error<Failure>("Getting signal info from the child failed: " +
                                 Twine(strerror(errno)));
    }

    // Send SIGKILL rather than SIGTERM as the child process has no SIGTERM
    // handlers to run, and calling SIGTERM would mean that ptrace will force
    // it to block in the signal-delivery-stop for the SIGSEGV/other signals,
    // and upon exit.
    if (kill(ChildPID, SIGKILL) == -1)
      return make_error<Failure>("Failed to kill child benchmarking proces: " +
                                 Twine(strerror(errno)));

    // Wait for the process to exit so that there are no zombie processes left
    // around.
    if (waitpid(ChildPID, NULL, 0) == -1)
      return make_error<Failure>("Failed to wait for process to die: " +
                                 Twine(strerror(errno)));

    if (ChildSignalInfo.si_signo == SIGSEGV)
      return make_error<SnippetSegmentationFault>(
          reinterpret_cast<uintptr_t>(ChildSignalInfo.si_addr));

    return make_error<SnippetSignal>(ChildSignalInfo.si_signo);
  }

  static void setCPUAffinityIfRequested(int CPUToUse) {
// Special case this function for x86_64 for now as certain more esoteric
// platforms have different definitions for some of the libc functions that
// cause buildtime failures. Additionally, the subprocess executor mode (the
// sole mode where this is supported) currently only supports x86_64.

// Also check that we have the SYS_getcpu macro defined, meaning the syscall
// actually exists within the build environment. We manually use the syscall
// rather than the libc wrapper given the wrapper for getcpu is only available
// in glibc 2.29 and later.
#if defined(__x86_64__) && defined(SYS_getcpu)
    // Set the CPU affinity for the child process, so that we ensure that if
    // the user specified a CPU the process should run on, the benchmarking
    // process is running on that CPU.
    cpu_set_t CPUMask;
    CPU_ZERO(&CPUMask);
    CPU_SET(CPUToUse, &CPUMask);
    // TODO(boomanaiden154): Rewrite this to use LLVM primitives once they
    // are available.
    int SetAffinityReturn = sched_setaffinity(0, sizeof(CPUMask), &CPUMask);
    if (SetAffinityReturn == -1) {
      exit(ChildProcessExitCodeE::SetCPUAffinityFailed);
    }

    // Check (if assertions are enabled) that we are actually running on the
    // CPU that was specified by the user.
    [[maybe_unused]] unsigned int CurrentCPU;
    assert(syscall(SYS_getcpu, &CurrentCPU, nullptr) == 0 &&
           "Expected getcpu call to succeed.");
    assert(static_cast<int>(CurrentCPU) == CPUToUse &&
           "Expected current CPU to equal the CPU requested by the user");
#else
    exit(ChildProcessExitCodeE::SetCPUAffinityFailed);
#endif // defined(__x86_64__) && defined(SYS_getcpu)
  }

  Error createSubProcessAndRunBenchmark(
      StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues,
      ArrayRef<const char *> ValidationCounters,
      SmallVectorImpl<int64_t> &ValidationCounterValues) const {
    int PipeFiles[2];
    int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles);
    if (PipeSuccessOrErr != 0) {
      return make_error<Failure>(
          "Failed to create a pipe for interprocess communication between "
          "llvm-exegesis and the benchmarking subprocess: " +
          Twine(strerror(errno)));
    }

    SubprocessMemory SPMemory;
    Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid());
    if (MemoryInitError)
      return MemoryInitError;

    Error AddMemDefError =
        SPMemory.addMemoryDefinition(Key.MemoryValues, getpid());
    if (AddMemDefError)
      return AddMemDefError;

    long ParentTID = SubprocessMemory::getCurrentTID();
    pid_t ParentOrChildPID = fork();

    if (ParentOrChildPID == -1) {
      return make_error<Failure>("Failed to create child process: " +
                                 Twine(strerror(errno)));
    }

    if (ParentOrChildPID == 0) {
      if (BenchmarkProcessCPU.has_value()) {
        setCPUAffinityIfRequested(*BenchmarkProcessCPU);
      }

      // We are in the child process, close the write end of the pipe.
      close(PipeFiles[1]);
      // Unregister handlers, signal handling is now handled through ptrace in
      // the host process.
      sys::unregisterHandlers();
      runChildSubprocess(PipeFiles[0], Key, ParentTID);
      // The child process terminates in the above function, so we should never
      // get to this point.
      llvm_unreachable("Child process didn't exit when expected.");
    }

    // Close the read end of the pipe as we only need to write to the subprocess
    // from the parent process.
    close(PipeFiles[0]);
    return runParentProcess(ParentOrChildPID, PipeFiles[1], CounterName,
                            CounterValues, ValidationCounters,
                            ValidationCounterValues);
  }

  void disableCoreDumps() const {
    struct rlimit rlim;

    rlim.rlim_cur = 0;
    setrlimit(RLIMIT_CORE, &rlim);
  }

  [[noreturn]] void runChildSubprocess(int Pipe, const BenchmarkKey &Key,
                                       long ParentTID) const {
    // Disable core dumps in the child process as otherwise everytime we
    // encounter an execution failure like a segmentation fault, we will create
    // a core dump. We report the information directly rather than require the
    // user inspect a core dump.
    disableCoreDumps();

    // The following occurs within the benchmarking subprocess.
    pid_t ParentPID = getppid();

    Expected<int> CounterFileDescriptorOrError =
        getFileDescriptorFromSocket(Pipe);

    if (!CounterFileDescriptorOrError)
      exit(ChildProcessExitCodeE::CounterFDReadFailed);

    int CounterFileDescriptor = *CounterFileDescriptorOrError;

// Glibc versions greater than 2.35 automatically call rseq during
// initialization. Unmapping the region that glibc sets up for this causes
// segfaults in the program. Unregister the rseq region so that we can safely
// unmap it later
#ifdef GLIBC_INITS_RSEQ
    unsigned int RseqStructSize = __rseq_size;

    // Glibc v2.40 (the change is also expected to be backported to v2.35)
    // changes the definition of __rseq_size to be the usable area of the struct
    // rather than the actual size of the struct. v2.35 uses only 20 bytes of
    // the 32 byte struct. For now, it should be safe to assume that if the
    // usable size is less than 32, the actual size of the struct will be 32
    // bytes given alignment requirements.
    if (__rseq_size < 32)
      RseqStructSize = 32;

    long RseqDisableOutput = syscall(
        SYS_rseq,
        reinterpret_cast<uintptr_t>(__builtin_thread_pointer()) + __rseq_offset,
        RseqStructSize, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
    if (RseqDisableOutput != 0)
      exit(ChildProcessExitCodeE::RSeqDisableFailed);
#endif // GLIBC_INITS_RSEQ

    // The frontend that generates the memory annotation structures should
    // validate that the address to map the snippet in at is a multiple of
    // the page size. Assert that this is true here.
    assert(Key.SnippetAddress % getpagesize() == 0 &&
           "The snippet address needs to be aligned to a page boundary.");

    size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
    void *MapAddress = NULL;
    int MapFlags = MAP_PRIVATE | MAP_ANONYMOUS;

    if (Key.SnippetAddress != 0) {
      MapAddress = reinterpret_cast<void *>(Key.SnippetAddress);
      MapFlags |= MAP_FIXED_NOREPLACE;
    }

    char *FunctionDataCopy =
        (char *)mmap(MapAddress, FunctionDataCopySize, PROT_READ | PROT_WRITE,
                     MapFlags, 0, 0);
    if (reinterpret_cast<intptr_t>(FunctionDataCopy) == -1)
      exit(ChildProcessExitCodeE::FunctionDataMappingFailed);

    memcpy(FunctionDataCopy, this->Function.FunctionBytes.data(),
           this->Function.FunctionBytes.size());
    mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC);

    Expected<int> AuxMemFDOrError =
        SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
            Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
    if (!AuxMemFDOrError)
      exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);

    ((void (*)(size_t, int))(uintptr_t)FunctionDataCopy)(FunctionDataCopySize,
                                                         *AuxMemFDOrError);

    exit(0);
  }

  Expected<SmallVector<int64_t, 4>> runWithCounter(
      StringRef CounterName, ArrayRef<const char *> ValidationCounters,
      SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
    SmallVector<int64_t, 4> Value(1, 0);
    Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(
        CounterName, Value, ValidationCounters, ValidationCounterValues);

    if (PossibleBenchmarkError)
      return std::move(PossibleBenchmarkError);

    return Value;
  }

  const LLVMState &State;
  const ExecutableFunction Function;
  const BenchmarkKey &Key;
  const std::optional<int> BenchmarkProcessCPU;
};
#endif // __linux__
} // namespace

Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
    const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
    unsigned MinInstructions, unsigned LoopBodySize,
    bool GenerateMemoryInstructions) const {
  const std::vector<MCInst> &Instructions = BC.Key.Instructions;
  SmallString<0> Buffer;
  raw_svector_ostream OS(Buffer);
  if (Error E = assembleToStream(
          State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
          Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
                           GenerateMemoryInstructions),
          OS, BC.Key, GenerateMemoryInstructions)) {
    return std::move(E);
  }
  return Buffer;
}

Expected<BenchmarkRunner::RunnableConfiguration>
BenchmarkRunner::getRunnableConfiguration(
    const BenchmarkCode &BC, unsigned MinInstructions, unsigned LoopBodySize,
    const SnippetRepetitor &Repetitor) const {
  RunnableConfiguration RC;

  Benchmark &BenchmarkResult = RC.BenchmarkResult;
  BenchmarkResult.Mode = Mode;
  BenchmarkResult.CpuName =
      std::string(State.getTargetMachine().getTargetCPU());
  BenchmarkResult.LLVMTriple =
      State.getTargetMachine().getTargetTriple().normalize();
  BenchmarkResult.MinInstructions = MinInstructions;
  BenchmarkResult.Info = BC.Info;

  const std::vector<MCInst> &Instructions = BC.Key.Instructions;

  bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;

  BenchmarkResult.Key = BC.Key;

  // Assemble at least kMinInstructionsForSnippet instructions by repeating
  // the snippet for debug/analysis. This is so that the user clearly
  // understands that the inside instructions are repeated.
  if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
    const int MinInstructionsForSnippet = 4 * Instructions.size();
    const int LoopBodySizeForSnippet = 2 * Instructions.size();
    auto Snippet =
        assembleSnippet(BC, Repetitor, MinInstructionsForSnippet,
                        LoopBodySizeForSnippet, GenerateMemoryInstructions);
    if (Error E = Snippet.takeError())
      return std::move(E);

    if (auto Err = getBenchmarkFunctionBytes(*Snippet,
                                             BenchmarkResult.AssembledSnippet))
      return std::move(Err);
  }

  // Assemble enough repetitions of the snippet so we have at least
  // MinInstructions instructions.
  if (BenchmarkPhaseSelector >
      BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
    auto Snippet =
        assembleSnippet(BC, Repetitor, BenchmarkResult.MinInstructions,
                        LoopBodySize, GenerateMemoryInstructions);
    if (Error E = Snippet.takeError())
      return std::move(E);
    RC.ObjectFile = getObjectFromBuffer(*Snippet);
  }

  return std::move(RC);
}

Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
BenchmarkRunner::createFunctionExecutor(
    object::OwningBinary<object::ObjectFile> ObjectFile,
    const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) const {
  switch (ExecutionMode) {
  case ExecutionModeE::InProcess: {
    if (BenchmarkProcessCPU.has_value())
      return make_error<Failure>("The inprocess execution mode does not "
                                 "support benchmark core pinning.");

    auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
        State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU);
    if (!InProcessExecutorOrErr)
      return InProcessExecutorOrErr.takeError();

    return std::move(*InProcessExecutorOrErr);
  }
  case ExecutionModeE::SubProcess: {
#ifdef __linux__
    auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
        State, std::move(ObjectFile), Key, BenchmarkProcessCPU);
    if (!SubProcessExecutorOrErr)
      return SubProcessExecutorOrErr.takeError();

    return std::move(*SubProcessExecutorOrErr);
#else
    return make_error<Failure>(
        "The subprocess execution mode is only supported on Linux");
#endif
  }
  }
  llvm_unreachable("ExecutionMode is outside expected range");
}

std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
    RunnableConfiguration &&RC, const std::optional<StringRef> &DumpFile,
    std::optional<int> BenchmarkProcessCPU) const {
  Benchmark &BenchmarkResult = RC.BenchmarkResult;
  object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;

  if (DumpFile && BenchmarkPhaseSelector >
                      BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
    auto ObjectFilePath =
        writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
    if (Error E = ObjectFilePath.takeError()) {
      return {std::move(E), std::move(BenchmarkResult)};
    }
    outs() << "Check generated assembly with: /usr/bin/objdump -d "
           << *ObjectFilePath << "\n";
  }

  if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
    BenchmarkResult.Error = "actual measurements skipped.";
    return {Error::success(), std::move(BenchmarkResult)};
  }

  Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
      createFunctionExecutor(std::move(ObjectFile), RC.BenchmarkResult.Key,
                             BenchmarkProcessCPU);
  if (!Executor)
    return {Executor.takeError(), std::move(BenchmarkResult)};
  auto NewMeasurements = runMeasurements(**Executor);

  if (Error E = NewMeasurements.takeError()) {
    return {std::move(E), std::move(BenchmarkResult)};
  }
  assert(BenchmarkResult.MinInstructions > 0 && "invalid MinInstructions");
  for (BenchmarkMeasure &BM : *NewMeasurements) {
    // Scale the measurements by the number of instructions.
    BM.PerInstructionValue /= BenchmarkResult.MinInstructions;
    // Scale the measurements by the number of times the entire snippet is
    // repeated.
    BM.PerSnippetValue /=
        std::ceil(BenchmarkResult.MinInstructions /
                  static_cast<double>(BenchmarkResult.Key.Instructions.size()));
  }
  BenchmarkResult.Measurements = std::move(*NewMeasurements);

  return {Error::success(), std::move(BenchmarkResult)};
}

Expected<std::string>
BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
  int ResultFD = 0;
  SmallString<256> ResultPath = FileName;
  if (Error E = errorCodeToError(
          FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o",
                                                          ResultFD, ResultPath)
                           : sys::fs::openFileForReadWrite(
                                 FileName, ResultFD, sys::fs::CD_CreateAlways,
                                 sys::fs::OF_None)))
    return std::move(E);
  raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
  OFS.write(Buffer.data(), Buffer.size());
  OFS.flush();
  return std::string(ResultPath);
}

static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS,
                          const ValidationEvent RHS) {
  return static_cast<int>(LHS.first) < static_cast<int>(RHS);
}

Error BenchmarkRunner::getValidationCountersToRun(
    SmallVector<const char *> &ValCountersToRun) const {
  const PfmCountersInfo &PCI = State.getPfmCounters();
  ValCountersToRun.reserve(ValidationCounters.size());

  ValCountersToRun.reserve(ValidationCounters.size());
  ArrayRef TargetValidationEvents(PCI.ValidationEvents,
                                  PCI.NumValidationEvents);
  for (const ValidationEvent RequestedValEvent : ValidationCounters) {
    auto ValCounterIt =
        lower_bound(TargetValidationEvents, RequestedValEvent, EventLessThan);
    if (ValCounterIt == TargetValidationEvents.end() ||
        ValCounterIt->first != RequestedValEvent)
      return make_error<Failure>("Cannot create validation counter");

    assert(ValCounterIt->first == RequestedValEvent &&
           "The array of validation events from the target should be sorted");
    ValCountersToRun.push_back(ValCounterIt->second);
  }

  return Error::success();
}

BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}

} // namespace exegesis
} // namespace llvm