llvm-project/compiler-rt/lib/xray/xray_x86_64.cpp
Sebastian Kreutzer e738a5d8e3
Reapply " [XRay] Add support for instrumentation of DSOs on x86_64 (#90959)" (#113548)
This fixes remaining issues in my previous PR #90959.

Changes:
- Removed dependency on LLVM header in `xray_interface.cpp`
- Fixed XRay patching for some targets due to missing changes in
architecture-specific patching functions
- Addressed some remaining compiler warnings that I missed in the
previous patch
- Formatting

I have tested these changes on `x86_64` (natively), as well as
`ppc64le`, `aarch64` and `arm32` (cross-compiled and emulated using
qemu).

**Original description:**

This PR introduces shared library (DSO) support for XRay based on a
revised version of the implementation outlined in [this
RFC](https://discourse.llvm.org/t/rfc-upstreaming-dso-instrumentation-support-for-xray/73000).
The feature enables the patching and handling of events from DSOs,
supporting both libraries linked at startup or explicitly loaded, e.g.
via `dlopen`.
This patch adds the following:
- The `-fxray-shared` flag to enable the feature (turned off by default)
- A small runtime library that is linked into every instrumented DSO,
providing position-independent trampolines and code to register with the
main XRay runtime
- Changes to the XRay runtime to support management and patching of
multiple objects

These changes are fully backward compatible, i.e. running without
instrumented DSOs will produce identical traces (in terms of recorded
function IDs) to the previous implementation.

Due to my limited ability to test on other architectures, this feature
is only implemented and tested with x86_64. Extending support to other
architectures is fairly straightforward, requiring only a
position-independent implementation of the architecture-specific
trampoline implementation (see
`compiler-rt/lib/xray/xray_trampoline_x86_64.S` for reference).

This patch does not include any functionality to resolve function IDs
from DSOs for the provided logging/tracing modes. These modes still work
and will record calls from DSOs, but symbol resolution for these
functions in not available. Getting this to work properly requires
recording information about the loaded DSOs and should IMO be discussed
in a separate RFC, as there are mulitple feasible approaches.

---------

Co-authored-by: Sebastian Kreutzer <sebastian.kreutzer@tu-darmstadt.de>
2024-10-25 10:15:25 +02:00

341 lines
11 KiB
C++

#include "cpuid.h"
#include "sanitizer_common/sanitizer_common.h"
#if !SANITIZER_FUCHSIA
#include "sanitizer_common/sanitizer_posix.h"
#endif
#include "xray_defs.h"
#include "xray_interface_internal.h"
#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
#include <sys/types.h>
#include <sys/sysctl.h>
#elif SANITIZER_FUCHSIA
#include <zircon/syscalls.h>
#endif
#include <atomic>
#include <cstdint>
#include <errno.h>
#include <fcntl.h>
#include <iterator>
#include <limits>
#include <tuple>
#include <unistd.h>
namespace __xray {
#if SANITIZER_LINUX
static std::pair<ssize_t, bool>
retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
auto BytesToRead = std::distance(Begin, End);
ssize_t BytesRead;
ssize_t TotalBytesRead = 0;
while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
if (BytesRead == -1) {
if (errno == EINTR)
continue;
Report("Read error; errno = %d\n", errno);
return std::make_pair(TotalBytesRead, false);
}
TotalBytesRead += BytesRead;
BytesToRead -= BytesRead;
Begin += BytesRead;
}
return std::make_pair(TotalBytesRead, true);
}
static bool readValueFromFile(const char *Filename,
long long *Value) XRAY_NEVER_INSTRUMENT {
int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
if (Fd == -1)
return false;
static constexpr size_t BufSize = 256;
char Line[BufSize] = {};
ssize_t BytesRead;
bool Success;
std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
close(Fd);
if (!Success)
return false;
const char *End = nullptr;
long long Tmp = internal_simple_strtoll(Line, &End, 10);
bool Result = false;
if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
*Value = Tmp;
Result = true;
}
return Result;
}
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
long long TSCFrequency = -1;
if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
&TSCFrequency)) {
TSCFrequency *= 1000;
} else if (readValueFromFile(
"/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
&TSCFrequency)) {
TSCFrequency *= 1000;
} else {
Report("Unable to determine CPU frequency for TSC accounting.\n");
}
return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
}
#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
long long TSCFrequency = -1;
size_t tscfreqsz = sizeof(TSCFrequency);
#if SANITIZER_APPLE
if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
&tscfreqsz, NULL, 0) != -1) {
#else
if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
NULL, 0) != -1) {
#endif
return static_cast<uint64_t>(TSCFrequency);
} else {
Report("Unable to determine CPU frequency for TSC accounting.\n");
}
return 0;
}
#elif !SANITIZER_FUCHSIA
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
/* Not supported */
return 0;
}
#endif
static constexpr uint8_t CallOpCode = 0xe8;
static constexpr uint16_t MovR10Seq = 0xba41;
static constexpr uint16_t Jmp9Seq = 0x09eb;
static constexpr uint16_t Jmp20Seq = 0x14eb;
static constexpr uint16_t Jmp15Seq = 0x0feb;
static constexpr uint8_t JmpOpCode = 0xe9;
static constexpr uint8_t RetOpCode = 0xc3;
static constexpr uint16_t NopwSeq = 0x9066;
static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled,
const XRayTrampolines &Trampolines,
bool LogArgs) XRAY_NEVER_INSTRUMENT {
// Here we do the dance of replacing the following sled:
//
// xray_sled_n:
// jmp +9
// <9 byte nop>
//
// With the following:
//
// mov r10d, <function id>
// call <relative 32bit offset to entry trampoline>
//
// We need to do this in the following order:
//
// 1. Put the function id first, 2 bytes from the start of the sled (just
// after the 2-byte jmp instruction).
// 2. Put the call opcode 6 bytes from the start of the sled.
// 3. Put the relative offset 7 bytes from the start of the sled.
// 4. Do an atomic write over the jmp instruction for the "mov r10d"
// opcode and first operand.
//
// Prerequisite is to compute the relative offset to the trampoline's address.
auto Trampoline =
LogArgs ? Trampolines.LogArgsTrampoline : Trampolines.EntryTrampoline;
const uint64_t Address = Sled.address();
int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
(static_cast<int64_t>(Address) + 11);
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
Report("XRay Entry trampoline (%p) too far from sled (%p)\n",
reinterpret_cast<void *>(Trampoline),
reinterpret_cast<void *>(Address));
return false;
}
if (Enable) {
*reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
*reinterpret_cast<uint8_t *>(Address + 6) = CallOpCode;
*reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
std::memory_order_release);
} else {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp9Seq,
std::memory_order_release);
// FIXME: Write out the nops still?
}
return true;
}
bool patchFunctionExit(
const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
// Here we do the dance of replacing the following sled:
//
// xray_sled_n:
// ret
// <10 byte nop>
//
// With the following:
//
// mov r10d, <function id>
// jmp <relative 32bit offset to exit trampoline>
//
// 1. Put the function id first, 2 bytes from the start of the sled (just
// after the 1-byte ret instruction).
// 2. Put the jmp opcode 6 bytes from the start of the sled.
// 3. Put the relative offset 7 bytes from the start of the sled.
// 4. Do an atomic write over the jmp instruction for the "mov r10d"
// opcode and first operand.
//
// Prerequisite is to compute the relative offset fo the
// __xray_FunctionExit function's address.
auto Trampoline = Trampolines.ExitTrampoline;
const uint64_t Address = Sled.address();
int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
(static_cast<int64_t>(Address) + 11);
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
reinterpret_cast<void *>(Trampoline),
reinterpret_cast<void *>(Address));
return false;
}
if (Enable) {
*reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
*reinterpret_cast<uint8_t *>(Address + 6) = JmpOpCode;
*reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
std::memory_order_release);
} else {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint8_t> *>(Address), RetOpCode,
std::memory_order_release);
// FIXME: Write out the nops still?
}
return true;
}
bool patchFunctionTailExit(
const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
// Here we do the dance of replacing the tail call sled with a similar
// sequence as the entry sled, but calls the tail exit sled instead.
auto Trampoline = Trampolines.TailExitTrampoline;
const uint64_t Address = Sled.address();
int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
(static_cast<int64_t>(Address) + 11);
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
reinterpret_cast<void *>(Trampoline),
reinterpret_cast<void *>(Address));
return false;
}
if (Enable) {
*reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
*reinterpret_cast<uint8_t *>(Address + 6) = CallOpCode;
*reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
std::memory_order_release);
} else {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp9Seq,
std::memory_order_release);
// FIXME: Write out the nops still?
}
return true;
}
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
// Here we do the dance of replacing the following sled:
//
// xray_sled_n:
// jmp +15 // 2 bytes
// ...
//
// With the following:
//
// nopw // 2 bytes*
// ...
//
//
// The "unpatch" should just turn the 'nopw' back to a 'jmp +15'.
const uint64_t Address = Sled.address();
if (Enable) {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), NopwSeq,
std::memory_order_release);
} else {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp15Seq,
std::memory_order_release);
}
return false;
}
bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
// Here we do the dance of replacing the following sled:
//
// xray_sled_n:
// jmp +20 // 2 byte instruction
// ...
//
// With the following:
//
// nopw // 2 bytes
// ...
//
//
// The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
// The 20 byte sled stashes three argument registers, calls the trampoline,
// unstashes the registers and returns. If the arguments are already in
// the correct registers, the stashing and unstashing become equivalently
// sized nops.
const uint64_t Address = Sled.address();
if (Enable) {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), NopwSeq,
std::memory_order_release);
} else {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp20Seq,
std::memory_order_release);
}
return false;
}
#if !SANITIZER_FUCHSIA
// We determine whether the CPU we're running on has the correct features we
// need. In x86_64 this will be rdtscp support.
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
unsigned int EAX, EBX, ECX, EDX;
// We check whether rdtscp support is enabled. According to the x86_64 manual,
// level should be set at 0x80000001, and we should have a look at bit 27 in
// EDX. That's 0x8000000 (or 1u << 27).
__asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
: "0"(0x80000001));
if (!(EDX & (1u << 27))) {
Report("Missing rdtscp support.\n");
return false;
}
// Also check whether we can determine the CPU frequency, since if we cannot,
// we should use the emulated TSC instead.
if (!getTSCFrequency()) {
Report("Unable to determine CPU frequency.\n");
return false;
}
return true;
}
#endif
} // namespace __xray