mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-16 23:26:32 +00:00

This fixes remaining issues in my previous PR #90959. Changes: - Removed dependency on LLVM header in `xray_interface.cpp` - Fixed XRay patching for some targets due to missing changes in architecture-specific patching functions - Addressed some remaining compiler warnings that I missed in the previous patch - Formatting I have tested these changes on `x86_64` (natively), as well as `ppc64le`, `aarch64` and `arm32` (cross-compiled and emulated using qemu). **Original description:** This PR introduces shared library (DSO) support for XRay based on a revised version of the implementation outlined in [this RFC](https://discourse.llvm.org/t/rfc-upstreaming-dso-instrumentation-support-for-xray/73000). The feature enables the patching and handling of events from DSOs, supporting both libraries linked at startup or explicitly loaded, e.g. via `dlopen`. This patch adds the following: - The `-fxray-shared` flag to enable the feature (turned off by default) - A small runtime library that is linked into every instrumented DSO, providing position-independent trampolines and code to register with the main XRay runtime - Changes to the XRay runtime to support management and patching of multiple objects These changes are fully backward compatible, i.e. running without instrumented DSOs will produce identical traces (in terms of recorded function IDs) to the previous implementation. Due to my limited ability to test on other architectures, this feature is only implemented and tested with x86_64. Extending support to other architectures is fairly straightforward, requiring only a position-independent implementation of the architecture-specific trampoline implementation (see `compiler-rt/lib/xray/xray_trampoline_x86_64.S` for reference). This patch does not include any functionality to resolve function IDs from DSOs for the provided logging/tracing modes. These modes still work and will record calls from DSOs, but symbol resolution for these functions in not available. Getting this to work properly requires recording information about the loaded DSOs and should IMO be discussed in a separate RFC, as there are mulitple feasible approaches. --------- Co-authored-by: Sebastian Kreutzer <sebastian.kreutzer@tu-darmstadt.de>
341 lines
11 KiB
C++
341 lines
11 KiB
C++
#include "cpuid.h"
|
|
#include "sanitizer_common/sanitizer_common.h"
|
|
#if !SANITIZER_FUCHSIA
|
|
#include "sanitizer_common/sanitizer_posix.h"
|
|
#endif
|
|
#include "xray_defs.h"
|
|
#include "xray_interface_internal.h"
|
|
|
|
#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
|
|
#include <sys/types.h>
|
|
#include <sys/sysctl.h>
|
|
#elif SANITIZER_FUCHSIA
|
|
#include <zircon/syscalls.h>
|
|
#endif
|
|
|
|
#include <atomic>
|
|
#include <cstdint>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <iterator>
|
|
#include <limits>
|
|
#include <tuple>
|
|
#include <unistd.h>
|
|
|
|
namespace __xray {
|
|
|
|
#if SANITIZER_LINUX
|
|
static std::pair<ssize_t, bool>
|
|
retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
|
|
auto BytesToRead = std::distance(Begin, End);
|
|
ssize_t BytesRead;
|
|
ssize_t TotalBytesRead = 0;
|
|
while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
|
|
if (BytesRead == -1) {
|
|
if (errno == EINTR)
|
|
continue;
|
|
Report("Read error; errno = %d\n", errno);
|
|
return std::make_pair(TotalBytesRead, false);
|
|
}
|
|
|
|
TotalBytesRead += BytesRead;
|
|
BytesToRead -= BytesRead;
|
|
Begin += BytesRead;
|
|
}
|
|
return std::make_pair(TotalBytesRead, true);
|
|
}
|
|
|
|
static bool readValueFromFile(const char *Filename,
|
|
long long *Value) XRAY_NEVER_INSTRUMENT {
|
|
int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
|
|
if (Fd == -1)
|
|
return false;
|
|
static constexpr size_t BufSize = 256;
|
|
char Line[BufSize] = {};
|
|
ssize_t BytesRead;
|
|
bool Success;
|
|
std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
|
|
close(Fd);
|
|
if (!Success)
|
|
return false;
|
|
const char *End = nullptr;
|
|
long long Tmp = internal_simple_strtoll(Line, &End, 10);
|
|
bool Result = false;
|
|
if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
|
|
*Value = Tmp;
|
|
Result = true;
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
|
|
long long TSCFrequency = -1;
|
|
if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
|
|
&TSCFrequency)) {
|
|
TSCFrequency *= 1000;
|
|
} else if (readValueFromFile(
|
|
"/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
|
|
&TSCFrequency)) {
|
|
TSCFrequency *= 1000;
|
|
} else {
|
|
Report("Unable to determine CPU frequency for TSC accounting.\n");
|
|
}
|
|
return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
|
|
}
|
|
#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
|
|
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
|
|
long long TSCFrequency = -1;
|
|
size_t tscfreqsz = sizeof(TSCFrequency);
|
|
#if SANITIZER_APPLE
|
|
if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
|
|
&tscfreqsz, NULL, 0) != -1) {
|
|
|
|
#else
|
|
if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
|
|
NULL, 0) != -1) {
|
|
#endif
|
|
return static_cast<uint64_t>(TSCFrequency);
|
|
} else {
|
|
Report("Unable to determine CPU frequency for TSC accounting.\n");
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
#elif !SANITIZER_FUCHSIA
|
|
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
|
|
/* Not supported */
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
static constexpr uint8_t CallOpCode = 0xe8;
|
|
static constexpr uint16_t MovR10Seq = 0xba41;
|
|
static constexpr uint16_t Jmp9Seq = 0x09eb;
|
|
static constexpr uint16_t Jmp20Seq = 0x14eb;
|
|
static constexpr uint16_t Jmp15Seq = 0x0feb;
|
|
static constexpr uint8_t JmpOpCode = 0xe9;
|
|
static constexpr uint8_t RetOpCode = 0xc3;
|
|
static constexpr uint16_t NopwSeq = 0x9066;
|
|
|
|
static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
|
|
static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
|
|
|
|
bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
|
|
const XRaySledEntry &Sled,
|
|
const XRayTrampolines &Trampolines,
|
|
bool LogArgs) XRAY_NEVER_INSTRUMENT {
|
|
// Here we do the dance of replacing the following sled:
|
|
//
|
|
// xray_sled_n:
|
|
// jmp +9
|
|
// <9 byte nop>
|
|
//
|
|
// With the following:
|
|
//
|
|
// mov r10d, <function id>
|
|
// call <relative 32bit offset to entry trampoline>
|
|
//
|
|
// We need to do this in the following order:
|
|
//
|
|
// 1. Put the function id first, 2 bytes from the start of the sled (just
|
|
// after the 2-byte jmp instruction).
|
|
// 2. Put the call opcode 6 bytes from the start of the sled.
|
|
// 3. Put the relative offset 7 bytes from the start of the sled.
|
|
// 4. Do an atomic write over the jmp instruction for the "mov r10d"
|
|
// opcode and first operand.
|
|
//
|
|
// Prerequisite is to compute the relative offset to the trampoline's address.
|
|
auto Trampoline =
|
|
LogArgs ? Trampolines.LogArgsTrampoline : Trampolines.EntryTrampoline;
|
|
const uint64_t Address = Sled.address();
|
|
int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
|
|
(static_cast<int64_t>(Address) + 11);
|
|
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
|
|
Report("XRay Entry trampoline (%p) too far from sled (%p)\n",
|
|
reinterpret_cast<void *>(Trampoline),
|
|
reinterpret_cast<void *>(Address));
|
|
return false;
|
|
}
|
|
if (Enable) {
|
|
*reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
|
|
*reinterpret_cast<uint8_t *>(Address + 6) = CallOpCode;
|
|
*reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
|
|
std::memory_order_release);
|
|
} else {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp9Seq,
|
|
std::memory_order_release);
|
|
// FIXME: Write out the nops still?
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool patchFunctionExit(
|
|
const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
|
|
const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
|
|
// Here we do the dance of replacing the following sled:
|
|
//
|
|
// xray_sled_n:
|
|
// ret
|
|
// <10 byte nop>
|
|
//
|
|
// With the following:
|
|
//
|
|
// mov r10d, <function id>
|
|
// jmp <relative 32bit offset to exit trampoline>
|
|
//
|
|
// 1. Put the function id first, 2 bytes from the start of the sled (just
|
|
// after the 1-byte ret instruction).
|
|
// 2. Put the jmp opcode 6 bytes from the start of the sled.
|
|
// 3. Put the relative offset 7 bytes from the start of the sled.
|
|
// 4. Do an atomic write over the jmp instruction for the "mov r10d"
|
|
// opcode and first operand.
|
|
//
|
|
// Prerequisite is to compute the relative offset fo the
|
|
// __xray_FunctionExit function's address.
|
|
auto Trampoline = Trampolines.ExitTrampoline;
|
|
const uint64_t Address = Sled.address();
|
|
int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
|
|
(static_cast<int64_t>(Address) + 11);
|
|
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
|
|
Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
|
|
reinterpret_cast<void *>(Trampoline),
|
|
reinterpret_cast<void *>(Address));
|
|
return false;
|
|
}
|
|
if (Enable) {
|
|
*reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
|
|
*reinterpret_cast<uint8_t *>(Address + 6) = JmpOpCode;
|
|
*reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
|
|
std::memory_order_release);
|
|
} else {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint8_t> *>(Address), RetOpCode,
|
|
std::memory_order_release);
|
|
// FIXME: Write out the nops still?
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool patchFunctionTailExit(
|
|
const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
|
|
const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
|
|
// Here we do the dance of replacing the tail call sled with a similar
|
|
// sequence as the entry sled, but calls the tail exit sled instead.
|
|
auto Trampoline = Trampolines.TailExitTrampoline;
|
|
const uint64_t Address = Sled.address();
|
|
int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
|
|
(static_cast<int64_t>(Address) + 11);
|
|
if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
|
|
Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
|
|
reinterpret_cast<void *>(Trampoline),
|
|
reinterpret_cast<void *>(Address));
|
|
return false;
|
|
}
|
|
if (Enable) {
|
|
*reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
|
|
*reinterpret_cast<uint8_t *>(Address + 6) = CallOpCode;
|
|
*reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
|
|
std::memory_order_release);
|
|
} else {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp9Seq,
|
|
std::memory_order_release);
|
|
// FIXME: Write out the nops still?
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
|
|
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
|
|
// Here we do the dance of replacing the following sled:
|
|
//
|
|
// xray_sled_n:
|
|
// jmp +15 // 2 bytes
|
|
// ...
|
|
//
|
|
// With the following:
|
|
//
|
|
// nopw // 2 bytes*
|
|
// ...
|
|
//
|
|
//
|
|
// The "unpatch" should just turn the 'nopw' back to a 'jmp +15'.
|
|
const uint64_t Address = Sled.address();
|
|
if (Enable) {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), NopwSeq,
|
|
std::memory_order_release);
|
|
} else {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp15Seq,
|
|
std::memory_order_release);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
|
|
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
|
|
// Here we do the dance of replacing the following sled:
|
|
//
|
|
// xray_sled_n:
|
|
// jmp +20 // 2 byte instruction
|
|
// ...
|
|
//
|
|
// With the following:
|
|
//
|
|
// nopw // 2 bytes
|
|
// ...
|
|
//
|
|
//
|
|
// The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
|
|
// The 20 byte sled stashes three argument registers, calls the trampoline,
|
|
// unstashes the registers and returns. If the arguments are already in
|
|
// the correct registers, the stashing and unstashing become equivalently
|
|
// sized nops.
|
|
const uint64_t Address = Sled.address();
|
|
if (Enable) {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), NopwSeq,
|
|
std::memory_order_release);
|
|
} else {
|
|
std::atomic_store_explicit(
|
|
reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp20Seq,
|
|
std::memory_order_release);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#if !SANITIZER_FUCHSIA
|
|
// We determine whether the CPU we're running on has the correct features we
|
|
// need. In x86_64 this will be rdtscp support.
|
|
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
|
|
unsigned int EAX, EBX, ECX, EDX;
|
|
|
|
// We check whether rdtscp support is enabled. According to the x86_64 manual,
|
|
// level should be set at 0x80000001, and we should have a look at bit 27 in
|
|
// EDX. That's 0x8000000 (or 1u << 27).
|
|
__asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
|
|
: "0"(0x80000001));
|
|
if (!(EDX & (1u << 27))) {
|
|
Report("Missing rdtscp support.\n");
|
|
return false;
|
|
}
|
|
// Also check whether we can determine the CPU frequency, since if we cannot,
|
|
// we should use the emulated TSC instead.
|
|
if (!getTSCFrequency()) {
|
|
Report("Unable to determine CPU frequency.\n");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
} // namespace __xray
|