[BOLT][Instrumentation] Initial instrumentation support for RISCV64 (#133882)

This patch adds code generation for RISCV64 instrumentation.The work
    involved includes the following three points:

a) Implements support for instrumenting direct function call and jump
    on RISC-V which relies on , Atomic instructions
    (used to increment counters) are only available on RISC-V when the A
    extension is used.

b) Implements support for instrumenting direct function inderect call
    by implementing the createInstrumentedIndCallHandlerEntryBB and
createInstrumentedIndCallHandlerExitBB interfaces. In this process, we
    need to accurately record the target address and IndCallID to ensure
    the correct recording of the indirect call counters.

c)Implemented the RISCV64 Bolt runtime library, implemented some system
call interfaces through embedded assembly. Get the difference between
runtime addrress of .text section andstatic address in section header
table, which in turn can be used to search for indirect call
description.

However, the community code currently has problems with relocation in
    some scenarios, but this has nothing to do with instrumentation. We
    may continue to submit patches to fix the related bugs.
This commit is contained in:
wangjue 2025-04-17 14:01:00 +08:00 committed by GitHub
parent a2029ee91d
commit dbb79c30c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 984 additions and 11 deletions

View File

@ -82,7 +82,8 @@ endforeach()
set(BOLT_ENABLE_RUNTIME_default OFF)
if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64"
OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$")
OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$"
OR CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
AND (CMAKE_SYSTEM_NAME STREQUAL "Linux"
OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
AND (NOT CMAKE_CROSSCOMPILING))

View File

@ -123,6 +123,7 @@ static bool isSupportedRISCV(uint32_t Type) {
case ELF::R_RISCV_LO12_S:
case ELF::R_RISCV_64:
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
case ELF::R_RISCV_TPREL_HI20:
case ELF::R_RISCV_TPREL_ADD:
case ELF::R_RISCV_TPREL_LO12_I:
@ -236,6 +237,7 @@ static size_t getSizeForTypeRISCV(uint32_t Type) {
case ELF::R_RISCV_64:
case ELF::R_RISCV_GOT_HI20:
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
// See extractValueRISCV for why this is necessary.
return 8;
}
@ -491,6 +493,7 @@ static uint64_t extractValueRISCV(uint32_t Type, uint64_t Contents,
return extractBImmRISCV(Contents);
case ELF::R_RISCV_GOT_HI20:
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
// We need to know the exact address of the GOT entry so we extract the
// value from both the AUIPC and L[D|W]. We cannot rely on the symbol in the
// relocation for this since it simply refers to the object that is stored
@ -707,6 +710,7 @@ static bool isPCRelativeRISCV(uint32_t Type) {
case ELF::R_RISCV_RVC_BRANCH:
case ELF::R_RISCV_32_PCREL:
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
return true;
}
}

View File

@ -2926,12 +2926,12 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
if (BinaryData *BD = BC->getBinaryDataContainingAddress(SymbolAddress)) {
// Note: this assertion is trying to check sanity of BinaryData objects
// but AArch64 has inferred and incomplete object locations coming from
// GOT/TLS or any other non-trivial relocation (that requires creation
// of sections and whose symbol address is not really what should be
// encoded in the instruction). So we essentially disabled this check
// but AArch64 and RISCV has inferred and incomplete object locations
// coming from GOT/TLS or any other non-trivial relocation (that requires
// creation of sections and whose symbol address is not really what should
// be encoded in the instruction). So we essentially disabled this check
// for AArch64 and live with bogus names for objects.
assert((IsAArch64 || IsSectionRelocation ||
assert((IsAArch64 || BC->isRISCV() || IsSectionRelocation ||
BD->nameStartsWith(SymbolName) ||
BD->nameStartsWith("PG" + SymbolName) ||
(BD->nameStartsWith("ANONYMOUS") &&

View File

@ -14,7 +14,9 @@
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "bolt/Core/MCPlusBuilder.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/ErrorHandling.h"
@ -72,6 +74,7 @@ public:
case ELF::R_RISCV_LO12_I:
case ELF::R_RISCV_LO12_S:
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
return true;
default:
llvm_unreachable("Unexpected RISCV relocation type in code");
@ -252,6 +255,11 @@ public:
return createCall(RISCV::PseudoCALL, Inst, Target, Ctx);
}
void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target,
MCContext *Ctx) override {
createShortJmp(Seq, Target, Ctx, /*IsTailCall*/ true);
}
void createTailCall(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) override {
return createCall(RISCV::PseudoTAIL, Inst, Target, Ctx);
@ -424,6 +432,7 @@ public:
return Expr;
case ELF::R_RISCV_GOT_HI20:
case ELF::R_RISCV_TLS_GOT_HI20:
case ELF::R_RISCV_TLS_GD_HI20:
// The GOT is reused so no need to create GOT relocations
case ELF::R_RISCV_PCREL_HI20:
return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_PCREL_HI, Ctx);
@ -483,6 +492,375 @@ public:
return 2;
return 4;
}
void createStackPointerIncrement(
MCInst &Inst, int imm,
bool NoFlagsClobber = false /*unused for RISCV*/) const override {
Inst = MCInstBuilder(RISCV::ADDI)
.addReg(RISCV::X2)
.addReg(RISCV::X2)
.addImm(-imm);
}
void createStackPointerDecrement(
MCInst &Inst, int imm,
bool NoFlagsClobber = false /*unused for RISCV*/) const override {
Inst = MCInstBuilder(RISCV::ADDI)
.addReg(RISCV::X2)
.addReg(RISCV::X2)
.addImm(imm);
}
void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From,
int64_t offset) const {
Inst = MCInstBuilder(RISCV::LD).addReg(To).addReg(From).addImm(offset);
}
void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To,
int64_t offset) const {
Inst = MCInstBuilder(RISCV::SD).addReg(From).addReg(To).addImm(offset);
}
void spillRegs(InstructionListType &Insts,
const SmallVector<unsigned> &Regs) const {
Insts.emplace_back();
createStackPointerIncrement(Insts.back(), Regs.size() * 8);
int64_t Offset = 0;
for (auto Reg : Regs) {
Insts.emplace_back();
storeReg(Insts.back(), Reg, RISCV::X2, Offset);
Offset += 8;
}
}
void reloadRegs(InstructionListType &Insts,
const SmallVector<unsigned> &Regs) const {
int64_t Offset = 0;
for (auto Reg : Regs) {
Insts.emplace_back();
loadReg(Insts.back(), Reg, RISCV::X2, Offset);
Offset += 8;
}
Insts.emplace_back();
createStackPointerDecrement(Insts.back(), Regs.size() * 8);
}
void atomicAdd(MCInst &Inst, MCPhysReg RegAtomic, MCPhysReg RegTo,
MCPhysReg RegCnt) const {
Inst = MCInstBuilder(RISCV::AMOADD_D)
.addReg(RegAtomic)
.addReg(RegTo)
.addReg(RegCnt);
}
InstructionListType createCmpJE(MCPhysReg RegNo, MCPhysReg RegTmp,
const MCSymbol *Target,
MCContext *Ctx) const {
InstructionListType Insts;
Insts.emplace_back(
MCInstBuilder(RISCV::SUB).addReg(RegTmp).addReg(RegNo).addReg(RegNo));
Insts.emplace_back(MCInstBuilder(RISCV::BEQ)
.addReg(RegNo)
.addReg(RegTmp)
.addExpr(MCSymbolRefExpr::create(
Target, MCSymbolRefExpr::VK_None, *Ctx)));
return Insts;
}
void createTrap(MCInst &Inst) const override {
Inst.clear();
Inst.setOpcode(RISCV::EBREAK);
}
void createShortJmp(InstructionListType &Seq, const MCSymbol *Target,
MCContext *Ctx, bool IsTailCall) override {
// The sequence of instructions we create here is the following:
// auipc a5, hi20(Target)
// addi a5, a5, low12(Target)
// jr x5 => jalr x0, x5, 0
MCPhysReg Reg = RISCV::X5;
InstructionListType Insts = materializeAddress(Target, Ctx, Reg);
Insts.emplace_back();
MCInst &Inst = Insts.back();
Inst.clear();
Inst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Reg).addImm(0);
if (IsTailCall)
setTailCall(Inst);
Seq.swap(Insts);
}
InstructionListType createGetter(MCContext *Ctx, const char *name) const {
InstructionListType Insts(4);
MCSymbol *Locs = Ctx->getOrCreateSymbol(name);
InstructionListType Addr = materializeAddress(Locs, Ctx, RISCV::X10);
std::copy(Addr.begin(), Addr.end(), Insts.begin());
loadReg(Insts[2], RISCV::X10, RISCV::X10, 0);
createReturn(Insts[3]);
return Insts;
}
InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegCnt,
MCPhysReg RegAtomic) const {
InstructionListType Insts;
Insts.emplace_back();
Insts.back() =
MCInstBuilder(RISCV::ADDI).addReg(RegCnt).addReg(RegAtomic).addImm(1);
Insts.emplace_back();
atomicAdd(Insts.back(), RegAtomic, RegTo, RegCnt);
return Insts;
}
InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx,
MCPhysReg RegName,
int64_t Addend = 0) const override {
// Get the symbol address by auipc + addi
InstructionListType Insts(2);
MCSymbol *AuipcLabel = Ctx->createNamedTempSymbol("pcrel_hi");
Insts[0] = MCInstBuilder(RISCV::AUIPC).addReg(RegName).addImm(0);
setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, Addend, Ctx,
ELF::R_RISCV_PCREL_HI20);
setInstLabel(Insts[0], AuipcLabel);
Insts[1] =
MCInstBuilder(RISCV::ADDI).addReg(RegName).addReg(RegName).addImm(0);
setOperandToSymbolRef(Insts[1], /* OpNum */ 2, AuipcLabel, Addend, Ctx,
ELF::R_RISCV_PCREL_LO12_I);
return Insts;
}
InstructionListType
createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
unsigned CodePointerSize) const override {
// We need 2 scratch registers: one for the target address (x10), and one
// for the increment value (x11).
// addi sp, sp, -16
// sd x10, 0(sp)
// sd x11, 8(sp)
// la x10, target # 1: auipc x10, %pcrel_hi(target)
// # addi x10, x10, %pcrel_lo(1b)
// li x11, 1 # addi x11, zero, 1
// amoadd.d zero, x10, x11
// ld x10, 0(sp)
// ld x11, 8(sp)
// addi sp, sp, 16
InstructionListType Insts;
spillRegs(Insts, {RISCV::X10, RISCV::X11});
InstructionListType Addr = materializeAddress(Target, Ctx, RISCV::X10);
Insts.insert(Insts.end(), Addr.begin(), Addr.end());
InstructionListType IncInsts =
createIncMemory(RISCV::X10, RISCV::X11, RISCV::X0);
Insts.insert(Insts.end(), IncInsts.begin(), IncInsts.end());
reloadRegs(Insts, {RISCV::X10, RISCV::X11});
return Insts;
}
void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
bool IsTailCall) override {
Inst.setOpcode(RISCV::JAL);
Inst.clear();
if (IsTailCall) {
Inst.addOperand(MCOperand::createReg(RISCV::X0));
Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
*Ctx, 0)));
convertJmpToTailCall(Inst);
} else {
Inst.addOperand(MCOperand::createReg(RISCV::X1));
Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
*Ctx, 0)));
}
}
void createIndirectCallInst(MCInst &Inst, bool IsTailCall, MCPhysReg Reg,
int64_t Disp) const {
Inst.clear();
Inst.setOpcode(RISCV::JALR);
Inst.clear();
if (IsTailCall) {
Inst.addOperand(MCOperand::createReg(RISCV::X0));
Inst.addOperand(MCOperand::createReg(Reg));
Inst.addOperand(MCOperand::createImm(Disp));
} else {
Inst.addOperand(MCOperand::createReg(RISCV::X1));
Inst.addOperand(MCOperand::createReg(Reg));
Inst.addOperand(MCOperand::createImm(Disp));
}
}
InstructionListType
createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline,
const MCSymbol *IndCallHandler,
MCContext *Ctx) override {
// Code sequence used to check whether InstrTampoline was initialized
// and call it if so, returns via IndCallHandler
// sp -16(sp)
// sd x10, 0(sp)
// sd x11, 0(sp)
// la x10, InstrTrampoline -> auipc + addi
// ld x10, [x10]
// beq x10, x11, IndCallHandler
// sp -16(sp)
// sd x1, 0(sp)
// jalr x1,x10,0
// ld x1, [sp], #16
// sp 16(sp)
// jal x0, IndCallHandler
InstructionListType Insts;
spillRegs(Insts, {RISCV::X10, RISCV::X11});
InstructionListType Addr =
materializeAddress(InstrTrampoline, Ctx, RISCV::X10);
Insts.insert(Insts.end(), Addr.begin(), Addr.end());
Insts.emplace_back();
loadReg(Insts.back(), RISCV::X10, RISCV::X10, 0);
InstructionListType cmpJmp =
createCmpJE(RISCV::X10, RISCV::X11, IndCallHandler, Ctx);
Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
Insts.emplace_back();
createStackPointerIncrement(Insts.back(), 16);
Insts.emplace_back();
storeReg(Insts.back(), RISCV::X1, RISCV::X2, 0);
Insts.emplace_back();
createIndirectCallInst(Insts.back(), /*IsTailCall*/ false, RISCV::X10, 0);
Insts.emplace_back();
loadReg(Insts.back(), RISCV::X1, RISCV::X2, 0);
Insts.emplace_back();
createStackPointerDecrement(Insts.back(), 16);
Insts.emplace_back();
createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true);
return Insts;
}
InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
InstructionListType Insts;
reloadRegs(Insts, {RISCV::X10, RISCV::X11});
Insts.emplace_back();
loadReg(Insts.back(), RISCV::X5, RISCV::X2, 0);
Insts.emplace_back();
createStackPointerDecrement(Insts.back(), 16);
reloadRegs(Insts, {RISCV::X10, RISCV::X11});
Insts.emplace_back();
createIndirectCallInst(Insts.back(), /*IsTailCall*/ true, RISCV::X5, 0);
return Insts;
}
InstructionListType
createInstrumentedIndTailCallHandlerExitBB() const override {
return createInstrumentedIndCallHandlerExitBB();
}
std::vector<MCInst> createSymbolTrampoline(const MCSymbol *TgtSym,
MCContext *Ctx) override {
std::vector<MCInst> Insts;
createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true);
return Insts;
}
InstructionListType createNumCountersGetter(MCContext *Ctx) const override {
return createGetter(Ctx, "__bolt_num_counters");
}
InstructionListType
createInstrLocationsGetter(MCContext *Ctx) const override {
return createGetter(Ctx, "__bolt_instr_locations");
}
InstructionListType createInstrTablesGetter(MCContext *Ctx) const override {
return createGetter(Ctx, "__bolt_instr_tables");
}
InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override {
return createGetter(Ctx, "__bolt_instr_num_funcs");
}
void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg,
MCPhysReg ZeroReg) const {
bool IsTailCall = isTailCall(Inst);
if (IsTailCall)
removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
Inst.setOpcode(RISCV::ADD);
Inst.insert(Inst.begin(), MCOperand::createReg(Reg));
Inst.insert(Inst.begin() + 1, MCOperand::createReg(ZeroReg));
return;
}
InstructionListType createLoadImmediate(const MCPhysReg Dest,
uint64_t Imm) const override {
InstructionListType Insts;
// get IMM higher 32bit
Insts.emplace_back(
MCInstBuilder(RISCV::LUI).addReg(Dest).addImm((Imm >> 44) & 0xFFFFF));
Insts.emplace_back(MCInstBuilder(RISCV::LUI)
.addReg(RISCV::X5)
.addImm((Imm >> 32) & 0xFFF));
Insts.emplace_back(MCInstBuilder(RISCV::SRLI)
.addReg(RISCV::X5)
.addReg(RISCV::X5)
.addImm(12));
Insts.emplace_back(
MCInstBuilder(RISCV::OR).addReg(Dest).addReg(Dest).addReg(RISCV::X5));
Insts.emplace_back(
MCInstBuilder(RISCV::SLLI).addReg(Dest).addReg(Dest).addImm(32));
// get IMM lower 32bit
Insts.emplace_back(MCInstBuilder(RISCV::LUI)
.addReg(RISCV::X5)
.addImm((Imm >> 12) & 0xFFFFF));
Insts.emplace_back(
MCInstBuilder(RISCV::LUI).addReg(RISCV::X6).addImm((Imm)&0xFFF));
Insts.emplace_back(MCInstBuilder(RISCV::SRLI)
.addReg(RISCV::X6)
.addReg(RISCV::X6)
.addImm(12));
Insts.emplace_back(
MCInstBuilder(RISCV::OR).addReg(RISCV::X5).addReg(RISCV::X5).addReg(
RISCV::X6));
// get 64bit IMM
Insts.emplace_back(
MCInstBuilder(RISCV::OR).addReg(Dest).addReg(Dest).addReg(RISCV::X5));
return Insts;
}
InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst,
MCSymbol *HandlerFuncAddr,
int CallSiteID,
MCContext *Ctx) override {
// Code sequence used to enter indirect call instrumentation helper:
// addi sp, sp, -0x10
// sd a0, 0x0(sp)
// sd a1, 0x8(sp)
// mov target x0 convertIndirectCallToLoad -> add a0, zero, target
// mov x1 CallSiteID createLoadImmediate
// addi sp, sp, -0x10
// sd a0, 0x0(sp)
// sd a1, 0x8(sp)
// la x0 *HandlerFuncAddr -> auipc + addi
// jalr x0
InstructionListType Insts;
spillRegs(Insts, {RISCV::X10, RISCV::X11});
Insts.emplace_back(CallInst);
convertIndirectCallToLoad(Insts.back(), RISCV::X10, RISCV::X0);
InstructionListType LoadImm = createLoadImmediate(RISCV::X11, CallSiteID);
Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end());
spillRegs(Insts, {RISCV::X10, RISCV::X11});
InstructionListType Addr =
materializeAddress(HandlerFuncAddr, Ctx, RISCV::X5);
Insts.insert(Insts.end(), Addr.begin(), Addr.end());
Insts.emplace_back();
createIndirectCallInst(Insts.back(), isTailCall(CallInst), RISCV::X5, 0);
// // Carry over metadata including tail call marker if present.
stripAnnotations(Insts.back());
moveAnnotations(std::move(CallInst), Insts.back());
return Insts;
}
};
} // end anonymous namespace

View File

@ -35,15 +35,21 @@ set(BOLT_RT_FLAGS
-fno-exceptions
-fno-rtti
-fno-stack-protector
-fPIC
-mgeneral-regs-only)
-fPIC)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse")
set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS}
-mno-sse
-mgeneral-regs-only)
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS})
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
check_cxx_compiler_flag("-mno-outline-atomics" CXX_SUPPORTS_OUTLINE_ATOMICS)
if (CXX_SUPPORTS_OUTLINE_ATOMICS)
set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics")
set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS}
-mno-outline-atomics
-mgeneral-regs-only)
endif()
endif()

View File

@ -153,10 +153,12 @@ struct timespec {
#if defined(__aarch64__) || defined(__arm64__)
#include "sys_aarch64.h"
#elif defined(__riscv)
#include "sys_riscv64.h"
#elif defined(__x86_64__)
#include "sys_x86_64.h"
#else
#error "For AArch64/ARM64 and X86_64 only."
#error "For AArch64/ARM64,X86_64 AND RISCV64 only."
#endif
constexpr uint32_t BufSize = 10240;

View File

@ -1674,6 +1674,19 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
"ret\n"
:::);
// clang-format on
#elif defined(__riscv)
// clang-format off
__asm__ __volatile__(
SAVE_ALL
"addi sp, sp, 288\n"
"ld x10, 0(sp)\n"
"ld x11, 8(sp)\n"
"addi sp, sp, -288\n"
"jal x1, instrumentIndirectCall\n"
RESTORE_ALL
"ret\n"
:::);
// clang-format on
#else
// clang-format off
__asm__ __volatile__(SAVE_ALL
@ -1698,6 +1711,18 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
"ret\n"
:::);
// clang-format on
#elif defined(__riscv)
// clang-format off
__asm__ __volatile__(SAVE_ALL
"addi sp, sp, 288\n"
"ld x10, 0(sp)\n"
"ld x11, 8(sp)\n"
"addi sp, sp, -288\n"
"jal x1, instrumentIndirectCall\n"
RESTORE_ALL
"ret\n"
:::);
// clang-format on
#else
// clang-format off
__asm__ __volatile__(SAVE_ALL
@ -1724,6 +1749,18 @@ extern "C" __attribute((naked)) void __bolt_instr_start()
"br x16\n"
:::);
// clang-format on
#elif defined(__riscv)
// clang-format off
__asm__ __volatile__(
SAVE_ALL
"jal x1, __bolt_instr_setup\n"
RESTORE_ALL
"setup_symbol:\n"
"auipc x5, %%pcrel_hi(__bolt_start_trampoline)\n"
"addi x5, x5, %%pcrel_lo(setup_symbol)\n"
"jr x5\n"
:::);
// clang-format on
#else
// clang-format off
__asm__ __volatile__(SAVE_ALL
@ -1746,6 +1783,17 @@ extern "C" void __bolt_instr_fini() {
RESTORE_ALL
:::);
// clang-format on
#elif defined(__riscv)
// clang-format off
__asm__ __volatile__(
SAVE_ALL
"fini_symbol:\n"
"auipc x5, %%pcrel_hi(__bolt_fini_trampoline)\n"
"addi x5, x5, %%pcrel_lo(fini_symbol)\n"
"jalr x1, 0(x5)\n"
RESTORE_ALL
:::);
// clang-format on
#else
__asm__ __volatile__("call __bolt_fini_trampoline\n" :::);
#endif

460
bolt/runtime/sys_riscv64.h Normal file
View File

@ -0,0 +1,460 @@
#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_RISCV
#define LLVM_TOOLS_LLVM_BOLT_SYS_RISCV
// Save all registers while keeping 16B stack alignment
#define SAVE_ALL \
"addi sp, sp, -16\n" \
"sd x0, 0(sp)\n" \
"sd x1, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x2, 0(sp)\n" \
"sd x3, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x4, 0(sp)\n" \
"sd x5, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x6, 0(sp)\n" \
"sd x7, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x8, 0(sp)\n" \
"sd x9, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x10, 0(sp)\n" \
"sd x11, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x12, 0(sp)\n" \
"sd x13, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x14, 0(sp)\n" \
"sd x15, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x16, 0(sp)\n" \
"sd x17, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x18, 0(sp)\n" \
"sd x19, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x20, 0(sp)\n" \
"sd x21, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x22, 0(sp)\n" \
"sd x23, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x24, 0(sp)\n" \
"sd x25, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x26, 0(sp)\n" \
"sd x27, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x28, 0(sp)\n" \
"sd x29, 8(sp)\n" \
"addi sp, sp, -16\n" \
"sd x30, 0(sp)\n" \
"sd x31, 8(sp)\n"
// Mirrors SAVE_ALL
#define RESTORE_ALL \
"ld x30, 0(sp)\n" \
"ld x31, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x28, 0(sp)\n" \
"ld x29, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x26, 0(sp)\n" \
"ld x27, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x24, 0(sp)\n" \
"ld x25, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x22, 0(sp)\n" \
"ld x23, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x20, 0(sp)\n" \
"ld x21, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x18, 0(sp)\n" \
"ld x19, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x16, 0(sp)\n" \
"ld x17, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x14, 0(sp)\n" \
"ld x15, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x12, 0(sp)\n" \
"ld x13, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x10, 0(sp)\n" \
"ld x11, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x8, 0(sp)\n" \
"ld x9, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x6, 0(sp)\n" \
"ld x7, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x4, 0(sp)\n" \
"ld x5, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x2, 0(sp)\n" \
"ld x3, 8(sp)\n" \
"addi sp, sp, 16\n" \
"ld x0, 0(sp)\n" \
"ld x1, 8(sp)\n" \
"addi sp, sp, 16\n"
// Anonymous namespace covering everything but our library entry point
namespace {
// Get the difference between runtime addrress of .text section and
// static address in section header table. Can be extracted from arbitrary
// pc value recorded at runtime to get the corresponding static address, which
// in turn can be used to search for indirect call description. Needed because
// indirect call descriptions are read-only non-relocatable data.
uint64_t getTextBaseAddress() {
uint64_t DynAddr;
uint64_t StaticAddr;
__asm__ volatile("lla %0, __hot_end\n\t"
"lui %1, %%hi(__hot_end)\n\t"
"addi %1, %1, %%lo(__hot_end)\n\t"
: "=r"(DynAddr), "=r"(StaticAddr));
return DynAddr - StaticAddr;
}
uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
uint64_t ret;
register uint64_t a0 __asm__("a0") = fd;
register const void *a1 __asm__("a1") = buf;
register uint64_t a2 __asm__("a2") = count;
register uint64_t a7 __asm__("a7") =
63; // Assuming 63 is the syscall number for read
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a0), "r"(a1), "r"(a2), "r"(a7)
: "memory");
return ret;
}
uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
uint64_t ret;
register uint64_t a0 __asm__("a0") = fd;
register const void *a1 __asm__("a1") = buf;
register uint64_t a2 __asm__("a2") = count;
register uint32_t a7 __asm__("a7") =
64; // Assuming 64 is the syscall number for write
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a0), "r"(a1), "r"(a2), "r"(a7)
: "memory");
return ret;
}
void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
uint64_t fd, uint64_t offset) {
void *ret;
register uint64_t a0 __asm__("a0") = addr;
register uint64_t a1 __asm__("a1") = size;
register uint64_t a2 __asm__("a2") = prot;
register uint64_t a3 __asm__("a3") = flags;
register uint64_t a4 __asm__("a4") = fd;
register uint64_t a5 __asm__("a5") = offset;
register uint32_t a7 __asm__("a7") =
222; // Assuming 222 is the syscall number for mmap
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5),
"r"(a7)
: "memory");
return ret;
}
uint64_t __munmap(void *addr, uint64_t size) {
uint64_t ret;
register void *a0 __asm__("a0") = addr;
register uint64_t a1 __asm__("a1") = size;
register uint32_t a7 __asm__("a7") = 215;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a7)
: "memory");
return ret;
}
uint64_t __exit(uint64_t code) {
uint64_t ret;
register uint64_t a0 __asm__("a0") = code;
register uint32_t a7 __asm__("a7") = 94;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0)
: "r"(a7)
: "memory");
return ret;
}
uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
uint64_t ret;
register int a0 __asm__("a0") =
-100; // Assuming -100 is an invalid file descriptor
register const char *a1 __asm__("a1") = pathname;
register uint64_t a2 __asm__("a2") = flags;
register uint64_t a3 __asm__("a3") = mode;
register uint64_t a7 __asm__("a7") =
56; // Assuming 56 is the syscall number for open
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a7)
: "memory");
return ret;
}
long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) {
long ret;
register unsigned int a0 __asm__("a0") = fd;
register dirent64 *a1 __asm__("a1") = dirp;
register size_t a2 __asm__("a2") = count;
register uint32_t a7 __asm__("a7") = 61;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a7)
: "memory");
return ret;
}
uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) {
uint64_t ret;
register int a0 __asm__("a0") = -100;
register const char *a1 __asm__("a1") = pathname;
register char *a2 __asm__("a2") = buf;
register size_t a3 __asm__("a3") = bufsize;
register uint32_t a7 __asm__("a7") = 78; // readlinkat
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a3), "r"(a7)
: "memory");
return ret;
}
uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
uint64_t ret;
register uint64_t a0 __asm__("a0") = fd;
register uint64_t a1 __asm__("a1") = pos;
register uint64_t a2 __asm__("a2") = whence;
register uint32_t a7 __asm__("a7") = 62;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a7)
: "memory");
return ret;
}
int __ftruncate(uint64_t fd, uint64_t length) {
int ret;
register uint64_t a0 __asm__("a0") = fd;
register uint64_t a1 __asm__("a1") = length;
register uint32_t a7 __asm__("a7") = 46;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a7)
: "memory");
return ret;
}
int __close(uint64_t fd) {
int ret;
register uint64_t a0 __asm__("a0") = fd;
register uint32_t a7 __asm__("a7") =
57; // Assuming 57 is the syscall number for close
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a0), "r"(a7)
: "memory");
return ret;
}
int __madvise(void *addr, size_t length, int advice) {
int ret;
register void *a0 __asm__("a0") = addr;
register size_t a1 __asm__("a1") = length;
register int a2 __asm__("a2") = advice;
register uint32_t a7 __asm__("a7") = 233;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a7)
: "memory");
return ret;
}
int __uname(struct UtsNameTy *buf) {
int ret;
register UtsNameTy *a0 __asm__("a0") = buf;
register uint32_t a7 __asm__("a7") = 160;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0)
: "r"(a7)
: "memory");
return ret;
}
uint64_t __nanosleep(const timespec *req, timespec *rem) {
uint64_t ret;
register const timespec *a0 __asm__("a0") = req;
register timespec *a1 __asm__("a1") = rem;
register uint32_t a7 __asm__("a7") = 101;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a7)
: "memory");
return ret;
}
int64_t __fork() {
uint64_t ret;
// clone instead of fork with flags
// "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD"
register uint64_t a0 __asm__("a0") = 0x1200011;
register uint64_t a1 __asm__("a1") = 0;
register uint64_t a2 __asm__("a2") = 0;
register uint64_t a3 __asm__("a3") = 0;
register uint64_t a4 __asm__("a4") = 0;
register uint32_t a7 __asm__("a7") = 220;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a3), "r"(a4), "r"(a7)
: "memory");
return ret;
}
int __mprotect(void *addr, size_t len, int prot) {
int ret;
register void *a0 __asm__("a0") = addr;
register size_t a1 __asm__("a1") = len;
register int a2 __asm__("a2") = prot;
register uint32_t a7 __asm__("a7") = 226;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a7)
: "memory");
return ret;
}
uint64_t __getpid() {
uint64_t ret;
register uint32_t a7 __asm__("a7") = 172;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a7)
: "memory");
return ret;
}
uint64_t __getppid() {
uint64_t ret;
register uint32_t a7 __asm__("a7") = 173;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret)
: "r"(a7)
: "memory");
return ret;
}
int __setpgid(uint64_t pid, uint64_t pgid) {
int ret;
register uint64_t a0 __asm__("a0") = pid;
register uint64_t a1 __asm__("a1") = pgid;
register uint32_t a7 __asm__("a7") = 154;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a7)
: "memory");
return ret;
}
uint64_t __getpgid(uint64_t pid) {
uint64_t ret;
register uint64_t a0 __asm__("a0") = pid;
register uint32_t a7 __asm__("a7") = 155;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0)
: "r"(a7)
: "memory");
return ret;
}
int __kill(uint64_t pid, int sig) {
int ret;
register uint64_t a0 __asm__("a0") = pid;
register int a1 __asm__("a1") = sig;
register uint32_t a7 __asm__("a7") = 129;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a7)
: "memory");
return ret;
}
int __fsync(int fd) {
int ret;
register int a0 __asm__("a0") = fd;
register uint32_t a7 __asm__("a7") = 82;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0)
: "r"(a7)
: "memory");
return ret;
}
uint64_t __sigprocmask(int how, const void *set, void *oldset) {
uint64_t ret;
register int a0 __asm__("a0") = how;
register const void *a1 __asm__("a1") = set;
register void *a2 __asm__("a2") = oldset;
register long a3 asm("a3") = 8;
register uint32_t a7 __asm__("a7") = 135;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a3), "r"(a7)
: "memory");
return ret;
}
int __prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5) {
int ret;
register int a0 __asm__("a0") = option;
register unsigned long a1 __asm__("a1") = arg2;
register unsigned long a2 __asm__("a2") = arg3;
register unsigned long a3 __asm__("a3") = arg4;
register unsigned long a4 __asm__("a4") = arg5;
register uint32_t a7 __asm__("a7") = 167;
__asm__ __volatile__("ecall\n\t"
"mv %0, a0"
: "=r"(ret), "+r"(a0), "+r"(a1)
: "r"(a2), "r"(a3), "r"(a4), "r"(a7)
: "cc", "memory");
return ret;
}
} // anonymous namespace
#endif

View File

@ -0,0 +1,33 @@
# REQUIRES: system-linux,bolt-runtime
# RUN: %clang %cflags -Wl,-q -o %t.exe %s
# RUN: llvm-bolt --instrument --instrumentation-file=%t.fdata -o %t.instr %t.exe
## Run the profiled binary and check that the profile reports at least that `f`
## has been called.
# RUN: rm -f %t.fdata
# RUN: %t.instr
# RUN: cat %t.fdata | FileCheck %s
# CHECK: f 0 0 1{{$}}
## Check BOLT works with this profile
# RUN: llvm-bolt --data %t.fdata --reorder-blocks=cache -o %t.bolt %t.exe
.text
.globl main
.type main, @function
main:
addi sp, sp, -8
sd ra, 0(sp)
call f
ld ra, 0(sp)
addi sp, sp, 8
li a0, 0
ret
.size main, .-main
.globl f
.type f, @function
f:
ret
.size f, .-f

View File

@ -0,0 +1,39 @@
/*
REQUIRES: system-linux,bolt-runtime
RUN: %clang %cflags %s -o %t.exe -Wl,-q
RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
RUN: -o %t.instrumented
# Instrumented program needs to finish returning zero
RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT
# Test that the instrumented data makes sense
RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \
RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \
RUN: --print-only=main --print-finalized | FileCheck %s
RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT
CHECK-OUTPUT: The sum is: 30
# Check that our indirect call has 1 hit recorded in the fdata file and that
# this was processed correctly by BOLT
CHECK: jalr a2 # CallProfile: 1 (0 misses) :
CHECK-NEXT: { add: 1 (0 misses) }
*/
#include <stdio.h>
typedef int (*func_ptr)(int, int);
int add(int a, int b) { return a + b; }
int main() {
func_ptr fun;
fun = add;
int sum = fun(10, 20); // indirect call to 'add'
printf("The sum is: %d\n", sum);
return 0;
}

View File

@ -0,0 +1,2 @@
if config.host_arch not in ["riscv64"]:
config.unsupported = True