mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-25 22:06:06 +00:00
[BOLT] Add support for dumping profile on MacOS
Summary: Add support for dumping profile on MacOS. (cherry picked from FBD25751363)
This commit is contained in:
parent
3b876cc3e7
commit
a0dd5b05dc
@ -17,6 +17,7 @@ typedef __SSIZE_TYPE__ ssize_t;
|
||||
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef unsigned uint32_t;
|
||||
typedef unsigned char uint8_t;
|
||||
|
||||
typedef long long int64_t;
|
||||
typedef int int32_t;
|
||||
@ -127,6 +128,21 @@ uint64_t __munmap(void *addr, uint64_t size) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t __exit(uint64_t code) {
|
||||
#if defined(__APPLE__)
|
||||
#define EXIT_SYSCALL 0x2000001
|
||||
#else
|
||||
#define EXIT_SYSCALL 231
|
||||
#endif
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(code)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Helper functions for writing strings to the .fdata file. We intentionally
|
||||
// avoid using libc names (lowercase memset) to make it clear it is our impl.
|
||||
|
||||
@ -317,15 +333,7 @@ uint64_t __getppid() {
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t __exit(uint64_t code) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__("movq $231, %%rax\n"
|
||||
"syscall\n"
|
||||
: "=a"(ret)
|
||||
: "D"(code)
|
||||
: "cc", "rcx", "r11", "memory");
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
void reportError(const char *Msg, uint64_t Size) {
|
||||
__write(2, Msg, Size);
|
||||
@ -372,6 +380,4 @@ inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
|
||||
return (Value + Align - 1) / Align * Align;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // anonymous namespace
|
||||
|
@ -60,10 +60,11 @@
|
||||
|
||||
#if defined(__APPLE__)
|
||||
extern "C" {
|
||||
|
||||
extern uint64_t* _bolt_instr_locations_getter();
|
||||
extern uint32_t _bolt_num_counters_getter();
|
||||
|
||||
extern uint8_t* _bolt_instr_tables_getter();
|
||||
extern uint32_t _bolt_instr_num_funcs_getter();
|
||||
}
|
||||
|
||||
#else
|
||||
@ -106,6 +107,8 @@ extern void (*__bolt_trampoline_ind_tailcall)();
|
||||
extern void (*__bolt_instr_init_ptr)();
|
||||
extern void (*__bolt_instr_fini_ptr)();
|
||||
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
/// A simple allocator that mmaps a fixed size region and manages this space
|
||||
@ -124,14 +127,21 @@ class BumpPtrAllocator {
|
||||
public:
|
||||
void *allocate(size_t Size) {
|
||||
Lock L(M);
|
||||
|
||||
if (StackBase == nullptr) {
|
||||
#if defined(__APPLE__)
|
||||
int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002;
|
||||
#else
|
||||
int MAP_PRIVATE_MAP_ANONYMOUS = 0x22;
|
||||
#endif
|
||||
StackBase = reinterpret_cast<uint8_t *>(
|
||||
__mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/,
|
||||
Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/
|
||||
: 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/,
|
||||
: MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/,
|
||||
-1, 0));
|
||||
StackSize = 0;
|
||||
}
|
||||
|
||||
Size = alignTo(Size + sizeof(EntryMetadata), 16);
|
||||
uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
|
||||
auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
|
||||
@ -555,7 +565,7 @@ FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
|
||||
|
||||
/// Read and mmap descriptions written by BOLT from the executable's notes
|
||||
/// section
|
||||
#ifdef HAVE_ELF_H
|
||||
#if defined(HAVE_ELF_H) and !defined(__APPLE__)
|
||||
ProfileWriterContext readDescriptions() {
|
||||
ProfileWriterContext Result;
|
||||
uint64_t FD = __open("/proc/self/exe",
|
||||
@ -614,16 +624,31 @@ ProfileWriterContext readDescriptions() {
|
||||
reportError(ErrMsg, sizeof(ErrMsg));
|
||||
return Result;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
ProfileWriterContext readDescriptions() {
|
||||
ProfileWriterContext Result;
|
||||
const char ErrMsg[] =
|
||||
"BOLT instrumentation runtime error: unsupported binary format.\n";
|
||||
reportError(ErrMsg, sizeof(ErrMsg));
|
||||
uint8_t *Tables = _bolt_instr_tables_getter();
|
||||
uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
|
||||
uint32_t IndCallTargetDescSize =
|
||||
*reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
|
||||
uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
|
||||
Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
|
||||
Result.IndCallDescriptions =
|
||||
reinterpret_cast<IndCallDescription *>(Tables + 4);
|
||||
Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
|
||||
Tables + 8 + IndCallDescSize);
|
||||
Result.FuncDescriptions =
|
||||
Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
|
||||
Result.Strings = reinterpret_cast<char *>(
|
||||
Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
|
||||
return Result;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
/// Debug by printing overall metadata global numbers to check it is sane
|
||||
void printStats(const ProfileWriterContext &Ctx) {
|
||||
char StatMsg[BufSize];
|
||||
@ -646,6 +671,8 @@ void printStats(const ProfileWriterContext &Ctx) {
|
||||
StatPtr = strCopy(StatPtr, "\n");
|
||||
__write(2, StatMsg, StatPtr - StatMsg);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/// This is part of a simple CFG representation in memory, where we store
|
||||
/// a dynamically sized array of input and output edges per node, and store
|
||||
@ -708,6 +735,7 @@ Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
|
||||
if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
|
||||
MaxNodes = D.Edges[I].ToNode;
|
||||
}
|
||||
|
||||
for (int I = 0; I < D.NumLeafNodes; ++I) {
|
||||
if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
|
||||
MaxNodes = D.LeafNodes[I].Node;
|
||||
@ -730,6 +758,7 @@ Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
|
||||
|
||||
// Initial allocations
|
||||
CFGNodes = new (Alloc) Node[MaxNodes];
|
||||
|
||||
DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
|
||||
SpanningTreeNodes = new (Alloc) Node[MaxNodes];
|
||||
DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
|
||||
@ -1104,25 +1133,31 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
|
||||
const FunctionDescription F(FuncDesc);
|
||||
const uint8_t *next = FuncDesc + F.getSize();
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
uint64_t *bolt_instr_locations = __bolt_instr_locations;
|
||||
#else
|
||||
uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
|
||||
#endif
|
||||
|
||||
// Skip funcs we know are cold
|
||||
#ifndef ENABLE_DEBUG
|
||||
uint64_t CountersFreq = 0;
|
||||
for (int I = 0; I < F.NumLeafNodes; ++I) {
|
||||
CountersFreq += __bolt_instr_locations[F.LeafNodes[I].Counter];
|
||||
CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
|
||||
}
|
||||
if (CountersFreq == 0) {
|
||||
for (int I = 0; I < F.NumEdges; ++I) {
|
||||
const uint32_t C = F.Edges[I].Counter;
|
||||
if (C == 0xffffffff)
|
||||
continue;
|
||||
CountersFreq += __bolt_instr_locations[C];
|
||||
CountersFreq += bolt_instr_locations[C];
|
||||
}
|
||||
if (CountersFreq == 0) {
|
||||
for (int I = 0; I < F.NumCalls; ++I) {
|
||||
const uint32_t C = F.Calls[I].Counter;
|
||||
if (C == 0xffffffff)
|
||||
continue;
|
||||
CountersFreq += __bolt_instr_locations[C];
|
||||
CountersFreq += bolt_instr_locations[C];
|
||||
}
|
||||
if (CountersFreq == 0)
|
||||
return next;
|
||||
@ -1130,8 +1165,9 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
|
||||
}
|
||||
#endif
|
||||
|
||||
Graph *G = new (Alloc) Graph(Alloc, F, __bolt_instr_locations, Ctx);
|
||||
Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
|
||||
DEBUG(G->dump());
|
||||
|
||||
if (!G->EdgeFreqs && !G->CallFreqs) {
|
||||
G->~Graph();
|
||||
Alloc.deallocate(G);
|
||||
@ -1173,6 +1209,7 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
|
||||
return next;
|
||||
}
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
const IndCallTargetDescription *
|
||||
ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
|
||||
uint32_t B = 0;
|
||||
@ -1293,8 +1330,13 @@ int openProfile() {
|
||||
}
|
||||
return FD;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
|
||||
/// Reset all counters in case you want to start profiling a new phase of your
|
||||
/// program independently of prior phases.
|
||||
/// The address of this function is printed by BOLT and this can be called by
|
||||
@ -1476,42 +1518,42 @@ extern "C" void __bolt_instr_fini() {
|
||||
|
||||
#if defined(__APPLE__)
|
||||
|
||||
extern "C" void __bolt_instr_data_dump() {
|
||||
ProfileWriterContext Ctx = readDescriptions();
|
||||
|
||||
int FD = 2;
|
||||
BumpPtrAllocator Alloc;
|
||||
const uint8_t *FuncDesc = Ctx.FuncDescriptions;
|
||||
uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
|
||||
|
||||
for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
|
||||
FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
|
||||
Alloc.clear();
|
||||
DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
|
||||
}
|
||||
assert(FuncDesc == (void *)Ctx.Strings,
|
||||
"FuncDesc ptr must be equal to stringtable");
|
||||
}
|
||||
|
||||
// On OSX/iOS the final symbol name of an extern "C" function/variable contains
|
||||
// one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
|
||||
extern "C"
|
||||
__attribute__((section("__TEXT,__setup")))
|
||||
__attribute__((force_align_arg_pointer))
|
||||
void _bolt_instr_setup() {
|
||||
const char *Message = "Hello!\n";
|
||||
__write(2, Message, 7);
|
||||
__asm__ __volatile__(SAVE_ALL :::);
|
||||
|
||||
uint32_t NumCounters = _bolt_num_counters_getter();
|
||||
reportNumber("__bolt_instr_setup, number of counters: ", NumCounters, 10);
|
||||
report("Hello!\n");
|
||||
|
||||
uint64_t *Locs = _bolt_instr_locations_getter();
|
||||
reportNumber("__bolt_instr_setup, address of counters: ",
|
||||
reinterpret_cast<uint64_t>(Locs), 10);
|
||||
|
||||
for (size_t I = 0; I < NumCounters; ++I)
|
||||
reportNumber("Counter value: ", Locs[I], 10);
|
||||
__asm__ __volatile__(RESTORE_ALL :::);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__attribute__((section("__TEXT,__fini")))
|
||||
__attribute__((force_align_arg_pointer))
|
||||
void _bolt_instr_fini() {
|
||||
uint32_t NumCounters = _bolt_num_counters_getter();
|
||||
reportNumber("__bolt_instr_fini, number of counters: ", NumCounters, 10);
|
||||
|
||||
uint64_t *Locs = _bolt_instr_locations_getter();
|
||||
reportNumber("__bolt_instr_fini, address of counters: ",
|
||||
reinterpret_cast<uint64_t>(Locs), 10);
|
||||
|
||||
for (size_t I = 0; I < NumCounters; ++I)
|
||||
reportNumber("Counter value: ", Locs[I], 10);
|
||||
|
||||
const char *Message = "Bye!\n";
|
||||
__write(2, Message, 5);
|
||||
report("Bye!\n");
|
||||
__bolt_instr_data_dump();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1748,6 +1748,16 @@ public:
|
||||
return {};
|
||||
}
|
||||
|
||||
virtual std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return {};
|
||||
}
|
||||
|
||||
virtual std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const {
|
||||
llvm_unreachable("not implemented");
|
||||
return {};
|
||||
}
|
||||
|
||||
/// This method takes an indirect call instruction and splits it up into an
|
||||
/// equivalent set of instructions that use direct calls for target
|
||||
/// symbols/addresses that are contained in the Targets vector. This is done
|
||||
|
@ -424,6 +424,7 @@ void MachORewriteInstance::emitAndLink() {
|
||||
if (Key == K) {
|
||||
mapCodeSections(Key);
|
||||
mapInstrumentationSection(Key, "__counters");
|
||||
mapInstrumentationSection(Key, "__tables");
|
||||
} else {
|
||||
// TODO: Refactor addRuntimeLibSections to work properly on Mach-O
|
||||
// and use it here.
|
||||
@ -494,6 +495,7 @@ void MachORewriteInstance::rewriteFile() {
|
||||
}
|
||||
|
||||
writeInstrumentationSection("__counters", OS);
|
||||
writeInstrumentationSection("__tables", OS);
|
||||
|
||||
// TODO: Refactor addRuntimeLibSections to work properly on Mach-O and
|
||||
// use it here.
|
||||
|
@ -612,6 +612,10 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
|
||||
BC.MIB->createNumCountersGetter(BC.Ctx.get()));
|
||||
createSimpleFunction("__bolt_instr_locations_getter",
|
||||
BC.MIB->createInstrLocationsGetter(BC.Ctx.get()));
|
||||
createSimpleFunction("__bolt_instr_tables_getter",
|
||||
BC.MIB->createInstrTablesGetter(BC.Ctx.get()));
|
||||
createSimpleFunction("__bolt_instr_num_funcs_getter",
|
||||
BC.MIB->createInstrNumFuncsGetter(BC.Ctx.get()));
|
||||
}
|
||||
|
||||
void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {
|
||||
|
@ -152,6 +152,7 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
|
||||
Streamer.EmitIntValue(Summary->IndCallTargetDescriptions.size(), /*Size=*/4);
|
||||
Streamer.EmitLabel(NumFuncs);
|
||||
Streamer.EmitSymbolAttribute(NumFuncs, MCSymbolAttr::MCSA_Global);
|
||||
|
||||
Streamer.EmitIntValue(Summary->FunctionDescriptions.size(), /*Size=*/4);
|
||||
Streamer.EmitLabel(FilenameSym);
|
||||
Streamer.EmitBytes(opts::InstrumentationFilename);
|
||||
@ -169,6 +170,18 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
|
||||
Streamer.EmitValue(
|
||||
MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx), /*Size=*/8);
|
||||
}
|
||||
|
||||
if (BC.isMachO()) {
|
||||
MCSection *TablesSection = BC.Ctx->getMachOSection(
|
||||
"__BOLT", "__tables", MachO::S_REGULAR,
|
||||
SectionKind::getData());
|
||||
MCSymbol *Tables = BC.Ctx->getOrCreateSymbol("__bolt_instr_tables");
|
||||
TablesSection->setAlignment(BC.RegularPageSize);
|
||||
Streamer.SwitchSection(TablesSection);
|
||||
Streamer.EmitLabel(Tables);
|
||||
Streamer.EmitSymbolAttribute(Tables, MCSymbolAttr::MCSA_Global);
|
||||
Streamer.EmitBytes(buildTables(BC));
|
||||
}
|
||||
}
|
||||
|
||||
void InstrumentationRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
|
||||
@ -208,11 +221,11 @@ void InstrumentationRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
|
||||
emitTablesAsELFNote(BC);
|
||||
}
|
||||
|
||||
void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
std::string InstrumentationRuntimeLibrary::buildTables(BinaryContext &BC) {
|
||||
std::string TablesStr;
|
||||
raw_string_ostream OS(TablesStr);
|
||||
// This is sync'ed with runtime/instr.cpp:readDescriptions()
|
||||
|
||||
// This is sync'ed with runtime/instr.cpp:readDescriptions()
|
||||
auto getOutputAddress = [](const BinaryFunction &Func,
|
||||
uint64_t Offset) -> uint64_t {
|
||||
return Offset == 0
|
||||
@ -238,6 +251,7 @@ void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.FuncString), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.Offset), 4);
|
||||
}
|
||||
|
||||
const auto ITDSize = Summary->IndCallTargetDescriptions.size() *
|
||||
sizeof(IndCallTargetDescription);
|
||||
OS.write(reinterpret_cast<const char *>(&ITDSize), 4);
|
||||
@ -248,6 +262,7 @@ void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
getOutputAddress(*Desc.Target, Desc.ToLoc.Offset);
|
||||
OS.write(reinterpret_cast<const char *>(&TargetFuncAddress), 8);
|
||||
}
|
||||
|
||||
auto FuncDescSize = Summary->getFDSize();
|
||||
OS.write(reinterpret_cast<const char *>(&FuncDescSize), 4);
|
||||
for (const auto &Desc : Summary->FunctionDescriptions) {
|
||||
@ -293,6 +308,12 @@ void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
// Our string table lives immediately after descriptions vector
|
||||
OS << Summary->StringTable;
|
||||
OS.flush();
|
||||
|
||||
return TablesStr;
|
||||
}
|
||||
|
||||
void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
std::string TablesStr = buildTables(BC);
|
||||
const auto BoltInfo = BinarySection::encodeELFNote(
|
||||
"BOLT", TablesStr, BinarySection::NT_BOLT_INSTRUMENTATION_TABLES);
|
||||
BC.registerOrUpdateNoteSection(".bolt.instr.tables", copyByteArray(BoltInfo),
|
||||
|
@ -34,6 +34,8 @@ public:
|
||||
orc::RTDyldObjectLinkingLayer &OLT) override;
|
||||
|
||||
private:
|
||||
std::string buildTables(BinaryContext &BC);
|
||||
|
||||
/// Create a non-allocatable ELF section with read-only tables necessary for
|
||||
/// writing the instrumented data profile during program finish. The runtime
|
||||
/// library needs to open the program executable file and read this data from
|
||||
|
@ -3341,6 +3341,22 @@ public:
|
||||
return Insts;
|
||||
}
|
||||
|
||||
std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const override {
|
||||
std::vector<MCInst> Insts(2);
|
||||
MCSymbol *Locs = Ctx->getOrCreateSymbol("__bolt_instr_tables");
|
||||
createLea(Insts[0], Locs, X86::EAX, Ctx);
|
||||
createReturn(Insts[1]);
|
||||
return Insts;
|
||||
}
|
||||
|
||||
std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const override {
|
||||
std::vector<MCInst> Insts(2);
|
||||
MCSymbol *NumFuncs = Ctx->getOrCreateSymbol("__bolt_instr_num_funcs");
|
||||
createMove(Insts[0], NumFuncs, X86::EAX, Ctx);
|
||||
createReturn(Insts[1]);
|
||||
return Insts;
|
||||
}
|
||||
|
||||
BlocksVectorTy indirectCallPromotion(
|
||||
const MCInst &CallInst,
|
||||
const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
|
||||
|
Loading…
x
Reference in New Issue
Block a user