[BOLT] Add support for dumping profile on MacOS

Summary: Add support for dumping profile on MacOS.

(cherry picked from FBD25751363)
This commit is contained in:
Alexander Shaposhnikov 2021-01-28 12:44:14 -08:00 committed by Maksim Panchenko
parent 3b876cc3e7
commit a0dd5b05dc
8 changed files with 148 additions and 45 deletions

View File

@ -17,6 +17,7 @@ typedef __SSIZE_TYPE__ ssize_t;
typedef unsigned long long uint64_t;
typedef unsigned uint32_t;
typedef unsigned char uint8_t;
typedef long long int64_t;
typedef int int32_t;
@ -127,6 +128,21 @@ uint64_t __munmap(void *addr, uint64_t size) {
return ret;
}
uint64_t __exit(uint64_t code) {
#if defined(__APPLE__)
#define EXIT_SYSCALL 0x2000001
#else
#define EXIT_SYSCALL 231
#endif
uint64_t ret;
__asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(code)
: "cc", "rcx", "r11", "memory");
return ret;
}
// Helper functions for writing strings to the .fdata file. We intentionally
// avoid using libc names (lowercase memset) to make it clear it is our impl.
@ -317,15 +333,7 @@ uint64_t __getppid() {
return ret;
}
uint64_t __exit(uint64_t code) {
uint64_t ret;
__asm__ __volatile__("movq $231, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(code)
: "cc", "rcx", "r11", "memory");
return ret;
}
#endif
void reportError(const char *Msg, uint64_t Size) {
__write(2, Msg, Size);
@ -372,6 +380,4 @@ inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
return (Value + Align - 1) / Align * Align;
}
#endif
} // anonymous namespace

View File

@ -60,10 +60,11 @@
#if defined(__APPLE__)
extern "C" {
extern uint64_t* _bolt_instr_locations_getter();
extern uint32_t _bolt_num_counters_getter();
extern uint8_t* _bolt_instr_tables_getter();
extern uint32_t _bolt_instr_num_funcs_getter();
}
#else
@ -106,6 +107,8 @@ extern void (*__bolt_trampoline_ind_tailcall)();
extern void (*__bolt_instr_init_ptr)();
extern void (*__bolt_instr_fini_ptr)();
#endif
namespace {
/// A simple allocator that mmaps a fixed size region and manages this space
@ -124,14 +127,21 @@ class BumpPtrAllocator {
public:
void *allocate(size_t Size) {
Lock L(M);
if (StackBase == nullptr) {
#if defined(__APPLE__)
int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002;
#else
int MAP_PRIVATE_MAP_ANONYMOUS = 0x22;
#endif
StackBase = reinterpret_cast<uint8_t *>(
__mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/,
Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/
: 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/,
: MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/,
-1, 0));
StackSize = 0;
}
Size = alignTo(Size + sizeof(EntryMetadata), 16);
uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
@ -555,7 +565,7 @@ FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
/// Read and mmap descriptions written by BOLT from the executable's notes
/// section
#ifdef HAVE_ELF_H
#if defined(HAVE_ELF_H) and !defined(__APPLE__)
ProfileWriterContext readDescriptions() {
ProfileWriterContext Result;
uint64_t FD = __open("/proc/self/exe",
@ -614,16 +624,31 @@ ProfileWriterContext readDescriptions() {
reportError(ErrMsg, sizeof(ErrMsg));
return Result;
}
#else
ProfileWriterContext readDescriptions() {
ProfileWriterContext Result;
const char ErrMsg[] =
"BOLT instrumentation runtime error: unsupported binary format.\n";
reportError(ErrMsg, sizeof(ErrMsg));
uint8_t *Tables = _bolt_instr_tables_getter();
uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
uint32_t IndCallTargetDescSize =
*reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
Result.IndCallDescriptions =
reinterpret_cast<IndCallDescription *>(Tables + 4);
Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
Tables + 8 + IndCallDescSize);
Result.FuncDescriptions =
Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
Result.Strings = reinterpret_cast<char *>(
Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
return Result;
}
#endif
#if !defined(__APPLE__)
/// Debug by printing overall metadata global numbers to check it is sane
void printStats(const ProfileWriterContext &Ctx) {
char StatMsg[BufSize];
@ -646,6 +671,8 @@ void printStats(const ProfileWriterContext &Ctx) {
StatPtr = strCopy(StatPtr, "\n");
__write(2, StatMsg, StatPtr - StatMsg);
}
#endif
/// This is part of a simple CFG representation in memory, where we store
/// a dynamically sized array of input and output edges per node, and store
@ -708,6 +735,7 @@ Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
MaxNodes = D.Edges[I].ToNode;
}
for (int I = 0; I < D.NumLeafNodes; ++I) {
if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
MaxNodes = D.LeafNodes[I].Node;
@ -730,6 +758,7 @@ Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
// Initial allocations
CFGNodes = new (Alloc) Node[MaxNodes];
DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
SpanningTreeNodes = new (Alloc) Node[MaxNodes];
DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
@ -1104,25 +1133,31 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
const FunctionDescription F(FuncDesc);
const uint8_t *next = FuncDesc + F.getSize();
#if !defined(__APPLE__)
uint64_t *bolt_instr_locations = __bolt_instr_locations;
#else
uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
#endif
// Skip funcs we know are cold
#ifndef ENABLE_DEBUG
uint64_t CountersFreq = 0;
for (int I = 0; I < F.NumLeafNodes; ++I) {
CountersFreq += __bolt_instr_locations[F.LeafNodes[I].Counter];
CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
}
if (CountersFreq == 0) {
for (int I = 0; I < F.NumEdges; ++I) {
const uint32_t C = F.Edges[I].Counter;
if (C == 0xffffffff)
continue;
CountersFreq += __bolt_instr_locations[C];
CountersFreq += bolt_instr_locations[C];
}
if (CountersFreq == 0) {
for (int I = 0; I < F.NumCalls; ++I) {
const uint32_t C = F.Calls[I].Counter;
if (C == 0xffffffff)
continue;
CountersFreq += __bolt_instr_locations[C];
CountersFreq += bolt_instr_locations[C];
}
if (CountersFreq == 0)
return next;
@ -1130,8 +1165,9 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
}
#endif
Graph *G = new (Alloc) Graph(Alloc, F, __bolt_instr_locations, Ctx);
Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
DEBUG(G->dump());
if (!G->EdgeFreqs && !G->CallFreqs) {
G->~Graph();
Alloc.deallocate(G);
@ -1173,6 +1209,7 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
return next;
}
#if !defined(__APPLE__)
const IndCallTargetDescription *
ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
uint32_t B = 0;
@ -1293,8 +1330,13 @@ int openProfile() {
}
return FD;
}
#endif
} // anonymous namespace
#if !defined(__APPLE__)
/// Reset all counters in case you want to start profiling a new phase of your
/// program independently of prior phases.
/// The address of this function is printed by BOLT and this can be called by
@ -1476,42 +1518,42 @@ extern "C" void __bolt_instr_fini() {
#if defined(__APPLE__)
extern "C" void __bolt_instr_data_dump() {
ProfileWriterContext Ctx = readDescriptions();
int FD = 2;
BumpPtrAllocator Alloc;
const uint8_t *FuncDesc = Ctx.FuncDescriptions;
uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
Alloc.clear();
DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
}
assert(FuncDesc == (void *)Ctx.Strings,
"FuncDesc ptr must be equal to stringtable");
}
// On OSX/iOS the final symbol name of an extern "C" function/variable contains
// one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
extern "C"
__attribute__((section("__TEXT,__setup")))
__attribute__((force_align_arg_pointer))
void _bolt_instr_setup() {
const char *Message = "Hello!\n";
__write(2, Message, 7);
__asm__ __volatile__(SAVE_ALL :::);
uint32_t NumCounters = _bolt_num_counters_getter();
reportNumber("__bolt_instr_setup, number of counters: ", NumCounters, 10);
report("Hello!\n");
uint64_t *Locs = _bolt_instr_locations_getter();
reportNumber("__bolt_instr_setup, address of counters: ",
reinterpret_cast<uint64_t>(Locs), 10);
for (size_t I = 0; I < NumCounters; ++I)
reportNumber("Counter value: ", Locs[I], 10);
__asm__ __volatile__(RESTORE_ALL :::);
}
extern "C"
__attribute__((section("__TEXT,__fini")))
__attribute__((force_align_arg_pointer))
void _bolt_instr_fini() {
uint32_t NumCounters = _bolt_num_counters_getter();
reportNumber("__bolt_instr_fini, number of counters: ", NumCounters, 10);
uint64_t *Locs = _bolt_instr_locations_getter();
reportNumber("__bolt_instr_fini, address of counters: ",
reinterpret_cast<uint64_t>(Locs), 10);
for (size_t I = 0; I < NumCounters; ++I)
reportNumber("Counter value: ", Locs[I], 10);
const char *Message = "Bye!\n";
__write(2, Message, 5);
report("Bye!\n");
__bolt_instr_data_dump();
}
#endif

View File

@ -1748,6 +1748,16 @@ public:
return {};
}
virtual std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const {
llvm_unreachable("not implemented");
return {};
}
virtual std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const {
llvm_unreachable("not implemented");
return {};
}
/// This method takes an indirect call instruction and splits it up into an
/// equivalent set of instructions that use direct calls for target
/// symbols/addresses that are contained in the Targets vector. This is done

View File

@ -424,6 +424,7 @@ void MachORewriteInstance::emitAndLink() {
if (Key == K) {
mapCodeSections(Key);
mapInstrumentationSection(Key, "__counters");
mapInstrumentationSection(Key, "__tables");
} else {
// TODO: Refactor addRuntimeLibSections to work properly on Mach-O
// and use it here.
@ -494,6 +495,7 @@ void MachORewriteInstance::rewriteFile() {
}
writeInstrumentationSection("__counters", OS);
writeInstrumentationSection("__tables", OS);
// TODO: Refactor addRuntimeLibSections to work properly on Mach-O and
// use it here.

View File

@ -612,6 +612,10 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
BC.MIB->createNumCountersGetter(BC.Ctx.get()));
createSimpleFunction("__bolt_instr_locations_getter",
BC.MIB->createInstrLocationsGetter(BC.Ctx.get()));
createSimpleFunction("__bolt_instr_tables_getter",
BC.MIB->createInstrTablesGetter(BC.Ctx.get()));
createSimpleFunction("__bolt_instr_num_funcs_getter",
BC.MIB->createInstrNumFuncsGetter(BC.Ctx.get()));
}
void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {

View File

@ -152,6 +152,7 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
Streamer.EmitIntValue(Summary->IndCallTargetDescriptions.size(), /*Size=*/4);
Streamer.EmitLabel(NumFuncs);
Streamer.EmitSymbolAttribute(NumFuncs, MCSymbolAttr::MCSA_Global);
Streamer.EmitIntValue(Summary->FunctionDescriptions.size(), /*Size=*/4);
Streamer.EmitLabel(FilenameSym);
Streamer.EmitBytes(opts::InstrumentationFilename);
@ -169,6 +170,18 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
Streamer.EmitValue(
MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx), /*Size=*/8);
}
if (BC.isMachO()) {
MCSection *TablesSection = BC.Ctx->getMachOSection(
"__BOLT", "__tables", MachO::S_REGULAR,
SectionKind::getData());
MCSymbol *Tables = BC.Ctx->getOrCreateSymbol("__bolt_instr_tables");
TablesSection->setAlignment(BC.RegularPageSize);
Streamer.SwitchSection(TablesSection);
Streamer.EmitLabel(Tables);
Streamer.EmitSymbolAttribute(Tables, MCSymbolAttr::MCSA_Global);
Streamer.EmitBytes(buildTables(BC));
}
}
void InstrumentationRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
@ -208,11 +221,11 @@ void InstrumentationRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
emitTablesAsELFNote(BC);
}
void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
std::string InstrumentationRuntimeLibrary::buildTables(BinaryContext &BC) {
std::string TablesStr;
raw_string_ostream OS(TablesStr);
// This is sync'ed with runtime/instr.cpp:readDescriptions()
// This is sync'ed with runtime/instr.cpp:readDescriptions()
auto getOutputAddress = [](const BinaryFunction &Func,
uint64_t Offset) -> uint64_t {
return Offset == 0
@ -238,6 +251,7 @@ void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.FuncString), 4);
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.Offset), 4);
}
const auto ITDSize = Summary->IndCallTargetDescriptions.size() *
sizeof(IndCallTargetDescription);
OS.write(reinterpret_cast<const char *>(&ITDSize), 4);
@ -248,6 +262,7 @@ void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
getOutputAddress(*Desc.Target, Desc.ToLoc.Offset);
OS.write(reinterpret_cast<const char *>(&TargetFuncAddress), 8);
}
auto FuncDescSize = Summary->getFDSize();
OS.write(reinterpret_cast<const char *>(&FuncDescSize), 4);
for (const auto &Desc : Summary->FunctionDescriptions) {
@ -293,6 +308,12 @@ void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
// Our string table lives immediately after descriptions vector
OS << Summary->StringTable;
OS.flush();
return TablesStr;
}
void InstrumentationRuntimeLibrary::emitTablesAsELFNote(BinaryContext &BC) {
std::string TablesStr = buildTables(BC);
const auto BoltInfo = BinarySection::encodeELFNote(
"BOLT", TablesStr, BinarySection::NT_BOLT_INSTRUMENTATION_TABLES);
BC.registerOrUpdateNoteSection(".bolt.instr.tables", copyByteArray(BoltInfo),

View File

@ -34,6 +34,8 @@ public:
orc::RTDyldObjectLinkingLayer &OLT) override;
private:
std::string buildTables(BinaryContext &BC);
/// Create a non-allocatable ELF section with read-only tables necessary for
/// writing the instrumented data profile during program finish. The runtime
/// library needs to open the program executable file and read this data from

View File

@ -3341,6 +3341,22 @@ public:
return Insts;
}
std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const override {
std::vector<MCInst> Insts(2);
MCSymbol *Locs = Ctx->getOrCreateSymbol("__bolt_instr_tables");
createLea(Insts[0], Locs, X86::EAX, Ctx);
createReturn(Insts[1]);
return Insts;
}
std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const override {
std::vector<MCInst> Insts(2);
MCSymbol *NumFuncs = Ctx->getOrCreateSymbol("__bolt_instr_num_funcs");
createMove(Insts[0], NumFuncs, X86::EAX, Ctx);
createReturn(Insts[1]);
return Insts;
}
BlocksVectorTy indirectCallPromotion(
const MCInst &CallInst,
const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,