llvm-project/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
Shaw Young 9a9af0a23f
[BOLT] Match blocks with pseudo probes (#99891)
Match inline trees first between profile and the binary: by GUID,
checksum, parent, and inline site for inlined functions. Map profile
probes to binary probes via matched inline tree nodes. Each binary probe
has an associated binary basic block. If all probes from one profile
basic block map to the same binary basic block, it’s an exact match,
otherwise the block is determined by majority vote and reported as loose
match.

Pseudo probe matching happens between exact hash matching and call/loose
matching.

Introduce ProbeMatchSpec - a mechanism to match probes belonging to
another binary function. For example, given functions foo and bar:
```
void foo() {
  bar();
}
```
profiled binary: bar is not inlined => have top-level function bar
new binary where the profile is applied to: bar is inlined into foo.

Currently, BOLT does 1:1 matching between profile functions and binary
functions based on the name. #100446 will extend this to N:M where
multiple profiles can be matched to one binary function (as in the
example above where binary function foo would use profiles for foo and
bar), and one profile can be matched to multiple binary functions (e.g.
if bar was inlined into multiple functions).

In this diff, ProbeMatchSpecs would only have one BinaryFunctionProfile
(existing name-based matching). 

Test Plan: Added match-blocks-with-pseudo-probes.test

Performance test:
- Setup:
  - Baseline no-BOLT: Clang with pseudo probes, ThinLTO + CSSPGO
  (#79942)
  - BOLT fresh: BOLTed Clang using fresh profile,
  - BOLT stale (hash): BOLTed Clang using stale profile (collected on
    Clang 10K commits back), `-infer-stale-profile` (hash+call block
    matching)
  - BOLT stale (+probe): BOLTed Clang using stale profile,
    `-infer-stale-profile` with `-stale-matching-with-pseudo-probes`
    (hash+call+pseudo probe block matching)
  - 2S Intel SKX Xeon 6138 with 40C/80T and 256GB RAM, using 20C/40T for
    build,
  - BOLT profiles are collected on Clang compiling large preprocessed
    C++ file.
- Benchmark: building Clang (average of 5 runs), see driver in
  aaupov/llvm-devmtg-2022
- Results, wall time, lower is better:
  - Baseline no-BOLT: 429.52 +- 2.61s,
  - BOLT stale (hash): 413.21 +- 2.19s,
  - BOLT stale (+probe): 409.69 +- 1.41s,
  - BOLT fresh: 384.50 +- 1.80s.

---------

Co-authored-by: Amir Ayupov <aaupov@fb.com>
2024-11-12 07:21:03 -08:00

453 lines
17 KiB
C++

//===- bolt/Rewrite/PseudoProbeRewriter.cpp -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implement support for pseudo probes.
//
//===----------------------------------------------------------------------===//
#include "bolt/Core/BinaryFunction.h"
#include "bolt/Rewrite/MetadataRewriter.h"
#include "bolt/Rewrite/MetadataRewriters.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "bolt/Utils/Utils.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCPseudoProbe.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/LEB128.h"
#include <memory>
#undef DEBUG_TYPE
#define DEBUG_TYPE "pseudo-probe-rewriter"
using namespace llvm;
using namespace bolt;
namespace opts {
enum PrintPseudoProbesOptions {
PPP_None = 0,
PPP_Probes_Section_Decode = 0x1,
PPP_Probes_Address_Conversion = 0x2,
PPP_Encoded_Probes = 0x3,
PPP_All = 0xf
};
static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
"print-pseudo-probes", cl::desc("print pseudo probe info"),
cl::init(PPP_None),
cl::values(clEnumValN(PPP_Probes_Section_Decode, "decode",
"decode probes section from binary"),
clEnumValN(PPP_Probes_Address_Conversion, "address_conversion",
"update address2ProbesMap with output block address"),
clEnumValN(PPP_Encoded_Probes, "encoded_probes",
"display the encoded probes in binary section"),
clEnumValN(PPP_All, "all", "enable all debugging printout")),
cl::Hidden, cl::cat(BoltCategory));
extern cl::opt<bool> ProfileWritePseudoProbes;
extern cl::opt<bool> StaleMatchingWithPseudoProbes;
} // namespace opts
namespace {
class PseudoProbeRewriter final : public MetadataRewriter {
/// .pseudo_probe_desc section.
/// Contains information about pseudo probe description, like its related
/// function
ErrorOr<BinarySection &> PseudoProbeDescSection{std::errc::bad_address};
/// .pseudo_probe section.
/// Contains information about pseudo probe details, like its address
ErrorOr<BinarySection &> PseudoProbeSection{std::errc::bad_address};
/// Update address of MCDecodedPseudoProbe.
void updatePseudoProbes();
/// Encode MCDecodedPseudoProbe.
void encodePseudoProbes();
/// Parse .pseudo_probe_desc section and .pseudo_probe section
/// Setup Pseudo probe decoder
/// If \p ProfiledOnly is set, only parse records for functions with profile.
void parsePseudoProbe(bool ProfiledOnly = false);
/// PseudoProbe decoder
std::shared_ptr<MCPseudoProbeDecoder> ProbeDecoderPtr;
public:
PseudoProbeRewriter(BinaryContext &BC)
: MetadataRewriter("pseudo-probe-rewriter", BC),
ProbeDecoderPtr(std::make_shared<MCPseudoProbeDecoder>()) {
BC.setPseudoProbeDecoder(ProbeDecoderPtr);
}
Error preCFGInitializer() override;
Error postEmitFinalizer() override;
~PseudoProbeRewriter() override { ProbeDecoderPtr.reset(); }
};
Error PseudoProbeRewriter::preCFGInitializer() {
if (opts::ProfileWritePseudoProbes || opts::StaleMatchingWithPseudoProbes)
parsePseudoProbe(opts::ProfileWritePseudoProbes);
return Error::success();
}
Error PseudoProbeRewriter::postEmitFinalizer() {
if (!opts::StaleMatchingWithPseudoProbes)
parsePseudoProbe();
updatePseudoProbes();
return Error::success();
}
void PseudoProbeRewriter::parsePseudoProbe(bool ProfiledOnly) {
MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
PseudoProbeDescSection = BC.getUniqueSectionByName(".pseudo_probe_desc");
PseudoProbeSection = BC.getUniqueSectionByName(".pseudo_probe");
if (!PseudoProbeDescSection && !PseudoProbeSection) {
// pesudo probe is not added to binary. It is normal and no warning needed.
return;
}
// If only one section is found, it might mean the ELF is corrupted.
if (!PseudoProbeDescSection) {
errs() << "BOLT-WARNING: fail in reading .pseudo_probe_desc binary\n";
return;
} else if (!PseudoProbeSection) {
errs() << "BOLT-WARNING: fail in reading .pseudo_probe binary\n";
return;
}
StringRef Contents = PseudoProbeDescSection->getContents();
if (!ProbeDecoder.buildGUID2FuncDescMap(
reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size(),
/*IsMMapped*/ true)) {
errs() << "BOLT-WARNING: fail in building GUID2FuncDescMap\n";
return;
}
MCPseudoProbeDecoder::Uint64Set GuidFilter;
MCPseudoProbeDecoder::Uint64Map FuncStartAddrs;
SmallVector<StringRef, 0> Suffixes(
{".destroy", ".resume", ".llvm.", ".cold", ".warm"});
for (const BinaryFunction *F : BC.getAllBinaryFunctions()) {
bool HasProfile = F->hasProfileAvailable();
for (const MCSymbol *Sym : F->getSymbols()) {
StringRef SymName = Sym->getName();
for (auto Name : {std::optional(NameResolver::restore(SymName)),
getCommonName(SymName, false, Suffixes)}) {
if (!Name)
continue;
SymName = *Name;
uint64_t GUID = Function::getGUID(SymName);
FuncStartAddrs[GUID] = F->getAddress();
if (ProfiledOnly && HasProfile)
GuidFilter.insert(GUID);
}
}
}
Contents = PseudoProbeSection->getContents();
if (!ProbeDecoder.buildAddress2ProbeMap(
reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size(),
GuidFilter, FuncStartAddrs)) {
errs() << "BOLT-WARNING: fail in building Address2ProbeMap\n";
return;
}
if (opts::PrintPseudoProbes == opts::PrintPseudoProbesOptions::PPP_All ||
opts::PrintPseudoProbes ==
opts::PrintPseudoProbesOptions::PPP_Probes_Section_Decode) {
outs() << "Report of decoding input pseudo probe binaries \n";
ProbeDecoder.printGUID2FuncDescMap(outs());
ProbeDecoder.printProbesForAllAddresses(outs());
}
const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
// Checks GUID in GUID2Func and returns it if it's present or null otherwise.
auto checkGUID = [&](StringRef SymName) -> uint64_t {
uint64_t GUID = Function::getGUID(SymName);
if (GUID2Func.find(GUID) == GUID2Func.end())
return 0;
return GUID;
};
for (BinaryFunction *F : BC.getAllBinaryFunctions()) {
for (const MCSymbol *Sym : F->getSymbols()) {
StringRef SymName = NameResolver::restore(Sym->getName());
uint64_t GUID = checkGUID(SymName);
std::optional<StringRef> CommonName =
getCommonName(SymName, false, Suffixes);
if (!GUID && CommonName)
GUID = checkGUID(*CommonName);
if (GUID)
F->setGUID(GUID);
}
}
}
void PseudoProbeRewriter::updatePseudoProbes() {
MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
// check if there is pseudo probe section decoded
if (ProbeDecoder.getAddress2ProbesMap().empty())
return;
// input address converted to output
AddressProbesMap &Address2ProbesMap = ProbeDecoder.getAddress2ProbesMap();
const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
uint64_t Address = Probe.getAddress();
BinaryFunction *F = BC.getBinaryFunctionContainingAddress(Address);
// If F is removed, eliminate all probes inside it from inline tree
// Setting probes' addresses as INT64_MAX means elimination
if (!F) {
Probe.setAddress(INT64_MAX);
continue;
}
// If F is not emitted, the function will remain in the same address as its
// input
if (!F->isEmitted())
continue;
uint64_t Offset = Address - F->getAddress();
const BinaryBasicBlock *BB = F->getBasicBlockContainingOffset(Offset);
uint64_t BlkOutputAddress = BB->getOutputAddressRange().first;
// Check if block output address is defined.
// If not, such block is removed from binary. Then remove the probes from
// inline tree
if (BlkOutputAddress == 0) {
Probe.setAddress(INT64_MAX);
continue;
}
if (Probe.isBlock()) {
Probe.setAddress(BlkOutputAddress);
} else if (Probe.isCall()) {
// A call probe may be duplicated due to ICP
// Go through output of InputOffsetToAddressMap to collect all related
// probes
auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(Address);
auto CallOutputAddress = CallOutputAddresses.first;
if (CallOutputAddress == CallOutputAddresses.second) {
Probe.setAddress(INT64_MAX);
} else {
Probe.setAddress(CallOutputAddress->second);
CallOutputAddress = std::next(CallOutputAddress);
}
while (CallOutputAddress != CallOutputAddresses.second) {
ProbeDecoder.addInjectedProbe(Probe, CallOutputAddress->second);
CallOutputAddress = std::next(CallOutputAddress);
}
}
}
if (opts::PrintPseudoProbes == opts::PrintPseudoProbesOptions::PPP_All ||
opts::PrintPseudoProbes ==
opts::PrintPseudoProbesOptions::PPP_Probes_Address_Conversion) {
outs() << "Pseudo Probe Address Conversion results:\n";
// table that correlates address to block
std::unordered_map<uint64_t, StringRef> Addr2BlockNames;
for (auto &F : BC.getBinaryFunctions())
for (BinaryBasicBlock &BinaryBlock : F.second)
Addr2BlockNames[BinaryBlock.getOutputAddressRange().first] =
BinaryBlock.getName();
// scan all addresses -> correlate probe to block when print out
for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
if (Probe.getAddress() == INT64_MAX)
outs() << "Deleted Probe: ";
else
outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
Probe.print(outs(), GUID2Func, true);
// print block name only if the probe is block type and undeleted.
if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
<< Addr2BlockNames[Probe.getAddress()] << "\n";
}
outs() << "=======================================\n";
}
// encode pseudo probes with updated addresses
encodePseudoProbes();
}
void PseudoProbeRewriter::encodePseudoProbes() {
MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
// Buffer for new pseudo probes section
SmallString<8> Contents;
MCDecodedPseudoProbe *LastProbe = nullptr;
auto EmitInt = [&](uint64_t Value, uint32_t Size) {
const bool IsLittleEndian = BC.AsmInfo->isLittleEndian();
uint64_t Swapped = support::endian::byte_swap(
Value,
IsLittleEndian ? llvm::endianness::little : llvm::endianness::big);
unsigned Index = IsLittleEndian ? 0 : 8 - Size;
auto Entry = StringRef(reinterpret_cast<char *>(&Swapped) + Index, Size);
Contents.append(Entry.begin(), Entry.end());
};
auto EmitULEB128IntValue = [&](uint64_t Value) {
SmallString<128> Tmp;
raw_svector_ostream OSE(Tmp);
encodeULEB128(Value, OSE, 0);
Contents.append(OSE.str().begin(), OSE.str().end());
};
auto EmitSLEB128IntValue = [&](int64_t Value) {
SmallString<128> Tmp;
raw_svector_ostream OSE(Tmp);
encodeSLEB128(Value, OSE);
Contents.append(OSE.str().begin(), OSE.str().end());
};
// Emit indiviual pseudo probes in a inline tree node
// Probe index, type, attribute, address type and address are encoded
// Address of the first probe is absolute.
// Other probes' address are represented by delta
auto EmitDecodedPseudoProbe = [&](MCDecodedPseudoProbe *&CurProbe) {
assert(!isSentinelProbe(CurProbe->getAttributes()) &&
"Sentinel probes should not be emitted");
EmitULEB128IntValue(CurProbe->getIndex());
uint8_t PackedType = CurProbe->getType() | (CurProbe->getAttributes() << 4);
uint8_t Flag =
LastProbe ? ((int8_t)MCPseudoProbeFlag::AddressDelta << 7) : 0;
EmitInt(Flag | PackedType, 1);
if (LastProbe) {
// Emit the delta between the address label and LastProbe.
int64_t Delta = CurProbe->getAddress() - LastProbe->getAddress();
EmitSLEB128IntValue(Delta);
} else {
// Emit absolute address for encoding the first pseudo probe.
uint32_t AddrSize = BC.AsmInfo->getCodePointerSize();
EmitInt(CurProbe->getAddress(), AddrSize);
}
};
std::map<InlineSite, MCDecodedPseudoProbeInlineTree *,
std::greater<InlineSite>>
Inlinees;
// DFS of inline tree to emit pseudo probes in all tree node
// Inline site index of a probe is emitted first.
// Then tree node Guid, size of pseudo probes and children nodes, and detail
// of contained probes are emitted Deleted probes are skipped Root node is not
// encoded to binaries. It's a "wrapper" of inline trees of each function.
std::list<std::pair<uint64_t, MCDecodedPseudoProbeInlineTree *>> NextNodes;
const MCDecodedPseudoProbeInlineTree &Root =
ProbeDecoder.getDummyInlineRoot();
for (auto Child = Root.getChildren().begin();
Child != Root.getChildren().end(); ++Child)
Inlinees[Child->getInlineSite()] = &*Child;
for (auto Inlinee : Inlinees)
// INT64_MAX is "placeholder" of unused callsite index field in the pair
NextNodes.push_back({INT64_MAX, Inlinee.second});
Inlinees.clear();
while (!NextNodes.empty()) {
uint64_t ProbeIndex = NextNodes.back().first;
MCDecodedPseudoProbeInlineTree *Cur = NextNodes.back().second;
NextNodes.pop_back();
if (Cur->Parent && !Cur->Parent->isRoot())
// Emit probe inline site
EmitULEB128IntValue(ProbeIndex);
// Emit probes grouped by GUID.
LLVM_DEBUG({
dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
dbgs() << "GUID: " << Cur->Guid << "\n";
});
// Emit Guid
EmitInt(Cur->Guid, 8);
// Emit number of probes in this node
uint64_t Deleted = 0;
for (MCDecodedPseudoProbe *&Probe :
llvm::make_pointer_range(Cur->getProbes()))
if (Probe->getAddress() == INT64_MAX)
Deleted++;
LLVM_DEBUG(dbgs() << "Deleted Probes:" << Deleted << "\n");
size_t InjectedProbes = ProbeDecoder.getNumInjectedProbes(Cur);
uint64_t ProbesSize = Cur->getProbes().size() - Deleted + InjectedProbes;
EmitULEB128IntValue(ProbesSize);
// Emit number of direct inlinees
EmitULEB128IntValue(Cur->getChildren().size());
// Emit probes in this group
for (MCDecodedPseudoProbe *&Probe :
llvm::make_pointer_range(Cur->getProbes())) {
if (Probe->getAddress() == INT64_MAX)
continue;
EmitDecodedPseudoProbe(Probe);
LastProbe = Probe;
}
if (InjectedProbes) {
for (MCDecodedPseudoProbe *&Probe :
llvm::make_pointer_range(ProbeDecoder.getInjectedProbes(Cur))) {
if (Probe->getAddress() == INT64_MAX)
continue;
EmitDecodedPseudoProbe(Probe);
LastProbe = Probe;
}
}
for (auto Child = Cur->getChildren().begin();
Child != Cur->getChildren().end(); ++Child)
Inlinees[Child->getInlineSite()] = &*Child;
for (const auto &Inlinee : Inlinees) {
assert(Cur->Guid != 0 && "non root tree node must have nonzero Guid");
NextNodes.push_back({std::get<1>(Inlinee.first), Inlinee.second});
LLVM_DEBUG({
dbgs().indent(MCPseudoProbeTable::DdgPrintIndent);
dbgs() << "InlineSite: " << std::get<1>(Inlinee.first) << "\n";
});
}
Inlinees.clear();
}
// Create buffer for new contents for the section
// Freed when parent section is destroyed
uint8_t *Output = new uint8_t[Contents.str().size()];
memcpy(Output, Contents.str().data(), Contents.str().size());
BC.registerOrUpdateSection(".pseudo_probe", PseudoProbeSection->getELFType(),
PseudoProbeSection->getELFFlags(), Output,
Contents.str().size(), 1);
if (opts::PrintPseudoProbes == opts::PrintPseudoProbesOptions::PPP_All ||
opts::PrintPseudoProbes ==
opts::PrintPseudoProbesOptions::PPP_Encoded_Probes) {
// create a dummy decoder;
MCPseudoProbeDecoder DummyDecoder;
StringRef DescContents = PseudoProbeDescSection->getContents();
DummyDecoder.buildGUID2FuncDescMap(
reinterpret_cast<const uint8_t *>(DescContents.data()),
DescContents.size());
StringRef ProbeContents = PseudoProbeSection->getOutputContents();
MCPseudoProbeDecoder::Uint64Set GuidFilter;
MCPseudoProbeDecoder::Uint64Map FuncStartAddrs;
for (const BinaryFunction *F : BC.getAllBinaryFunctions()) {
const uint64_t Addr =
F->isEmitted() ? F->getOutputAddress() : F->getAddress();
FuncStartAddrs[Function::getGUID(
NameResolver::restore(F->getOneName()))] = Addr;
}
DummyDecoder.buildAddress2ProbeMap(
reinterpret_cast<const uint8_t *>(ProbeContents.data()),
ProbeContents.size(), GuidFilter, FuncStartAddrs);
DummyDecoder.printProbesForAllAddresses(outs());
}
}
} // namespace
std::unique_ptr<MetadataRewriter>
llvm::bolt::createPseudoProbeRewriter(BinaryContext &BC) {
return std::make_unique<PseudoProbeRewriter>(BC);
}