From 105ecd8bb2c90c874375100e61f85678a0f6e310 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 21 Nov 2024 18:18:30 -0800 Subject: [PATCH] [BOLT] Avoid EH trampolines for PIEs/DSOs (#117106) We used to emit EH trampolines for PIE/DSO whenever a function fragment contained a landing pad outside of it. However, it is common to have all landing pads in a cold fragment even when their throwers are in a hot one. To reduce the number of trampolines, analyze landing pads for any given function fragment, and if they all belong to the same (possibly different) fragment, designate that fragment as a landing pad fragment for the "thrower" fragment. Later, emit landing pad fragment symbol as an LPStart for the thrower LSDA. --- bolt/include/bolt/Core/BinaryFunction.h | 41 +++++++++ bolt/lib/Core/BinaryEmitter.cpp | 37 +++++--- bolt/lib/Passes/SplitFunctions.cpp | 43 +++++++++- bolt/test/X86/pie-eh-split-undo.s | 86 +++++++++++++++++++ .../X86/Inputs/pie-exceptions-failed-split.s | 2 +- ...d-split.test => pie-exceptions-split.test} | 25 ++---- 6 files changed, 200 insertions(+), 34 deletions(-) create mode 100644 bolt/test/X86/pie-eh-split-undo.s rename bolt/test/runtime/X86/{pie-exceptions-failed-split.test => pie-exceptions-split.test} (54%) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 0b3682353f73..9730cca148c7 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -527,6 +527,11 @@ private: /// fragment of the function. SmallVector LSDASymbols; + /// Each function fragment may have another fragment containing all landing + /// pads for it. If that's the case, the LP fragment will be stored in the + /// vector below with indexing starting with the main fragment. + SmallVector, 0> LPFragments; + /// Map to discover which CFIs are attached to a given instruction offset. /// Maps an instruction offset into a FrameInstructions offset. /// This is only relevant to the buildCFG phase and is discarded afterwards. @@ -1885,6 +1890,42 @@ public: return LSDASymbols[F.get()]; } + /// If all landing pads for the function fragment \p F are located in fragment + /// \p LPF, designate \p LPF as a landing-pad fragment for \p F. Passing + /// std::nullopt in LPF, means that landing pads for \p F are located in more + /// than one fragment. + void setLPFragment(const FragmentNum F, std::optional LPF) { + if (F.get() >= LPFragments.size()) + LPFragments.resize(F.get() + 1); + + LPFragments[F.get()] = LPF; + } + + /// If function fragment \p F has a designated landing pad fragment, i.e. a + /// fragment that contains all landing pads for throwers in \p F, then return + /// that landing pad fragment number. If \p F does not need landing pads, + /// return \p F. Return nullptr if landing pads for \p F are scattered among + /// several function fragments. + std::optional getLPFragment(const FragmentNum F) { + if (!isSplit()) { + assert(F == FragmentNum::main() && "Invalid fragment number"); + return FragmentNum::main(); + } + + if (F.get() >= LPFragments.size()) + return std::nullopt; + + return LPFragments[F.get()]; + } + + /// Return a symbol corresponding to a landing pad fragment for fragment \p F. + /// See getLPFragment(). + MCSymbol *getLPStartSymbol(const FragmentNum F) { + if (std::optional LPFragment = getLPFragment(F)) + return getSymbol(*LPFragment); + return nullptr; + } + void setOutputDataAddress(uint64_t Address) { OutputDataOffset = Address; } uint64_t getOutputDataAddress() const { return OutputDataOffset; } diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 4b5d8154728c..b11ae68b3fea 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -140,7 +140,7 @@ private: void emitCFIInstruction(const MCCFIInstruction &Inst) const; - /// Emit exception handling ranges for the function. + /// Emit exception handling ranges for the function fragment. void emitLSDA(BinaryFunction &BF, const FunctionFragment &FF); /// Emit line number information corresponding to \p NewLoc. \p PrevLoc @@ -915,15 +915,15 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, const FunctionFragment &FF) { // Emit the LSDA header. // If LPStart is omitted, then the start of the FDE is used as a base for - // landing pad displacements. Then, if a cold fragment starts with a landing - // pad, this means that the first landing pad offset will be 0. However, C++ - // runtime treats 0 as if there is no landing pad present, thus we *must* emit - // non-zero offsets for all valid LPs. + // landing pad displacements. Then, if a cold fragment starts with + // a landing pad, this means that the first landing pad offset will be 0. + // However, C++ runtime will treat 0 as if there is no landing pad, thus we + // cannot emit LP offset as 0. // // As a solution, for fixed-address binaries we set LPStart to 0, and for - // position-independent binaries we set LP start to FDE start minus one byte - // for FDEs that start with a landing pad. - const bool NeedsLPAdjustment = !FF.empty() && FF.front()->isLandingPad(); + // position-independent binaries we offset LP start by one byte. + bool NeedsLPAdjustment = false; + const MCSymbol *LPStartSymbol = nullptr; std::function emitLandingPad; if (BC.HasFixedLoadAddress) { Streamer.emitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format @@ -935,17 +935,26 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, const FunctionFragment &FF) { Streamer.emitIntValue(0, 4); }; } else { - if (NeedsLPAdjustment) { - // Use relative LPStart format and emit LPStart as [SymbolStart - 1]. + std::optional LPFN = BF.getLPFragment(FF.getFragmentNum()); + LPStartSymbol = BF.getLPStartSymbol(FF.getFragmentNum()); + assert(LPFN && LPStartSymbol && "Expected LPStart symbol to be set"); + + const FunctionFragment &LPFragment = BF.getLayout().getFragment(*LPFN); + NeedsLPAdjustment = + (!LPFragment.empty() && LPFragment.front()->isLandingPad()); + + // Emit LPStart encoding and optionally LPStart. + if (NeedsLPAdjustment || LPStartSymbol != StartSymbol) { Streamer.emitIntValue(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4, 1); MCSymbol *DotSymbol = BC.Ctx->createTempSymbol("LPBase"); Streamer.emitLabel(DotSymbol); const MCExpr *LPStartExpr = MCBinaryExpr::createSub( - MCSymbolRefExpr::create(StartSymbol, *BC.Ctx), + MCSymbolRefExpr::create(LPStartSymbol, *BC.Ctx), MCSymbolRefExpr::create(DotSymbol, *BC.Ctx), *BC.Ctx); - LPStartExpr = MCBinaryExpr::createSub( - LPStartExpr, MCConstantExpr::create(1, *BC.Ctx), *BC.Ctx); + if (NeedsLPAdjustment) + LPStartExpr = MCBinaryExpr::createSub( + LPStartExpr, MCConstantExpr::create(1, *BC.Ctx), *BC.Ctx); Streamer.emitValue(LPStartExpr, 4); } else { // DW_EH_PE_omit means FDE start (StartSymbol) will be used as LPStart. @@ -955,7 +964,7 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, const FunctionFragment &FF) { if (LPSymbol) { const MCExpr *LPOffsetExpr = MCBinaryExpr::createSub( MCSymbolRefExpr::create(LPSymbol, *BC.Ctx), - MCSymbolRefExpr::create(StartSymbol, *BC.Ctx), *BC.Ctx); + MCSymbolRefExpr::create(LPStartSymbol, *BC.Ctx), *BC.Ctx); if (NeedsLPAdjustment) LPOffsetExpr = MCBinaryExpr::createAdd( LPOffsetExpr, MCConstantExpr::create(1, *BC.Ctx), *BC.Ctx); diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp index bd0b6dea0e06..3c527038b5e3 100644 --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -901,8 +901,43 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { // have to be placed in the same fragment. When we split them, create // trampoline landing pads that will redirect the execution to real LPs. TrampolineSetType Trampolines; - if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) - Trampolines = createEHTrampolines(BF); + if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) { + // If all landing pads for this fragment are grouped in one (potentially + // different) fragment, we can set LPStart to the start of that fragment + // and avoid trampoline code. + bool NeedsTrampolines = false; + for (FunctionFragment &FF : BF.getLayout().fragments()) { + // Vector of fragments that contain landing pads for this fragment. + SmallVector LandingPadFragments; + for (const BinaryBasicBlock *BB : FF) + for (const BinaryBasicBlock *LPB : BB->landing_pads()) + LandingPadFragments.push_back(LPB->getFragmentNum()); + + // Eliminate duplicate entries from the vector. + llvm::sort(LandingPadFragments); + auto Last = llvm::unique(LandingPadFragments); + LandingPadFragments.erase(Last, LandingPadFragments.end()); + + if (LandingPadFragments.size() == 0) { + // If the fragment has no landing pads, we can safely set itself as its + // landing pad fragment. + BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum()); + } else if (LandingPadFragments.size() == 1) { + BF.setLPFragment(FF.getFragmentNum(), LandingPadFragments.front()); + } else { + NeedsTrampolines = true; + break; + } + } + + // Trampolines guarantee that all landing pads for any given fragment will + // be contained in the same fragment. + if (NeedsTrampolines) { + for (FunctionFragment &FF : BF.getLayout().fragments()) + BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum()); + Trampolines = createEHTrampolines(BF); + } + } // Check the new size to see if it's worth splitting the function. if (BC.isX86() && LayoutUpdated) { @@ -933,6 +968,10 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { } } + // Restore LP fragment for the main fragment if the splitting was undone. + if (BF.hasEHRanges() && !BF.isSplit()) + BF.setLPFragment(FragmentNum::main(), FragmentNum::main()); + // Fix branches if the splitting decision of the pass after function // reordering is different from that of the pass before function reordering. if (LayoutUpdated && BC.HasFinalizedFunctionOrder) diff --git a/bolt/test/X86/pie-eh-split-undo.s b/bolt/test/X86/pie-eh-split-undo.s new file mode 100644 index 000000000000..6192dfa768af --- /dev/null +++ b/bolt/test/X86/pie-eh-split-undo.s @@ -0,0 +1,86 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: ld.lld --pie %t.o -o %t.exe -q +# RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --split-functions --split-eh \ +# RUN: --split-all-cold --print-after-lowering --print-only=_start 2>&1 \ +# RUN: | FileCheck %s + +## _start has two landing pads: one hot and one cold. Hence, BOLT will introduce +## a landing pad trampoline. However, the trampoline code will make the main +## split fragment larger than the whole function before split. Then BOLT will +## undo the splitting and remove the trampoline. + +# CHECK: Binary Function "_start" +# CHECK: IsSplit : +# CHECK-SAME: 0 + +## Check that a landing pad trampoline was created, but contains no instructions +## and falls though to the real landing pad. + +# CHECK: {{^[^[:space:]]+}} (0 instructions +# CHECK-NEXT: Landing Pad{{$}} +# CHECK: Exec Count +# CHECK-SAME: : 0 +# CHECK: Successors: +# CHECK-SAME: [[LP:[^[:space:]]+]] +# CHECK-EMPTY: +# CHECK-NEXT: [[LP]] + + .text + .global foo + .type foo, %function +foo: + .cfi_startproc + ret + .cfi_endproc + .size foo, .-foo + + .globl _start + .type _start, %function +_start: +# FDATA: 0 [unknown] 0 1 _start 0 1 100 +.Lfunc_begin0: + .cfi_startproc + .cfi_lsda 27, .Lexception0 + call foo +.Ltmp0: + call foo +.Ltmp1: + ret + +## Cold landing pad. +.LLP1: + ret + +## Hot landing pad. +LLP0: +# FDATA: 0 [unknown] 0 1 _start #LLP0# 1 100 + ret + + .cfi_endproc +.Lfunc_end0: + .size _start, .-_start + +## EH table. + .section .gcc_except_table,"a",@progbits + .p2align 2 +GCC_except_table0: +.Lexception0: + .byte 255 # @LPStart Encoding = omit + .byte 255 # @TType Encoding = omit + .byte 1 # Call site Encoding = uleb128 + .uleb128 .Lcst_end0-.Lcst_begin0 +.Lcst_begin0: + .uleb128 .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << + .uleb128 .Ltmp0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Ltmp0 + .uleb128 LLP0-.Lfunc_begin0 # jumps to LLP0 + .byte 0 # On action: cleanup + .uleb128 .Ltmp0-.Lfunc_begin0 # >> Call Site 2 << + .uleb128 .Ltmp1-.Ltmp0 # Call between .Ltmp0 and .Ltmp1 + .uleb128 .LLP1-.Lfunc_begin0 # jumps to .LLP1 + .byte 0 # On action: cleanup +.Lcst_end0: + diff --git a/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s b/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s index 1ead293bb5cf..5195d298b1bb 100644 --- a/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s +++ b/bolt/test/runtime/X86/Inputs/pie-exceptions-failed-split.s @@ -1,4 +1,4 @@ -# Assembly generated from building the followingC++ code with the following +# Assembly generated from building the following C++ code with the following # command using trunk clang. Then, basic block at .LBB1_7 was moved before the # landing pad. # diff --git a/bolt/test/runtime/X86/pie-exceptions-failed-split.test b/bolt/test/runtime/X86/pie-exceptions-split.test similarity index 54% rename from bolt/test/runtime/X86/pie-exceptions-failed-split.test rename to bolt/test/runtime/X86/pie-exceptions-split.test index eb46dca6d98e..550c1ac67e6c 100644 --- a/bolt/test/runtime/X86/pie-exceptions-failed-split.test +++ b/bolt/test/runtime/X86/pie-exceptions-split.test @@ -11,25 +11,16 @@ RUN: llvm-bolt %t -o %t.bolt --data %t.fdata --reorder-blocks=ext-tsp \ RUN: --split-functions --split-eh --print-after-lowering \ RUN: --print-only=_Z10throw_testiPPc 2>&1 | FileCheck %s -## Hot code in the test case gets larger after splitting because of jump -## instruction relaxation. Check that BOLT reverts the split correctly. +## Check that a landing pad is split from its thrower and does not require a +## trampoline LP. CHECK: Binary Function "_Z10throw_testiPPc" CHECK: IsSplit : -CHECK-SAME: 0 - -## Check that the landing pad trampoline was created, but contains no -## instructions and falls to the real landing pad. -CHECK: {{^[^[:space:]]+}} (0 instructions -CHECK-NEXT: Landing Pad{{$}} -CHECK: Exec Count -CHECK-SAME: : 0 -CHECK: Successors: -CHECK-SAME: [[LP:[^[:space:]]+]] -CHECK-EMPTY: -CHECK-NEXT: [[LP]] -CHECK-DAG: Exec Count -CHECK-NOT: Exec Count -CHECK-DAG: callq __cxa_begin_catch +CHECK-SAME: 1 +CHECK: callq {{.*}} # handler: [[LPAD:.*]]; +CHECK-NOT: Landing Pad{{$}} +CHECK: HOT-COLD SPLIT POINT +CHECK: {{^}}[[LPAD]] +CHECK-NEXT: Landing Pad ## Verify the output still executes correctly when the exception path is being ## taken.