From db3f9774eefc662cbcf976b51f459c80d2664d82 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Fri, 25 Jan 2019 18:30:37 +0000 Subject: [PATCH] [HotColdSplit] Introduce a cost model to control splitting behavior The main goal of the model is to avoid *increasing* function size, as that would eradicate any memory locality benefits from splitting. This happens when: - There are too many inputs or outputs to the cold region. Argument materialization and reloads of outputs have a cost. - The cold region has too many distinct exit blocks, causing a large switch to be formed in the caller. - The code size cost of the split code is less than the cost of a set-up call. A secondary goal is to prevent excessive overall binary size growth. With the cost model in place, I experimented to find a splitting threshold that works well in practice. To make warm & cold code easily separable for analysis purposes, I moved split functions to a "cold" section. I experimented with thresholds between [0, 4] and set the default to the threshold which minimized geomean __text size. Experiment data from building LNT+externals for X86 (N = 639 programs, all sizes in bytes): | Configuration | __text geom size | __cold geom size | TEXT geom size | | **-Os** | 1736.3 | 0, n=0 | 10961.6 | | -Os, thresh=0 | 1740.53 | 124.482, n=134 | 11014 | | -Os, thresh=1 | 1734.79 | 57.8781, n=90 | 10978.6 | | -Os, thresh=2 | ** 1733.85 ** | 65.6604, n=61 | 10977.6 | | -Os, thresh=3 | 1733.85 | 65.3071, n=61 | 10977.6 | | -Os, thresh=4 | 1735.08 | 67.5156, n=54 | 10965.7 | | **-Oz** | 1554.4 | 0, n=0 | 10153 | | -Oz, thresh=2 | ** 1552.2 ** | 65.633, n=61 | 10176 | | **-O3** | 2563.37 | 0, n=0 | 13105.4 | | -O3, thresh=2 | ** 2559.49 ** | 71.1072, n=61 | 13162.4 | Picking thresh=2 reduces the geomean __text section size by 0.14% at -Os, -Oz, and -O3 and causes ~0.2% growth in the TEXT segment. Note that TEXT size is page-aligned, whereas section sizes are byte-aligned. Experiment data from building LNT+externals for ARM64 (N = 558 programs, all sizes in bytes): | Configuration | __text geom size | __cold geom size | TEXT geom size | | **-Os** | 1763.96 | 0, n=0 | 42934.9 | | -Os, thresh=2 | ** 1760.9 ** | 76.6755, n=61 | 42934.9 | Picking thresh=2 reduces the geomean __text section size by 0.17% at -Os and causes no growth in the TEXT segment. Measurements were done with D57082 (r352080) applied. Differential Revision: https://reviews.llvm.org/D57125 llvm-svn: 352228 --- llvm/lib/Transforms/IPO/HotColdSplitting.cpp | 127 +++++++++++++----- .../X86/extraction-subregion-breaks-phis.ll | 63 --------- .../HotColdSplit/X86/outline-expensive.ll | 25 ---- .../Transforms/HotColdSplit/addr-taken.ll | 2 +- .../HotColdSplit/apply-noreturn-bonus.ll | 26 ++++ .../HotColdSplit/apply-penalty-for-inputs.ll | 19 +++ .../HotColdSplit/apply-penalty-for-outputs.ll | 22 +++ .../HotColdSplit/apply-successor-penalty.ll | 53 ++++++++ .../HotColdSplit/outline-disjoint-diamonds.ll | 2 +- llvm/test/Transforms/HotColdSplit/resume.ll | 2 +- .../Transforms/HotColdSplit/split-cold-2.ll | 4 +- 11 files changed, 216 insertions(+), 129 deletions(-) delete mode 100644 llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll delete mode 100644 llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll create mode 100644 llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll create mode 100644 llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll create mode 100644 llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll create mode 100644 llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index 8250db706e3a..36dd6fa4be7a 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -80,9 +80,9 @@ static cl::opt EnableStaticAnalyis("hot-cold-static-analysis", cl::init(true), cl::Hidden); static cl::opt - SplittingThreshold("hotcoldsplit-threshold", cl::init(3), cl::Hidden, - cl::desc("Code size threshold for splitting cold code " - "(as a multiple of TCC_Basic)")); + SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, + cl::desc("Base penalty for splitting cold code (as a " + "multiple of TCC_Basic)")); namespace { @@ -139,31 +139,6 @@ static bool mayExtractBlock(const BasicBlock &BB) { !isa(BB.getTerminator()); } -/// Check whether \p Region is profitable to outline. -static bool isProfitableToOutline(const BlockSequence &Region, - TargetTransformInfo &TTI) { - // If the splitting threshold is set at or below zero, skip the usual - // profitability check. - if (SplittingThreshold <= 0) - return true; - - if (Region.size() > 1) - return true; - - int Cost = 0; - const BasicBlock &BB = *Region[0]; - for (const Instruction &I : BB) { - if (isa(&I) || &I == BB.getTerminator()) - continue; - - Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); - - if (Cost >= (SplittingThreshold * TargetTransformInfo::TCC_Basic)) - return true; - } - return false; -} - /// Mark \p F cold. Based on this assumption, also optimize it for minimum size. /// Return true if the function is changed. static bool markFunctionCold(Function &F) { @@ -247,6 +222,82 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { return true; } +/// Get the benefit score of outlining \p Region. +static int getOutliningBenefit(ArrayRef Region, + TargetTransformInfo &TTI) { + // Sum up the code size costs of non-terminator instructions. Tight coupling + // with \ref getOutliningPenalty is needed to model the costs of terminators. + int Benefit = 0; + for (BasicBlock *BB : Region) + for (Instruction &I : BB->instructionsWithoutDebug()) + if (&I != BB->getTerminator()) + Benefit += + TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); + + return Benefit; +} + +/// Get the penalty score for outlining \p Region. +static int getOutliningPenalty(ArrayRef Region, + unsigned NumInputs, unsigned NumOutputs) { + int Penalty = SplittingThreshold; + LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n"); + + // If the splitting threshold is set at or below zero, skip the usual + // profitability check. + if (SplittingThreshold <= 0) + return Penalty; + + // The typical code size cost for materializing an argument for the outlined + // call. + LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n"); + const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic; + Penalty += CostForArgMaterialization * NumInputs; + + // The typical code size cost for an output alloca, its associated store, and + // its associated reload. + LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n"); + const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic; + Penalty += CostForRegionOutput * NumOutputs; + + // Find the number of distinct exit blocks for the region. Use a conservative + // check to determine whether control returns from the region. + bool NoBlocksReturn = true; + SmallPtrSet SuccsOutsideRegion; + for (BasicBlock *BB : Region) { + // If a block has no successors, only assume it does not return if it's + // unreachable. + if (succ_empty(BB)) { + NoBlocksReturn &= isa(BB->getTerminator()); + continue; + } + + for (BasicBlock *SuccBB : successors(BB)) { + if (find(Region, SuccBB) == Region.end()) { + NoBlocksReturn = false; + SuccsOutsideRegion.insert(SuccBB); + } + } + } + + // Apply a `noreturn` bonus. + if (NoBlocksReturn) { + LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size() + << " non-returning terminators\n"); + Penalty -= Region.size(); + } + + // Apply a penalty for having more than one successor outside of the region. + // This penalty accounts for the switch needed in the caller. + if (!SuccsOutsideRegion.empty()) { + LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size() + << " non-region successors\n"); + Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic; + } + + return Penalty; +} + Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, DominatorTree &DT, BlockFrequencyInfo *BFI, @@ -261,6 +312,18 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, /* AllowAlloca */ false, /* Suffix */ "cold." + std::to_string(Count)); + // Perform a simple cost/benefit analysis to decide whether or not to permit + // splitting. + SetVector Inputs, Outputs, Sinks; + CE.findInputsOutputs(Inputs, Outputs, Sinks); + int OutliningBenefit = getOutliningBenefit(Region, TTI); + int OutliningPenalty = + getOutliningPenalty(Region, Inputs.size(), Outputs.size()); + LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit + << ", penalty = " << OutliningPenalty << "\n"); + if (OutliningBenefit <= OutliningPenalty) + return nullptr; + Function *OrigF = Region[0]->getParent(); if (Function *OutF = CE.extractCodeRegion()) { User *U = *OutF->user_begin(); @@ -556,14 +619,6 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { assert(!Region.empty() && "Empty outlining region in worklist"); do { BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT); - if (!isProfitableToOutline(SubRegion, TTI)) { - LLVM_DEBUG({ - dbgs() << "Skipping outlining; not profitable to outline\n"; - SubRegion[0]->dump(); - }); - continue; - } - LLVM_DEBUG({ dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; for (BasicBlock *BB : SubRegion) diff --git a/llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll b/llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll deleted file mode 100644 index 9a751e3b28db..000000000000 --- a/llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=1 < %s | FileCheck %s - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.14.0" - -; CHECK-LABEL: define {{.*}}@foo( -; CHECK: call {{.*}}@foo.cold.1( -; CHECK: unreachable - -; CHECK-LABEL: define {{.*}}@foo.cold.1( -; CHECK: switch i32 undef, label %sw.epilog.i -define void @foo(i32 %QMM) { -entry: - switch i32 %QMM, label %entry.if.end16_crit_edge [ - i32 1, label %if.then - ] - -entry.if.end16_crit_edge: ; preds = %entry - br label %if.end16 - -if.then: ; preds = %entry - br i1 undef, label %cond.true.i.i, label %_ZN10StringView8popFrontEv.exit.i - -cond.true.i.i: ; preds = %if.then - ret void - -_ZN10StringView8popFrontEv.exit.i: ; preds = %if.then - switch i32 undef, label %sw.epilog.i [ - i32 81, label %if.end16 - i32 82, label %sw.bb4.i - i32 83, label %sw.bb8.i - i32 84, label %sw.bb12.i - i32 65, label %if.end16 - i32 66, label %sw.bb20.i - i32 67, label %sw.bb24.i - i32 68, label %sw.bb28.i - ] - -sw.bb4.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -sw.bb8.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -sw.bb12.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -sw.bb20.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -sw.bb24.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -sw.bb28.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -sw.epilog.i: ; preds = %_ZN10StringView8popFrontEv.exit.i - br label %if.end16 - -if.end16: ; preds = %sw.epilog.i, %sw.bb28.i, %sw.bb24.i, %sw.bb20.i, %sw.bb12.i, %sw.bb8.i, %sw.bb4.i, %_ZN10StringView8popFrontEv.exit.i, %_ZN10StringView8popFrontEv.exit.i, %entry.if.end16_crit_edge - %0 = phi i8 [ 0, %entry.if.end16_crit_edge ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 1, %sw.bb4.i ], [ 2, %sw.bb8.i ], [ 3, %sw.bb12.i ], [ 1, %sw.bb20.i ], [ 2, %sw.bb24.i ], [ 3, %sw.bb28.i ], [ 0, %sw.epilog.i ] - unreachable -} diff --git a/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll b/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll deleted file mode 100644 index 3f04283b0c1a..000000000000 --- a/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll +++ /dev/null @@ -1,25 +0,0 @@ -; The magic number 6 comes from (1 * TCC_Expensive) + (1 * CostOfCallX86). -; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=6 -S < %s | FileCheck %s - -; Test that we outline even though there are only two cold instructions. TTI -; should determine that they are expensive in terms of code size. - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.14.0" - -; CHECK-LABEL: @fun -; CHECK: call void @fun.cold.1 -define void @fun(i32 %x) { -entry: - br i1 undef, label %if.then, label %if.else - -if.then: - ret void - -if.else: - %y = sdiv i32 %x, 111 - call void @sink(i32 %y) - ret void -} - -declare void @sink(i32 %x) cold diff --git a/llvm/test/Transforms/HotColdSplit/addr-taken.ll b/llvm/test/Transforms/HotColdSplit/addr-taken.ll index f2f448c8a46e..19f1d4f1974b 100644 --- a/llvm/test/Transforms/HotColdSplit/addr-taken.ll +++ b/llvm/test/Transforms/HotColdSplit/addr-taken.ll @@ -1,4 +1,4 @@ -; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s +; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.14.0" diff --git a/llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll b/llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll new file mode 100644 index 000000000000..c1d9af88595c --- /dev/null +++ b/llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll @@ -0,0 +1,26 @@ +; REQUIRES: asserts +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s + +declare void @sink() cold + +define void @foo(i32 %arg) { +entry: + br i1 undef, label %cold1, label %exit + +cold1: + ; CHECK: Applying bonus for: 4 non-returning terminators + call void @sink() + br i1 undef, label %cold2, label %cold3 + +cold2: + br label %cold4 + +cold3: + br label %cold4 + +cold4: + unreachable + +exit: + ret void +} diff --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll new file mode 100644 index 000000000000..fffd6f9f5dcf --- /dev/null +++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll @@ -0,0 +1,19 @@ +; REQUIRES: asserts +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s + +declare void @sink(i32*, i32, i32) cold + +@g = global i32 0 + +define void @foo(i32 %arg) { + %local = load i32, i32* @g + br i1 undef, label %cold, label %exit + +cold: + ; CHECK: Applying penalty for: 2 inputs + call void @sink(i32* @g, i32 %arg, i32 %local) + ret void + +exit: + ret void +} diff --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll new file mode 100644 index 000000000000..a7d9f97ab030 --- /dev/null +++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll @@ -0,0 +1,22 @@ +; REQUIRES: asserts +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s + +declare void @sink() cold + +@g = global i32 0 + +define i32 @foo(i32 %arg) { +entry: + br i1 undef, label %cold, label %exit + +cold: + ; CHECK: Applying penalty for: 1 output + ; CHECK: Applying penalty for: 1 non-region successors + %local = load i32, i32* @g + call void @sink() + br label %exit + +exit: + %p = phi i32 [ %local, %cold ], [ 0, %entry ] + ret i32 %p +} diff --git a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll new file mode 100644 index 000000000000..3886d76da016 --- /dev/null +++ b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll @@ -0,0 +1,53 @@ +; REQUIRES: asserts +; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s + +declare void @sink() cold + +; CHECK-LABEL: Outlining in one_non_region_successor +define void @one_non_region_successor(i32 %arg) { +entry: + br i1 undef, label %cold1, label %exit + +cold1: + ; CHECK: Applying penalty for: 1 non-region successor + call void @sink() + br i1 undef, label %cold2, label %cold3 + +cold2: + br i1 undef, label %cold4, label %exit + +cold3: + br i1 undef, label %cold4, label %exit + +cold4: + unreachable + +exit: + ret void +} + +; CHECK-LABEL: Outlining in two_non_region_successor +define void @two_non_region_successors(i32 %arg) { +entry: + br i1 undef, label %cold1, label %exit1 + +cold1: + ; CHECK: Applying penalty for: 2 non-region successors + call void @sink() + br i1 undef, label %cold2, label %cold3 + +cold2: + br i1 undef, label %cold4, label %exit1 + +cold3: + br i1 undef, label %cold4, label %exit2 + +cold4: + unreachable + +exit1: + br label %exit2 + +exit2: + ret void +} diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll index 64bc94ebd545..b33454b5c4ef 100644 --- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll +++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=0 < %s 2>&1 | FileCheck %s +; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s ; CHECK-LABEL: define {{.*}}@fun ; CHECK: call {{.*}}@fun.cold.2( diff --git a/llvm/test/Transforms/HotColdSplit/resume.ll b/llvm/test/Transforms/HotColdSplit/resume.ll index cbda078da90c..2b8ea7d91d9e 100644 --- a/llvm/test/Transforms/HotColdSplit/resume.ll +++ b/llvm/test/Transforms/HotColdSplit/resume.ll @@ -1,4 +1,4 @@ -; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s +; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.14.0" diff --git a/llvm/test/Transforms/HotColdSplit/split-cold-2.ll b/llvm/test/Transforms/HotColdSplit/split-cold-2.ll index 0ce168179307..0b228a58897c 100644 --- a/llvm/test/Transforms/HotColdSplit/split-cold-2.ll +++ b/llvm/test/Transforms/HotColdSplit/split-cold-2.ll @@ -1,5 +1,5 @@ -; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s -; RUN: opt -hotcoldsplit-threshold=0 -passes=hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s +; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s +; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s ; Make sure this compiles. This test used to fail with an invalid phi node: the ; two predecessors were outlined and the SSA representation was invalid.