mirror of
https://github.com/llvm/llvm-project.git
synced 2025-05-02 05:56:07 +00:00

This is phase 2 of the MD5 refactoring on Sample Profile following https://reviews.llvm.org/D147740 In previous implementation, when a MD5 Sample Profile is read, the reader first converts the MD5 values to strings, and then create a StringRef as if the numerical strings are regular function names, and later on IPO transformation passes perform string comparison over these numerical strings for profile matching. This is inefficient since it causes many small heap allocations. In this patch I created a class `ProfileFuncRef` that is similar to `StringRef` but it can represent a hash value directly without any conversion, and it will be more efficient (I will attach some benchmark results later) when being used in associative containers. ProfileFuncRef guarantees the same function name in string form or in MD5 form has the same hash value, which also fix a few issue in IPO passes where function matching/lookup only check for function name string, while returns a no-match if the profile is MD5. When testing on an internal large profile (> 1 GB, with more than 10 million functions), the full profile load time is reduced from 28 sec to 25 sec in average, and reading function offset table from 0.78s to 0.7s
97 lines
3.6 KiB
C++
97 lines
3.6 KiB
C++
//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
|
|
#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
|
|
|
|
#include "ProfiledBinary.h"
|
|
#include "llvm/ADT/PriorityQueue.h"
|
|
#include "llvm/ProfileData/ProfileCommon.h"
|
|
#include "llvm/ProfileData/SampleProf.h"
|
|
#include "llvm/Transforms/IPO/ProfiledCallGraph.h"
|
|
#include "llvm/Transforms/IPO/SampleContextTracker.h"
|
|
|
|
using namespace llvm;
|
|
using namespace sampleprof;
|
|
|
|
namespace llvm {
|
|
namespace sampleprof {
|
|
|
|
// Inline candidate seen from profile
|
|
struct ProfiledInlineCandidate {
|
|
ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count,
|
|
uint32_t Size)
|
|
: CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {}
|
|
// Context-sensitive function profile for inline candidate
|
|
const FunctionSamples *CalleeSamples;
|
|
// Call site count for an inline candidate
|
|
// TODO: make sure entry count for context profile and call site
|
|
// target count for corresponding call are consistent.
|
|
uint64_t CallsiteCount;
|
|
// Size proxy for function under particular call context.
|
|
uint64_t SizeCost;
|
|
};
|
|
|
|
// Inline candidate comparer using call site weight
|
|
struct ProfiledCandidateComparer {
|
|
bool operator()(const ProfiledInlineCandidate &LHS,
|
|
const ProfiledInlineCandidate &RHS) {
|
|
// Always prioritize inlining zero-sized functions as they do not affect the
|
|
// size budget. This could happen when all of the callee's code is gone and
|
|
// only pseudo probes are left.
|
|
if ((LHS.SizeCost == 0 || RHS.SizeCost == 0) &&
|
|
(LHS.SizeCost != RHS.SizeCost))
|
|
return RHS.SizeCost == 0;
|
|
|
|
if (LHS.CallsiteCount != RHS.CallsiteCount)
|
|
return LHS.CallsiteCount < RHS.CallsiteCount;
|
|
|
|
if (LHS.SizeCost != RHS.SizeCost)
|
|
return LHS.SizeCost > RHS.SizeCost;
|
|
|
|
// Tie breaker using GUID so we have stable/deterministic inlining order
|
|
assert(LHS.CalleeSamples && RHS.CalleeSamples &&
|
|
"Expect non-null FunctionSamples");
|
|
return LHS.CalleeSamples->getGUID() < RHS.CalleeSamples->getGUID();
|
|
}
|
|
};
|
|
|
|
using ProfiledCandidateQueue =
|
|
PriorityQueue<ProfiledInlineCandidate, std::vector<ProfiledInlineCandidate>,
|
|
ProfiledCandidateComparer>;
|
|
|
|
// Pre-compilation inliner based on context-sensitive profile.
|
|
// The PreInliner estimates inline decision using hotness from profile
|
|
// and cost estimation from machine code size. It helps merges context
|
|
// profile globally and achieves better post-inine profile quality, which
|
|
// otherwise won't be possible for ThinLTO. It also reduce context profile
|
|
// size by only keep context that is estimated to be inlined.
|
|
class CSPreInliner {
|
|
public:
|
|
CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary,
|
|
ProfileSummary *Summary);
|
|
void run();
|
|
|
|
private:
|
|
bool getInlineCandidates(ProfiledCandidateQueue &CQueue,
|
|
const FunctionSamples *FCallerContextSamples);
|
|
std::vector<FunctionId> buildTopDownOrder();
|
|
void processFunction(FunctionId Name);
|
|
bool shouldInline(ProfiledInlineCandidate &Candidate);
|
|
uint32_t getFuncSize(const ContextTrieNode *ContextNode);
|
|
bool UseContextCost;
|
|
SampleContextTracker &ContextTracker;
|
|
ProfiledBinary &Binary;
|
|
ProfileSummary *Summary;
|
|
};
|
|
|
|
} // end namespace sampleprof
|
|
} // end namespace llvm
|
|
|
|
#endif
|