llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
Jon Chesterfield 0507448d82 [amdgpu] Implement dynamic LDS accesses from non-kernel functions
The premise here is to allow non-kernel functions to locate external LDS variables without using LDS or extra magic SGPRs to do so.

1/ First it crawls the callgraph to work out which external LDS variables are reachable from a given kernel
2/ Then it creates a new `extern char[0]` variable for each kernel, which will alias all the other extern LDS variables because that's the documented behaviour of these variables
3/ The address of that variable is written to a lookup table. The global variable is tagged with metadata to track what address it was allocated at by codegen
4/ The assembler builds the lookup table using the metadata
5/ Any non-kernel functions use the same magic intrinsic used by table lookups of non-dynamic LDS variables to find the address to use

Heavy overlap with the code paths taken for other lowering, in particular the same intrinsic is used to pass the dynamic scope information through the same sgpr as for table lookups of static LDS.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D144233
2023-04-04 20:06:34 +01:00

172 lines
5.2 KiB
C++

//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass marks all internal functions as always_inline and creates
/// duplicates of all other functions and marks the duplicates as always_inline.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
namespace {
static cl::opt<bool> StressCalls(
"amdgpu-stress-function-calls",
cl::Hidden,
cl::desc("Force all functions to be noinline"),
cl::init(false));
class AMDGPUAlwaysInline : public ModulePass {
bool GlobalOpt;
public:
static char ID;
AMDGPUAlwaysInline(bool GlobalOpt = false) :
ModulePass(ID), GlobalOpt(GlobalOpt) { }
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
}
};
} // End anonymous namespace
INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
"AMDGPU Inline All Functions", false, false)
char AMDGPUAlwaysInline::ID = 0;
static void
recursivelyVisitUsers(GlobalValue &GV,
SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
SmallVector<User *, 16> Stack(GV.users());
SmallPtrSet<const Value *, 8> Visited;
while (!Stack.empty()) {
User *U = Stack.pop_back_val();
if (!Visited.insert(U).second)
continue;
if (Instruction *I = dyn_cast<Instruction>(U)) {
Function *F = I->getParent()->getParent();
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
// FIXME: This is a horrible hack. We should always respect noinline,
// and just let us hit the error when we can't handle this.
//
// Unfortunately, clang adds noinline to all functions at -O0. We have
// to override this here until that's fixed.
F->removeFnAttr(Attribute::NoInline);
FuncsToAlwaysInline.insert(F);
Stack.push_back(F);
}
// No need to look at further users, but we do need to inline any callers.
continue;
}
append_range(Stack, U->users());
}
}
static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
std::vector<GlobalAlias*> AliasesToRemove;
SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
SmallPtrSet<Function *, 8> FuncsToNoInline;
Triple TT(M.getTargetTriple());
for (GlobalAlias &A : M.aliases()) {
if (Function* F = dyn_cast<Function>(A.getAliasee())) {
if (TT.getArch() == Triple::amdgcn &&
A.getLinkage() != GlobalValue::InternalLinkage)
continue;
A.replaceAllUsesWith(F);
AliasesToRemove.push_back(&A);
}
// FIXME: If the aliasee isn't a function, it's some kind of constant expr
// cast that won't be inlined through.
}
if (GlobalOpt) {
for (GlobalAlias* A : AliasesToRemove) {
A->eraseFromParent();
}
}
// Always force inlining of any function that uses an LDS global address. This
// is something of a workaround because we don't have a way of supporting LDS
// objects defined in functions. LDS is always allocated by a kernel, and it
// is difficult to manage LDS usage if a function may be used by multiple
// kernels.
//
// OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
// should only appear when IPO passes manages to move LDs defined in a kernel
// into a single user function.
for (GlobalVariable &GV : M.globals()) {
// TODO: Region address
unsigned AS = GV.getAddressSpace();
if ((AS == AMDGPUAS::REGION_ADDRESS) ||
(AS == AMDGPUAS::LOCAL_ADDRESS &&
(!AMDGPUTargetMachine::EnableLowerModuleLDS)))
recursivelyVisitUsers(GV, FuncsToAlwaysInline);
}
if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
auto IncompatAttr
= StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
for (Function &F : M) {
if (!F.isDeclaration() && !F.use_empty() &&
!F.hasFnAttribute(IncompatAttr)) {
if (StressCalls) {
if (!FuncsToAlwaysInline.count(&F))
FuncsToNoInline.insert(&F);
} else
FuncsToAlwaysInline.insert(&F);
}
}
}
for (Function *F : FuncsToAlwaysInline)
F->addFnAttr(Attribute::AlwaysInline);
for (Function *F : FuncsToNoInline)
F->addFnAttr(Attribute::NoInline);
return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
}
bool AMDGPUAlwaysInline::runOnModule(Module &M) {
return alwaysInlineImpl(M, GlobalOpt);
}
ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
return new AMDGPUAlwaysInline(GlobalOpt);
}
PreservedAnalyses AMDGPUAlwaysInlinePass::run(Module &M,
ModuleAnalysisManager &AM) {
alwaysInlineImpl(M, GlobalOpt);
return PreservedAnalyses::all();
}