[AMDGPU] Update code object metadata for kernarg preload

Tracks the registers that explicit and hidden arguments are preloaded to
with new code object metadata.

IR arguments may be split across multiple parts by isel, and SGPR tuple
alignment means that an argument may be spread across multiple
registers.

To support this, some of the utilities for hidden kernel arguments are
moved to `AMDGPUArgumentUsageInfo.h`. Additional bookkeeping is also
needed for tracking purposes.
This commit is contained in:
Austin Kerbow 2025-04-04 23:06:28 -07:00
parent 8f6551935a
commit 62e5168889
21 changed files with 951 additions and 176 deletions

View File

@ -47,7 +47,7 @@ constexpr uint32_t VersionMinorV5 = 2;
/// HSA metadata major version for code object V6.
constexpr uint32_t VersionMajorV6 = 1;
/// HSA metadata minor version for code object V6.
constexpr uint32_t VersionMinorV6 = 2;
constexpr uint32_t VersionMinorV6 = 3;
/// Old HSA metadata beginning assembler directive for V2. This is only used for
/// diagnostics now.

View File

@ -16,12 +16,15 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
using namespace llvm::KernArgPreload;
#define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
"Argument Register Usage Information Storage", false, true)
constexpr HiddenArgInfo HiddenArgUtils::HiddenArgs[END_HIDDEN_ARGS];
void ArgDescriptor::print(raw_ostream &OS,
const TargetRegisterInfo *TRI) const {
if (!isSet()) {
@ -176,6 +179,37 @@ AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
return AI;
}
SmallVector<const KernArgPreloadDescriptor *, 4>
AMDGPUFunctionArgInfo::getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const {
SmallVector<const KernArgPreloadDescriptor *, 4> Results;
for (const auto &KV : PreloadKernArgs) {
if (KV.second.OrigArgIdx == ArgIdx)
Results.push_back(&KV.second);
}
llvm::stable_sort(Results, [](const KernArgPreloadDescriptor *A,
const KernArgPreloadDescriptor *B) {
return A->PartIdx < B->PartIdx;
});
return Results;
}
std::optional<const KernArgPreloadDescriptor *>
AMDGPUFunctionArgInfo::getHiddenArgPreloadDescriptor(HiddenArg HA) const {
assert(HA < END_HIDDEN_ARGS);
auto HiddenArgIt = PreloadHiddenArgsIndexMap.find(HA);
if (HiddenArgIt == PreloadHiddenArgsIndexMap.end())
return std::nullopt;
auto KernArgIt = PreloadKernArgs.find(HiddenArgIt->second);
if (KernArgIt == PreloadKernArgs.end())
return std::nullopt;
return &KernArgIt->second;
}
const AMDGPUFunctionArgInfo &
AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
auto I = ArgInfoMap.find(&F);

View File

@ -11,7 +11,10 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
namespace llvm {
@ -95,11 +98,78 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
return OS;
}
struct KernArgPreloadDescriptor : public ArgDescriptor {
KernArgPreloadDescriptor() {}
SmallVector<MCRegister> Regs;
namespace KernArgPreload {
enum HiddenArg {
HIDDEN_BLOCK_COUNT_X,
HIDDEN_BLOCK_COUNT_Y,
HIDDEN_BLOCK_COUNT_Z,
HIDDEN_GROUP_SIZE_X,
HIDDEN_GROUP_SIZE_Y,
HIDDEN_GROUP_SIZE_Z,
HIDDEN_REMAINDER_X,
HIDDEN_REMAINDER_Y,
HIDDEN_REMAINDER_Z,
END_HIDDEN_ARGS
};
// Stores information about a specific hidden argument.
struct HiddenArgInfo {
// Offset in bytes from the location in the kernearg segment pointed to by
// the implicitarg pointer.
uint8_t Offset;
// The size of the hidden argument in bytes.
uint8_t Size;
// The name of the hidden argument in the kernel signature.
const char *Name;
};
struct HiddenArgUtils {
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
{22, 2, "_hidden_remainder_z"}};
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
if (HiddenArgs[I].Offset == Offset)
return static_cast<HiddenArg>(I);
return END_HIDDEN_ARGS;
}
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
if (HA < END_HIDDEN_ARGS)
return static_cast<Type *>(Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8));
llvm_unreachable("Unexpected hidden argument.");
}
static const char *getHiddenArgName(HiddenArg HA) {
if (HA < END_HIDDEN_ARGS) {
return HiddenArgs[HA].Name;
}
llvm_unreachable("Unexpected hidden argument.");
}
};
struct KernArgPreloadDescriptor {
// Id of the original argument in the IR kernel function argument list.
unsigned OrigArgIdx = 0;
// If this IR argument was split into multiple parts, this is the index of the
// part in the original argument.
unsigned PartIdx = 0;
// The registers that the argument is preloaded into. The argument may be
// split accross multilpe registers.
SmallVector<MCRegister, 2> Regs;
};
} // namespace KernArgPreload
struct AMDGPUFunctionArgInfo {
// clang-format off
enum PreloadedValue {
@ -161,7 +231,10 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor WorkItemIDZ;
// Map the index of preloaded kernel arguments to its descriptor.
SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
SmallDenseMap<int, KernArgPreload::KernArgPreloadDescriptor>
PreloadKernArgs{};
// Map hidden argument to the index of it's descriptor.
SmallDenseMap<KernArgPreload::HiddenArg, int> PreloadHiddenArgsIndexMap{};
// The first user SGPR allocated for kernarg preloading.
Register FirstKernArgPreloadReg;
@ -169,6 +242,16 @@ struct AMDGPUFunctionArgInfo {
getPreloadedValue(PreloadedValue Value) const;
static AMDGPUFunctionArgInfo fixedABILayout();
// Returns preload argument descriptors for an IR argument index. Isel may
// split IR arguments into multiple parts, the return vector holds all parts
// associated with an IR argument in the kernel signature.
SmallVector<const KernArgPreload::KernArgPreloadDescriptor *, 4>
getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const;
// Returns the hidden arguments `KernArgPreloadDescriptor` if it is preloaded.
std::optional<const KernArgPreload::KernArgPreloadDescriptor *>
getHiddenArgPreloadDescriptor(KernArgPreload::HiddenArg HA) const;
};
class AMDGPUArgumentUsageInfo : public ImmutablePass {

View File

@ -15,6 +15,7 @@
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
@ -290,7 +291,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
if (Arg.hasAttribute("amdgpu-hidden-argument"))
continue;
emitKernelArg(Arg, Offset, Args);
emitKernelArg(Arg, Offset, Args, MF);
}
emitHiddenKernelArgs(MF, Offset, Args);
@ -300,7 +301,8 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
unsigned &Offset,
msgpack::ArrayDocNode Args) {
msgpack::ArrayDocNode Args,
const MachineFunction &MF) {
const auto *Func = Arg.getParent();
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
@ -357,17 +359,18 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
Align ArgAlign;
std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
emitKernelArg(DL, ArgTy, ArgAlign,
getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual,
AccQual, TypeQual);
emitKernelArgImpl(DL, ArgTy, ArgAlign,
getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
"" /* PreloadRegisters */, PointeeAlign, Name, TypeName,
BaseTypeName, ActAccQual, AccQual, TypeQual);
}
void MetadataStreamerMsgPackV4::emitKernelArg(
void MetadataStreamerMsgPackV4::emitKernelArgImpl(
const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
StringRef Name, StringRef TypeName, StringRef BaseTypeName,
StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) {
unsigned &Offset, msgpack::ArrayDocNode Args, StringRef PreloadRegisters,
MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName,
StringRef BaseTypeName, StringRef ActAccQual, StringRef AccQual,
StringRef TypeQual) {
auto Arg = Args.getDocument()->getMapNode();
if (!Name.empty())
@ -409,6 +412,11 @@ void MetadataStreamerMsgPackV4::emitKernelArg(
Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
}
if (!PreloadRegisters.empty()) {
Arg[".preload_registers"] =
Arg.getDocument()->getNode(PreloadRegisters, /*Copy=*/true);
}
Args.push_back(Arg);
}
@ -428,14 +436,14 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
if (HiddenArgNumBytes >= 8)
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
Args);
if (HiddenArgNumBytes >= 16)
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
Args);
if (HiddenArgNumBytes >= 24)
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
Args);
auto *Int8PtrTy =
PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
@ -445,42 +453,42 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
// before code object V5, which makes the mutual exclusion between the
// "printf buffer" and "hostcall buffer" here sound.
if (M->getNamedMetadata("llvm.printf.fmts"))
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer",
Offset, Args);
else
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
// Emit "default queue" and "completion action" arguments if enqueue kernel is
// used, otherwise emit dummy "none" arguments.
if (HiddenArgNumBytes >= 40) {
if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
Args);
} else {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
if (HiddenArgNumBytes >= 48) {
if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
Offset, Args);
} else {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
// Emit the pointer argument for multi-grid object.
if (HiddenArgNumBytes >= 56) {
if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
Offset, Args);
} else {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
}
@ -635,77 +643,83 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
auto *Int16Ty = Type::getInt16Ty(Func.getContext());
Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset,
Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset,
Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset,
Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
// Reserved for hidden_tool_correlation_id.
Offset += 8;
Offset += 8; // Reserved.
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args);
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args);
emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
Args);
emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
Offset += 6; // Reserved.
auto *Int8PtrTy =
PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
if (M->getNamedMetadata("llvm.printf.fmts")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
Offset, Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
else
Offset += 8; // Skipped.
if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
Offset, Args);
} else {
Offset += 8; // Skipped.
}
// Emit argument for hidden dynamic lds size
if (MFI.isDynamicLDSUsed()) {
emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
Args);
} else {
Offset += 4; // skipped
}
@ -715,14 +729,17 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
// hidden_private_base and hidden_shared_base are only when the subtarget has
// ApertureRegs.
if (!ST.hasApertureRegs()) {
emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_private_base", Offset,
Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_shared_base", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (MFI.getUserSGPRInfo().hasQueuePtr())
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset,
Args);
}
void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
@ -745,5 +762,241 @@ void MetadataStreamerMsgPackV6::emitVersion() {
getRootMetadata("amdhsa.version") = Version;
}
void MetadataStreamerMsgPackV6::emitHiddenKernelArgWithPreload(
const DataLayout &DL, Type *ArgTy, Align Alignment,
KernArgPreload::HiddenArg HiddenArg, StringRef ArgName, unsigned &Offset,
msgpack::ArrayDocNode Args, const AMDGPUFunctionArgInfo &ArgInfo) {
SmallString<16> PreloadStr;
auto PreloadDesc = ArgInfo.getHiddenArgPreloadDescriptor(HiddenArg);
if (PreloadDesc) {
const auto &Regs = (*PreloadDesc)->Regs;
for (unsigned I = 0; I < Regs.size(); ++I) {
if (I > 0)
PreloadStr += " ";
PreloadStr += AMDGPUInstPrinter::getRegisterName(Regs[I]);
}
}
emitKernelArgImpl(DL, ArgTy, Alignment, ArgName, Offset, Args, PreloadStr);
}
void MetadataStreamerMsgPackV6::emitHiddenKernelArgs(
const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) {
auto &Func = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// No implicit kernel argument is used.
if (ST.getImplicitArgNumBytes(Func) == 0)
return;
const Module *M = Func.getParent();
auto &DL = M->getDataLayout();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
auto *Int64Ty = Type::getInt64Ty(Func.getContext());
auto *Int32Ty = Type::getInt32Ty(Func.getContext());
auto *Int16Ty = Type::getInt16Ty(Func.getContext());
Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
const AMDGPUFunctionArgInfo &ArgInfo = MFI.getArgInfo();
emitHiddenKernelArgWithPreload(DL, Int32Ty, Align(4),
KernArgPreload::HIDDEN_BLOCK_COUNT_X,
"hidden_block_count_x", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int32Ty, Align(4),
KernArgPreload::HIDDEN_BLOCK_COUNT_Y,
"hidden_block_count_y", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int32Ty, Align(4),
KernArgPreload::HIDDEN_BLOCK_COUNT_Z,
"hidden_block_count_z", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int16Ty, Align(2),
KernArgPreload::HIDDEN_GROUP_SIZE_X,
"hidden_group_size_x", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int16Ty, Align(2),
KernArgPreload::HIDDEN_GROUP_SIZE_Y,
"hidden_group_size_y", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int16Ty, Align(2),
KernArgPreload::HIDDEN_GROUP_SIZE_Z,
"hidden_group_size_z", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int16Ty, Align(2),
KernArgPreload::HIDDEN_REMAINDER_X,
"hidden_remainder_x", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int16Ty, Align(2),
KernArgPreload::HIDDEN_REMAINDER_Y,
"hidden_remainder_y", Offset, Args, ArgInfo);
emitHiddenKernelArgWithPreload(DL, Int16Ty, Align(2),
KernArgPreload::HIDDEN_REMAINDER_Z,
"hidden_remainder_z", Offset, Args, ArgInfo);
// Reserved for hidden_tool_correlation_id.
Offset += 8;
Offset += 8; // Reserved.
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
Args);
emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
Args);
emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
Offset += 6; // Reserved.
auto *Int8PtrTy =
PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
if (M->getNamedMetadata("llvm.printf.fmts")) {
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
Offset, Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
else
Offset += 8; // Skipped.
if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
Offset, Args);
} else {
Offset += 8; // Skipped.
}
// Emit argument for hidden dynamic lds size
if (MFI.isDynamicLDSUsed()) {
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
Args);
} else {
Offset += 4; // skipped
}
Offset += 68; // Reserved.
// hidden_private_base and hidden_shared_base are only when the subtarget has
// ApertureRegs.
if (!ST.hasApertureRegs()) {
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_private_base", Offset,
Args);
emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_shared_base", Offset,
Args);
} else {
Offset += 8; // Skipped.
}
if (MFI.getUserSGPRInfo().hasQueuePtr())
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset,
Args);
}
void MetadataStreamerMsgPackV6::emitKernelArg(const Argument &Arg,
unsigned &Offset,
msgpack::ArrayDocNode Args,
const MachineFunction &MF) {
const auto *Func = Arg.getParent();
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
StringRef Name;
Node = Func->getMetadata("kernel_arg_name");
if (Node && ArgNo < Node->getNumOperands())
Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
else if (Arg.hasName())
Name = Arg.getName();
StringRef TypeName;
Node = Func->getMetadata("kernel_arg_type");
if (Node && ArgNo < Node->getNumOperands())
TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
StringRef BaseTypeName;
Node = Func->getMetadata("kernel_arg_base_type");
if (Node && ArgNo < Node->getNumOperands())
BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
StringRef ActAccQual;
// Do we really need NoAlias check here?
if (Arg.getType()->isPointerTy() && Arg.hasNoAliasAttr()) {
if (Arg.onlyReadsMemory())
ActAccQual = "read_only";
else if (Arg.hasAttribute(Attribute::WriteOnly))
ActAccQual = "write_only";
}
StringRef AccQual;
Node = Func->getMetadata("kernel_arg_access_qual");
if (Node && ArgNo < Node->getNumOperands())
AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
StringRef TypeQual;
Node = Func->getMetadata("kernel_arg_type_qual");
if (Node && ArgNo < Node->getNumOperands())
TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
const DataLayout &DL = Func->getDataLayout();
MaybeAlign PointeeAlign;
Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType();
// FIXME: Need to distinguish in memory alignment from pointer alignment.
if (auto *PtrTy = dyn_cast<PointerType>(Ty)) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
PointeeAlign = Arg.getParamAlign().valueOrOne();
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
SmallString<8> PreloadRegisters;
if (MFI->getNumKernargPreloadedSGPRs()) {
assert(MF.getSubtarget<GCNSubtarget>().hasKernargPreload());
const auto &PreloadDescs =
MFI->getArgInfo().getPreloadDescriptorsForArgIdx(ArgNo);
for (auto &Desc : PreloadDescs) {
if (!PreloadRegisters.empty())
PreloadRegisters += " ";
for (unsigned I = 0; I < Desc->Regs.size(); ++I) {
if (I > 0)
PreloadRegisters += " ";
PreloadRegisters += AMDGPUInstPrinter::getRegisterName(Desc->Regs[I]);
}
}
}
// There's no distinction between byval aggregates and raw aggregates.
Type *ArgTy;
Align ArgAlign;
std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
emitKernelArgImpl(DL, ArgTy, ArgAlign,
getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
PreloadRegisters, PointeeAlign, Name, TypeName,
BaseTypeName, ActAccQual, AccQual, TypeQual);
}
} // end namespace AMDGPU::HSAMD
} // end namespace llvm

View File

@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUDelayedMCExpr.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
@ -60,6 +61,9 @@ protected:
virtual void emitVersion() = 0;
virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) = 0;
virtual void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args,
const MachineFunction &MF) = 0;
virtual void emitKernelAttrs(const AMDGPUTargetMachine &TM,
const Function &Func,
msgpack::MapDocNode Kern) = 0;
@ -108,15 +112,17 @@ protected:
void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);
void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args);
void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
StringRef ValueKind, unsigned &Offset,
msgpack::ArrayDocNode Args,
MaybeAlign PointeeAlign = std::nullopt,
StringRef Name = "", StringRef TypeName = "",
StringRef BaseTypeName = "", StringRef ActAccQual = "",
StringRef AccQual = "", StringRef TypeQual = "");
const MachineFunction &MF) override;
void emitKernelArgImpl(const DataLayout &DL, Type *Ty, Align Alignment,
StringRef ValueKind, unsigned &Offset,
msgpack::ArrayDocNode Args,
StringRef PreloadRegisters = "",
MaybeAlign PointeeAlign = std::nullopt,
StringRef Name = "", StringRef TypeName = "",
StringRef BaseTypeName = "", StringRef ActAccQual = "",
StringRef AccQual = "", StringRef TypeQual = "");
void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) override;
@ -160,6 +166,18 @@ public:
class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
protected:
void emitVersion() override;
void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) override;
void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args,
const MachineFunction &MF) override;
void emitHiddenKernelArgWithPreload(const DataLayout &DL, Type *ArgTy,
Align Alignment,
KernArgPreload::HiddenArg HiddenArg,
StringRef ArgName, unsigned &Offset,
msgpack::ArrayDocNode Args,
const AMDGPUFunctionArgInfo &ArgInfo);
public:
MetadataStreamerMsgPackV6() = default;

View File

@ -24,6 +24,7 @@
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
using namespace llvm;
using namespace llvm::KernArgPreload;
namespace {
@ -33,59 +34,6 @@ private:
const GCNSubtarget &ST;
unsigned NumFreeUserSGPRs;
enum HiddenArg : unsigned {
HIDDEN_BLOCK_COUNT_X,
HIDDEN_BLOCK_COUNT_Y,
HIDDEN_BLOCK_COUNT_Z,
HIDDEN_GROUP_SIZE_X,
HIDDEN_GROUP_SIZE_Y,
HIDDEN_GROUP_SIZE_Z,
HIDDEN_REMAINDER_X,
HIDDEN_REMAINDER_Y,
HIDDEN_REMAINDER_Z,
END_HIDDEN_ARGS
};
// Stores information about a specific hidden argument.
struct HiddenArgInfo {
// Offset in bytes from the location in the kernearg segment pointed to by
// the implicitarg pointer.
uint8_t Offset;
// The size of the hidden argument in bytes.
uint8_t Size;
// The name of the hidden argument in the kernel signature.
const char *Name;
};
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
{22, 2, "_hidden_remainder_z"}};
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
if (HiddenArgs[I].Offset == Offset)
return static_cast<HiddenArg>(I);
return END_HIDDEN_ARGS;
}
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
if (HA < END_HIDDEN_ARGS)
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
llvm_unreachable("Unexpected hidden argument.");
}
static const char *getHiddenArgName(HiddenArg HA) {
if (HA < END_HIDDEN_ARGS) {
return HiddenArgs[HA].Name;
}
llvm_unreachable("Unexpected hidden argument.");
}
// Clones the function after adding implicit arguments to the argument list
// and returns the new updated function. Preloaded implicit arguments are
// added up to and including the last one that will be preloaded, indicated by
@ -98,7 +46,7 @@ private:
LLVMContext &Ctx = F.getParent()->getContext();
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
FTypes.push_back(HiddenArgUtils::getHiddenArgType(Ctx, HiddenArg(I)));
FunctionType *NFT =
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
@ -126,7 +74,7 @@ private:
AttributeList AL = NF->getAttributes();
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
NFArg++->setName(HiddenArgUtils::getHiddenArgName(HiddenArg(I)));
}
NF->setAttributes(AL);
@ -202,8 +150,9 @@ public:
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
LLVMContext &Ctx = F.getParent()->getContext();
Type *LoadTy = Load->getType();
HiddenArg HA = getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
HiddenArg HA = HiddenArgUtils::getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS ||
LoadTy != HiddenArgUtils::getHiddenArgType(Ctx, HA))
continue;
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
@ -238,13 +187,15 @@ public:
if (PreloadEnd == ImplicitArgLoads.begin())
return;
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
unsigned LastHiddenArgIndex =
HiddenArgUtils::getHiddenArgFromOffset(PreloadEnd[-1].second);
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
assert(NF);
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
LoadInst *LoadInst = I->first;
unsigned LoadOffset = I->second;
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
unsigned HiddenArgIndex =
HiddenArgUtils::getHiddenArgFromOffset(LoadOffset);
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
Argument *Arg = NF->getArg(Index);
LoadInst->replaceAllUsesWith(Arg);

View File

@ -46,6 +46,7 @@
#include <optional>
using namespace llvm;
using namespace llvm::KernArgPreload;
#define DEBUG_TYPE "si-lower"
@ -2537,6 +2538,18 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
// Maps a hidden kernel argument to its preload index in
// PreloadHiddenArgsIndexMap.
static void mapHiddenArgToPreloadIndex(AMDGPUFunctionArgInfo &ArgInfo,
unsigned ArgOffset,
unsigned ImplicitArgOffset,
unsigned ArgIdx) {
auto [It, Inserted] = ArgInfo.PreloadHiddenArgsIndexMap.try_emplace(
HiddenArgUtils::getHiddenArgFromOffset(ArgOffset - ImplicitArgOffset));
assert(Inserted && "Preload hidden kernel argument allocated twice.");
It->second = ArgIdx;
}
// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
@ -2549,6 +2562,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
bool InPreloadSequence = true;
unsigned InIdx = 0;
bool AlignedForImplictArgs = false;
unsigned ImplicitArgOffsetAdjustment = 0;
unsigned ImplicitArgOffset = 0;
for (auto &Arg : F.args()) {
if (!InPreloadSequence || !Arg.hasInRegAttr())
@ -2577,18 +2591,32 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
if (!AlignedForImplictArgs) {
ImplicitArgOffset =
alignTo(LastExplicitArgOffset,
Subtarget->getAlignmentForImplicitArgPtr()) -
LastExplicitArgOffset;
Subtarget->getAlignmentForImplicitArgPtr());
ImplicitArgOffsetAdjustment =
ImplicitArgOffset - LastExplicitArgOffset;
AlignedForImplictArgs = true;
}
ArgOffset += ImplicitArgOffset;
ArgOffset += ImplicitArgOffsetAdjustment;
}
// Arg is preloaded into the previous SGPR.
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
assert(InIdx >= 1 && "No previous SGPR");
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
auto [It, Inserted] =
Info.getArgInfo().PreloadKernArgs.try_emplace(InIdx);
assert(Inserted && "Preload kernel argument allocated twice.");
KernArgPreloadDescriptor &PreloadDesc = It->second;
const KernArgPreloadDescriptor &PrevDesc =
Info.getArgInfo().PreloadKernArgs[InIdx - 1];
PreloadDesc.Regs.push_back(PrevDesc.Regs[0]);
PreloadDesc.OrigArgIdx = Arg.getArgNo();
PreloadDesc.PartIdx = InIdx;
if (Arg.hasAttribute("amdgpu-hidden-argument"))
mapHiddenArgToPreloadIndex(Info.getArgInfo(), ArgOffset,
ImplicitArgOffset, InIdx);
continue;
}
@ -2600,11 +2628,15 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
break;
}
if (Arg.hasAttribute("amdgpu-hidden-argument"))
mapHiddenArgToPreloadIndex(Info.getArgInfo(), ArgOffset,
ImplicitArgOffset, InIdx);
// Preload this argument.
const TargetRegisterClass *RC =
TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
SmallVectorImpl<MCRegister> *PreloadRegs =
Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, Arg.getArgNo(), PaddingSGPRs);
if (PreloadRegs->size() > 1)
RC = &AMDGPU::SGPR_32RegClass;

View File

@ -253,9 +253,14 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
unsigned AllocSizeDWord, unsigned PartIdx, unsigned ArgIdx,
unsigned PaddingSGPRs) {
auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(PartIdx);
assert(Inserted && "Preload kernel argument allocated twice.");
KernArgPreload::KernArgPreloadDescriptor &PreloadDesc = It->second;
PreloadDesc.PartIdx = PartIdx;
PreloadDesc.OrigArgIdx = ArgIdx;
NumUserSGPRs += PaddingSGPRs;
// If the available register tuples are aligned with the kernarg to be
// preloaded use that register, otherwise we need to use a set of SGPRs and
@ -264,7 +269,7 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
Register PreloadReg =
TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
auto &Regs = It->second.Regs;
auto &Regs = PreloadDesc.Regs;
if (PreloadReg &&
(RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
Regs.push_back(PreloadReg);

View File

@ -809,8 +809,8 @@ public:
Register addLDSKernelId();
SmallVectorImpl<MCRegister> *
addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
unsigned AllocSizeDWord, int KernArgIdx,
int PaddingSGPRs);
unsigned AllocSizeDWord, unsigned PartIdx,
unsigned ArgIdx, unsigned PaddingSGPRs);
/// Increment user SGPRs used for padding the argument list only.
Register addReservedUserSGPR() {

View File

@ -0,0 +1,388 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=16 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=16 < %s | FileCheck --check-prefix=CHECK %s
; CHECK: amdhsa.kernels:
; CHECK-NEXT: - .agpr_count: 0
; CHECK-NEXT: .args:
; CHECK-NEXT: - .name: in
; CHECK-NEXT: .offset: 0
; CHECK-NEXT: .preload_registers: s8
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: by_value
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: r
; CHECK-NEXT: .offset: 8
; CHECK-NEXT: .preload_registers: 's[10:11]'
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: a
; CHECK-NEXT: .offset: 16
; CHECK-NEXT: .preload_registers: 's[12:13]'
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: b
; CHECK-NEXT: .offset: 24
; CHECK-NEXT: .preload_registers: 's[14:15]'
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .offset: 32
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_x
; CHECK-NEXT: - .offset: 36
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_y
; CHECK-NEXT: - .offset: 40
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_z
; CHECK-NEXT: - .offset: 44
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_x
; CHECK-NEXT: - .offset: 46
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_y
; CHECK-NEXT: - .offset: 48
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_z
; CHECK-NEXT: - .offset: 50
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_x
; CHECK-NEXT: - .offset: 52
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_y
; CHECK-NEXT: - .offset: 54
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_z
; CHECK-NEXT: - .offset: 72
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_x
; CHECK-NEXT: - .offset: 80
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_y
; CHECK-NEXT: - .offset: 88
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_z
; CHECK-NEXT: - .offset: 96
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_grid_dims
; CHECK-NEXT: - .offset: 104
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_printf_buffer
; CHECK-NEXT: - .offset: 112
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
; CHECK-NEXT: - .offset: 120
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
; CHECK-NEXT: - .offset: 128
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_heap_v1
; CHECK-NEXT: - .offset: 136
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_default_queue
; CHECK-NEXT: - .offset: 144
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_completion_action
; CHECK-NEXT: - .offset: 152
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
; CHECK-NEXT: - .offset: 232
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_queue_ptr
; CHECK-NEXT: .group_segment_fixed_size: 0
; CHECK-NEXT: .kernarg_segment_align: 8
; CHECK-NEXT: .kernarg_segment_size: 288
; CHECK-NEXT: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: test_preload_v6
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK-NEXT: .sgpr_count: 22
; CHECK-NEXT: .sgpr_spill_count: 0
; CHECK-NEXT: .symbol: test_preload_v6.kd
; CHECK-NEXT: .uses_dynamic_stack: false
; CHECK-NEXT: .vgpr_count: 3
; CHECK-NEXT: .vgpr_spill_count: 0
; CHECK-NEXT: .wavefront_size: 64
; CHECK-NEXT: - .agpr_count: 0
; CHECK-NEXT: .args:
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: out
; CHECK-NEXT: .offset: 0
; CHECK-NEXT: .preload_registers: 's[2:3]'
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .offset: 8
; CHECK-NEXT: .preload_registers: s4
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_x
; CHECK-NEXT: - .offset: 12
; CHECK-NEXT: .preload_registers: s5
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_y
; CHECK-NEXT: - .offset: 16
; CHECK-NEXT: .preload_registers: s6
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_z
; CHECK-NEXT: - .offset: 20
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_x
; CHECK-NEXT: - .offset: 22
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_y
; CHECK-NEXT: - .offset: 24
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_z
; CHECK-NEXT: - .offset: 26
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_x
; CHECK-NEXT: - .offset: 28
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_y
; CHECK-NEXT: - .offset: 30
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_z
; CHECK-NEXT: - .offset: 48
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_x
; CHECK-NEXT: - .offset: 56
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_y
; CHECK-NEXT: - .offset: 64
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_z
; CHECK-NEXT: - .offset: 72
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_grid_dims
; CHECK-NEXT: - .offset: 80
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_printf_buffer
; CHECK-NEXT: .group_segment_fixed_size: 0
; CHECK-NEXT: .kernarg_segment_align: 8
; CHECK-NEXT: .kernarg_segment_size: 264
; CHECK-NEXT: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: test_preload_v6_block_count_xyz
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK-NEXT: .sgpr_count: 13
; CHECK-NEXT: .sgpr_spill_count: 0
; CHECK-NEXT: .symbol: test_preload_v6_block_count_xyz.kd
; CHECK-NEXT: .uses_dynamic_stack: false
; CHECK-NEXT: .vgpr_count: 4
; CHECK-NEXT: .vgpr_spill_count: 0
; CHECK-NEXT: .wavefront_size: 64
; CHECK-NEXT: - .agpr_count: 0
; CHECK-NEXT: .args:
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: out
; CHECK-NEXT: .offset: 0
; CHECK-NEXT: .preload_registers: 's[2:3]'
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .offset: 8
; CHECK-NEXT: .preload_registers: s4
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_x
; CHECK-NEXT: - .offset: 12
; CHECK-NEXT: .preload_registers: s5
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_y
; CHECK-NEXT: - .offset: 16
; CHECK-NEXT: .preload_registers: s6
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_z
; CHECK-NEXT: - .offset: 20
; CHECK-NEXT: .preload_registers: s7
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_x
; CHECK-NEXT: - .offset: 22
; CHECK-NEXT: .preload_registers: s7
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_y
; CHECK-NEXT: - .offset: 24
; CHECK-NEXT: .preload_registers: s8
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_z
; CHECK-NEXT: - .offset: 26
; CHECK-NEXT: .preload_registers: s8
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_x
; CHECK-NEXT: - .offset: 28
; CHECK-NEXT: .preload_registers: s9
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_y
; CHECK-NEXT: - .offset: 30
; CHECK-NEXT: .preload_registers: s9
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_z
; CHECK-NEXT: - .offset: 48
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_x
; CHECK-NEXT: - .offset: 56
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_y
; CHECK-NEXT: - .offset: 64
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_z
; CHECK-NEXT: - .offset: 72
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_grid_dims
; CHECK-NEXT: - .offset: 80
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_printf_buffer
; CHECK-NEXT: .group_segment_fixed_size: 0
; CHECK-NEXT: .kernarg_segment_align: 8
; CHECK-NEXT: .kernarg_segment_size: 264
; CHECK-NEXT: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: test_preload_v6_block_count_z_workgroup_size_z_remainder_z
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK-NEXT: .sgpr_count: 16
; CHECK-NEXT: .sgpr_spill_count: 0
; CHECK-NEXT: .symbol: test_preload_v6_block_count_z_workgroup_size_z_remainder_z.kd
; CHECK-NEXT: .uses_dynamic_stack: false
; CHECK-NEXT: .vgpr_count: 4
; CHECK-NEXT: .vgpr_spill_count: 0
; CHECK-NEXT: .wavefront_size: 64
; CHECK-NEXT: - .agpr_count: 0
; CHECK-NEXT: .args:
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: out
; CHECK-NEXT: .offset: 0
; CHECK-NEXT: .preload_registers: 's[2:3]'
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .name: arg0
; CHECK-NEXT: .offset: 8
; CHECK-NEXT: .preload_registers: s4
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: by_value
; CHECK-NEXT: - .name: arg1
; CHECK-NEXT: .offset: 10
; CHECK-NEXT: .preload_registers: s4
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: by_value
; CHECK-NEXT: - .offset: 16
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_x
; CHECK-NEXT: - .offset: 20
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_y
; CHECK-NEXT: - .offset: 24
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_z
; CHECK-NEXT: - .offset: 28
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_x
; CHECK-NEXT: - .offset: 30
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_y
; CHECK-NEXT: - .offset: 32
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_z
; CHECK-NEXT: - .offset: 34
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_x
; CHECK-NEXT: - .offset: 36
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_y
; CHECK-NEXT: - .offset: 38
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_z
; CHECK-NEXT: - .offset: 56
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_x
; CHECK-NEXT: - .offset: 64
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_y
; CHECK-NEXT: - .offset: 72
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_z
; CHECK-NEXT: - .offset: 80
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_grid_dims
; CHECK-NEXT: - .offset: 88
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_printf_buffer
; CHECK-NEXT: .group_segment_fixed_size: 0
; CHECK-NEXT: .kernarg_segment_align: 8
; CHECK-NEXT: .kernarg_segment_size: 272
; CHECK-NEXT: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: test_prelaod_v6_ptr1_i16_i16
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK-NEXT: .sgpr_count: 11
; CHECK-NEXT: .sgpr_spill_count: 0
; CHECK-NEXT: .symbol: test_prelaod_v6_ptr1_i16_i16.kd
; CHECK-NEXT: .uses_dynamic_stack: false
; CHECK-NEXT: .vgpr_count: 2
; CHECK-NEXT: .vgpr_spill_count: 0
; CHECK-NEXT: .wavefront_size: 64
; CHECK-NEXT: amdhsa.printf:
; CHECK-NEXT: - '1:1:4:%d\n'
; CHECK-NEXT: - '2:1:8:%g\n'
; CHECK-NEXT: amdhsa.target: amdgcn-amd-amdhsa--gfx942
; CHECK-NEXT: amdhsa.version:
; CHECK-NEXT: - 1
; CHECK-NEXT: - 3
@lds = external hidden addrspace(3) global [0 x i32], align 4
define amdgpu_kernel void @test_preload_v6(
i32 inreg %in,
ptr addrspace(1) inreg %r,
ptr addrspace(1) inreg %a,
ptr addrspace(1) inreg %b) #0 {
%a.val = load half, ptr addrspace(1) %a
%b.val = load half, ptr addrspace(1) %b
%r.val = fadd half %a.val, %b.val
store half %r.val, ptr addrspace(1) %r
store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
ret void
}
define amdgpu_kernel void @test_preload_v6_block_count_xyz(ptr addrspace(1) inreg %out) #1 {
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
%load_x = load i32, ptr addrspace(4) %gep_x
%gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
%load_y = load i32, ptr addrspace(4) %gep_y
%gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%load_z = load i32, ptr addrspace(4) %gep_z
%ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0
%ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
%ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
store <3 x i32> %ins.2, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_preload_v6_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #1 {
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
%gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
%load0 = load i32, ptr addrspace(4) %gep0
%load1 = load i16, ptr addrspace(4) %gep1
%load2 = load i16, ptr addrspace(4) %gep2
%conv1 = zext i16 %load1 to i32
%conv2 = zext i16 %load2 to i32
%ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0
%ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
%ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
store <3 x i32> %ins.2, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_prelaod_v6_ptr1_i16_i16(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #1 {
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
%add = add i32 %ext, %ext1
store i32 %add, ptr addrspace(1) %out, align 4
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 600}
!llvm.printf.fmts = !{!1, !2}
!1 = !{!"1:1:4:%d\5Cn"}
!2 = !{!"2:1:8:%g\5Cn"}
attributes #0 = { optnone noinline }
attributes #1 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM56 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM6 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
@ -15,7 +15,8 @@
; ASM: amdhsa.version:
; ASM: - 1
; ASM4: - 1
; ASM56: - 2
; ASM5: - 2
; ASM6: - 3
; ELF: OS/ABI: AMDGPU_HSA (0x40)
; ELF4: ABIVersion: 2