mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 01:16:06 +00:00
[NVPTX/CUDA] added an optional src_size argument to __nvvm_cp_async*
The optional argument is needed for CUDA-11+ headers when we're compiling for sm_80+ GPUs. Differential Revision: https://reviews.llvm.org/D150820
This commit is contained in:
parent
aa144fbeaf
commit
6963c61f0f
@ -968,10 +968,10 @@ TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PT
|
||||
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
|
||||
|
@ -13564,6 +13564,8 @@ private:
|
||||
bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
|
||||
unsigned BuiltinID,
|
||||
CallExpr *TheCall);
|
||||
bool CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
|
||||
CallExpr *TheCall);
|
||||
|
||||
bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall);
|
||||
bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call);
|
||||
|
@ -18177,6 +18177,19 @@ static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
|
||||
{Ptr, CGF.EmitScalarExpr(E->getArg(1))});
|
||||
}
|
||||
|
||||
static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
|
||||
CodeGenFunction &CGF, const CallExpr *E,
|
||||
int SrcSize) {
|
||||
return E->getNumArgs() == 3
|
||||
? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
|
||||
{CGF.EmitScalarExpr(E->getArg(0)),
|
||||
CGF.EmitScalarExpr(E->getArg(1)),
|
||||
CGF.EmitScalarExpr(E->getArg(2))})
|
||||
: CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
|
||||
{CGF.EmitScalarExpr(E->getArg(0)),
|
||||
CGF.EmitScalarExpr(E->getArg(1))});
|
||||
}
|
||||
|
||||
static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
|
||||
const CallExpr *E, CodeGenFunction &CGF) {
|
||||
auto &C = CGF.CGM.getContext();
|
||||
@ -18840,6 +18853,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
|
||||
case NVPTX::BI__nvvm_ldu_h2: {
|
||||
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
|
||||
}
|
||||
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
|
||||
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
|
||||
Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
|
||||
4);
|
||||
case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
|
||||
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
|
||||
Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
|
||||
8);
|
||||
case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
|
||||
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
|
||||
Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
|
||||
16);
|
||||
case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
|
||||
return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
|
||||
Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
|
||||
16);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -2028,6 +2028,9 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
|
||||
case llvm::Triple::wasm32:
|
||||
case llvm::Triple::wasm64:
|
||||
return CheckWebAssemblyBuiltinFunctionCall(TI, BuiltinID, TheCall);
|
||||
case llvm::Triple::nvptx:
|
||||
case llvm::Triple::nvptx64:
|
||||
return CheckNVPTXBuiltinFunctionCall(TI, BuiltinID, TheCall);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4815,6 +4818,20 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
|
||||
unsigned BuiltinID,
|
||||
CallExpr *TheCall) {
|
||||
switch (BuiltinID) {
|
||||
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
|
||||
case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
|
||||
case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
|
||||
case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
|
||||
return checkArgCountAtMost(*this, TheCall, 3);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// SemaBuiltinCpuSupports - Handle __builtin_cpu_supports(char *).
|
||||
/// This checks that the target supports __builtin_cpu_supports and
|
||||
/// that the string argument is constant and valid.
|
||||
|
@ -830,15 +830,24 @@ __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __a
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
|
||||
__nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
|
||||
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4(
|
||||
__nvvm_cp_async_ca_shared_global_4(dst, src);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8(
|
||||
__nvvm_cp_async_ca_shared_global_8(dst, src);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16(
|
||||
__nvvm_cp_async_ca_shared_global_16(dst, src);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16(
|
||||
__nvvm_cp_async_cg_shared_global_16(dst, src);
|
||||
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2)
|
||||
__nvvm_cp_async_ca_shared_global_4(dst, src, 2);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2)
|
||||
__nvvm_cp_async_ca_shared_global_8(dst, src, 2);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2)
|
||||
__nvvm_cp_async_ca_shared_global_16(dst, src, 2);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2)
|
||||
__nvvm_cp_async_cg_shared_global_16(dst, src, 2);
|
||||
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
|
||||
__nvvm_cp_async_commit_group();
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
|
||||
|
@ -10,6 +10,7 @@
|
||||
// RUN: -fsyntax-only -verify=host %s
|
||||
// RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \
|
||||
// RUN: -aux-triple x86_64-unknown-unknown \
|
||||
// RUN: -target-cpu sm_80 -target-feature +ptx70 \
|
||||
// RUN: -fsyntax-only -verify=dev %s
|
||||
|
||||
#if !(defined(__amd64__) && defined(__PTX__))
|
||||
@ -28,3 +29,23 @@ __attribute__((device)) void df() {
|
||||
int y = __builtin_ia32_rdtsc(); // dev-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}}
|
||||
x = __builtin_abs(1);
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
__attribute__((device)) void nvvm_async_copy(__attribute__((address_space(3))) void* dst,
|
||||
__attribute__((address_space(1))) const void* src) {
|
||||
__nvvm_cp_async_ca_shared_global_4(dst, src);
|
||||
__nvvm_cp_async_ca_shared_global_8(dst, src);
|
||||
__nvvm_cp_async_ca_shared_global_16(dst, src);
|
||||
__nvvm_cp_async_cg_shared_global_16(dst, src);
|
||||
|
||||
__nvvm_cp_async_ca_shared_global_4(dst, src, 2);
|
||||
__nvvm_cp_async_ca_shared_global_8(dst, src, 2);
|
||||
__nvvm_cp_async_ca_shared_global_16(dst, src, 2);
|
||||
__nvvm_cp_async_cg_shared_global_16(dst, src, 2);
|
||||
|
||||
__nvvm_cp_async_ca_shared_global_4(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
|
||||
__nvvm_cp_async_ca_shared_global_8(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
|
||||
__nvvm_cp_async_ca_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
|
||||
__nvvm_cp_async_cg_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
|
||||
}
|
||||
#endif
|
||||
|
@ -1380,30 +1380,21 @@ def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
|
||||
ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
|
||||
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
|
||||
|
||||
def int_nvvm_cp_async_ca_shared_global_4 :
|
||||
ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.ca.shared.global.4">;
|
||||
def int_nvvm_cp_async_ca_shared_global_8 :
|
||||
ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.ca.shared.global.8">;
|
||||
def int_nvvm_cp_async_ca_shared_global_16 :
|
||||
ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.ca.shared.global.16">;
|
||||
def int_nvvm_cp_async_cg_shared_global_16 :
|
||||
ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.cg.shared.global.16">;
|
||||
multiclass CP_ASYNC_SHARED_GLOBAL<string n, string cc> {
|
||||
def NAME: Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async." # cc # ".shared.global." # n>;
|
||||
def _s: Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
|
||||
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async." # cc # ".shared.global." # n # ".s">;
|
||||
}
|
||||
|
||||
defm int_nvvm_cp_async_ca_shared_global_4 : CP_ASYNC_SHARED_GLOBAL<"4", "ca">;
|
||||
defm int_nvvm_cp_async_ca_shared_global_8 : CP_ASYNC_SHARED_GLOBAL<"8", "ca">;
|
||||
defm int_nvvm_cp_async_ca_shared_global_16 : CP_ASYNC_SHARED_GLOBAL<"16", "ca">;
|
||||
defm int_nvvm_cp_async_cg_shared_global_16 : CP_ASYNC_SHARED_GLOBAL<"16", "cg">;
|
||||
|
||||
def int_nvvm_cp_async_commit_group :
|
||||
ClangBuiltin<"__nvvm_cp_async_commit_group">,
|
||||
|
@ -328,39 +328,49 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
|
||||
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
|
||||
CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
|
||||
|
||||
multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
|
||||
multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin, Intrinsic IntrinS> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
|
||||
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
|
||||
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
// Variant with src_size parameter
|
||||
def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
|
||||
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
|
||||
[(IntrinS Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
|
||||
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
|
||||
[(IntrinS Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
|
||||
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
|
||||
[(IntrinS Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
|
||||
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
|
||||
[(IntrinS Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
|
||||
CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
|
||||
CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4,
|
||||
int_nvvm_cp_async_ca_shared_global_4_s>;
|
||||
|
||||
defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
|
||||
CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
|
||||
CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8,
|
||||
int_nvvm_cp_async_ca_shared_global_8_s>;
|
||||
|
||||
defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
|
||||
CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
|
||||
|
||||
multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
|
||||
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
|
||||
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16,
|
||||
int_nvvm_cp_async_ca_shared_global_16_s>;
|
||||
|
||||
defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
|
||||
CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
|
||||
CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16,
|
||||
int_nvvm_cp_async_cg_shared_global_16_s>;
|
||||
|
||||
def CP_ASYNC_COMMIT_GROUP :
|
||||
NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
|
||||
|
@ -1,35 +1,35 @@
|
||||
; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
|
||||
; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
|
||||
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
|
||||
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.wait.group(i32)
|
||||
|
||||
; ALL-LABEL: asyncwaitgroup
|
||||
; CHECK-LABEL: asyncwaitgroup
|
||||
define void @asyncwaitgroup() {
|
||||
; ALL: cp.async.wait_group 8;
|
||||
; CHECK: cp.async.wait_group 8;
|
||||
tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
|
||||
; ALL: cp.async.wait_group 0;
|
||||
; CHECK: cp.async.wait_group 0;
|
||||
tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
|
||||
; ALL: cp.async.wait_group 16;
|
||||
; CHECK: cp.async.wait_group 16;
|
||||
tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.wait.all()
|
||||
|
||||
; ALL-LABEL: asyncwaitall
|
||||
; CHECK-LABEL: asyncwaitall
|
||||
define void @asyncwaitall() {
|
||||
; ALL: cp.async.wait_all
|
||||
; CHECK: cp.async.wait_all
|
||||
tail call void @llvm.nvvm.cp.async.wait.all()
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.commit.group()
|
||||
|
||||
; ALL-LABEL: asynccommitgroup
|
||||
; CHECK-LABEL: asynccommitgroup
|
||||
define void @asynccommitgroup() {
|
||||
; ALL: cp.async.commit_group
|
||||
; CHECK: cp.async.commit_group
|
||||
tail call void @llvm.nvvm.cp.async.commit.group()
|
||||
ret void
|
||||
}
|
||||
@ -41,72 +41,87 @@ declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %
|
||||
|
||||
; CHECK-LABEL: asyncmbarrier
|
||||
define void @asyncmbarrier(ptr %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
|
||||
; The distinction between PTX32/PTX64 here is only to capture pointer register type
|
||||
; in R to be used in subsequent tests.
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: asyncmbarriershared
|
||||
define void @asyncmbarriershared(ptr addrspace(3) %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
|
||||
; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: asyncmbarriernoinc
|
||||
define void @asyncmbarriernoinc(ptr %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: asyncmbarriernoincshared
|
||||
define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
|
||||
; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
|
||||
; CHECK-LABEL: asynccasharedglobal4i8
|
||||
define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
|
||||
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
|
||||
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
|
||||
define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4;
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, %r{{[0-9]+}};
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, 1;
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
|
||||
; CHECK-LABEL: asynccasharedglobal8i8
|
||||
define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
|
||||
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8;
|
||||
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8;
|
||||
define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8;
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, %r{{[0-9]+}};
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, 1;
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.8.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
|
||||
; CHECK-LABEL: asynccasharedglobal16i8
|
||||
define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
|
||||
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
|
||||
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
|
||||
define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16;
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}};
|
||||
; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1;
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
declare void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
|
||||
; CHECK-LABEL: asynccgsharedglobal16i8
|
||||
define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
|
||||
; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
|
||||
; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
|
||||
define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
|
||||
; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16;
|
||||
; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}};
|
||||
; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1;
|
||||
tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
|
||||
tail call void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
|
||||
tail call void @llvm.nvvm.cp.async.cg.shared.global.16.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user