mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 04:56:07 +00:00
[NVPTX] Add TMA bulk tensor reduction intrinsics (#116854)
This patch adds NVVM intrinsics and NVPTX codegen for: * cp.async.bulk.tensor.reduce.1D -> 5D variants, supporting both Tile and Im2Col modes. * These intrinsics optionally support cache_hints as indicated by the boolean flag argument. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk-tensor-reduce.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
parent
0d15d46362
commit
40d0058e6a
@ -633,7 +633,7 @@ specified by the ``i32 %d0 ... i32 %d4`` arguments.
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
|
||||
|
||||
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
|
||||
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[3-5]d``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
@ -648,7 +648,7 @@ Syntax:
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
|
||||
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[3-5]d``' intrinsics
|
||||
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
|
||||
of PTX instructions. These instructions initiate an asynchronous prefetch
|
||||
of tensor data from global memory to the L2 cache. In im2col mode, some
|
||||
@ -663,6 +663,76 @@ the same functionality as described in the ``tile`` mode intrinsics above.
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
|
||||
|
||||
'``llvm.nvvm.cp.async.bulk.tensor.reduce.[red_op].tile.[1-5]d``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.2d(..., i32 %d0, i32 %d1, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``@llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.[1-5]d``' intrinsics
|
||||
correspond to the ``cp.reduce.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
|
||||
These instructions initiate an asynchronous reduction operation of tensor data
|
||||
in global memory with the tensor data in shared{::cta} memory, using ``tile`` mode.
|
||||
The dimension of the tensor data ranges from 1d to 5d with the coordinates
|
||||
specified by the ``i32 %d0 ... i32 %d4`` arguments. The supported reduction
|
||||
operations are {add, min, max, inc, dec, and, or, xor} as described in the
|
||||
``tile.1d`` intrinsics.
|
||||
|
||||
* The last argument to these intrinsics is a boolean flag
|
||||
indicating support for cache_hint. This flag argument must
|
||||
be a compile-time constant. When set, it indicates a valid
|
||||
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
|
||||
variant of the PTX instruction.
|
||||
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`_.
|
||||
|
||||
'``llvm.nvvm.cp.async.bulk.tensor.reduce.[red_op].im2col.[3-5]d``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``@llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.[3-5]d``' intrinsics
|
||||
correspond to the ``cp.reduce.async.bulk.tensor.[3-5]d.*`` set of PTX instructions.
|
||||
These instructions initiate an asynchronous reduction operation of tensor data
|
||||
in global memory with the tensor data in shared{::cta} memory, using ``im2col`` mode.
|
||||
In this mode, the tensor has to be at least three-dimensional. The supported reduction
|
||||
operations supported are the same as the ones in the tile mode. The last argument to
|
||||
these intrinsics is a boolean flag, with the same functionality as described in the
|
||||
``tile`` mode intrinsics above.
|
||||
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`_.
|
||||
|
||||
Other Intrinsics
|
||||
----------------
|
||||
|
||||
|
@ -635,6 +635,25 @@ class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
|
||||
ImmArg<ArgIndex<FlagsStartIdx>>];
|
||||
}
|
||||
|
||||
class CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, string mode, string op> {
|
||||
string Suffix = op # "_" # mode # "_" # dim # "d";
|
||||
string Name = "int_nvvm_cp_async_bulk_tensor_reduce_" # Suffix;
|
||||
|
||||
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
|
||||
list<LLVMType> ArgsTy = !listconcat(
|
||||
[llvm_shared_ptr_ty, // src_smem_ptr
|
||||
llvm_ptr_ty], // tensormap_ptr
|
||||
TensorDimsTy, // actual tensor dims
|
||||
[llvm_i64_ty, // cache_hint
|
||||
llvm_i1_ty] // Flag for cache_hint
|
||||
);
|
||||
int FlagsStartIdx = !add(dim, 3);
|
||||
list<IntrinsicProperty> IntrProp = [IntrConvergent,
|
||||
ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
|
||||
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
|
||||
ImmArg<ArgIndex<FlagsStartIdx>>];
|
||||
}
|
||||
|
||||
let TargetPrefix = "nvvm" in {
|
||||
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
|
||||
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
@ -4936,4 +4955,14 @@ foreach dim = [1, 2, 3, 4, 5] in {
|
||||
}
|
||||
}
|
||||
|
||||
// Intrinsics for TMA Copy with reduction
|
||||
foreach dim = [1, 2, 3, 4, 5] in {
|
||||
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
|
||||
foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in {
|
||||
foreach reduce = [CP_ASYNC_BULK_TENSOR_REDUCE_INTR<dim, mode, red_op>] in
|
||||
def reduce.Name : DefaultAttrsIntrinsic<[], reduce.ArgsTy, reduce.IntrProp>;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // let TargetPrefix = "nvvm"
|
||||
|
37
llvm/include/llvm/IR/NVVMIntrinsicFlags.h
Normal file
37
llvm/include/llvm/IR/NVVMIntrinsicFlags.h
Normal file
@ -0,0 +1,37 @@
|
||||
//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// This file contains the definitions of the enumerations and flags
|
||||
/// associated with NVVM Intrinsics.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_IR_NVVMINTRINSICFLAGS_H
|
||||
#define LLVM_IR_NVVMINTRINSICFLAGS_H
|
||||
|
||||
namespace llvm {
|
||||
namespace nvvm {
|
||||
|
||||
// Reduction Ops supported with TMA Copy from Shared
|
||||
// to Global Memory for the "cp.reduce.async.bulk.tensor.*"
|
||||
// family of PTX instructions.
|
||||
enum class TMAReductionOp : uint8_t {
|
||||
ADD = 0,
|
||||
MIN = 1,
|
||||
MAX = 2,
|
||||
INC = 3,
|
||||
DEC = 4,
|
||||
AND = 5,
|
||||
OR = 6,
|
||||
XOR = 7,
|
||||
};
|
||||
|
||||
} // namespace nvvm
|
||||
} // namespace llvm
|
||||
#endif // LLVM_IR_NVVMINTRINSICFLAGS_H
|
@ -14,6 +14,7 @@
|
||||
#include "NVPTX.h"
|
||||
#include "NVPTXUtilities.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/IR/NVVMIntrinsicFlags.h"
|
||||
#include "llvm/MC/MCExpr.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCInstrInfo.h"
|
||||
@ -416,3 +417,40 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum,
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum,
|
||||
raw_ostream &O,
|
||||
const char *Modifier) {
|
||||
const MCOperand &MO = MI->getOperand(OpNum);
|
||||
using RedTy = llvm::nvvm::TMAReductionOp;
|
||||
|
||||
switch (static_cast<RedTy>(MO.getImm())) {
|
||||
case RedTy::ADD:
|
||||
O << ".add";
|
||||
return;
|
||||
case RedTy::MIN:
|
||||
O << ".min";
|
||||
return;
|
||||
case RedTy::MAX:
|
||||
O << ".max";
|
||||
return;
|
||||
case RedTy::INC:
|
||||
O << ".inc";
|
||||
return;
|
||||
case RedTy::DEC:
|
||||
O << ".dec";
|
||||
return;
|
||||
case RedTy::AND:
|
||||
O << ".and";
|
||||
return;
|
||||
case RedTy::OR:
|
||||
O << ".or";
|
||||
return;
|
||||
case RedTy::XOR:
|
||||
O << ".xor";
|
||||
return;
|
||||
default:
|
||||
llvm_unreachable(
|
||||
"Invalid Reduction Op in printCpAsyncBulkTensorReductionMode");
|
||||
}
|
||||
}
|
||||
|
@ -54,6 +54,8 @@ public:
|
||||
raw_ostream &O, const char *Modifier = nullptr);
|
||||
void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O,
|
||||
const char *Modifier = nullptr);
|
||||
void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O,
|
||||
const char *Modifier = nullptr);
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicsNVPTX.h"
|
||||
#include "llvm/IR/NVVMIntrinsicFlags.h"
|
||||
#include "llvm/Support/AtomicOrdering.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
@ -4152,40 +4153,51 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
|
||||
|
||||
bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
|
||||
|
||||
#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \
|
||||
(IsShared32 \
|
||||
#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
|
||||
(is_s32 \
|
||||
? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
|
||||
: NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
|
||||
|
||||
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \
|
||||
(IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, _CH)) \
|
||||
: (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, )))
|
||||
#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
|
||||
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
|
||||
: (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
|
||||
|
||||
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \
|
||||
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
|
||||
is_s32) \
|
||||
(is_reduce \
|
||||
? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
|
||||
: (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
|
||||
is_s32)))
|
||||
|
||||
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
|
||||
[&]() -> auto { \
|
||||
if (IsMultiCast && IsCacheHint) \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC_CH); \
|
||||
if (IsCacheHint) \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _CH); \
|
||||
if (IsMultiCast) \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC); \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \
|
||||
if (is_mc && is_ch) \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
|
||||
if (is_ch) \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
|
||||
if (is_mc) \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
|
||||
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
|
||||
}()
|
||||
|
||||
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode) \
|
||||
(IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
|
||||
: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
|
||||
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
|
||||
(is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
|
||||
: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
|
||||
|
||||
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
|
||||
bool IsCacheHint, bool IsIm2Col) {
|
||||
bool IsCacheHint, bool IsIm2Col,
|
||||
bool IsReduce = false) {
|
||||
if (IsIm2Col) {
|
||||
switch (Dim) {
|
||||
case 3:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
case 4:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
case 5:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
default:
|
||||
llvm_unreachable("Invalid Dimension in im2col mode for "
|
||||
"GetCpAsyncBulkTensorS2GOpcode.");
|
||||
@ -4193,15 +4205,20 @@ static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
|
||||
} else {
|
||||
switch (Dim) {
|
||||
case 1:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
case 2:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
case 3:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
case 4:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
case 5:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
|
||||
IsCacheHint, IsShared32);
|
||||
default:
|
||||
llvm_unreachable(
|
||||
"Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
|
||||
@ -4215,11 +4232,14 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
|
||||
if (IsIm2Col) {
|
||||
switch (Dim) {
|
||||
case 3:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
case 4:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
case 5:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
default:
|
||||
llvm_unreachable("Invalid Dimension in im2col mode for "
|
||||
"GetCpAsyncBulkTensorG2SOpcode.");
|
||||
@ -4227,15 +4247,20 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
|
||||
} else {
|
||||
switch (Dim) {
|
||||
case 1:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
case 2:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
case 3:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
case 4:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
case 5:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
|
||||
IsCacheHint, IsShared32);
|
||||
default:
|
||||
llvm_unreachable(
|
||||
"Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
|
||||
@ -4248,11 +4273,11 @@ static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
|
||||
if (IsIm2Col) {
|
||||
switch (Dim) {
|
||||
case 3:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
|
||||
case 4:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
|
||||
case 5:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
|
||||
default:
|
||||
llvm_unreachable("Invalid Dimension in im2col mode for "
|
||||
"GetCpAsyncBulkTensorPrefetchOpcode.");
|
||||
@ -4260,15 +4285,15 @@ static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
|
||||
} else {
|
||||
switch (Dim) {
|
||||
case 1:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
|
||||
case 2:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
|
||||
case 3:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
|
||||
case 4:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
|
||||
case 5:
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE);
|
||||
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
|
||||
default:
|
||||
llvm_unreachable("Invalid Dimension in tile mode for "
|
||||
"GetCpAsyncBulkTensorPrefetchOpcode.");
|
||||
@ -4377,8 +4402,34 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
|
||||
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
|
||||
}
|
||||
|
||||
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
|
||||
unsigned RedOp,
|
||||
bool IsIm2Col) {
|
||||
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
|
||||
// src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
|
||||
// NumOperands = {Chain, IID} + {Actual intrinsic args}
|
||||
// = {2} + {4 + dims}
|
||||
size_t NumOps = N->getNumOperands();
|
||||
size_t NumDims = NumOps - 6;
|
||||
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
|
||||
size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
|
||||
|
||||
SDLoc DL(N);
|
||||
SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
|
||||
Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
|
||||
Ops.push_back(N->getOperand(0)); // Chain operand
|
||||
|
||||
bool IsShared32 =
|
||||
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
|
||||
unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
|
||||
NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
|
||||
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
|
||||
}
|
||||
|
||||
bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
|
||||
unsigned IID = N->getConstantOperandVal(1);
|
||||
using TMARedTy = llvm::nvvm::TMAReductionOp;
|
||||
auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
|
||||
switch (IID) {
|
||||
default:
|
||||
return false;
|
||||
@ -4418,5 +4469,109 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
|
||||
SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
|
||||
return true;
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
|
||||
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
|
||||
SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
|
||||
/*IsIm2Col=*/true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -95,6 +95,9 @@ private:
|
||||
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
|
||||
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
|
||||
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
|
||||
void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
|
||||
bool IsIm2Col = false);
|
||||
|
||||
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
|
||||
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
|
||||
}
|
||||
|
@ -564,17 +564,19 @@ foreach dim = [1, 2, 3, 4, 5] in {
|
||||
}
|
||||
|
||||
// From Shared to Global memory (S2G)
|
||||
class S2G_STRINGS<int dim, string mode, bit ch, bit is_shared32 = 0> {
|
||||
string prefix = "cp.async.bulk.tensor";
|
||||
class S2G_STRINGS<int dim, string mode, bit ch,
|
||||
bit is_shared32 = 0, bit is_reduce = 0> {
|
||||
string dir = "global.shared::cta";
|
||||
string completion = "bulk_group";
|
||||
string inst_name = prefix
|
||||
string inst_name = !if(is_reduce, "cp.reduce", "cp")
|
||||
# ".async.bulk.tensor"
|
||||
# "." # dim # "d"
|
||||
# "." # dir
|
||||
# "." # mode
|
||||
# "." # completion
|
||||
# !if(ch, ".L2::cache_hint", "");
|
||||
string intr_name = "CP_ASYNC_BULK_TENSOR_S2G_"
|
||||
string intr_name = "CP_ASYNC_BULK_TENSOR_"
|
||||
# !if(is_reduce, "RED_", "S2G_")
|
||||
# dim # "D"
|
||||
# !if(is_shared32, "_SHARED32", "")
|
||||
# !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
|
||||
@ -596,11 +598,37 @@ multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, bit shared32, string mode> {
|
||||
Requires<[hasPTX<80>, hasSM<90>]>;
|
||||
}
|
||||
|
||||
def TMAReductionFlags : Operand<i32> {
|
||||
let PrintMethod = "printTmaReductionMode";
|
||||
}
|
||||
|
||||
// TMA Copy from Shared to Global memory with Reduction
|
||||
multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode> {
|
||||
defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
|
||||
defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
|
||||
defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
|
||||
defvar rc = !if(shared32, Int32Regs, Int64Regs);
|
||||
|
||||
defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta";
|
||||
defvar suffix = "." # mode # ".bulk_group";
|
||||
|
||||
def NAME: NVPTXInst<(outs),
|
||||
!con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)),
|
||||
!strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>,
|
||||
Requires<[hasPTX<80>, hasSM<90>]>;
|
||||
def NAME # _CH: NVPTXInst<(outs),
|
||||
!con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch, TMAReductionFlags:$red_op)),
|
||||
!strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>,
|
||||
Requires<[hasPTX<80>, hasSM<90>]>;
|
||||
}
|
||||
|
||||
foreach dim = [1, 2, 3, 4, 5] in {
|
||||
foreach shared32 = [true, false] in {
|
||||
foreach mode = !if(!ge(dim, 3), ["tile", "im2col_no_offs"], ["tile"]) in {
|
||||
defm S2G_STRINGS<dim, mode, 0, shared32>.intr_name :
|
||||
CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, shared32, mode>;
|
||||
defm S2G_STRINGS<dim, mode, 0, shared32, 1>.intr_name :
|
||||
CP_ASYNC_BULK_TENSOR_REDUCE_INTR<dim, shared32, mode>;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
426
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll
Normal file
426
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll
Normal file
@ -0,0 +1,426 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s
|
||||
; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.1d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i64 %ch, i1 %flag_ch);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag_ch);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch);
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch);
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_1d
|
||||
define void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_1d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_2d
|
||||
define void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_2d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_3d
|
||||
define void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_3d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_4d
|
||||
define void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_4d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_5d
|
||||
define void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_5d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_3d
|
||||
define void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_3d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_4d
|
||||
define void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_4d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_5d
|
||||
define void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
|
||||
; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_5d(
|
||||
; CHECK-PTX: {
|
||||
; CHECK-PTX-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
|
||||
ret void
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user