mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-16 00:16:30 +00:00
[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers (#131631)
Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback. AMD version for Ron https://gist.github.com/jhuber6/42014d635b9a8158727640876bf47226.
This commit is contained in:
parent
c02b935a9b
commit
206f78dfec
@ -19,9 +19,9 @@ namespace ompx {
|
||||
namespace mapping {
|
||||
|
||||
enum {
|
||||
DIM_X = 0,
|
||||
DIM_Y = 1,
|
||||
DIM_Z = 2,
|
||||
DIM_X = __GPU_X_DIM,
|
||||
DIM_Y = __GPU_Y_DIM,
|
||||
DIM_Z = __GPU_Z_DIM,
|
||||
};
|
||||
|
||||
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
|
||||
|
@ -14,198 +14,12 @@
|
||||
#include "DeviceUtils.h"
|
||||
#include "Interface.h"
|
||||
#include "State.h"
|
||||
#include "gpuintrin.h"
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
namespace ompx {
|
||||
namespace impl {
|
||||
|
||||
/// AMDGCN Implementation
|
||||
///
|
||||
///{
|
||||
#ifdef __AMDGPU__
|
||||
|
||||
uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
|
||||
|
||||
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __builtin_amdgcn_workgroup_size_x();
|
||||
case 1:
|
||||
return __builtin_amdgcn_workgroup_size_y();
|
||||
case 2:
|
||||
return __builtin_amdgcn_workgroup_size_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
|
||||
|
||||
LaneMaskTy lanemaskLT() {
|
||||
uint32_t Lane = mapping::getThreadIdInWarp();
|
||||
int64_t Ballot = mapping::activemask();
|
||||
uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
|
||||
return Mask & Ballot;
|
||||
}
|
||||
|
||||
LaneMaskTy lanemaskGT() {
|
||||
uint32_t Lane = mapping::getThreadIdInWarp();
|
||||
if (Lane == (mapping::getWarpSize() - 1))
|
||||
return 0;
|
||||
int64_t Ballot = mapping::activemask();
|
||||
uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
|
||||
return Mask & Ballot;
|
||||
}
|
||||
|
||||
uint32_t getThreadIdInWarp() {
|
||||
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
||||
}
|
||||
|
||||
uint32_t getThreadIdInBlock(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __builtin_amdgcn_workitem_id_x();
|
||||
case 1:
|
||||
return __builtin_amdgcn_workitem_id_y();
|
||||
case 2:
|
||||
return __builtin_amdgcn_workitem_id_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getNumberOfThreadsInKernel() {
|
||||
return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
|
||||
__builtin_amdgcn_grid_size_z();
|
||||
}
|
||||
|
||||
uint32_t getBlockIdInKernel(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __builtin_amdgcn_workgroup_id_x();
|
||||
case 1:
|
||||
return __builtin_amdgcn_workgroup_id_y();
|
||||
case 2:
|
||||
return __builtin_amdgcn_workgroup_id_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
|
||||
case 1:
|
||||
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
|
||||
case 2:
|
||||
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getWarpIdInBlock() {
|
||||
return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
|
||||
}
|
||||
|
||||
uint32_t getNumberOfWarpsInBlock() {
|
||||
return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
|
||||
}
|
||||
|
||||
#endif
|
||||
///}
|
||||
|
||||
/// NVPTX Implementation
|
||||
///
|
||||
///{
|
||||
#ifdef __NVPTX__
|
||||
|
||||
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __nvvm_read_ptx_sreg_ntid_x();
|
||||
case 1:
|
||||
return __nvvm_read_ptx_sreg_ntid_y();
|
||||
case 2:
|
||||
return __nvvm_read_ptx_sreg_ntid_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
|
||||
|
||||
LaneMaskTy activemask() { return __nvvm_activemask(); }
|
||||
|
||||
LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
|
||||
|
||||
LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
|
||||
|
||||
uint32_t getThreadIdInBlock(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __nvvm_read_ptx_sreg_tid_x();
|
||||
case 1:
|
||||
return __nvvm_read_ptx_sreg_tid_y();
|
||||
case 2:
|
||||
return __nvvm_read_ptx_sreg_tid_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
|
||||
|
||||
uint32_t getBlockIdInKernel(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __nvvm_read_ptx_sreg_ctaid_x();
|
||||
case 1:
|
||||
return __nvvm_read_ptx_sreg_ctaid_y();
|
||||
case 2:
|
||||
return __nvvm_read_ptx_sreg_ctaid_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
case 0:
|
||||
return __nvvm_read_ptx_sreg_nctaid_x();
|
||||
case 1:
|
||||
return __nvvm_read_ptx_sreg_nctaid_y();
|
||||
case 2:
|
||||
return __nvvm_read_ptx_sreg_nctaid_z();
|
||||
};
|
||||
UNREACHABLE("Dim outside range!");
|
||||
}
|
||||
|
||||
uint32_t getNumberOfThreadsInKernel() {
|
||||
return impl::getNumberOfThreadsInBlock(0) *
|
||||
impl::getNumberOfBlocksInKernel(0) *
|
||||
impl::getNumberOfThreadsInBlock(1) *
|
||||
impl::getNumberOfBlocksInKernel(1) *
|
||||
impl::getNumberOfThreadsInBlock(2) *
|
||||
impl::getNumberOfBlocksInKernel(2);
|
||||
}
|
||||
|
||||
uint32_t getWarpIdInBlock() {
|
||||
return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
|
||||
}
|
||||
|
||||
uint32_t getNumberOfWarpsInBlock() {
|
||||
return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
|
||||
mapping::getWarpSize();
|
||||
}
|
||||
|
||||
#endif
|
||||
///}
|
||||
|
||||
} // namespace impl
|
||||
} // namespace ompx
|
||||
|
||||
/// We have to be deliberate about the distinction of `mapping::` and `impl::`
|
||||
/// below to avoid repeating assumptions or including irrelevant ones.
|
||||
///{
|
||||
|
||||
static bool isInLastWarp() {
|
||||
uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
|
||||
~(mapping::getWarpSize() - 1);
|
||||
@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
|
||||
return utils::popc(Active & LaneMaskLT) == 0;
|
||||
}
|
||||
|
||||
LaneMaskTy mapping::activemask() { return impl::activemask(); }
|
||||
LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
|
||||
|
||||
LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
|
||||
LaneMaskTy mapping::lanemaskLT() {
|
||||
#ifdef __NVPTX__
|
||||
return __nvvm_read_ptx_sreg_lanemask_lt();
|
||||
#else
|
||||
uint32_t Lane = mapping::getThreadIdInWarp();
|
||||
int64_t Ballot = mapping::activemask();
|
||||
uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
|
||||
return Mask & Ballot;
|
||||
#endif
|
||||
}
|
||||
|
||||
LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
|
||||
LaneMaskTy mapping::lanemaskGT() {
|
||||
#ifdef __NVPTX__
|
||||
return __nvvm_read_ptx_sreg_lanemask_gt();
|
||||
#else
|
||||
uint32_t Lane = mapping::getThreadIdInWarp();
|
||||
if (Lane == (mapping::getWarpSize() - 1))
|
||||
return 0;
|
||||
int64_t Ballot = mapping::activemask();
|
||||
uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
|
||||
return Mask & Ballot;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t mapping::getThreadIdInWarp() {
|
||||
uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
|
||||
ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
|
||||
uint32_t ThreadIdInWarp = __gpu_lane_id();
|
||||
ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
|
||||
return ThreadIdInWarp;
|
||||
}
|
||||
|
||||
uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
|
||||
uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
|
||||
uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
|
||||
return ThreadIdInBlock;
|
||||
}
|
||||
|
||||
uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
|
||||
uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
|
||||
|
||||
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
|
||||
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
|
||||
// If we are in SPMD mode, remove one warp.
|
||||
return BlockSize - (!IsSPMD * impl::getWarpSize());
|
||||
return BlockSize - (!IsSPMD * mapping::getWarpSize());
|
||||
}
|
||||
uint32_t mapping::getMaxTeamThreads() {
|
||||
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
|
||||
}
|
||||
|
||||
uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
|
||||
return impl::getNumberOfThreadsInBlock(Dim);
|
||||
return __gpu_num_threads(Dim);
|
||||
}
|
||||
|
||||
uint32_t mapping::getNumberOfThreadsInKernel() {
|
||||
return impl::getNumberOfThreadsInKernel();
|
||||
return mapping::getNumberOfThreadsInBlock(0) *
|
||||
mapping::getNumberOfBlocksInKernel(0) *
|
||||
mapping::getNumberOfThreadsInBlock(1) *
|
||||
mapping::getNumberOfBlocksInKernel(1) *
|
||||
mapping::getNumberOfThreadsInBlock(2) *
|
||||
mapping::getNumberOfBlocksInKernel(2);
|
||||
}
|
||||
|
||||
uint32_t mapping::getWarpIdInBlock() {
|
||||
uint32_t WarpID = impl::getWarpIdInBlock();
|
||||
ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
|
||||
uint32_t WarpID =
|
||||
mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
|
||||
ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
|
||||
return WarpID;
|
||||
}
|
||||
|
||||
uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
|
||||
uint32_t BlockId = impl::getBlockIdInKernel(Dim);
|
||||
ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
|
||||
uint32_t BlockId = __gpu_block_id(Dim);
|
||||
ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
|
||||
return BlockId;
|
||||
}
|
||||
|
||||
uint32_t mapping::getNumberOfWarpsInBlock() {
|
||||
uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
|
||||
ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
|
||||
return NumberOfWarpsInBlocks;
|
||||
return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
|
||||
mapping::getWarpSize();
|
||||
}
|
||||
|
||||
uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
|
||||
uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
|
||||
ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
|
||||
return NumberOfBlocks;
|
||||
return __gpu_num_blocks(Dim);
|
||||
}
|
||||
|
||||
uint32_t mapping::getNumberOfProcessorElements() {
|
||||
@ -326,11 +163,11 @@ extern "C" {
|
||||
}
|
||||
|
||||
[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
|
||||
return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
|
||||
return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
|
||||
}
|
||||
|
||||
[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
|
||||
return impl::getWarpSize();
|
||||
return mapping::getWarpSize();
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user