mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-29 08:16:08 +00:00
[OPENMP][NVPTX] Add support for lastprivates/reductions handling in SPMD constructs with lightweight runtime.
Summary: We need the support for per-team shared variables to support codegen for lastprivates/reductions. Patch adds this support by using shared memory if the total size of the reductions/lastprivates is <= 128 bytes, then pre-allocated buffer in global memory if size is <= 4K bytes,or uses malloc/free, otherwise. Reviewers: gtbercea, kkwli0, grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D51875 llvm-svn: 342737
This commit is contained in:
parent
8171bd8e0f
commit
022bf16b41
@ -378,6 +378,12 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
|
||||
// as long as the size requested fits the pre-allocated size.
|
||||
EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
|
||||
int16_t UseSharedMemory) {
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, isSPMDMode(),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
|
||||
}
|
||||
|
||||
// Frame pointer must be visible to all workers in the same warp.
|
||||
unsigned WID = getWarpId();
|
||||
void *&FrameP = DataSharingState.FramePtr[WID];
|
||||
@ -456,6 +462,12 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
|
||||
// reclaim all outstanding global memory slots since it is
|
||||
// likely we have reached the end of the kernel.
|
||||
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, isSPMDMode(),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
|
||||
}
|
||||
|
||||
if (IsWarpMasterActiveThread()) {
|
||||
unsigned WID = getWarpId();
|
||||
|
||||
|
@ -38,6 +38,8 @@ __device__ __shared__
|
||||
__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
|
||||
*omptarget_nvptx_simpleThreadPrivateContext;
|
||||
|
||||
__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The team master sets the outlined parallel function in this variable to
|
||||
// communicate with the workers. Since it is in shared memory, there is one
|
||||
|
@ -25,13 +25,23 @@ extern __device__ omptarget_nvptx_Queue<
|
||||
omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_simpleState[MAX_SM];
|
||||
|
||||
extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// init entry points
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE unsigned nsmid() {
|
||||
unsigned n;
|
||||
asm("mov.u32 %0, %%nsmid;" : "=r"(n));
|
||||
return n;
|
||||
}
|
||||
|
||||
INLINE unsigned smid() {
|
||||
unsigned id;
|
||||
asm("mov.u32 %0, %%smid;" : "=r"(id));
|
||||
ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
|
||||
"Expected number of SMs is less than reported.");
|
||||
return id;
|
||||
}
|
||||
|
||||
@ -108,6 +118,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
||||
int slot = smid() % MAX_SM;
|
||||
omptarget_nvptx_simpleThreadPrivateContext =
|
||||
omptarget_nvptx_device_simpleState[slot].Dequeue();
|
||||
// Reuse the memory allocated for the full runtime as the preallocated
|
||||
// global memory buffer for the lightweight runtime.
|
||||
omptarget_nvptx_simpleGlobalData =
|
||||
omptarget_nvptx_device_State[slot].Dequeue();
|
||||
}
|
||||
__syncthreads();
|
||||
omptarget_nvptx_simpleThreadPrivateContext->Init();
|
||||
@ -177,6 +191,10 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
|
||||
int slot = smid() % MAX_SM;
|
||||
omptarget_nvptx_device_simpleState[slot].Enqueue(
|
||||
omptarget_nvptx_simpleThreadPrivateContext);
|
||||
// Enqueue global memory back.
|
||||
omptarget_nvptx_device_State[slot].Enqueue(
|
||||
reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
|
||||
omptarget_nvptx_simpleGlobalData));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -113,6 +113,8 @@ enum DATA_SHARING_SIZES {
|
||||
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = 32,
|
||||
// The size of the preallocated shared memory buffer per team
|
||||
DS_Shared_Memory_Size = 128,
|
||||
};
|
||||
|
||||
// Data structure to keep in shared memory that traces the current slot, stack,
|
||||
@ -386,12 +388,15 @@ struct omptarget_device_environmentTy {
|
||||
|
||||
class omptarget_nvptx_SimpleThreadPrivateContext {
|
||||
uint16_t par_level[MAX_THREADS_PER_TEAM];
|
||||
|
||||
public:
|
||||
INLINE void Init() {
|
||||
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
|
||||
"Expected SPMD + uninitialized runtime modes.");
|
||||
par_level[GetThreadIdInBlock()] = 0;
|
||||
}
|
||||
static INLINE void *Allocate(size_t DataSize);
|
||||
static INLINE void Deallocate(void *Ptr);
|
||||
INLINE void IncParLevel() {
|
||||
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
|
||||
"Expected SPMD + uninitialized runtime modes.");
|
||||
|
@ -202,3 +202,36 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
|
||||
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Lightweight runtime functions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Shared memory buffer for globalization support.
|
||||
static __align__(16) __device__ __shared__ char
|
||||
omptarget_static_buffer[DS_Shared_Memory_Size];
|
||||
static __device__ __shared__ void *omptarget_spmd_allocated;
|
||||
|
||||
extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
|
||||
|
||||
INLINE void *
|
||||
omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
|
||||
if (DataSize <= DS_Shared_Memory_Size)
|
||||
return ::omptarget_static_buffer;
|
||||
if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
|
||||
return ::omptarget_nvptx_simpleGlobalData;
|
||||
if (threadIdx.x == 0)
|
||||
omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
|
||||
__syncthreads();
|
||||
return omptarget_spmd_allocated;
|
||||
}
|
||||
|
||||
INLINE void
|
||||
omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
|
||||
if (Ptr != ::omptarget_static_buffer &&
|
||||
Ptr != ::omptarget_nvptx_simpleGlobalData) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0)
|
||||
SafeFree(Ptr, "SPMD teams dealloc");
|
||||
}
|
||||
}
|
||||
|
@ -34,7 +34,10 @@
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#define OMP_STATE_COUNT 32
|
||||
#define MAX_SM 84
|
||||
#elif __CUDA_ARCH__ >= 600
|
||||
#define OMP_STATE_COUNT 32
|
||||
#define MAX_SM 56
|
||||
#else
|
||||
|
Loading…
x
Reference in New Issue
Block a user