mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 22:36:06 +00:00
[OpenMP] Basic BumpAllocator for (AMD)GPUs (#69806)
The patch contains a basic BumpAllocator for (AMD)GPUs to allow us to run more tests. The allocator implements `malloc`, both internally and externally, while we continue to default to the NVIDIA `malloc` when we target NVIDIA GPUs. Once we have smarter or customizable allocators we should consider this choice, for now, this allocator is better than none. It traps if it is out of memory, making it easy to debug. Heap size is configured via `LIBOMPTARGET_HEAP_SIZE` and defaults to 512MB. It allows to track allocation statistics via `LIBOMPTARGET_DEVICE_RTL_DEBUG=8` (together with `-fopenmp-target-debug=8`). Two tests were added, and one was enabled. This is the next step towards fixing https://github.com/llvm/llvm-project/issues/66708
This commit is contained in:
parent
d571af7f62
commit
d3921e4670
@ -1465,3 +1465,4 @@ debugging features are supported.
|
|||||||
|
|
||||||
* Enable debugging assertions in the device. ``0x01``
|
* Enable debugging assertions in the device. ``0x01``
|
||||||
* Enable diagnosing common problems during offloading . ``0x4``
|
* Enable diagnosing common problems during offloading . ``0x4``
|
||||||
|
* Enable device malloc statistics (amdgpu only). ``0x8``
|
||||||
|
@ -83,6 +83,7 @@ endif()
|
|||||||
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
|
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
|
||||||
|
|
||||||
set(include_files
|
set(include_files
|
||||||
|
${include_directory}/Allocator.h
|
||||||
${include_directory}/Configuration.h
|
${include_directory}/Configuration.h
|
||||||
${include_directory}/Debug.h
|
${include_directory}/Debug.h
|
||||||
${include_directory}/Interface.h
|
${include_directory}/Interface.h
|
||||||
@ -95,6 +96,7 @@ set(include_files
|
|||||||
)
|
)
|
||||||
|
|
||||||
set(src_files
|
set(src_files
|
||||||
|
${source_directory}/Allocator.cpp
|
||||||
${source_directory}/Configuration.cpp
|
${source_directory}/Configuration.cpp
|
||||||
${source_directory}/Debug.cpp
|
${source_directory}/Debug.cpp
|
||||||
${source_directory}/Kernel.cpp
|
${source_directory}/Kernel.cpp
|
||||||
|
44
openmp/libomptarget/DeviceRTL/include/Allocator.h
Normal file
44
openmp/libomptarget/DeviceRTL/include/Allocator.h
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef OMPTARGET_ALLOCATOR_H
|
||||||
|
#define OMPTARGET_ALLOCATOR_H
|
||||||
|
|
||||||
|
#include "Types.h"
|
||||||
|
|
||||||
|
// Forward declaration.
|
||||||
|
struct KernelEnvironmentTy;
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
namespace ompx {
|
||||||
|
|
||||||
|
namespace allocator {
|
||||||
|
|
||||||
|
static uint64_t constexpr ALIGNMENT = 16;
|
||||||
|
|
||||||
|
/// Initialize the allocator according to \p KernelEnvironment
|
||||||
|
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
|
||||||
|
|
||||||
|
/// Allocate \p Size bytes.
|
||||||
|
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
|
||||||
|
alloc(uint64_t Size);
|
||||||
|
|
||||||
|
/// Free the allocation pointed to by \p Ptr.
|
||||||
|
void free(void *Ptr);
|
||||||
|
|
||||||
|
} // namespace allocator
|
||||||
|
|
||||||
|
} // namespace ompx
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
|
|
||||||
|
#endif
|
80
openmp/libomptarget/DeviceRTL/src/Allocator.cpp
Normal file
80
openmp/libomptarget/DeviceRTL/src/Allocator.cpp
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "Allocator.h"
|
||||||
|
#include "Configuration.h"
|
||||||
|
#include "Environment.h"
|
||||||
|
#include "Mapping.h"
|
||||||
|
#include "Synchronization.h"
|
||||||
|
#include "Types.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
[[gnu::used, gnu::retain, gnu::weak,
|
||||||
|
gnu::visibility(
|
||||||
|
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
|
||||||
|
[[gnu::used, gnu::retain, gnu::weak,
|
||||||
|
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
|
||||||
|
__omp_rtl_device_memory_pool_tracker;
|
||||||
|
|
||||||
|
/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
|
||||||
|
/// directly.
|
||||||
|
struct BumpAllocatorTy final {
|
||||||
|
|
||||||
|
void *alloc(uint64_t Size) {
|
||||||
|
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
|
||||||
|
|
||||||
|
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
|
||||||
|
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
|
||||||
|
atomic::seq_cst);
|
||||||
|
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
|
||||||
|
atomic::seq_cst);
|
||||||
|
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
|
||||||
|
atomic::seq_cst);
|
||||||
|
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
|
||||||
|
atomic::seq_cst);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t *Data =
|
||||||
|
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
|
||||||
|
uint64_t End =
|
||||||
|
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
|
||||||
|
|
||||||
|
uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
|
||||||
|
if (OldData + Size > End)
|
||||||
|
__builtin_trap();
|
||||||
|
|
||||||
|
return reinterpret_cast<void *>(OldData);
|
||||||
|
}
|
||||||
|
|
||||||
|
void free(void *) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
BumpAllocatorTy BumpAllocator;
|
||||||
|
|
||||||
|
/// allocator namespace implementation
|
||||||
|
///
|
||||||
|
///{
|
||||||
|
|
||||||
|
void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
|
||||||
|
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
|
||||||
|
// more than one.
|
||||||
|
}
|
||||||
|
|
||||||
|
void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
|
||||||
|
|
||||||
|
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
|
||||||
|
|
||||||
|
///}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
@ -10,6 +10,7 @@
|
|||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "Allocator.h"
|
||||||
#include "Debug.h"
|
#include "Debug.h"
|
||||||
#include "Environment.h"
|
#include "Environment.h"
|
||||||
#include "Interface.h"
|
#include "Interface.h"
|
||||||
@ -30,6 +31,7 @@ static void inititializeRuntime(bool IsSPMD,
|
|||||||
synchronize::init(IsSPMD);
|
synchronize::init(IsSPMD);
|
||||||
mapping::init(IsSPMD);
|
mapping::init(IsSPMD);
|
||||||
state::init(IsSPMD, KernelEnvironment);
|
state::init(IsSPMD, KernelEnvironment);
|
||||||
|
allocator::init(IsSPMD, KernelEnvironment);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Simple generic state machine for worker threads.
|
/// Simple generic state machine for worker threads.
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
#include "State.h"
|
#include "State.h"
|
||||||
|
#include "Allocator.h"
|
||||||
|
#include "Configuration.h"
|
||||||
#include "Debug.h"
|
#include "Debug.h"
|
||||||
#include "Environment.h"
|
#include "Environment.h"
|
||||||
#include "Interface.h"
|
#include "Interface.h"
|
||||||
@ -26,18 +28,16 @@ using namespace ompx;
|
|||||||
///
|
///
|
||||||
///{
|
///{
|
||||||
|
|
||||||
/// Add worst-case padding so that future allocations are properly aligned.
|
|
||||||
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
|
|
||||||
/// passed in as an argument and the stack rewritten to support it.
|
|
||||||
constexpr const uint32_t Alignment = 16;
|
|
||||||
|
|
||||||
/// External symbol to access dynamic shared memory.
|
/// External symbol to access dynamic shared memory.
|
||||||
[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
|
[[gnu::aligned(
|
||||||
|
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
|
||||||
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
|
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
|
||||||
|
|
||||||
/// The kernel environment passed to the init method by the compiler.
|
/// The kernel environment passed to the init method by the compiler.
|
||||||
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
|
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
|
||||||
|
|
||||||
|
///}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
/// Fallback implementations are missing to trigger a link time error.
|
/// Fallback implementations are missing to trigger a link time error.
|
||||||
@ -45,29 +45,19 @@ namespace {
|
|||||||
/// dedicated begin/end declare variant.
|
/// dedicated begin/end declare variant.
|
||||||
///
|
///
|
||||||
///{
|
///{
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
#ifdef __AMDGPU__
|
||||||
|
|
||||||
|
[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
|
||||||
|
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
|
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
|
||||||
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
|
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
///}
|
|
||||||
|
|
||||||
/// AMDGCN implementations of the shuffle sync idiom.
|
|
||||||
///
|
|
||||||
///{
|
|
||||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
|
||||||
|
|
||||||
extern "C" {
|
|
||||||
void *malloc(uint64_t Size) {
|
|
||||||
// TODO: Use some preallocated space for dynamic malloc.
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void free(void *Ptr) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp end declare variant
|
|
||||||
///}
|
///}
|
||||||
|
|
||||||
/// A "smart" stack in shared memory.
|
/// A "smart" stack in shared memory.
|
||||||
@ -96,7 +86,7 @@ private:
|
|||||||
uint32_t computeThreadStorageTotal() {
|
uint32_t computeThreadStorageTotal() {
|
||||||
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
|
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
|
||||||
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
|
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
|
||||||
Alignment);
|
allocator::ALIGNMENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the top address of the warp data stack, that is the first address
|
/// Return the top address of the warp data stack, that is the first address
|
||||||
@ -106,8 +96,10 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The actual storage, shared among all warps.
|
/// The actual storage, shared among all warps.
|
||||||
[[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
|
[[gnu::aligned(
|
||||||
[[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
|
allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
|
||||||
|
[[gnu::aligned(
|
||||||
|
allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
|
static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
|
||||||
@ -122,7 +114,9 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) {
|
|||||||
|
|
||||||
void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
|
void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
|
||||||
// First align the number of requested bytes.
|
// First align the number of requested bytes.
|
||||||
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
|
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
|
||||||
|
/// be passed in as an argument and the stack rewritten to support it.
|
||||||
|
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
|
||||||
|
|
||||||
uint32_t StorageTotal = computeThreadStorageTotal();
|
uint32_t StorageTotal = computeThreadStorageTotal();
|
||||||
|
|
||||||
@ -150,7 +144,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
|
void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
|
||||||
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
|
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
|
||||||
if (utils::isSharedMemPtr(Ptr)) {
|
if (utils::isSharedMemPtr(Ptr)) {
|
||||||
int TId = mapping::getThreadIdInBlock();
|
int TId = mapping::getThreadIdInBlock();
|
||||||
Usage[TId] -= AlignedBytes;
|
Usage[TId] -= AlignedBytes;
|
||||||
|
@ -43,6 +43,27 @@ struct DeviceEnvironmentTy {
|
|||||||
uint64_t HardwareParallelism;
|
uint64_t HardwareParallelism;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct DeviceMemoryPoolTy {
|
||||||
|
void *Ptr;
|
||||||
|
uint64_t Size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DeviceMemoryPoolTrackingTy {
|
||||||
|
uint64_t NumAllocations;
|
||||||
|
uint64_t AllocationTotal;
|
||||||
|
uint64_t AllocationMin;
|
||||||
|
uint64_t AllocationMax;
|
||||||
|
|
||||||
|
void combine(DeviceMemoryPoolTrackingTy &Other) {
|
||||||
|
NumAllocations += Other.NumAllocations;
|
||||||
|
AllocationTotal += Other.AllocationTotal;
|
||||||
|
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
|
||||||
|
: AllocationMin;
|
||||||
|
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
|
||||||
|
: AllocationMax;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// NOTE: Please don't change the order of those members as their indices are
|
// NOTE: Please don't change the order of those members as their indices are
|
||||||
// used in the middle end. Always add the new data member at the end.
|
// used in the middle end. Always add the new data member at the end.
|
||||||
// Different from KernelEnvironmentTy below, this structure contains members
|
// Different from KernelEnvironmentTy below, this structure contains members
|
||||||
|
@ -2529,10 +2529,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
|||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
Error getDeviceHeapSize(uint64_t &Value) override {
|
Error getDeviceHeapSize(uint64_t &Value) override {
|
||||||
Value = 0;
|
Value = DeviceMemoryPoolSize;
|
||||||
|
return Plugin::success();
|
||||||
|
}
|
||||||
|
Error setDeviceHeapSize(uint64_t Value) override {
|
||||||
|
for (DeviceImageTy *Image : LoadedImages)
|
||||||
|
if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
|
||||||
|
return Err;
|
||||||
|
DeviceMemoryPoolSize = Value;
|
||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
|
|
||||||
|
|
||||||
/// AMDGPU-specific function to get device attributes.
|
/// AMDGPU-specific function to get device attributes.
|
||||||
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
|
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
|
||||||
@ -2625,6 +2631,9 @@ private:
|
|||||||
|
|
||||||
/// Reference to the host device.
|
/// Reference to the host device.
|
||||||
AMDHostDeviceTy &HostDevice;
|
AMDHostDeviceTy &HostDevice;
|
||||||
|
|
||||||
|
/// The current size of the global device memory pool (managed by us).
|
||||||
|
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
|
||||||
};
|
};
|
||||||
|
|
||||||
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
|
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
|
||||||
|
@ -590,6 +590,35 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
|
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
|
||||||
|
|
||||||
|
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
|
||||||
|
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
||||||
|
for (auto *Image : LoadedImages) {
|
||||||
|
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
||||||
|
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
|
||||||
|
sizeof(DeviceMemoryPoolTrackingTy),
|
||||||
|
&ImageDeviceMemoryPoolTracking);
|
||||||
|
if (auto Err =
|
||||||
|
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal))
|
||||||
|
return Err;
|
||||||
|
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Write this by default into a file.
|
||||||
|
printf("\n\n|-----------------------\n"
|
||||||
|
"| Device memory tracker:\n"
|
||||||
|
"|-----------------------\n"
|
||||||
|
"| #Allocations: %lu\n"
|
||||||
|
"| Byes allocated: %lu\n"
|
||||||
|
"| Minimal allocation: %lu\n"
|
||||||
|
"| Maximal allocation: %lu\n"
|
||||||
|
"|-----------------------\n\n\n",
|
||||||
|
DeviceMemoryPoolTracking.NumAllocations,
|
||||||
|
DeviceMemoryPoolTracking.AllocationTotal,
|
||||||
|
DeviceMemoryPoolTracking.AllocationMin,
|
||||||
|
DeviceMemoryPoolTracking.AllocationMax);
|
||||||
|
}
|
||||||
|
|
||||||
// Delete the memory manager before deinitializing the device. Otherwise,
|
// Delete the memory manager before deinitializing the device. Otherwise,
|
||||||
// we may delete device allocations after the device is deinitialized.
|
// we may delete device allocations after the device is deinitialized.
|
||||||
if (MemoryManager)
|
if (MemoryManager)
|
||||||
@ -648,6 +677,17 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
|
|||||||
if (auto Err = setupDeviceEnvironment(Plugin, *Image))
|
if (auto Err = setupDeviceEnvironment(Plugin, *Image))
|
||||||
return std::move(Err);
|
return std::move(Err);
|
||||||
|
|
||||||
|
// Setup the global device memory pool if needed.
|
||||||
|
if (shouldSetupDeviceMemoryPool()) {
|
||||||
|
uint64_t HeapSize;
|
||||||
|
auto SizeOrErr = getDeviceHeapSize(HeapSize);
|
||||||
|
if (SizeOrErr) {
|
||||||
|
REPORT("No global device memory pool due to error: %s\n",
|
||||||
|
toString(std::move(SizeOrErr)).data());
|
||||||
|
} else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
|
||||||
|
return std::move(Err);
|
||||||
|
}
|
||||||
|
|
||||||
// Register all offload entries of the image.
|
// Register all offload entries of the image.
|
||||||
if (auto Err = registerOffloadEntries(*Image))
|
if (auto Err = registerOffloadEntries(*Image))
|
||||||
return std::move(Err);
|
return std::move(Err);
|
||||||
@ -713,6 +753,45 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
|
|||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
|
||||||
|
DeviceImageTy &Image,
|
||||||
|
uint64_t PoolSize) {
|
||||||
|
// Free the old pool, if any.
|
||||||
|
if (DeviceMemoryPool.Ptr) {
|
||||||
|
if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
|
||||||
|
TargetAllocTy::TARGET_ALLOC_DEVICE))
|
||||||
|
return Err;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceMemoryPool.Size = PoolSize;
|
||||||
|
auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
|
||||||
|
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
||||||
|
if (AllocOrErr) {
|
||||||
|
DeviceMemoryPool.Ptr = *AllocOrErr;
|
||||||
|
} else {
|
||||||
|
auto Err = AllocOrErr.takeError();
|
||||||
|
REPORT("Failure to allocate device memory for global memory pool: %s\n",
|
||||||
|
toString(std::move(Err)).data());
|
||||||
|
DeviceMemoryPool.Ptr = nullptr;
|
||||||
|
DeviceMemoryPool.Size = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the metainfo of the device environment global.
|
||||||
|
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
|
||||||
|
sizeof(DeviceMemoryPoolTrackingTy),
|
||||||
|
&DeviceMemoryPoolTracking);
|
||||||
|
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
||||||
|
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
|
||||||
|
return Err;
|
||||||
|
|
||||||
|
// Create the metainfo of the device environment global.
|
||||||
|
GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
|
||||||
|
sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
|
||||||
|
|
||||||
|
// Write device environment values to the device.
|
||||||
|
return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
|
||||||
|
}
|
||||||
|
|
||||||
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
|
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
|
||||||
DeviceImageTy &Image) {
|
DeviceImageTy &Image) {
|
||||||
// The plugin either does not need an RPC server or it is unavailible.
|
// The plugin either does not need an RPC server or it is unavailible.
|
||||||
@ -1327,10 +1406,6 @@ Error GenericPluginTy::init() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Error GenericPluginTy::deinit() {
|
Error GenericPluginTy::deinit() {
|
||||||
// There is no global handler if no device is available.
|
|
||||||
if (GlobalHandler)
|
|
||||||
delete GlobalHandler;
|
|
||||||
|
|
||||||
// Deinitialize all active devices.
|
// Deinitialize all active devices.
|
||||||
for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
|
for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
|
||||||
if (Devices[DeviceId]) {
|
if (Devices[DeviceId]) {
|
||||||
@ -1340,6 +1415,10 @@ Error GenericPluginTy::deinit() {
|
|||||||
assert(!Devices[DeviceId] && "Device was not deinitialized");
|
assert(!Devices[DeviceId] && "Device was not deinitialized");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// There is no global handler if no device is available.
|
||||||
|
if (GlobalHandler)
|
||||||
|
delete GlobalHandler;
|
||||||
|
|
||||||
if (RPCServer)
|
if (RPCServer)
|
||||||
delete RPCServer;
|
delete RPCServer;
|
||||||
|
|
||||||
|
@ -625,6 +625,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
|
|||||||
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
|
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
|
||||||
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
|
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
|
||||||
|
|
||||||
|
/// Setup the global device memory pool, if the plugin requires one.
|
||||||
|
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
|
||||||
|
uint64_t PoolSize);
|
||||||
|
|
||||||
// Setup the RPC server for this device if needed. This may not run on some
|
// Setup the RPC server for this device if needed. This may not run on some
|
||||||
// plugins like the CPU targets. By default, it will not be executed so it is
|
// plugins like the CPU targets. By default, it will not be executed so it is
|
||||||
// up to the target to override this using the shouldSetupRPCServer function.
|
// up to the target to override this using the shouldSetupRPCServer function.
|
||||||
@ -831,6 +835,10 @@ private:
|
|||||||
/// setupDeviceEnvironment() function.
|
/// setupDeviceEnvironment() function.
|
||||||
virtual bool shouldSetupDeviceEnvironment() const { return true; }
|
virtual bool shouldSetupDeviceEnvironment() const { return true; }
|
||||||
|
|
||||||
|
/// Indicate whether the device should setup the global device memory pool. If
|
||||||
|
/// false is return the value on the device will be uninitialized.
|
||||||
|
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
|
||||||
|
|
||||||
/// Indicate whether or not the device should setup the RPC server. This is
|
/// Indicate whether or not the device should setup the RPC server. This is
|
||||||
/// only necessary for unhosted targets like the GPU.
|
/// only necessary for unhosted targets like the GPU.
|
||||||
virtual bool shouldSetupRPCServer() const { return false; }
|
virtual bool shouldSetupRPCServer() const { return false; }
|
||||||
@ -911,6 +919,9 @@ private:
|
|||||||
/// Return the kernel environment object for kernel \p Name.
|
/// Return the kernel environment object for kernel \p Name.
|
||||||
Expected<KernelEnvironmentTy>
|
Expected<KernelEnvironmentTy>
|
||||||
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
|
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
|
||||||
|
|
||||||
|
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
|
||||||
|
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Class implementing common functionalities of offload plugins. Each plugin
|
/// Class implementing common functionalities of offload plugins. Each plugin
|
||||||
|
@ -843,6 +843,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
|||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual bool shouldSetupDeviceMemoryPool() const override {
|
||||||
|
/// We use the CUDA malloc for now.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/// Getters and setters for stack and heap sizes.
|
/// Getters and setters for stack and heap sizes.
|
||||||
Error getDeviceStackSize(uint64_t &Value) override {
|
Error getDeviceStackSize(uint64_t &Value) override {
|
||||||
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
|
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
|
||||||
|
@ -307,8 +307,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
|
|||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This plugin should not setup the device environment.
|
/// This plugin should not setup the device environment or memory pool.
|
||||||
virtual bool shouldSetupDeviceEnvironment() const override { return false; };
|
virtual bool shouldSetupDeviceEnvironment() const override { return false; };
|
||||||
|
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
|
||||||
|
|
||||||
/// Getters and setters for stack size and heap size not relevant.
|
/// Getters and setters for stack size and heap size not relevant.
|
||||||
Error getDeviceStackSize(uint64_t &Value) override {
|
Error getDeviceStackSize(uint64_t &Value) override {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
// On AMDGPU we don't have malloc support yet. We need optimizations
|
// Unonptimized, we need 24000000 bytes heap
|
||||||
// to avoid a thread state which requires malloc.
|
// RUN: %libomptarget-compilexx-generic
|
||||||
//
|
// RUN: env LIBOMPTARGET_HEAP_SIZE=24000000 \
|
||||||
// XUN: %libomptarget-compilexx-run-and-check-generic
|
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
|
||||||
// RUN: %libomptarget-compileoptxx-run-and-check-generic
|
// RUN: %libomptarget-compileoptxx-run-and-check-generic
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
37
openmp/libomptarget/test/offloading/malloc.c
Normal file
37
openmp/libomptarget/test/offloading/malloc.c
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
|
||||||
|
// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
long unsigned *DP = 0;
|
||||||
|
int N = 128;
|
||||||
|
int Threads = 128;
|
||||||
|
int Teams = 440;
|
||||||
|
|
||||||
|
// Allocate ~55MB on the device.
|
||||||
|
#pragma omp target map(from : DP)
|
||||||
|
DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
|
||||||
|
|
||||||
|
#pragma omp target teams distribute parallel for num_teams(Teams) \
|
||||||
|
thread_limit(Threads) is_device_ptr(DP)
|
||||||
|
for (int i = 0; i < Threads * Teams; ++i) {
|
||||||
|
for (int j = 0; j < N; ++j) {
|
||||||
|
DP[i * N + j] = i + j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
long unsigned s = 0;
|
||||||
|
#pragma omp target teams distribute parallel for num_teams(Teams) \
|
||||||
|
thread_limit(Threads) reduction(+ : s)
|
||||||
|
for (int i = 0; i < Threads * Teams; ++i) {
|
||||||
|
for (int j = 0; j < N; ++j) {
|
||||||
|
s += DP[i * N + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK: Sum: 203458478080
|
||||||
|
printf("Sum: %li\n", s);
|
||||||
|
return 0;
|
||||||
|
}
|
42
openmp/libomptarget/test/offloading/malloc_parallel.c
Normal file
42
openmp/libomptarget/test/offloading/malloc_parallel.c
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
|
||||||
|
// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
|
||||||
|
|
||||||
|
#include <omp.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
long unsigned **DP = 0;
|
||||||
|
int N = 128;
|
||||||
|
int Threads = 128;
|
||||||
|
int Teams = 440;
|
||||||
|
|
||||||
|
#pragma omp target map(from : DP)
|
||||||
|
DP = (long unsigned **)malloc(sizeof(long unsigned *) * Threads * Teams);
|
||||||
|
|
||||||
|
#pragma omp target teams distribute parallel for num_teams(Teams) \
|
||||||
|
thread_limit(Threads)
|
||||||
|
for (int i = 0; i < Threads * Teams; ++i)
|
||||||
|
DP[i] = (long unsigned *)malloc(sizeof(long unsigned) * N);
|
||||||
|
|
||||||
|
#pragma omp target teams distribute parallel for num_teams(Teams) \
|
||||||
|
thread_limit(Threads)
|
||||||
|
for (int i = 0; i < Threads * Teams; ++i) {
|
||||||
|
for (int j = 0; j < N; ++j) {
|
||||||
|
DP[i][j] = i + j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
long unsigned s = 0;
|
||||||
|
#pragma omp target teams distribute parallel for num_teams(Teams) \
|
||||||
|
thread_limit(Threads) reduction(+ : s)
|
||||||
|
for (int i = 0; i < Threads * Teams; ++i) {
|
||||||
|
for (int j = 0; j < N; ++j) {
|
||||||
|
s += DP[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK: Sum: 203458478080
|
||||||
|
printf("Sum: %li\n", s);
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user