mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-17 17:26:41 +00:00

Summary: We conditionally allocate the implicit arguments, so they possibly are null. The flang compiler seems to hit this case, even though it shouldn't when it's supposed to conform to the HSA code object. For now guard this to fix the regression and cover a case in the future where someone rolls a fully custom implementatation. Fixes: https://github.com/llvm/llvm-project/issues/132982
3576 lines
131 KiB
C++
3576 lines
131 KiB
C++
//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// RTL NextGen for AMDGPU machine
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include <atomic>
|
|
#include <cassert>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <deque>
|
|
#include <functional>
|
|
#include <mutex>
|
|
#include <string>
|
|
#include <system_error>
|
|
#include <unistd.h>
|
|
#include <unordered_map>
|
|
|
|
#include "ErrorReporting.h"
|
|
#include "Shared/APITypes.h"
|
|
#include "Shared/Debug.h"
|
|
#include "Shared/Environment.h"
|
|
#include "Shared/RefCnt.h"
|
|
#include "Shared/Utils.h"
|
|
#include "Utils/ELF.h"
|
|
|
|
#include "GlobalHandler.h"
|
|
#include "OpenMP/OMPT/Callback.h"
|
|
#include "PluginInterface.h"
|
|
#include "UtilitiesRTL.h"
|
|
#include "omptarget.h"
|
|
|
|
#include "llvm/ADT/SmallString.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/BinaryFormat/ELF.h"
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
|
#include "llvm/Support/Error.h"
|
|
#include "llvm/Support/FileOutputBuffer.h"
|
|
#include "llvm/Support/FileSystem.h"
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
#include "llvm/Support/Program.h"
|
|
#include "llvm/Support/Signals.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
|
|
!defined(__ORDER_BIG_ENDIAN__)
|
|
#error "Missing preprocessor definitions for endianness detection."
|
|
#endif
|
|
|
|
// The HSA headers require these definitions.
|
|
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
|
|
#define LITTLEENDIAN_CPU
|
|
#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
#define BIGENDIAN_CPU
|
|
#endif
|
|
|
|
#if defined(__has_include)
|
|
#if __has_include("hsa.h")
|
|
#include "hsa.h"
|
|
#include "hsa_ext_amd.h"
|
|
#elif __has_include("hsa/hsa.h")
|
|
#include "hsa/hsa.h"
|
|
#include "hsa/hsa_ext_amd.h"
|
|
#endif
|
|
#else
|
|
#include "hsa/hsa.h"
|
|
#include "hsa/hsa_ext_amd.h"
|
|
#endif
|
|
|
|
namespace llvm {
|
|
namespace omp {
|
|
namespace target {
|
|
namespace plugin {
|
|
|
|
/// Forward declarations for all specialized data structures.
|
|
struct AMDGPUKernelTy;
|
|
struct AMDGPUDeviceTy;
|
|
struct AMDGPUPluginTy;
|
|
struct AMDGPUStreamTy;
|
|
struct AMDGPUEventTy;
|
|
struct AMDGPUStreamManagerTy;
|
|
struct AMDGPUEventManagerTy;
|
|
struct AMDGPUDeviceImageTy;
|
|
struct AMDGPUMemoryManagerTy;
|
|
struct AMDGPUMemoryPoolTy;
|
|
|
|
namespace hsa_utils {
|
|
|
|
/// Iterate elements using an HSA iterate function. Do not use this function
|
|
/// directly but the specialized ones below instead.
|
|
template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
|
|
hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
|
|
auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
|
|
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
|
|
return (*Unwrapped)(Elem);
|
|
};
|
|
return Func(L, static_cast<void *>(&Cb));
|
|
}
|
|
|
|
/// Iterate elements using an HSA iterate function passing a parameter. Do not
|
|
/// use this function directly but the specialized ones below instead.
|
|
template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
|
|
typename CallbackTy>
|
|
hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
|
|
auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
|
|
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
|
|
return (*Unwrapped)(Elem);
|
|
};
|
|
return Func(FuncArg, L, static_cast<void *>(&Cb));
|
|
}
|
|
|
|
/// Iterate elements using an HSA iterate function passing a parameter. Do not
|
|
/// use this function directly but the specialized ones below instead.
|
|
template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
|
|
typename IterFuncArgTy, typename CallbackTy>
|
|
hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
|
|
auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
|
|
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
|
|
return (*Unwrapped)(Elem1, Elem2);
|
|
};
|
|
return Func(FuncArg, L, static_cast<void *>(&Cb));
|
|
}
|
|
|
|
/// Iterate agents.
|
|
template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
|
|
hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
|
|
return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
|
|
}
|
|
|
|
/// Iterate ISAs of an agent.
|
|
template <typename CallbackTy>
|
|
Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
|
|
hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
|
|
return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
|
|
}
|
|
|
|
/// Iterate memory pools of an agent.
|
|
template <typename CallbackTy>
|
|
Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
|
|
hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
|
|
hsa_amd_agent_iterate_memory_pools, Agent, Cb);
|
|
return Plugin::check(Status,
|
|
"Error in hsa_amd_agent_iterate_memory_pools: %s");
|
|
}
|
|
|
|
/// Dispatches an asynchronous memory copy.
|
|
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
|
|
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
|
|
const void *Src, hsa_agent_t SrcAgent, size_t Size,
|
|
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
|
|
hsa_signal_t CompletionSignal) {
|
|
if (!UseMultipleSdmaEngines) {
|
|
hsa_status_t S =
|
|
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
|
|
NumDepSignals, DepSignals, CompletionSignal);
|
|
return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
|
|
}
|
|
|
|
// This solution is probably not the best
|
|
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
|
|
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
|
|
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
|
|
#else
|
|
static std::atomic<int> SdmaEngine{1};
|
|
|
|
// This atomics solution is probably not the best, but should be sufficient
|
|
// for now.
|
|
// In a worst case scenario, in which threads read the same value, they will
|
|
// dispatch to the same SDMA engine. This may result in sub-optimal
|
|
// performance. However, I think the possibility to be fairly low.
|
|
int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
|
|
// This call is only avail in ROCm >= 5.7
|
|
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
|
|
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
|
|
CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
|
|
/*force_copy_on_sdma=*/true);
|
|
// Increment to use one of two SDMA engines: 0x1, 0x2
|
|
LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
|
|
SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
|
|
|
|
return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
|
|
#endif
|
|
}
|
|
|
|
Error getTargetTripleAndFeatures(hsa_agent_t Agent,
|
|
SmallVector<SmallString<32>> &Targets) {
|
|
auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
|
|
uint32_t Length;
|
|
hsa_status_t Status;
|
|
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
|
|
if (Status != HSA_STATUS_SUCCESS)
|
|
return Status;
|
|
|
|
llvm::SmallVector<char> ISAName(Length);
|
|
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
|
|
if (Status != HSA_STATUS_SUCCESS)
|
|
return Status;
|
|
|
|
llvm::StringRef TripleTarget(ISAName.begin(), Length);
|
|
if (TripleTarget.consume_front("amdgcn-amd-amdhsa")) {
|
|
auto Target = TripleTarget.ltrim('-').rtrim('\0');
|
|
Targets.push_back(Target);
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
});
|
|
return Err;
|
|
}
|
|
} // namespace hsa_utils
|
|
|
|
/// Utility class representing generic resource references to AMDGPU resources.
|
|
template <typename ResourceTy>
|
|
struct AMDGPUResourceRef : public GenericDeviceResourceRef {
|
|
/// The underlying handle type for resources.
|
|
using HandleTy = ResourceTy *;
|
|
|
|
/// Create an empty reference to an invalid resource.
|
|
AMDGPUResourceRef() : Resource(nullptr) {}
|
|
|
|
/// Create a reference to an existing resource.
|
|
AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
|
|
|
|
virtual ~AMDGPUResourceRef() {}
|
|
|
|
/// Create a new resource and save the reference. The reference must be empty
|
|
/// before calling to this function.
|
|
Error create(GenericDeviceTy &Device) override;
|
|
|
|
/// Destroy the referenced resource and invalidate the reference. The
|
|
/// reference must be to a valid resource before calling to this function.
|
|
Error destroy(GenericDeviceTy &Device) override {
|
|
if (!Resource)
|
|
return Plugin::error("Destroying an invalid resource");
|
|
|
|
if (auto Err = Resource->deinit())
|
|
return Err;
|
|
|
|
delete Resource;
|
|
|
|
Resource = nullptr;
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Get the underlying resource handle.
|
|
operator HandleTy() const { return Resource; }
|
|
|
|
private:
|
|
/// The handle to the actual resource.
|
|
HandleTy Resource;
|
|
};
|
|
|
|
/// Class holding an HSA memory pool.
|
|
struct AMDGPUMemoryPoolTy {
|
|
/// Create a memory pool from an HSA memory pool.
|
|
AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
|
|
: MemoryPool(MemoryPool), GlobalFlags(0) {}
|
|
|
|
/// Initialize the memory pool retrieving its properties.
|
|
Error init() {
|
|
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
|
|
return Err;
|
|
|
|
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Getter of the HSA memory pool.
|
|
hsa_amd_memory_pool_t get() const { return MemoryPool; }
|
|
|
|
/// Indicate the segment which belongs to.
|
|
bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
|
|
bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
|
|
bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
|
|
bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
|
|
|
|
/// Indicate if it is fine-grained memory. Valid only for global.
|
|
bool isFineGrained() const {
|
|
assert(isGlobal() && "Not global memory");
|
|
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
|
|
}
|
|
|
|
/// Indicate if it is coarse-grained memory. Valid only for global.
|
|
bool isCoarseGrained() const {
|
|
assert(isGlobal() && "Not global memory");
|
|
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
|
|
}
|
|
|
|
/// Indicate if it supports storing kernel arguments. Valid only for global.
|
|
bool supportsKernelArgs() const {
|
|
assert(isGlobal() && "Not global memory");
|
|
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
|
|
}
|
|
|
|
/// Allocate memory on the memory pool.
|
|
Error allocate(size_t Size, void **PtrStorage) {
|
|
hsa_status_t Status =
|
|
hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
|
|
return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
|
|
}
|
|
|
|
/// Return memory to the memory pool.
|
|
Error deallocate(void *Ptr) {
|
|
hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
|
|
return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
|
|
}
|
|
|
|
/// Returns if the \p Agent can access the memory pool.
|
|
bool canAccess(hsa_agent_t Agent) {
|
|
hsa_amd_memory_pool_access_t Access;
|
|
if (hsa_amd_agent_memory_pool_get_info(
|
|
Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access))
|
|
return false;
|
|
return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
|
|
}
|
|
|
|
/// Allow the device to access a specific allocation.
|
|
Error enableAccess(void *Ptr, int64_t Size,
|
|
const llvm::SmallVector<hsa_agent_t> &Agents) const {
|
|
#ifdef OMPTARGET_DEBUG
|
|
for (hsa_agent_t Agent : Agents) {
|
|
hsa_amd_memory_pool_access_t Access;
|
|
if (auto Err =
|
|
getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
|
|
return Err;
|
|
|
|
// The agent is not allowed to access the memory pool in any case. Do not
|
|
// continue because otherwise it result in undefined behavior.
|
|
if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
|
|
return Plugin::error("An agent is not allowed to access a memory pool");
|
|
}
|
|
#endif
|
|
|
|
// We can access but it is disabled by default. Enable the access then.
|
|
hsa_status_t Status =
|
|
hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
|
|
return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
|
|
}
|
|
|
|
/// Get attribute from the memory pool.
|
|
template <typename Ty>
|
|
Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
|
|
hsa_status_t Status;
|
|
Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
|
|
return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
|
|
}
|
|
|
|
template <typename Ty>
|
|
hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
|
|
return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
|
|
}
|
|
|
|
/// Get attribute from the memory pool relating to an agent.
|
|
template <typename Ty>
|
|
Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
|
|
Ty &Value) const {
|
|
hsa_status_t Status;
|
|
Status =
|
|
hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
|
|
return Plugin::check(Status,
|
|
"Error in hsa_amd_agent_memory_pool_get_info: %s");
|
|
}
|
|
|
|
private:
|
|
/// The HSA memory pool.
|
|
hsa_amd_memory_pool_t MemoryPool;
|
|
|
|
/// The segment where the memory pool belongs to.
|
|
hsa_amd_segment_t Segment;
|
|
|
|
/// The global flags of memory pool. Only valid if the memory pool belongs to
|
|
/// the global segment.
|
|
uint32_t GlobalFlags;
|
|
};
|
|
|
|
/// Class that implements a memory manager that gets memory from a specific
|
|
/// memory pool.
|
|
struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
|
|
|
|
/// Create an empty memory manager.
|
|
AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
|
|
: Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
|
|
|
|
/// Initialize the memory manager from a memory pool.
|
|
Error init(AMDGPUMemoryPoolTy &MemoryPool) {
|
|
const uint32_t Threshold = 1 << 30;
|
|
this->MemoryManager = new MemoryManagerTy(*this, Threshold);
|
|
this->MemoryPool = &MemoryPool;
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Deinitialize the memory manager and free its allocations.
|
|
Error deinit() {
|
|
assert(MemoryManager && "Invalid memory manager");
|
|
|
|
// Delete and invalidate the memory manager. At this point, the memory
|
|
// manager will deallocate all its allocations.
|
|
delete MemoryManager;
|
|
MemoryManager = nullptr;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Reuse or allocate memory through the memory manager.
|
|
Error allocate(size_t Size, void **PtrStorage) {
|
|
assert(MemoryManager && "Invalid memory manager");
|
|
assert(PtrStorage && "Invalid pointer storage");
|
|
|
|
*PtrStorage = MemoryManager->allocate(Size, nullptr);
|
|
if (*PtrStorage == nullptr)
|
|
return Plugin::error("Failure to allocate from AMDGPU memory manager");
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Release an allocation to be reused.
|
|
Error deallocate(void *Ptr) {
|
|
assert(Ptr && "Invalid pointer");
|
|
|
|
if (MemoryManager->free(Ptr))
|
|
return Plugin::error("Failure to deallocate from AMDGPU memory manager");
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
private:
|
|
/// Allocation callback that will be called once the memory manager does not
|
|
/// have more previously allocated buffers.
|
|
void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
|
|
|
|
/// Deallocation callback that will be called by the memory manager.
|
|
int free(void *TgtPtr, TargetAllocTy Kind) override {
|
|
if (auto Err = MemoryPool->deallocate(TgtPtr)) {
|
|
consumeError(std::move(Err));
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
/// The underlying plugin that owns this memory manager.
|
|
AMDGPUPluginTy &Plugin;
|
|
|
|
/// The memory pool used to allocate memory.
|
|
AMDGPUMemoryPoolTy *MemoryPool;
|
|
|
|
/// Reference to the actual memory manager.
|
|
MemoryManagerTy *MemoryManager;
|
|
};
|
|
|
|
/// Class implementing the AMDGPU device images' properties.
|
|
struct AMDGPUDeviceImageTy : public DeviceImageTy {
|
|
/// Create the AMDGPU image with the id and the target image pointer.
|
|
AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
|
|
const __tgt_device_image *TgtImage)
|
|
: DeviceImageTy(ImageId, Device, TgtImage) {}
|
|
|
|
/// Prepare and load the executable corresponding to the image.
|
|
Error loadExecutable(const AMDGPUDeviceTy &Device);
|
|
|
|
/// Unload the executable.
|
|
Error unloadExecutable() {
|
|
hsa_status_t Status = hsa_executable_destroy(Executable);
|
|
return Plugin::check(Status, "Error in hsa_executable_destroy: %s");
|
|
}
|
|
|
|
/// Get the executable.
|
|
hsa_executable_t getExecutable() const { return Executable; }
|
|
|
|
/// Get to Code Object Version of the ELF
|
|
uint16_t getELFABIVersion() const { return ELFABIVersion; }
|
|
|
|
/// Find an HSA device symbol by its name on the executable.
|
|
Expected<hsa_executable_symbol_t>
|
|
findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
|
|
|
|
/// Get additional info for kernel, e.g., register spill counts
|
|
std::optional<offloading::amdgpu::AMDGPUKernelMetaData>
|
|
getKernelInfo(StringRef Identifier) const {
|
|
auto It = KernelInfoMap.find(Identifier);
|
|
|
|
if (It == KernelInfoMap.end())
|
|
return {};
|
|
|
|
return It->second;
|
|
}
|
|
|
|
private:
|
|
/// The executable loaded on the agent.
|
|
hsa_executable_t Executable;
|
|
StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
|
|
uint16_t ELFABIVersion;
|
|
};
|
|
|
|
/// Class implementing the AMDGPU kernel functionalities which derives from the
|
|
/// generic kernel class.
|
|
struct AMDGPUKernelTy : public GenericKernelTy {
|
|
/// Create an AMDGPU kernel with a name and an execution mode.
|
|
AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
|
|
|
|
/// Initialize the AMDGPU kernel.
|
|
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
|
|
AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
|
|
|
|
// Kernel symbols have a ".kd" suffix.
|
|
std::string KernelName(getName());
|
|
KernelName += ".kd";
|
|
|
|
// Find the symbol on the device executable.
|
|
auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
|
|
if (!SymbolOrErr)
|
|
return SymbolOrErr.takeError();
|
|
|
|
hsa_executable_symbol_t Symbol = *SymbolOrErr;
|
|
hsa_symbol_kind_t SymbolType;
|
|
hsa_status_t Status;
|
|
|
|
// Retrieve different properties of the kernel symbol.
|
|
std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
|
|
|
|
for (auto &Info : RequiredInfos) {
|
|
Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
|
|
if (auto Err = Plugin::check(
|
|
Status, "Error in hsa_executable_symbol_get_info: %s"))
|
|
return Err;
|
|
}
|
|
|
|
// Make sure it is a kernel symbol.
|
|
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
|
|
return Plugin::error("Symbol %s is not a kernel function");
|
|
|
|
// TODO: Read the kernel descriptor for the max threads per block. May be
|
|
// read from the image.
|
|
|
|
ImplicitArgsSize =
|
|
hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
|
|
DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
|
|
|
|
// Get additional kernel info read from image
|
|
KernelInfo = AMDImage.getKernelInfo(getName());
|
|
if (!KernelInfo.has_value())
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
|
|
"Could not read extra information for kernel %s.", getName());
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Launch the AMDGPU kernel function.
|
|
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
|
KernelLaunchParamsTy LaunchParams,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
|
|
|
/// Print more elaborate kernel launch info for AMDGPU
|
|
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
|
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3]) const override;
|
|
|
|
/// Get group and private segment kernel size.
|
|
uint32_t getGroupSize() const { return GroupSize; }
|
|
uint32_t getPrivateSize() const { return PrivateSize; }
|
|
|
|
/// Get the HSA kernel object representing the kernel function.
|
|
uint64_t getKernelObject() const { return KernelObject; }
|
|
|
|
/// Get the size of implicitargs based on the code object version.
|
|
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
|
|
|
|
/// Indicates whether or not we need to set up our own private segment size.
|
|
bool usesDynamicStack() const { return DynamicStack; }
|
|
|
|
private:
|
|
/// The kernel object to execute.
|
|
uint64_t KernelObject;
|
|
|
|
/// The args, group and private segments sizes required by a kernel instance.
|
|
uint32_t ArgsSize;
|
|
uint32_t GroupSize;
|
|
uint32_t PrivateSize;
|
|
bool DynamicStack;
|
|
|
|
/// The size of implicit kernel arguments.
|
|
uint32_t ImplicitArgsSize;
|
|
|
|
/// Additional Info for the AMD GPU Kernel
|
|
std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo;
|
|
};
|
|
|
|
/// Class representing an HSA signal. Signals are used to define dependencies
|
|
/// between asynchronous operations: kernel launches and memory transfers.
|
|
struct AMDGPUSignalTy {
|
|
/// Create an empty signal.
|
|
AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
|
|
AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
|
|
|
|
/// Initialize the signal with an initial value.
|
|
Error init(uint32_t InitialValue = 1) {
|
|
hsa_status_t Status =
|
|
hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
|
|
return Plugin::check(Status, "Error in hsa_signal_create: %s");
|
|
}
|
|
|
|
/// Deinitialize the signal.
|
|
Error deinit() {
|
|
hsa_status_t Status = hsa_signal_destroy(HSASignal);
|
|
return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
|
|
}
|
|
|
|
/// Wait until the signal gets a zero value.
|
|
Error wait(const uint64_t ActiveTimeout = 0,
|
|
GenericDeviceTy *Device = nullptr) const {
|
|
if (ActiveTimeout) {
|
|
hsa_signal_value_t Got = 1;
|
|
Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
|
|
ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
|
|
if (Got == 0)
|
|
return Plugin::success();
|
|
}
|
|
|
|
// If there is an RPC device attached to this stream we run it as a server.
|
|
uint64_t Timeout = UINT64_MAX;
|
|
auto WaitState = HSA_WAIT_STATE_BLOCKED;
|
|
while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
|
|
Timeout, WaitState) != 0)
|
|
;
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Load the value on the signal.
|
|
hsa_signal_value_t load() const {
|
|
return hsa_signal_load_scacquire(HSASignal);
|
|
}
|
|
|
|
/// Signal decrementing by one.
|
|
void signal() {
|
|
assert(load() > 0 && "Invalid signal value");
|
|
hsa_signal_subtract_screlease(HSASignal, 1);
|
|
}
|
|
|
|
/// Reset the signal value before reusing the signal. Do not call this
|
|
/// function if the signal is being currently used by any watcher, such as a
|
|
/// plugin thread or the HSA runtime.
|
|
void reset() { hsa_signal_store_screlease(HSASignal, 1); }
|
|
|
|
/// Increase the number of concurrent uses.
|
|
void increaseUseCount() { UseCount.increase(); }
|
|
|
|
/// Decrease the number of concurrent uses and return whether was the last.
|
|
bool decreaseUseCount() { return UseCount.decrease(); }
|
|
|
|
hsa_signal_t get() const { return HSASignal; }
|
|
|
|
private:
|
|
/// The underlying HSA signal.
|
|
hsa_signal_t HSASignal;
|
|
|
|
/// Reference counter for tracking the concurrent use count. This is mainly
|
|
/// used for knowing how many streams are using the signal.
|
|
RefCountTy<> UseCount;
|
|
};
|
|
|
|
/// Classes for holding AMDGPU signals and managing signals.
|
|
using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
|
|
using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
|
|
|
|
/// Class holding an HSA queue to submit kernel and barrier packets.
|
|
struct AMDGPUQueueTy {
|
|
/// Create an empty queue.
|
|
AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
|
|
|
|
/// Lazily initialize a new queue belonging to a specific agent.
|
|
Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
|
|
if (Queue)
|
|
return Plugin::success();
|
|
hsa_status_t Status =
|
|
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
|
|
&Device, UINT32_MAX, UINT32_MAX, &Queue);
|
|
return Plugin::check(Status, "Error in hsa_queue_create: %s");
|
|
}
|
|
|
|
/// Deinitialize the queue and destroy its resources.
|
|
Error deinit() {
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
if (!Queue)
|
|
return Plugin::success();
|
|
hsa_status_t Status = hsa_queue_destroy(Queue);
|
|
return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
|
|
}
|
|
|
|
/// Returns the number of streams, this queue is currently assigned to.
|
|
bool getUserCount() const { return NumUsers; }
|
|
|
|
/// Returns if the underlying HSA queue is initialized.
|
|
bool isInitialized() { return Queue != nullptr; }
|
|
|
|
/// Decrement user count of the queue object.
|
|
void removeUser() { --NumUsers; }
|
|
|
|
/// Increase user count of the queue object.
|
|
void addUser() { ++NumUsers; }
|
|
|
|
/// Push a kernel launch to the queue. The kernel launch requires an output
|
|
/// signal and can define an optional input signal (nullptr if none).
|
|
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
|
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
|
uint32_t GroupSize, uint64_t StackSize,
|
|
AMDGPUSignalTy *OutputSignal,
|
|
AMDGPUSignalTy *InputSignal) {
|
|
assert(OutputSignal && "Invalid kernel output signal");
|
|
|
|
// Lock the queue during the packet publishing process. Notice this blocks
|
|
// the addition of other packets to the queue. The following piece of code
|
|
// should be lightweight; do not block the thread, allocate memory, etc.
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
assert(Queue && "Interacted with a non-initialized queue!");
|
|
|
|
// Add a barrier packet before the kernel packet in case there is a pending
|
|
// preceding operation. The barrier packet will delay the processing of
|
|
// subsequent queue's packets until the barrier input signal are satisfied.
|
|
// No need output signal needed because the dependency is already guaranteed
|
|
// by the queue barrier itself.
|
|
if (InputSignal && InputSignal->load())
|
|
if (auto Err = pushBarrierImpl(nullptr, InputSignal))
|
|
return Err;
|
|
|
|
// Now prepare the kernel packet.
|
|
uint64_t PacketId;
|
|
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
|
|
assert(Packet && "Invalid packet");
|
|
|
|
// The first 32 bits of the packet are written after the other fields
|
|
uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1
|
|
? 3
|
|
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
|
|
uint16_t Setup = UINT16_C(Dims)
|
|
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
|
Packet->workgroup_size_x = NumThreads[0];
|
|
Packet->workgroup_size_y = NumThreads[1];
|
|
Packet->workgroup_size_z = NumThreads[2];
|
|
Packet->reserved0 = 0;
|
|
Packet->grid_size_x = NumBlocks[0] * NumThreads[0];
|
|
Packet->grid_size_y = NumBlocks[1] * NumThreads[1];
|
|
Packet->grid_size_z = NumBlocks[2] * NumThreads[2];
|
|
Packet->private_segment_size =
|
|
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
|
|
Packet->group_segment_size = GroupSize;
|
|
Packet->kernel_object = Kernel.getKernelObject();
|
|
Packet->kernarg_address = KernelArgs;
|
|
Packet->reserved2 = 0;
|
|
Packet->completion_signal = OutputSignal->get();
|
|
|
|
// Publish the packet. Do not modify the packet after this point.
|
|
publishKernelPacket(PacketId, Setup, Packet);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Push a barrier packet that will wait up to two input signals. All signals
|
|
/// are optional (nullptr if none).
|
|
Error pushBarrier(AMDGPUSignalTy *OutputSignal,
|
|
const AMDGPUSignalTy *InputSignal1,
|
|
const AMDGPUSignalTy *InputSignal2) {
|
|
// Lock the queue during the packet publishing process.
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
assert(Queue && "Interacted with a non-initialized queue!");
|
|
|
|
// Push the barrier with the lock acquired.
|
|
return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
|
|
}
|
|
|
|
private:
|
|
/// Push a barrier packet that will wait up to two input signals. Assumes the
|
|
/// the queue lock is acquired.
|
|
Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
|
|
const AMDGPUSignalTy *InputSignal1,
|
|
const AMDGPUSignalTy *InputSignal2 = nullptr) {
|
|
// Add a queue barrier waiting on both the other stream's operation and the
|
|
// last operation on the current stream (if any).
|
|
uint64_t PacketId;
|
|
hsa_barrier_and_packet_t *Packet =
|
|
(hsa_barrier_and_packet_t *)acquirePacket(PacketId);
|
|
assert(Packet && "Invalid packet");
|
|
|
|
Packet->reserved0 = 0;
|
|
Packet->reserved1 = 0;
|
|
Packet->dep_signal[0] = {0};
|
|
Packet->dep_signal[1] = {0};
|
|
Packet->dep_signal[2] = {0};
|
|
Packet->dep_signal[3] = {0};
|
|
Packet->dep_signal[4] = {0};
|
|
Packet->reserved2 = 0;
|
|
Packet->completion_signal = {0};
|
|
|
|
// Set input and output dependencies if needed.
|
|
if (OutputSignal)
|
|
Packet->completion_signal = OutputSignal->get();
|
|
if (InputSignal1)
|
|
Packet->dep_signal[0] = InputSignal1->get();
|
|
if (InputSignal2)
|
|
Packet->dep_signal[1] = InputSignal2->get();
|
|
|
|
// Publish the packet. Do not modify the packet after this point.
|
|
publishBarrierPacket(PacketId, Packet);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Acquire a packet from the queue. This call may block the thread if there
|
|
/// is no space in the underlying HSA queue. It may need to wait until the HSA
|
|
/// runtime processes some packets. Assumes the queue lock is acquired.
|
|
hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
|
|
// Increase the queue index with relaxed memory order. Notice this will need
|
|
// another subsequent atomic operation with acquire order.
|
|
PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
|
|
|
|
// Wait for the package to be available. Notice the atomic operation uses
|
|
// the acquire memory order.
|
|
while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
|
|
;
|
|
|
|
// Return the packet reference.
|
|
const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
|
|
return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
|
|
(PacketId & Mask);
|
|
}
|
|
|
|
/// Publish the kernel packet so that the HSA runtime can start processing
|
|
/// the kernel launch. Do not modify the packet once this function is called.
|
|
/// Assumes the queue lock is acquired.
|
|
void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
|
|
hsa_kernel_dispatch_packet_t *Packet) {
|
|
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
|
|
|
|
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
|
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
|
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
|
|
|
// Publish the packet. Do not modify the package after this point.
|
|
uint32_t HeaderWord = Header | (Setup << 16u);
|
|
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
|
|
|
|
// Signal the doorbell about the published packet.
|
|
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
|
|
}
|
|
|
|
/// Publish the barrier packet so that the HSA runtime can start processing
|
|
/// the barrier. Next packets in the queue will not be processed until all
|
|
/// barrier dependencies (signals) are satisfied. Assumes the queue is locked
|
|
void publishBarrierPacket(uint64_t PacketId,
|
|
hsa_barrier_and_packet_t *Packet) {
|
|
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
|
|
uint16_t Setup = 0;
|
|
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
|
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
|
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
|
|
|
// Publish the packet. Do not modify the package after this point.
|
|
uint32_t HeaderWord = Header | (Setup << 16u);
|
|
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
|
|
|
|
// Signal the doorbell about the published packet.
|
|
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
|
|
}
|
|
|
|
/// Callback that will be called when an error is detected on the HSA queue.
|
|
static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
|
|
void *Data);
|
|
|
|
/// The HSA queue.
|
|
hsa_queue_t *Queue;
|
|
|
|
/// Mutex to protect the acquiring and publishing of packets. For the moment,
|
|
/// we need this mutex to prevent publishing packets that are not ready to be
|
|
/// published in a multi-thread scenario. Without a queue lock, a thread T1
|
|
/// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
|
|
/// publish its packet P+1 (signaling the queue's doorbell) before packet P
|
|
/// from T1 is ready to be processed. That scenario should be invalid. Thus,
|
|
/// we use the following mutex to make packet acquiring and publishing atomic.
|
|
/// TODO: There are other more advanced approaches to avoid this mutex using
|
|
/// atomic operations. We can further investigate it if this is a bottleneck.
|
|
std::mutex Mutex;
|
|
|
|
/// The number of streams, this queue is currently assigned to. A queue is
|
|
/// considered idle when this is zero, otherwise: busy.
|
|
uint32_t NumUsers;
|
|
};
|
|
|
|
/// Struct that implements a stream of asynchronous operations for AMDGPU
|
|
/// devices. This class relies on signals to implement streams and define the
|
|
/// dependencies between asynchronous operations.
|
|
struct AMDGPUStreamTy {
|
|
private:
|
|
/// Utility struct holding arguments for async H2H memory copies.
|
|
struct MemcpyArgsTy {
|
|
void *Dst;
|
|
const void *Src;
|
|
size_t Size;
|
|
};
|
|
|
|
/// Utility struct holding arguments for freeing buffers to memory managers.
|
|
struct ReleaseBufferArgsTy {
|
|
void *Buffer;
|
|
AMDGPUMemoryManagerTy *MemoryManager;
|
|
};
|
|
|
|
/// Utility struct holding arguments for releasing signals to signal managers.
|
|
struct ReleaseSignalArgsTy {
|
|
AMDGPUSignalTy *Signal;
|
|
AMDGPUSignalManagerTy *SignalManager;
|
|
};
|
|
|
|
using AMDGPUStreamCallbackTy = Error(void *Data);
|
|
|
|
/// The stream is composed of N stream's slots. The struct below represents
|
|
/// the fields of each slot. Each slot has a signal and an optional action
|
|
/// function. When appending an HSA asynchronous operation to the stream, one
|
|
/// slot is consumed and used to store the operation's information. The
|
|
/// operation's output signal is set to the consumed slot's signal. If there
|
|
/// is a previous asynchronous operation on the previous slot, the HSA async
|
|
/// operation's input signal is set to the signal of the previous slot. This
|
|
/// way, we obtain a chain of dependent async operations. The action is a
|
|
/// function that will be executed eventually after the operation is
|
|
/// completed, e.g., for releasing a buffer.
|
|
struct StreamSlotTy {
|
|
/// The output signal of the stream operation. May be used by the subsequent
|
|
/// operation as input signal.
|
|
AMDGPUSignalTy *Signal;
|
|
|
|
/// The actions that must be performed after the operation's completion. Set
|
|
/// to nullptr when there is no action to perform.
|
|
llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks;
|
|
|
|
/// Space for the action's arguments. A pointer to these arguments is passed
|
|
/// to the action function. Notice the space of arguments is limited.
|
|
union ActionArgsTy {
|
|
MemcpyArgsTy MemcpyArgs;
|
|
ReleaseBufferArgsTy ReleaseBufferArgs;
|
|
ReleaseSignalArgsTy ReleaseSignalArgs;
|
|
void *CallbackArgs;
|
|
};
|
|
|
|
llvm::SmallVector<ActionArgsTy> ActionArgs;
|
|
|
|
/// Create an empty slot.
|
|
StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
|
|
|
|
/// Schedule a host memory copy action on the slot.
|
|
Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
|
|
Callbacks.emplace_back(memcpyAction);
|
|
ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Schedule a release buffer action on the slot.
|
|
Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
|
|
Callbacks.emplace_back(releaseBufferAction);
|
|
ActionArgs.emplace_back().ReleaseBufferArgs =
|
|
ReleaseBufferArgsTy{Buffer, &Manager};
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Schedule a signal release action on the slot.
|
|
Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
|
|
AMDGPUSignalManagerTy *SignalManager) {
|
|
Callbacks.emplace_back(releaseSignalAction);
|
|
ActionArgs.emplace_back().ReleaseSignalArgs =
|
|
ReleaseSignalArgsTy{SignalToRelease, SignalManager};
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Register a callback to be called on compleition
|
|
Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) {
|
|
Callbacks.emplace_back(Func);
|
|
ActionArgs.emplace_back().CallbackArgs = Data;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
// Perform the action if needed.
|
|
Error performAction() {
|
|
if (Callbacks.empty())
|
|
return Plugin::success();
|
|
|
|
assert(Callbacks.size() == ActionArgs.size() && "Size mismatch");
|
|
for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) {
|
|
// Perform the action.
|
|
if (Callback == memcpyAction) {
|
|
if (auto Err = memcpyAction(&ActionArg))
|
|
return Err;
|
|
} else if (Callback == releaseBufferAction) {
|
|
if (auto Err = releaseBufferAction(&ActionArg))
|
|
return Err;
|
|
} else if (Callback == releaseSignalAction) {
|
|
if (auto Err = releaseSignalAction(&ActionArg))
|
|
return Err;
|
|
} else if (Callback) {
|
|
if (auto Err = Callback(ActionArg.CallbackArgs))
|
|
return Err;
|
|
}
|
|
}
|
|
|
|
// Invalidate the action.
|
|
Callbacks.clear();
|
|
ActionArgs.clear();
|
|
|
|
return Plugin::success();
|
|
}
|
|
};
|
|
|
|
/// The device agent where the stream was created.
|
|
hsa_agent_t Agent;
|
|
|
|
/// The queue that the stream uses to launch kernels.
|
|
AMDGPUQueueTy *Queue;
|
|
|
|
/// The manager of signals to reuse signals.
|
|
AMDGPUSignalManagerTy &SignalManager;
|
|
|
|
/// A reference to the associated device.
|
|
GenericDeviceTy &Device;
|
|
|
|
/// Array of stream slots. Use std::deque because it can dynamically grow
|
|
/// without invalidating the already inserted elements. For instance, the
|
|
/// std::vector may invalidate the elements by reallocating the internal
|
|
/// array if there is not enough space on new insertions.
|
|
std::deque<StreamSlotTy> Slots;
|
|
|
|
/// The next available slot on the queue. This is reset to zero each time the
|
|
/// stream is synchronized. It also indicates the current number of consumed
|
|
/// slots at a given time.
|
|
uint32_t NextSlot;
|
|
|
|
/// The synchronization id. This number is increased each time the stream is
|
|
/// synchronized. It is useful to detect if an AMDGPUEventTy points to an
|
|
/// operation that was already finalized in a previous stream sycnhronize.
|
|
uint32_t SyncCycle;
|
|
|
|
/// Mutex to protect stream's management.
|
|
mutable std::mutex Mutex;
|
|
|
|
/// Timeout hint for HSA actively waiting for signal value to change
|
|
const uint64_t StreamBusyWaitMicroseconds;
|
|
|
|
/// Indicate to spread data transfers across all available SDMAs
|
|
bool UseMultipleSdmaEngines;
|
|
|
|
/// Return the current number of asynchronous operations on the stream.
|
|
uint32_t size() const { return NextSlot; }
|
|
|
|
/// Return the last valid slot on the stream.
|
|
uint32_t last() const { return size() - 1; }
|
|
|
|
/// Consume one slot from the stream. Since the stream uses signals on demand
|
|
/// and releases them once the slot is no longer used, the function requires
|
|
/// an idle signal for the new consumed slot.
|
|
std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
|
|
// Double the stream size if needed. Since we use std::deque, this operation
|
|
// does not invalidate the already added slots.
|
|
if (Slots.size() == NextSlot)
|
|
Slots.resize(Slots.size() * 2);
|
|
|
|
// Update the next available slot and the stream size.
|
|
uint32_t Curr = NextSlot++;
|
|
|
|
// Retrieve the input signal, if any, of the current operation.
|
|
AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
|
|
|
|
// Set the output signal of the current slot.
|
|
Slots[Curr].Signal = OutputSignal;
|
|
|
|
return std::make_pair(Curr, InputSignal);
|
|
}
|
|
|
|
/// Complete all pending post actions and reset the stream after synchronizing
|
|
/// or positively querying the stream.
|
|
Error complete() {
|
|
for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
|
|
// Take the post action of the operation if any.
|
|
if (auto Err = Slots[Slot].performAction())
|
|
return Err;
|
|
|
|
// Release the slot's signal if possible. Otherwise, another user will.
|
|
if (Slots[Slot].Signal->decreaseUseCount())
|
|
if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
|
|
return Err;
|
|
|
|
Slots[Slot].Signal = nullptr;
|
|
}
|
|
|
|
// Reset the stream slots to zero.
|
|
NextSlot = 0;
|
|
|
|
// Increase the synchronization id since the stream completed a sync cycle.
|
|
SyncCycle += 1;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Make the current stream wait on a specific operation of another stream.
|
|
/// The idea is to make the current stream waiting on two signals: 1) the last
|
|
/// signal of the current stream, and 2) the last signal of the other stream.
|
|
/// Use a barrier packet with two input signals.
|
|
Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
|
|
if (Queue == nullptr)
|
|
return Plugin::error("Target queue was nullptr");
|
|
|
|
/// The signal that we must wait from the other stream.
|
|
AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
|
|
|
|
// Prevent the release of the other stream's signal.
|
|
OtherSignal->increaseUseCount();
|
|
|
|
// Retrieve an available signal for the operation's output.
|
|
AMDGPUSignalTy *OutputSignal = nullptr;
|
|
if (auto Err = SignalManager.getResource(OutputSignal))
|
|
return Err;
|
|
OutputSignal->reset();
|
|
OutputSignal->increaseUseCount();
|
|
|
|
// Consume stream slot and compute dependencies.
|
|
auto [Curr, InputSignal] = consume(OutputSignal);
|
|
|
|
// Setup the post action to release the signal.
|
|
if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
|
|
return Err;
|
|
|
|
// Push a barrier into the queue with both input signals.
|
|
return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal);
|
|
}
|
|
|
|
/// Callback for running a specific asynchronous operation. This callback is
|
|
/// used for hsa_amd_signal_async_handler. The argument is the operation that
|
|
/// should be executed. Notice we use the post action mechanism to codify the
|
|
/// asynchronous operation.
|
|
static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
|
|
StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
|
|
assert(Slot && "Invalid slot");
|
|
assert(Slot->Signal && "Invalid signal");
|
|
|
|
// This thread is outside the stream mutex. Make sure the thread sees the
|
|
// changes on the slot.
|
|
std::atomic_thread_fence(std::memory_order_acquire);
|
|
|
|
// Perform the operation.
|
|
if (auto Err = Slot->performAction())
|
|
FATAL_MESSAGE(1, "Error performing post action: %s",
|
|
toString(std::move(Err)).data());
|
|
|
|
// Signal the output signal to notify the asynchronous operation finalized.
|
|
Slot->Signal->signal();
|
|
|
|
// Unregister callback.
|
|
return false;
|
|
}
|
|
|
|
// Callback for host-to-host memory copies. This is an asynchronous action.
|
|
static Error memcpyAction(void *Data) {
|
|
MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
|
|
assert(Args && "Invalid arguments");
|
|
assert(Args->Dst && "Invalid destination buffer");
|
|
assert(Args->Src && "Invalid source buffer");
|
|
|
|
std::memcpy(Args->Dst, Args->Src, Args->Size);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Releasing a memory buffer to a memory manager. This is a post completion
|
|
/// action. There are two kinds of memory buffers:
|
|
/// 1. For kernel arguments. This buffer can be freed after receiving the
|
|
/// kernel completion signal.
|
|
/// 2. For H2D transfers that need pinned memory space for staging. This
|
|
/// buffer can be freed after receiving the transfer completion signal.
|
|
/// 3. For D2H transfers that need pinned memory space for staging. This
|
|
/// buffer cannot be freed after receiving the transfer completion signal
|
|
/// because of the following asynchronous H2H callback.
|
|
/// For this reason, This action can only be taken at
|
|
/// AMDGPUStreamTy::complete()
|
|
/// Because of the case 3, all releaseBufferActions are taken at
|
|
/// AMDGPUStreamTy::complete() in the current implementation.
|
|
static Error releaseBufferAction(void *Data) {
|
|
ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
|
|
assert(Args && "Invalid arguments");
|
|
assert(Args->MemoryManager && "Invalid memory manager");
|
|
assert(Args->Buffer && "Invalid buffer");
|
|
|
|
// Release the allocation to the memory manager.
|
|
return Args->MemoryManager->deallocate(Args->Buffer);
|
|
}
|
|
|
|
/// Releasing a signal object back to SignalManager. This is a post completion
|
|
/// action. This action can only be taken at AMDGPUStreamTy::complete()
|
|
static Error releaseSignalAction(void *Data) {
|
|
ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
|
|
assert(Args && "Invalid arguments");
|
|
assert(Args->Signal && "Invalid signal");
|
|
assert(Args->SignalManager && "Invalid signal manager");
|
|
|
|
// Release the signal if needed.
|
|
if (Args->Signal->decreaseUseCount())
|
|
if (auto Err = Args->SignalManager->returnResource(Args->Signal))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
public:
|
|
/// Create an empty stream associated with a specific device.
|
|
AMDGPUStreamTy(AMDGPUDeviceTy &Device);
|
|
|
|
/// Initialize the stream's signals.
|
|
Error init() { return Plugin::success(); }
|
|
|
|
/// Deinitialize the stream's signals.
|
|
Error deinit() { return Plugin::success(); }
|
|
|
|
/// Push a asynchronous kernel to the stream. The kernel arguments must be
|
|
/// placed in a special allocation for kernel args and must keep alive until
|
|
/// the kernel finalizes. Once the kernel is finished, the stream will release
|
|
/// the kernel args buffer to the specified memory manager.
|
|
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
|
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
|
uint32_t GroupSize, uint64_t StackSize,
|
|
AMDGPUMemoryManagerTy &MemoryManager) {
|
|
if (Queue == nullptr)
|
|
return Plugin::error("Target queue was nullptr");
|
|
|
|
// Retrieve an available signal for the operation's output.
|
|
AMDGPUSignalTy *OutputSignal = nullptr;
|
|
if (auto Err = SignalManager.getResource(OutputSignal))
|
|
return Err;
|
|
OutputSignal->reset();
|
|
OutputSignal->increaseUseCount();
|
|
|
|
std::lock_guard<std::mutex> StreamLock(Mutex);
|
|
|
|
// Consume stream slot and compute dependencies.
|
|
auto [Curr, InputSignal] = consume(OutputSignal);
|
|
|
|
// Setup the post action to release the kernel args buffer.
|
|
if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
|
|
return Err;
|
|
|
|
// If we are running an RPC server we want to wake up the server thread
|
|
// whenever there is a kernel running and let it sleep otherwise.
|
|
if (Device.getRPCServer())
|
|
Device.Plugin.getRPCServer().Thread->notify();
|
|
|
|
// Push the kernel with the output signal and an input signal (optional)
|
|
if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
|
|
NumBlocks, GroupSize, StackSize,
|
|
OutputSignal, InputSignal))
|
|
return Err;
|
|
|
|
// Register a callback to indicate when the kernel is complete.
|
|
if (Device.getRPCServer()) {
|
|
if (auto Err = Slots[Curr].schedCallback(
|
|
[](void *Data) -> llvm::Error {
|
|
GenericPluginTy &Plugin =
|
|
*reinterpret_cast<GenericPluginTy *>(Data);
|
|
Plugin.getRPCServer().Thread->finish();
|
|
return Error::success();
|
|
},
|
|
&Device.Plugin))
|
|
return Err;
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Push an asynchronous memory copy between pinned memory buffers.
|
|
Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
|
|
uint64_t CopySize) {
|
|
// Retrieve an available signal for the operation's output.
|
|
AMDGPUSignalTy *OutputSignal = nullptr;
|
|
if (auto Err = SignalManager.getResource(OutputSignal))
|
|
return Err;
|
|
OutputSignal->reset();
|
|
OutputSignal->increaseUseCount();
|
|
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// Consume stream slot and compute dependencies.
|
|
auto [Curr, InputSignal] = consume(OutputSignal);
|
|
|
|
// Issue the async memory copy.
|
|
if (InputSignal && InputSignal->load()) {
|
|
hsa_signal_t InputSignalRaw = InputSignal->get();
|
|
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
|
|
Agent, CopySize, 1, &InputSignalRaw,
|
|
OutputSignal->get());
|
|
}
|
|
|
|
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
|
|
Agent, CopySize, 0, nullptr,
|
|
OutputSignal->get());
|
|
}
|
|
|
|
/// Push an asynchronous memory copy device-to-host involving an unpinned
|
|
/// memory buffer. The operation consists of a two-step copy from the
|
|
/// device buffer to an intermediate pinned host buffer, and then, to a
|
|
/// unpinned host buffer. Both operations are asynchronous and dependent.
|
|
/// The intermediate pinned buffer will be released to the specified memory
|
|
/// manager once the operation completes.
|
|
Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
|
|
uint64_t CopySize,
|
|
AMDGPUMemoryManagerTy &MemoryManager) {
|
|
// Retrieve available signals for the operation's outputs.
|
|
AMDGPUSignalTy *OutputSignals[2] = {};
|
|
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
|
|
return Err;
|
|
for (auto *Signal : OutputSignals) {
|
|
Signal->reset();
|
|
Signal->increaseUseCount();
|
|
}
|
|
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// Consume stream slot and compute dependencies.
|
|
auto [Curr, InputSignal] = consume(OutputSignals[0]);
|
|
|
|
// Setup the post action for releasing the intermediate buffer.
|
|
if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
|
|
return Err;
|
|
|
|
// Issue the first step: device to host transfer. Avoid defining the input
|
|
// dependency if already satisfied.
|
|
if (InputSignal && InputSignal->load()) {
|
|
hsa_signal_t InputSignalRaw = InputSignal->get();
|
|
if (auto Err = hsa_utils::asyncMemCopy(
|
|
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
|
|
&InputSignalRaw, OutputSignals[0]->get()))
|
|
return Err;
|
|
} else {
|
|
if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
|
|
Agent, Src, Agent, CopySize, 0,
|
|
nullptr, OutputSignals[0]->get()))
|
|
return Err;
|
|
}
|
|
|
|
// Consume another stream slot and compute dependencies.
|
|
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
|
|
assert(InputSignal && "Invalid input signal");
|
|
|
|
// The std::memcpy is done asynchronously using an async handler. We store
|
|
// the function's information in the action but it's not actually an action.
|
|
if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize))
|
|
return Err;
|
|
|
|
// Make changes on this slot visible to the async handler's thread.
|
|
std::atomic_thread_fence(std::memory_order_release);
|
|
|
|
// Issue the second step: host to host transfer.
|
|
hsa_status_t Status = hsa_amd_signal_async_handler(
|
|
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
|
|
(void *)&Slots[Curr]);
|
|
|
|
return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
|
|
}
|
|
|
|
/// Push an asynchronous memory copy host-to-device involving an unpinned
|
|
/// memory buffer. The operation consists of a two-step copy from the
|
|
/// unpinned host buffer to an intermediate pinned host buffer, and then, to
|
|
/// the pinned host buffer. Both operations are asynchronous and dependent.
|
|
/// The intermediate pinned buffer will be released to the specified memory
|
|
/// manager once the operation completes.
|
|
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
|
|
uint64_t CopySize,
|
|
AMDGPUMemoryManagerTy &MemoryManager) {
|
|
// Retrieve available signals for the operation's outputs.
|
|
AMDGPUSignalTy *OutputSignals[2] = {};
|
|
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
|
|
return Err;
|
|
for (auto *Signal : OutputSignals) {
|
|
Signal->reset();
|
|
Signal->increaseUseCount();
|
|
}
|
|
|
|
AMDGPUSignalTy *OutputSignal = OutputSignals[0];
|
|
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// Consume stream slot and compute dependencies.
|
|
auto [Curr, InputSignal] = consume(OutputSignal);
|
|
|
|
// Issue the first step: host to host transfer.
|
|
if (InputSignal && InputSignal->load()) {
|
|
// The std::memcpy is done asynchronously using an async handler. We store
|
|
// the function's information in the action but it is not actually a
|
|
// post action.
|
|
if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
|
|
return Err;
|
|
|
|
// Make changes on this slot visible to the async handler's thread.
|
|
std::atomic_thread_fence(std::memory_order_release);
|
|
|
|
hsa_status_t Status = hsa_amd_signal_async_handler(
|
|
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
|
|
(void *)&Slots[Curr]);
|
|
|
|
if (auto Err = Plugin::check(Status,
|
|
"Error in hsa_amd_signal_async_handler: %s"))
|
|
return Err;
|
|
|
|
// Let's use now the second output signal.
|
|
OutputSignal = OutputSignals[1];
|
|
|
|
// Consume another stream slot and compute dependencies.
|
|
std::tie(Curr, InputSignal) = consume(OutputSignal);
|
|
} else {
|
|
// All preceding operations completed, copy the memory synchronously.
|
|
std::memcpy(Inter, Src, CopySize);
|
|
|
|
// Return the second signal because it will not be used.
|
|
OutputSignals[1]->decreaseUseCount();
|
|
if (auto Err = SignalManager.returnResource(OutputSignals[1]))
|
|
return Err;
|
|
}
|
|
|
|
// Setup the post action to release the intermediate pinned buffer.
|
|
if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
|
|
return Err;
|
|
|
|
// Issue the second step: host to device transfer. Avoid defining the input
|
|
// dependency if already satisfied.
|
|
if (InputSignal && InputSignal->load()) {
|
|
hsa_signal_t InputSignalRaw = InputSignal->get();
|
|
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
|
|
Agent, CopySize, 1, &InputSignalRaw,
|
|
OutputSignal->get());
|
|
}
|
|
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
|
|
Agent, CopySize, 0, nullptr,
|
|
OutputSignal->get());
|
|
}
|
|
|
|
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
|
|
Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
|
|
hsa_agent_t SrcAgent, uint64_t CopySize) {
|
|
AMDGPUSignalTy *OutputSignal;
|
|
if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
|
|
return Err;
|
|
OutputSignal->reset();
|
|
OutputSignal->increaseUseCount();
|
|
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// Consume stream slot and compute dependencies.
|
|
auto [Curr, InputSignal] = consume(OutputSignal);
|
|
|
|
// The agents need to have access to the corresponding memory
|
|
// This is presently only true if the pointers were originally
|
|
// allocated by this runtime or the caller made the appropriate
|
|
// access calls.
|
|
|
|
if (InputSignal && InputSignal->load()) {
|
|
hsa_signal_t InputSignalRaw = InputSignal->get();
|
|
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
|
|
SrcAgent, CopySize, 1, &InputSignalRaw,
|
|
OutputSignal->get());
|
|
}
|
|
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
|
|
SrcAgent, CopySize, 0, nullptr,
|
|
OutputSignal->get());
|
|
}
|
|
|
|
/// Synchronize with the stream. The current thread waits until all operations
|
|
/// are finalized and it performs the pending post actions (i.e., releasing
|
|
/// intermediate buffers).
|
|
Error synchronize() {
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// No need to synchronize anything.
|
|
if (size() == 0)
|
|
return Plugin::success();
|
|
|
|
// Wait until all previous operations on the stream have completed.
|
|
if (auto Err =
|
|
Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
|
|
return Err;
|
|
|
|
// Reset the stream and perform all pending post actions.
|
|
return complete();
|
|
}
|
|
|
|
/// Query the stream and complete pending post actions if operations finished.
|
|
/// Return whether all the operations completed. This operation does not block
|
|
/// the calling thread.
|
|
Expected<bool> query() {
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// No need to query anything.
|
|
if (size() == 0)
|
|
return true;
|
|
|
|
// The last operation did not complete yet. Return directly.
|
|
if (Slots[last()].Signal->load())
|
|
return false;
|
|
|
|
// Reset the stream and perform all pending post actions.
|
|
if (auto Err = complete())
|
|
return std::move(Err);
|
|
|
|
return true;
|
|
}
|
|
|
|
const AMDGPUQueueTy *getQueue() const { return Queue; }
|
|
|
|
/// Record the state of the stream on an event.
|
|
Error recordEvent(AMDGPUEventTy &Event) const;
|
|
|
|
/// Make the stream wait on an event.
|
|
Error waitEvent(const AMDGPUEventTy &Event);
|
|
|
|
friend struct AMDGPUStreamManagerTy;
|
|
};
|
|
|
|
/// Class representing an event on AMDGPU. The event basically stores some
|
|
/// information regarding the state of the recorded stream.
|
|
struct AMDGPUEventTy {
|
|
/// Create an empty event.
|
|
AMDGPUEventTy(AMDGPUDeviceTy &Device)
|
|
: RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
|
|
|
|
/// Initialize and deinitialize.
|
|
Error init() { return Plugin::success(); }
|
|
Error deinit() { return Plugin::success(); }
|
|
|
|
/// Record the state of a stream on the event.
|
|
Error record(AMDGPUStreamTy &Stream) {
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
// Ignore the last recorded stream.
|
|
RecordedStream = &Stream;
|
|
|
|
return Stream.recordEvent(*this);
|
|
}
|
|
|
|
/// Make a stream wait on the current event.
|
|
Error wait(AMDGPUStreamTy &Stream) {
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
if (!RecordedStream)
|
|
return Plugin::error("Event does not have any recorded stream");
|
|
|
|
// Synchronizing the same stream. Do nothing.
|
|
if (RecordedStream == &Stream)
|
|
return Plugin::success();
|
|
|
|
// No need to wait anything, the recorded stream already finished the
|
|
// corresponding operation.
|
|
if (RecordedSlot < 0)
|
|
return Plugin::success();
|
|
|
|
return Stream.waitEvent(*this);
|
|
}
|
|
|
|
protected:
|
|
/// The stream registered in this event.
|
|
AMDGPUStreamTy *RecordedStream;
|
|
|
|
/// The recordered operation on the recorded stream.
|
|
int64_t RecordedSlot;
|
|
|
|
/// The sync cycle when the stream was recorded. Used to detect stale events.
|
|
int64_t RecordedSyncCycle;
|
|
|
|
/// Mutex to safely access event fields.
|
|
mutable std::mutex Mutex;
|
|
|
|
friend struct AMDGPUStreamTy;
|
|
};
|
|
|
|
Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
|
|
std::lock_guard<std::mutex> Lock(Mutex);
|
|
|
|
if (size() > 0) {
|
|
// Record the synchronize identifier (to detect stale recordings) and
|
|
// the last valid stream's operation.
|
|
Event.RecordedSyncCycle = SyncCycle;
|
|
Event.RecordedSlot = last();
|
|
|
|
assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
|
|
assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
|
|
} else {
|
|
// The stream is empty, everything already completed, record nothing.
|
|
Event.RecordedSyncCycle = -1;
|
|
Event.RecordedSlot = -1;
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
|
|
// Retrieve the recorded stream on the event.
|
|
AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
|
|
|
|
std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
|
|
|
|
// The recorded stream already completed the operation because the synchronize
|
|
// identifier is already outdated.
|
|
if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
|
|
return Plugin::success();
|
|
|
|
// Again, the recorded stream already completed the operation, the last
|
|
// operation's output signal is satisfied.
|
|
if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
|
|
return Plugin::success();
|
|
|
|
// Otherwise, make the current stream wait on the other stream's operation.
|
|
return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
|
|
}
|
|
|
|
struct AMDGPUStreamManagerTy final
|
|
: GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
|
|
using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
|
|
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
|
|
|
|
AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
|
|
: GenericDeviceResourceManagerTy(Device), Device(Device),
|
|
OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
|
|
NextQueue(0), Agent(HSAAgent) {}
|
|
|
|
Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
|
|
Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
|
|
QueueSize = HSAQueueSize;
|
|
MaxNumQueues = NumHSAQueues;
|
|
// Initialize one queue eagerly
|
|
if (auto Err = Queues.front().init(Device, Agent, QueueSize))
|
|
return Err;
|
|
|
|
return GenericDeviceResourceManagerTy::init(InitialSize);
|
|
}
|
|
|
|
/// Deinitialize the resource pool and delete all resources. This function
|
|
/// must be called before the destructor.
|
|
Error deinit() override {
|
|
// De-init all queues
|
|
for (AMDGPUQueueTy &Queue : Queues) {
|
|
if (auto Err = Queue.deinit())
|
|
return Err;
|
|
}
|
|
|
|
return GenericDeviceResourceManagerTy::deinit();
|
|
}
|
|
|
|
/// Get a single stream from the pool or create new resources.
|
|
virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
|
|
return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
|
|
return assignNextQueue(Handle);
|
|
});
|
|
}
|
|
|
|
/// Return stream to the pool.
|
|
virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
|
|
return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
|
|
Handle->Queue->removeUser();
|
|
return Plugin::success();
|
|
});
|
|
}
|
|
|
|
private:
|
|
/// Search for and assign an preferably idle queue to the given Stream. If
|
|
/// there is no queue without current users, choose the queue with the lowest
|
|
/// user count. If utilization is ignored: use round robin selection.
|
|
inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
|
|
// Start from zero when tracking utilization, otherwise: round robin policy.
|
|
uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
|
|
|
|
if (OMPX_QueueTracking) {
|
|
// Find the least used queue.
|
|
for (uint32_t I = 0; I < MaxNumQueues; ++I) {
|
|
// Early exit when an initialized queue is idle.
|
|
if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
|
|
Index = I;
|
|
break;
|
|
}
|
|
|
|
// Update the least used queue.
|
|
if (Queues[Index].getUserCount() > Queues[I].getUserCount())
|
|
Index = I;
|
|
}
|
|
}
|
|
|
|
// Make sure the queue is initialized, then add user & assign.
|
|
if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
|
|
return Err;
|
|
Queues[Index].addUser();
|
|
Stream->Queue = &Queues[Index];
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// The device associated with this stream.
|
|
GenericDeviceTy &Device;
|
|
|
|
/// Envar for controlling the tracking of busy HSA queues.
|
|
BoolEnvar OMPX_QueueTracking;
|
|
|
|
/// The next queue index to use for round robin selection.
|
|
uint32_t NextQueue;
|
|
|
|
/// The queues which are assigned to requested streams.
|
|
std::vector<AMDGPUQueueTy> Queues;
|
|
|
|
/// The corresponding device as HSA agent.
|
|
hsa_agent_t Agent;
|
|
|
|
/// The maximum number of queues.
|
|
uint32_t MaxNumQueues;
|
|
|
|
/// The size of created queues.
|
|
uint32_t QueueSize;
|
|
};
|
|
|
|
/// Abstract class that holds the common members of the actual kernel devices
|
|
/// and the host device. Both types should inherit from this class.
|
|
struct AMDGenericDeviceTy {
|
|
AMDGenericDeviceTy() {}
|
|
|
|
virtual ~AMDGenericDeviceTy() {}
|
|
|
|
/// Create all memory pools which the device has access to and classify them.
|
|
Error initMemoryPools() {
|
|
// Retrieve all memory pools from the device agent(s).
|
|
Error Err = retrieveAllMemoryPools();
|
|
if (Err)
|
|
return Err;
|
|
|
|
for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
|
|
// Initialize the memory pool and retrieve some basic info.
|
|
Error Err = MemoryPool->init();
|
|
if (Err)
|
|
return Err;
|
|
|
|
if (!MemoryPool->isGlobal())
|
|
continue;
|
|
|
|
// Classify the memory pools depending on their properties.
|
|
if (MemoryPool->isFineGrained()) {
|
|
FineGrainedMemoryPools.push_back(MemoryPool);
|
|
if (MemoryPool->supportsKernelArgs())
|
|
ArgsMemoryPools.push_back(MemoryPool);
|
|
} else if (MemoryPool->isCoarseGrained()) {
|
|
CoarseGrainedMemoryPools.push_back(MemoryPool);
|
|
}
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Destroy all memory pools.
|
|
Error deinitMemoryPools() {
|
|
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
|
|
delete Pool;
|
|
|
|
AllMemoryPools.clear();
|
|
FineGrainedMemoryPools.clear();
|
|
CoarseGrainedMemoryPools.clear();
|
|
ArgsMemoryPools.clear();
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Retrieve and construct all memory pools from the device agent(s).
|
|
virtual Error retrieveAllMemoryPools() = 0;
|
|
|
|
/// Get the device agent.
|
|
virtual hsa_agent_t getAgent() const = 0;
|
|
|
|
protected:
|
|
/// Array of all memory pools available to the host agents.
|
|
llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
|
|
|
|
/// Array of fine-grained memory pools available to the host agents.
|
|
llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
|
|
|
|
/// Array of coarse-grained memory pools available to the host agents.
|
|
llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
|
|
|
|
/// Array of kernel args memory pools available to the host agents.
|
|
llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
|
|
};
|
|
|
|
/// Class representing the host device. This host device may have more than one
|
|
/// HSA host agent. We aggregate all its resources into the same instance.
|
|
struct AMDHostDeviceTy : public AMDGenericDeviceTy {
|
|
/// Create a host device from an array of host agents.
|
|
AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
|
|
const llvm::SmallVector<hsa_agent_t> &HostAgents)
|
|
: AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
|
|
PinnedMemoryManager(Plugin) {
|
|
assert(HostAgents.size() && "No host agent found");
|
|
}
|
|
|
|
/// Initialize the host device memory pools and the memory managers for
|
|
/// kernel args and host pinned memory allocations.
|
|
Error init() {
|
|
if (auto Err = initMemoryPools())
|
|
return Err;
|
|
|
|
if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
|
|
return Err;
|
|
|
|
if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Deinitialize memory pools and managers.
|
|
Error deinit() {
|
|
if (auto Err = deinitMemoryPools())
|
|
return Err;
|
|
|
|
if (auto Err = ArgsMemoryManager.deinit())
|
|
return Err;
|
|
|
|
if (auto Err = PinnedMemoryManager.deinit())
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Retrieve and construct all memory pools from the host agents.
|
|
Error retrieveAllMemoryPools() override {
|
|
// Iterate through the available pools across the host agents.
|
|
for (hsa_agent_t Agent : Agents) {
|
|
Error Err = hsa_utils::iterateAgentMemoryPools(
|
|
Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
|
|
AMDGPUMemoryPoolTy *MemoryPool =
|
|
new AMDGPUMemoryPoolTy(HSAMemoryPool);
|
|
AllMemoryPools.push_back(MemoryPool);
|
|
return HSA_STATUS_SUCCESS;
|
|
});
|
|
if (Err)
|
|
return Err;
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Get one of the host agents. Return always the first agent.
|
|
hsa_agent_t getAgent() const override { return Agents[0]; }
|
|
|
|
/// Get a memory pool for fine-grained allocations.
|
|
AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
|
|
assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
|
|
// Retrieve any memory pool.
|
|
return *FineGrainedMemoryPools[0];
|
|
}
|
|
|
|
AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
|
|
assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
|
|
// Retrieve any memory pool.
|
|
return *CoarseGrainedMemoryPools[0];
|
|
}
|
|
|
|
/// Get a memory pool for kernel args allocations.
|
|
AMDGPUMemoryPoolTy &getArgsMemoryPool() {
|
|
assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
|
|
// Retrieve any memory pool.
|
|
return *ArgsMemoryPools[0];
|
|
}
|
|
|
|
/// Getters for kernel args and host pinned memory managers.
|
|
AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
|
|
AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
|
|
return PinnedMemoryManager;
|
|
}
|
|
|
|
private:
|
|
/// Array of agents on the host side.
|
|
const llvm::SmallVector<hsa_agent_t> Agents;
|
|
|
|
// Memory manager for kernel arguments.
|
|
AMDGPUMemoryManagerTy ArgsMemoryManager;
|
|
|
|
// Memory manager for pinned memory.
|
|
AMDGPUMemoryManagerTy PinnedMemoryManager;
|
|
};
|
|
|
|
/// Class implementing the AMDGPU device functionalities which derives from the
|
|
/// generic device class.
|
|
struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
|
// Create an AMDGPU device with a device id and default AMDGPU grid values.
|
|
AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
|
|
AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
|
|
: GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(),
|
|
OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
|
|
OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
|
|
OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
|
|
OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
|
|
1 * 1024 * 1024), // 1MB
|
|
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
|
|
64),
|
|
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
|
|
OMPX_UseMultipleSdmaEngines(
|
|
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
|
|
OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
|
|
AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
|
|
HostDevice(HostDevice) {}
|
|
|
|
~AMDGPUDeviceTy() {}
|
|
|
|
/// Initialize the device, its resources and get its properties.
|
|
Error initImpl(GenericPluginTy &Plugin) override {
|
|
// First setup all the memory pools.
|
|
if (auto Err = initMemoryPools())
|
|
return Err;
|
|
|
|
char GPUName[64];
|
|
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
|
|
return Err;
|
|
ComputeUnitKind = GPUName;
|
|
|
|
// Get the wavefront size.
|
|
uint32_t WavefrontSize = 0;
|
|
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
|
|
return Err;
|
|
GridValues.GV_Warp_Size = WavefrontSize;
|
|
|
|
// Get the frequency of the steady clock. If the attribute is missing
|
|
// assume running on an older libhsa and default to 0, omp_get_wtime
|
|
// will be inaccurate but otherwise programs can still run.
|
|
if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
|
|
ClockFrequency) != HSA_STATUS_SUCCESS)
|
|
ClockFrequency = 0;
|
|
|
|
// Load the grid values depending on the wavefront.
|
|
if (WavefrontSize == 32)
|
|
GridValues = getAMDGPUGridValues<32>();
|
|
else if (WavefrontSize == 64)
|
|
GridValues = getAMDGPUGridValues<64>();
|
|
else
|
|
return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
|
|
|
|
// Get maximum number of workitems per workgroup.
|
|
uint16_t WorkgroupMaxDim[3];
|
|
if (auto Err =
|
|
getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
|
|
return Err;
|
|
GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
|
|
|
|
// Get maximum number of workgroups.
|
|
hsa_dim3_t GridMaxDim;
|
|
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
|
|
return Err;
|
|
|
|
GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
|
|
if (GridValues.GV_Max_Teams == 0)
|
|
return Plugin::error("Maximum number of teams cannot be zero");
|
|
|
|
// Compute the default number of teams.
|
|
uint32_t ComputeUnits = 0;
|
|
if (auto Err =
|
|
getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
|
|
return Err;
|
|
GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
|
|
|
|
uint32_t WavesPerCU = 0;
|
|
if (auto Err =
|
|
getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
|
|
return Err;
|
|
HardwareParallelism = ComputeUnits * WavesPerCU;
|
|
|
|
// Get maximum size of any device queues and maximum number of queues.
|
|
uint32_t MaxQueueSize;
|
|
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
|
|
return Err;
|
|
|
|
uint32_t MaxQueues;
|
|
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
|
|
return Err;
|
|
|
|
// Compute the number of queues and their size.
|
|
OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
|
|
OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
|
|
|
|
// Initialize stream pool.
|
|
if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
|
|
OMPX_NumQueues, OMPX_QueueSize))
|
|
return Err;
|
|
|
|
// Initialize event pool.
|
|
if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
|
|
return Err;
|
|
|
|
// Initialize signal pool.
|
|
if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
|
|
return Err;
|
|
|
|
// Detect if XNACK is enabled
|
|
SmallVector<SmallString<32>> Targets;
|
|
if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets))
|
|
return Err;
|
|
if (!Targets.empty() && Targets[0].str().contains("xnack+"))
|
|
IsXnackEnabled = true;
|
|
|
|
// detect if device is an APU.
|
|
if (auto Err = checkIfAPU())
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Deinitialize the device and release its resources.
|
|
Error deinitImpl() override {
|
|
// Deinitialize the stream and event pools.
|
|
if (auto Err = AMDGPUStreamManager.deinit())
|
|
return Err;
|
|
|
|
if (auto Err = AMDGPUEventManager.deinit())
|
|
return Err;
|
|
|
|
if (auto Err = AMDGPUSignalManager.deinit())
|
|
return Err;
|
|
|
|
// Close modules if necessary.
|
|
if (!LoadedImages.empty()) {
|
|
// Each image has its own module.
|
|
for (DeviceImageTy *Image : LoadedImages) {
|
|
AMDGPUDeviceImageTy &AMDImage =
|
|
static_cast<AMDGPUDeviceImageTy &>(*Image);
|
|
|
|
// Unload the executable of the image.
|
|
if (auto Err = AMDImage.unloadExecutable())
|
|
return Err;
|
|
}
|
|
}
|
|
|
|
// Invalidate agent reference.
|
|
Agent = {0};
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) override {
|
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
|
if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
|
|
Image.setPendingGlobalDtors();
|
|
|
|
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
|
|
}
|
|
|
|
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) override {
|
|
if (Image.hasPendingGlobalDtors())
|
|
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
|
|
return Plugin::success();
|
|
}
|
|
|
|
uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
|
|
|
|
Expected<std::unique_ptr<MemoryBuffer>>
|
|
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
|
|
|
|
// TODO: We should try to avoid materialization but there seems to be no
|
|
// good linker interface w/o file i/o.
|
|
SmallString<128> LinkerInputFilePath;
|
|
std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
|
|
"o", LinkerInputFilePath);
|
|
if (EC)
|
|
return Plugin::error("Failed to create temporary file for linker");
|
|
|
|
// Write the file's contents to the output file.
|
|
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
|
|
FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
|
|
if (!OutputOrErr)
|
|
return OutputOrErr.takeError();
|
|
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
|
|
llvm::copy(MB->getBuffer(), Output->getBufferStart());
|
|
if (Error E = Output->commit())
|
|
return std::move(E);
|
|
|
|
SmallString<128> LinkerOutputFilePath;
|
|
EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
|
|
LinkerOutputFilePath);
|
|
if (EC)
|
|
return Plugin::error("Failed to create temporary file for linker");
|
|
|
|
const auto &ErrorOrPath = sys::findProgramByName("lld");
|
|
if (!ErrorOrPath)
|
|
return createStringError(inconvertibleErrorCode(),
|
|
"Failed to find `lld` on the PATH.");
|
|
|
|
std::string LLDPath = ErrorOrPath.get();
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
|
|
"Using `%s` to link JITed amdgcn output.", LLDPath.c_str());
|
|
|
|
std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
|
|
StringRef Args[] = {LLDPath,
|
|
"-flavor",
|
|
"gnu",
|
|
"--no-undefined",
|
|
"-shared",
|
|
MCPU,
|
|
"-o",
|
|
LinkerOutputFilePath.data(),
|
|
LinkerInputFilePath.data()};
|
|
|
|
std::string Error;
|
|
int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
|
|
if (RC)
|
|
return Plugin::error("Linking optimized bitcode failed: %s",
|
|
Error.c_str());
|
|
|
|
auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
|
|
if (!BufferOrErr)
|
|
return Plugin::error("Failed to open temporary file for lld");
|
|
|
|
// Clean up the temporary files afterwards.
|
|
if (sys::fs::remove(LinkerOutputFilePath))
|
|
return Plugin::error("Failed to remove temporary output file for lld");
|
|
if (sys::fs::remove(LinkerInputFilePath))
|
|
return Plugin::error("Failed to remove temporary input file for lld");
|
|
|
|
return std::move(*BufferOrErr);
|
|
}
|
|
|
|
/// See GenericDeviceTy::getComputeUnitKind().
|
|
std::string getComputeUnitKind() const override { return ComputeUnitKind; }
|
|
|
|
/// Returns the clock frequency for the given AMDGPU device.
|
|
uint64_t getClockFrequency() const override { return ClockFrequency; }
|
|
|
|
/// Allocate and construct an AMDGPU kernel.
|
|
Expected<GenericKernelTy &> constructKernel(const char *Name) override {
|
|
// Allocate and construct the AMDGPU kernel.
|
|
AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
|
|
if (!AMDGPUKernel)
|
|
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
|
|
|
|
new (AMDGPUKernel) AMDGPUKernelTy(Name);
|
|
|
|
return *AMDGPUKernel;
|
|
}
|
|
|
|
/// Set the current context to this device's context. Do nothing since the
|
|
/// AMDGPU devices do not have the concept of contexts.
|
|
Error setContext() override { return Plugin::success(); }
|
|
|
|
/// AMDGPU returns the product of the number of compute units and the waves
|
|
/// per compute unit.
|
|
uint64_t getHardwareParallelism() const override {
|
|
return HardwareParallelism;
|
|
}
|
|
|
|
/// We want to set up the RPC server for host services to the GPU if it is
|
|
/// available.
|
|
bool shouldSetupRPCServer() const override { return true; }
|
|
|
|
/// The RPC interface should have enough space for all available parallelism.
|
|
uint64_t requestedRPCPortCount() const override {
|
|
return getHardwareParallelism();
|
|
}
|
|
|
|
/// Get the stream of the asynchronous info structure or get a new one.
|
|
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
|
|
AMDGPUStreamTy *&Stream) {
|
|
// Get the stream (if any) from the async info.
|
|
Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
|
|
if (!Stream) {
|
|
// There was no stream; get an idle one.
|
|
if (auto Err = AMDGPUStreamManager.getResource(Stream))
|
|
return Err;
|
|
|
|
// Modify the async info's stream.
|
|
AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Load the binary image into the device and allocate an image object.
|
|
Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
|
|
int32_t ImageId) override {
|
|
// Allocate and initialize the image object.
|
|
AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
|
|
new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
|
|
|
|
// Load the HSA executable.
|
|
if (Error Err = AMDImage->loadExecutable(*this))
|
|
return std::move(Err);
|
|
|
|
return AMDImage;
|
|
}
|
|
|
|
/// Allocate memory on the device or related to the device.
|
|
void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
|
|
|
|
/// Deallocate memory on the device or related to the device.
|
|
int free(void *TgtPtr, TargetAllocTy Kind) override {
|
|
if (TgtPtr == nullptr)
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
AMDGPUMemoryPoolTy *MemoryPool = nullptr;
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
|
|
MemoryPool = CoarseGrainedMemoryPools[0];
|
|
break;
|
|
case TARGET_ALLOC_HOST:
|
|
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
|
|
break;
|
|
case TARGET_ALLOC_SHARED:
|
|
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
|
|
break;
|
|
}
|
|
|
|
if (!MemoryPool) {
|
|
REPORT("No memory pool for the specified allocation kind\n");
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
if (Error Err = MemoryPool->deallocate(TgtPtr)) {
|
|
REPORT("%s\n", toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
/// Synchronize current thread with the pending operations on the async info.
|
|
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
|
|
AMDGPUStreamTy *Stream =
|
|
reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
|
|
assert(Stream && "Invalid stream");
|
|
|
|
if (auto Err = Stream->synchronize())
|
|
return Err;
|
|
|
|
// Once the stream is synchronized, return it to stream pool and reset
|
|
// AsyncInfo. This is to make sure the synchronization only works for its
|
|
// own tasks.
|
|
AsyncInfo.Queue = nullptr;
|
|
return AMDGPUStreamManager.returnResource(Stream);
|
|
}
|
|
|
|
/// Query for the completion of the pending operations on the async info.
|
|
Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
|
|
AMDGPUStreamTy *Stream =
|
|
reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
|
|
assert(Stream && "Invalid stream");
|
|
|
|
auto CompletedOrErr = Stream->query();
|
|
if (!CompletedOrErr)
|
|
return CompletedOrErr.takeError();
|
|
|
|
// Return if it the stream did not complete yet.
|
|
if (!(*CompletedOrErr))
|
|
return Plugin::success();
|
|
|
|
// Once the stream is completed, return it to stream pool and reset
|
|
// AsyncInfo. This is to make sure the synchronization only works for its
|
|
// own tasks.
|
|
AsyncInfo.Queue = nullptr;
|
|
return AMDGPUStreamManager.returnResource(Stream);
|
|
}
|
|
|
|
/// Pin the host buffer and return the device pointer that should be used for
|
|
/// device transfers.
|
|
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
|
|
void *PinnedPtr = nullptr;
|
|
|
|
hsa_status_t Status =
|
|
hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
|
|
if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
|
|
return std::move(Err);
|
|
|
|
return PinnedPtr;
|
|
}
|
|
|
|
/// Unpin the host buffer.
|
|
Error dataUnlockImpl(void *HstPtr) override {
|
|
hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
|
|
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
|
|
}
|
|
|
|
/// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
|
|
Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
|
|
void *&BaseDevAccessiblePtr,
|
|
size_t &BaseSize) const override {
|
|
hsa_amd_pointer_info_t Info;
|
|
Info.size = sizeof(hsa_amd_pointer_info_t);
|
|
|
|
hsa_status_t Status = hsa_amd_pointer_info(
|
|
HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr,
|
|
/*accessible=*/nullptr);
|
|
if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
|
|
return std::move(Err);
|
|
|
|
// The buffer may be locked or allocated through HSA allocators. Assume that
|
|
// the buffer is host pinned if the runtime reports a HSA type.
|
|
if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
|
|
Info.type != HSA_EXT_POINTER_TYPE_HSA)
|
|
return false;
|
|
|
|
assert(Info.hostBaseAddress && "Invalid host pinned address");
|
|
assert(Info.agentBaseAddress && "Invalid agent pinned address");
|
|
assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
|
|
|
|
// Save the allocation info in the output parameters.
|
|
BaseHstPtr = Info.hostBaseAddress;
|
|
BaseDevAccessiblePtr = Info.agentBaseAddress;
|
|
BaseSize = Info.sizeInBytes;
|
|
|
|
return true;
|
|
}
|
|
|
|
/// Submit data to the device (host to device transfer).
|
|
Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
AMDGPUStreamTy *Stream = nullptr;
|
|
void *PinnedPtr = nullptr;
|
|
|
|
// Use one-step asynchronous operation when host memory is already pinned.
|
|
if (void *PinnedPtr =
|
|
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
|
|
}
|
|
|
|
// For large transfers use synchronous behavior.
|
|
if (Size >= OMPX_MaxAsyncCopyBytes) {
|
|
if (AsyncInfoWrapper.hasQueue())
|
|
if (auto Err = synchronize(AsyncInfoWrapper))
|
|
return Err;
|
|
|
|
hsa_status_t Status;
|
|
Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
|
|
&PinnedPtr);
|
|
if (auto Err =
|
|
Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
|
|
return Err;
|
|
|
|
AMDGPUSignalTy Signal;
|
|
if (auto Err = Signal.init())
|
|
return Err;
|
|
|
|
if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
|
|
Agent, PinnedPtr, Agent, Size, 0,
|
|
nullptr, Signal.get()))
|
|
return Err;
|
|
|
|
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
|
return Err;
|
|
|
|
if (auto Err = Signal.deinit())
|
|
return Err;
|
|
|
|
Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
|
|
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
|
|
}
|
|
|
|
// Otherwise, use two-step copy with an intermediate pinned host buffer.
|
|
AMDGPUMemoryManagerTy &PinnedMemoryManager =
|
|
HostDevice.getPinnedMemoryManager();
|
|
if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
|
|
return Err;
|
|
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
|
|
PinnedMemoryManager);
|
|
}
|
|
|
|
/// Retrieve data from the device (device to host transfer).
|
|
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
AMDGPUStreamTy *Stream = nullptr;
|
|
void *PinnedPtr = nullptr;
|
|
|
|
// Use one-step asynchronous operation when host memory is already pinned.
|
|
if (void *PinnedPtr =
|
|
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
|
|
}
|
|
|
|
// For large transfers use synchronous behavior.
|
|
if (Size >= OMPX_MaxAsyncCopyBytes) {
|
|
if (AsyncInfoWrapper.hasQueue())
|
|
if (auto Err = synchronize(AsyncInfoWrapper))
|
|
return Err;
|
|
|
|
hsa_status_t Status;
|
|
Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
|
|
&PinnedPtr);
|
|
if (auto Err =
|
|
Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
|
|
return Err;
|
|
|
|
AMDGPUSignalTy Signal;
|
|
if (auto Err = Signal.init())
|
|
return Err;
|
|
|
|
if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
|
|
PinnedPtr, Agent, TgtPtr, Agent,
|
|
Size, 0, nullptr, Signal.get()))
|
|
return Err;
|
|
|
|
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
|
return Err;
|
|
|
|
if (auto Err = Signal.deinit())
|
|
return Err;
|
|
|
|
Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
|
|
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
|
|
}
|
|
|
|
// Otherwise, use two-step copy with an intermediate pinned host buffer.
|
|
AMDGPUMemoryManagerTy &PinnedMemoryManager =
|
|
HostDevice.getPinnedMemoryManager();
|
|
if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
|
|
return Err;
|
|
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
|
|
PinnedMemoryManager);
|
|
}
|
|
|
|
/// Exchange data between two devices within the plugin.
|
|
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
|
|
void *DstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
|
|
|
|
// For large transfers use synchronous behavior.
|
|
if (Size >= OMPX_MaxAsyncCopyBytes) {
|
|
if (AsyncInfoWrapper.hasQueue())
|
|
if (auto Err = synchronize(AsyncInfoWrapper))
|
|
return Err;
|
|
|
|
AMDGPUSignalTy Signal;
|
|
if (auto Err = Signal.init())
|
|
return Err;
|
|
|
|
if (auto Err = hsa_utils::asyncMemCopy(
|
|
useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
|
|
getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
|
|
return Err;
|
|
|
|
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
|
return Err;
|
|
|
|
return Signal.deinit();
|
|
}
|
|
|
|
AMDGPUStreamTy *Stream = nullptr;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
if (Size <= 0)
|
|
return Plugin::success();
|
|
|
|
return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
|
|
getAgent(), (uint64_t)Size);
|
|
}
|
|
|
|
/// Initialize the async info for interoperability purposes.
|
|
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
// TODO: Implement this function.
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Initialize the device info for interoperability purposes.
|
|
Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
|
|
DeviceInfo->Context = nullptr;
|
|
|
|
if (!DeviceInfo->Device)
|
|
DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Create an event.
|
|
Error createEventImpl(void **EventPtrStorage) override {
|
|
AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
|
|
return AMDGPUEventManager.getResource(*Event);
|
|
}
|
|
|
|
/// Destroy a previously created event.
|
|
Error destroyEventImpl(void *EventPtr) override {
|
|
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
|
|
return AMDGPUEventManager.returnResource(Event);
|
|
}
|
|
|
|
/// Record the event.
|
|
Error recordEventImpl(void *EventPtr,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
|
|
assert(Event && "Invalid event");
|
|
|
|
AMDGPUStreamTy *Stream = nullptr;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
return Event->record(*Stream);
|
|
}
|
|
|
|
/// Make the stream wait on the event.
|
|
Error waitEventImpl(void *EventPtr,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
|
|
|
|
AMDGPUStreamTy *Stream = nullptr;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
return Event->wait(*Stream);
|
|
}
|
|
|
|
/// Synchronize the current thread with the event.
|
|
Error syncEventImpl(void *EventPtr) override {
|
|
return Plugin::error("Synchronize event not implemented");
|
|
}
|
|
|
|
/// Print information about the device.
|
|
Error obtainInfoImpl(InfoQueueTy &Info) override {
|
|
char TmpChar[1000];
|
|
const char *TmpCharPtr = "Unknown";
|
|
uint16_t Major, Minor;
|
|
uint32_t TmpUInt, TmpUInt2;
|
|
uint32_t CacheSize[4];
|
|
size_t TmpSt;
|
|
bool TmpBool;
|
|
uint16_t WorkgrpMaxDim[3];
|
|
hsa_dim3_t GridMaxDim;
|
|
hsa_status_t Status, Status2;
|
|
|
|
Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
|
|
Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
|
|
if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
|
|
Info.add("HSA Runtime Version",
|
|
std::to_string(Major) + "." + std::to_string(Minor));
|
|
|
|
Info.add("HSA OpenMP Device Number", DeviceId);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Product Name", TmpChar);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Device Name", TmpChar);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Vendor Name", TmpChar);
|
|
|
|
hsa_device_type_t DevType;
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
|
|
if (Status == HSA_STATUS_SUCCESS) {
|
|
switch (DevType) {
|
|
case HSA_DEVICE_TYPE_CPU:
|
|
TmpCharPtr = "CPU";
|
|
break;
|
|
case HSA_DEVICE_TYPE_GPU:
|
|
TmpCharPtr = "GPU";
|
|
break;
|
|
case HSA_DEVICE_TYPE_DSP:
|
|
TmpCharPtr = "DSP";
|
|
break;
|
|
}
|
|
Info.add("Device Type", TmpCharPtr);
|
|
}
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Max Queues", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Queue Min Size", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Queue Max Size", TmpUInt);
|
|
|
|
// FIXME: This is deprecated according to HSA documentation. But using
|
|
// hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
|
|
// runtime.
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
|
|
if (Status == HSA_STATUS_SUCCESS) {
|
|
Info.add("Cache");
|
|
|
|
for (int I = 0; I < 4; I++)
|
|
if (CacheSize[I])
|
|
Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
|
|
}
|
|
|
|
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Cacheline Size", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Max Clock Freq", TmpUInt, "MHz");
|
|
|
|
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Compute Units", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("SIMD per CU", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Fast F16 Operation", TmpBool);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Wavefront Size", TmpUInt2);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Workgroup Max Size", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
|
|
if (Status == HSA_STATUS_SUCCESS) {
|
|
Info.add("Workgroup Max Size per Dimension");
|
|
Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
|
|
Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
|
|
Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
|
|
}
|
|
|
|
Status = getDeviceAttrRaw(
|
|
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS) {
|
|
Info.add("Max Waves Per CU", TmpUInt);
|
|
Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
|
|
}
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Grid Max Size", TmpUInt);
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
|
|
if (Status == HSA_STATUS_SUCCESS) {
|
|
Info.add("Grid Max Size per Dimension");
|
|
Info.add<InfoLevel2>("x", GridMaxDim.x);
|
|
Info.add<InfoLevel2>("y", GridMaxDim.y);
|
|
Info.add<InfoLevel2>("z", GridMaxDim.z);
|
|
}
|
|
|
|
Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add("Max fbarriers/Workgrp", TmpUInt);
|
|
|
|
Info.add("Memory Pools");
|
|
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
|
|
std::string TmpStr, TmpStr2;
|
|
|
|
if (Pool->isGlobal())
|
|
TmpStr = "Global";
|
|
else if (Pool->isReadOnly())
|
|
TmpStr = "ReadOnly";
|
|
else if (Pool->isPrivate())
|
|
TmpStr = "Private";
|
|
else if (Pool->isGroup())
|
|
TmpStr = "Group";
|
|
else
|
|
TmpStr = "Unknown";
|
|
|
|
Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
|
|
|
|
if (Pool->isGlobal()) {
|
|
if (Pool->isFineGrained())
|
|
TmpStr2 += "Fine Grained ";
|
|
if (Pool->isCoarseGrained())
|
|
TmpStr2 += "Coarse Grained ";
|
|
if (Pool->supportsKernelArgs())
|
|
TmpStr2 += "Kernarg ";
|
|
|
|
Info.add<InfoLevel3>("Flags", TmpStr2);
|
|
}
|
|
|
|
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add<InfoLevel3>("Size", TmpSt, "bytes");
|
|
|
|
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
|
|
TmpBool);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add<InfoLevel3>("Allocatable", TmpBool);
|
|
|
|
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
|
|
TmpSt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
|
|
|
|
Status = Pool->getAttrRaw(
|
|
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
|
|
|
|
Status =
|
|
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add<InfoLevel3>("Accessible by all", TmpBool);
|
|
}
|
|
|
|
Info.add("ISAs");
|
|
auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
|
|
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
|
|
if (Status == HSA_STATUS_SUCCESS)
|
|
Info.add<InfoLevel2>("Name", TmpChar);
|
|
|
|
return Status;
|
|
});
|
|
|
|
// Silently consume the error.
|
|
if (Err)
|
|
consumeError(std::move(Err));
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Returns true if auto zero-copy the best configuration for the current
|
|
/// arch.
|
|
/// On AMDGPUs, automatic zero-copy is turned on
|
|
/// when running on an APU with XNACK (unified memory) support
|
|
/// enabled. On discrete GPUs, automatic zero-copy is triggered
|
|
/// if the user sets the environment variable OMPX_APU_MAPS=1
|
|
/// and if XNACK is enabled. The rationale is that zero-copy
|
|
/// is the best configuration (performance, memory footprint) on APUs,
|
|
/// while it is often not the best on discrete GPUs.
|
|
/// XNACK can be enabled with a kernel boot parameter or with
|
|
/// the HSA_XNACK environment variable.
|
|
bool useAutoZeroCopyImpl() override {
|
|
return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
|
|
}
|
|
|
|
/// Getters and setters for stack and heap sizes.
|
|
Error getDeviceStackSize(uint64_t &Value) override {
|
|
Value = StackSize;
|
|
return Plugin::success();
|
|
}
|
|
Error setDeviceStackSize(uint64_t Value) override {
|
|
StackSize = Value;
|
|
return Plugin::success();
|
|
}
|
|
Error getDeviceHeapSize(uint64_t &Value) override {
|
|
Value = DeviceMemoryPoolSize;
|
|
return Plugin::success();
|
|
}
|
|
Error setDeviceHeapSize(uint64_t Value) override {
|
|
for (DeviceImageTy *Image : LoadedImages)
|
|
if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
|
|
return Err;
|
|
DeviceMemoryPoolSize = Value;
|
|
return Plugin::success();
|
|
}
|
|
Error getDeviceMemorySize(uint64_t &Value) override {
|
|
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
|
|
if (Pool->isGlobal()) {
|
|
hsa_status_t Status =
|
|
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
|
|
return Plugin::check(Status, "Error in getting device memory size: %s");
|
|
}
|
|
}
|
|
return Plugin::error("getDeviceMemorySize:: no global pool");
|
|
}
|
|
|
|
/// AMDGPU-specific function to get device attributes.
|
|
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
|
|
hsa_status_t Status =
|
|
hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
|
|
return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
|
|
}
|
|
|
|
template <typename Ty>
|
|
hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
|
|
return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
|
|
}
|
|
|
|
/// Get the device agent.
|
|
hsa_agent_t getAgent() const override { return Agent; }
|
|
|
|
/// Get the signal manager.
|
|
AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
|
|
|
|
/// Retrieve and construct all memory pools of the device agent.
|
|
Error retrieveAllMemoryPools() override {
|
|
// Iterate through the available pools of the device agent.
|
|
return hsa_utils::iterateAgentMemoryPools(
|
|
Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
|
|
AMDGPUMemoryPoolTy *MemoryPool =
|
|
Plugin.allocate<AMDGPUMemoryPoolTy>();
|
|
new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
|
|
AllMemoryPools.push_back(MemoryPool);
|
|
return HSA_STATUS_SUCCESS;
|
|
});
|
|
}
|
|
|
|
bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
|
|
|
|
private:
|
|
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
|
|
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
|
|
|
|
/// Common method to invoke a single threaded constructor or destructor
|
|
/// kernel by name.
|
|
Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
|
|
bool IsCtor) {
|
|
const char *KernelName =
|
|
IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
|
|
// Perform a quick check for the named kernel in the image. The kernel
|
|
// should be created by the 'amdgpu-lower-ctor-dtor' pass.
|
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
|
if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
|
|
return Plugin::success();
|
|
|
|
// Allocate and construct the AMDGPU kernel.
|
|
AMDGPUKernelTy AMDGPUKernel(KernelName);
|
|
if (auto Err = AMDGPUKernel.init(*this, Image))
|
|
return Err;
|
|
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
|
|
|
|
KernelArgsTy KernelArgs = {};
|
|
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
|
if (auto Err = AMDGPUKernel.launchImpl(
|
|
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
|
|
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
|
return Err;
|
|
|
|
Error Err = Plugin::success();
|
|
AsyncInfoWrapper.finalize(Err);
|
|
|
|
return Err;
|
|
}
|
|
|
|
/// Detect if current architecture is an APU.
|
|
Error checkIfAPU() {
|
|
// TODO: replace with ROCr API once it becomes available.
|
|
llvm::StringRef StrGfxName(ComputeUnitKind);
|
|
bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
|
|
.Case("gfx942", true)
|
|
.Default(false);
|
|
if (!MayBeAPU)
|
|
return Plugin::success();
|
|
|
|
// can be MI300A or MI300X
|
|
uint32_t ChipID = 0;
|
|
if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
|
|
return Err;
|
|
|
|
if (!(ChipID & 0x1)) {
|
|
IsAPU = true;
|
|
return Plugin::success();
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Envar for controlling the number of HSA queues per device. High number of
|
|
/// queues may degrade performance.
|
|
UInt32Envar OMPX_NumQueues;
|
|
|
|
/// Envar for controlling the size of each HSA queue. The size is the number
|
|
/// of HSA packets a queue is expected to hold. It is also the number of HSA
|
|
/// packets that can be pushed into each queue without waiting the driver to
|
|
/// process them.
|
|
UInt32Envar OMPX_QueueSize;
|
|
|
|
/// Envar for controlling the default number of teams relative to the number
|
|
/// of compute units (CUs) the device has:
|
|
/// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
|
|
UInt32Envar OMPX_DefaultTeamsPerCU;
|
|
|
|
/// Envar specifying the maximum size in bytes where the memory copies are
|
|
/// asynchronous operations. Up to this transfer size, the memory copies are
|
|
/// asynchronous operations pushed to the corresponding stream. For larger
|
|
/// transfers, they are synchronous transfers.
|
|
UInt32Envar OMPX_MaxAsyncCopyBytes;
|
|
|
|
/// Envar controlling the initial number of HSA signals per device. There is
|
|
/// one manager of signals per device managing several pre-allocated signals.
|
|
/// These signals are mainly used by AMDGPU streams. If needed, more signals
|
|
/// will be created.
|
|
UInt32Envar OMPX_InitialNumSignals;
|
|
|
|
/// Environment variables to set the time to wait in active state before
|
|
/// switching to blocked state. The default 2000000 busywaits for 2 seconds
|
|
/// before going into a blocking HSA wait state. The unit for these variables
|
|
/// are microseconds.
|
|
UInt32Envar OMPX_StreamBusyWait;
|
|
|
|
/// Use ROCm 5.7 interface for multiple SDMA engines
|
|
BoolEnvar OMPX_UseMultipleSdmaEngines;
|
|
|
|
/// Value of OMPX_APU_MAPS env var used to force
|
|
/// automatic zero-copy behavior on non-APU GPUs.
|
|
BoolEnvar OMPX_ApuMaps;
|
|
|
|
/// Stream manager for AMDGPU streams.
|
|
AMDGPUStreamManagerTy AMDGPUStreamManager;
|
|
|
|
/// Event manager for AMDGPU events.
|
|
AMDGPUEventManagerTy AMDGPUEventManager;
|
|
|
|
/// Signal manager for AMDGPU signals.
|
|
AMDGPUSignalManagerTy AMDGPUSignalManager;
|
|
|
|
/// The agent handler corresponding to the device.
|
|
hsa_agent_t Agent;
|
|
|
|
/// The GPU architecture.
|
|
std::string ComputeUnitKind;
|
|
|
|
/// The frequency of the steady clock inside the device.
|
|
uint64_t ClockFrequency;
|
|
|
|
/// The total number of concurrent work items that can be running on the GPU.
|
|
uint64_t HardwareParallelism;
|
|
|
|
/// Reference to the host device.
|
|
AMDHostDeviceTy &HostDevice;
|
|
|
|
/// The current size of the global device memory pool (managed by us).
|
|
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
|
|
|
|
/// The current size of the stack that will be used in cases where it could
|
|
/// not be statically determined.
|
|
uint64_t StackSize = 16 * 1024 /* 16 KB */;
|
|
|
|
/// Is the plugin associated with an APU?
|
|
bool IsAPU = false;
|
|
|
|
/// True is the system is configured with XNACK-Enabled.
|
|
/// False otherwise.
|
|
bool IsXnackEnabled = false;
|
|
};
|
|
|
|
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
|
|
hsa_code_object_reader_t Reader;
|
|
hsa_status_t Status =
|
|
hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader);
|
|
if (auto Err = Plugin::check(
|
|
Status, "Error in hsa_code_object_reader_create_from_memory: %s"))
|
|
return Err;
|
|
|
|
Status = hsa_executable_create_alt(
|
|
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
|
|
if (auto Err =
|
|
Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
|
|
return Err;
|
|
|
|
hsa_loaded_code_object_t Object;
|
|
Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(),
|
|
Reader, "", &Object);
|
|
if (auto Err = Plugin::check(
|
|
Status, "Error in hsa_executable_load_agent_code_object: %s"))
|
|
return Err;
|
|
|
|
Status = hsa_executable_freeze(Executable, "");
|
|
if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
|
|
return Err;
|
|
|
|
uint32_t Result;
|
|
Status = hsa_executable_validate(Executable, &Result);
|
|
if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
|
|
return Err;
|
|
|
|
if (Result)
|
|
return Plugin::error("Loaded HSA executable does not validate");
|
|
|
|
Status = hsa_code_object_reader_destroy(Reader);
|
|
if (auto Err =
|
|
Plugin::check(Status, "Error in hsa_code_object_reader_destroy: %s"))
|
|
return Err;
|
|
|
|
if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
|
|
getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
Expected<hsa_executable_symbol_t>
|
|
AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
|
|
StringRef SymbolName) const {
|
|
|
|
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
|
|
hsa_agent_t Agent = AMDGPUDevice.getAgent();
|
|
|
|
hsa_executable_symbol_t Symbol;
|
|
hsa_status_t Status = hsa_executable_get_symbol_by_name(
|
|
Executable, SymbolName.data(), &Agent, &Symbol);
|
|
if (auto Err = Plugin::check(
|
|
Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
|
|
SymbolName.data()))
|
|
return std::move(Err);
|
|
|
|
return Symbol;
|
|
}
|
|
|
|
template <typename ResourceTy>
|
|
Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
|
|
if (Resource)
|
|
return Plugin::error("Creating an existing resource");
|
|
|
|
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
|
|
|
|
Resource = new ResourceTy(AMDGPUDevice);
|
|
|
|
return Resource->init();
|
|
}
|
|
|
|
AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
|
|
: Agent(Device.getAgent()), Queue(nullptr),
|
|
SignalManager(Device.getSignalManager()), Device(Device),
|
|
// Initialize the std::deque with some empty positions.
|
|
Slots(32), NextSlot(0), SyncCycle(0),
|
|
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
|
|
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
|
|
|
|
/// Class implementing the AMDGPU-specific functionalities of the global
|
|
/// handler.
|
|
struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
|
|
/// Get the metadata of a global from the device. The name and size of the
|
|
/// global is read from DeviceGlobal and the address of the global is written
|
|
/// to DeviceGlobal.
|
|
Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
|
|
DeviceImageTy &Image,
|
|
GlobalTy &DeviceGlobal) override {
|
|
AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
|
|
|
|
// Find the symbol on the device executable.
|
|
auto SymbolOrErr =
|
|
AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
|
|
if (!SymbolOrErr)
|
|
return SymbolOrErr.takeError();
|
|
|
|
hsa_executable_symbol_t Symbol = *SymbolOrErr;
|
|
hsa_symbol_kind_t SymbolType;
|
|
hsa_status_t Status;
|
|
uint64_t SymbolAddr;
|
|
uint32_t SymbolSize;
|
|
|
|
// Retrieve the type, address and size of the symbol.
|
|
std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
|
|
{HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
|
|
|
|
for (auto &Info : RequiredInfos) {
|
|
Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
|
|
if (auto Err = Plugin::check(
|
|
Status, "Error in hsa_executable_symbol_get_info: %s"))
|
|
return Err;
|
|
}
|
|
|
|
// Check the size of the symbol.
|
|
if (SymbolSize != DeviceGlobal.getSize())
|
|
return Plugin::error(
|
|
"Failed to load global '%s' due to size mismatch (%zu != %zu)",
|
|
DeviceGlobal.getName().data(), SymbolSize,
|
|
(size_t)DeviceGlobal.getSize());
|
|
|
|
// Store the symbol address on the device global metadata.
|
|
DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
|
|
|
|
return Plugin::success();
|
|
}
|
|
};
|
|
|
|
/// Class implementing the AMDGPU-specific functionalities of the plugin.
|
|
struct AMDGPUPluginTy final : public GenericPluginTy {
|
|
/// Create an AMDGPU plugin and initialize the AMDGPU driver.
|
|
AMDGPUPluginTy()
|
|
: GenericPluginTy(getTripleArch()), Initialized(false),
|
|
HostDevice(nullptr) {}
|
|
|
|
/// This class should not be copied.
|
|
AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
|
|
AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
|
|
|
|
/// Initialize the plugin and return the number of devices.
|
|
Expected<int32_t> initImpl() override {
|
|
hsa_status_t Status = hsa_init();
|
|
if (Status != HSA_STATUS_SUCCESS) {
|
|
// Cannot call hsa_success_string.
|
|
DP("Failed to initialize AMDGPU's HSA library\n");
|
|
return 0;
|
|
}
|
|
|
|
// The initialization of HSA was successful. It should be safe to call
|
|
// HSA functions from now on, e.g., hsa_shut_down.
|
|
Initialized = true;
|
|
|
|
// Register event handler to detect memory errors on the devices.
|
|
Status = hsa_amd_register_system_event_handler(eventHandler, this);
|
|
if (auto Err = Plugin::check(
|
|
Status, "Error in hsa_amd_register_system_event_handler: %s"))
|
|
return std::move(Err);
|
|
|
|
// List of host (CPU) agents.
|
|
llvm::SmallVector<hsa_agent_t> HostAgents;
|
|
|
|
// Count the number of available agents.
|
|
auto Err = hsa_utils::iterateAgents([&](hsa_agent_t Agent) {
|
|
// Get the device type of the agent.
|
|
hsa_device_type_t DeviceType;
|
|
hsa_status_t Status =
|
|
hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
|
|
if (Status != HSA_STATUS_SUCCESS)
|
|
return Status;
|
|
|
|
// Classify the agents into kernel (GPU) and host (CPU) kernels.
|
|
if (DeviceType == HSA_DEVICE_TYPE_GPU) {
|
|
// Ensure that the GPU agent supports kernel dispatch packets.
|
|
hsa_agent_feature_t Features;
|
|
Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
|
|
if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
|
|
KernelAgents.push_back(Agent);
|
|
} else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
|
|
HostAgents.push_back(Agent);
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
});
|
|
|
|
if (Err)
|
|
return std::move(Err);
|
|
|
|
int32_t NumDevices = KernelAgents.size();
|
|
if (NumDevices == 0) {
|
|
// Do not initialize if there are no devices.
|
|
DP("There are no devices supporting AMDGPU.\n");
|
|
return 0;
|
|
}
|
|
|
|
// There are kernel agents but there is no host agent. That should be
|
|
// treated as an error.
|
|
if (HostAgents.empty())
|
|
return Plugin::error("No AMDGPU host agents");
|
|
|
|
// Initialize the host device using host agents.
|
|
HostDevice = allocate<AMDHostDeviceTy>();
|
|
new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
|
|
|
|
// Setup the memory pools of available for the host.
|
|
if (auto Err = HostDevice->init())
|
|
return std::move(Err);
|
|
|
|
return NumDevices;
|
|
}
|
|
|
|
/// Deinitialize the plugin.
|
|
Error deinitImpl() override {
|
|
// The HSA runtime was not initialized, so nothing from the plugin was
|
|
// actually initialized.
|
|
if (!Initialized)
|
|
return Plugin::success();
|
|
|
|
if (HostDevice)
|
|
if (auto Err = HostDevice->deinit())
|
|
return Err;
|
|
|
|
// Finalize the HSA runtime.
|
|
hsa_status_t Status = hsa_shut_down();
|
|
return Plugin::check(Status, "Error in hsa_shut_down: %s");
|
|
}
|
|
|
|
/// Creates an AMDGPU device.
|
|
GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
|
|
int32_t NumDevices) override {
|
|
return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
|
|
getKernelAgent(DeviceId));
|
|
}
|
|
|
|
/// Creates an AMDGPU global handler.
|
|
GenericGlobalHandlerTy *createGlobalHandler() override {
|
|
return new AMDGPUGlobalHandlerTy();
|
|
}
|
|
|
|
Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
|
|
|
|
const char *getName() const override { return GETNAME(TARGET_NAME); }
|
|
|
|
/// Get the ELF code for recognizing the compatible image binary.
|
|
uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
|
|
|
|
/// Check whether the image is compatible with an AMDGPU device.
|
|
Expected<bool> isELFCompatible(uint32_t DeviceId,
|
|
StringRef Image) const override {
|
|
// Get the associated architecture and flags from the ELF.
|
|
auto ElfOrErr = ELF64LEObjectFile::create(
|
|
MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false);
|
|
if (!ElfOrErr)
|
|
return ElfOrErr.takeError();
|
|
std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
|
|
if (!Processor)
|
|
return false;
|
|
|
|
SmallVector<SmallString<32>> Targets;
|
|
if (auto Err = hsa_utils::getTargetTripleAndFeatures(
|
|
getKernelAgent(DeviceId), Targets))
|
|
return Err;
|
|
for (auto &Target : Targets)
|
|
if (offloading::amdgpu::isImageCompatibleWithEnv(
|
|
Processor ? *Processor : "", ElfOrErr->getPlatformFlags(),
|
|
Target.str()))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
|
|
return true;
|
|
}
|
|
|
|
/// Get the host device instance.
|
|
AMDHostDeviceTy &getHostDevice() {
|
|
assert(HostDevice && "Host device not initialized");
|
|
return *HostDevice;
|
|
}
|
|
|
|
/// Get the kernel agent with the corresponding agent id.
|
|
hsa_agent_t getKernelAgent(int32_t AgentId) const {
|
|
assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
|
|
return KernelAgents[AgentId];
|
|
}
|
|
|
|
/// Get the list of the available kernel agents.
|
|
const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
|
|
return KernelAgents;
|
|
}
|
|
|
|
private:
|
|
/// Event handler that will be called by ROCr if an event is detected.
|
|
static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
|
|
void *PluginPtr) {
|
|
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
|
|
return HSA_STATUS_SUCCESS;
|
|
|
|
SmallVector<std::string> Reasons;
|
|
uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
|
|
Reasons.emplace_back("Page not present or supervisor privilege");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
|
|
Reasons.emplace_back("Write access to a read-only page");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
|
|
Reasons.emplace_back("Execute access to a page marked NX");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
|
|
Reasons.emplace_back("GPU attempted access to a host only page");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
|
|
Reasons.emplace_back("DRAM ECC failure");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
|
|
Reasons.emplace_back("Can't determine the exact fault address");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
|
|
Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
|
|
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
|
|
Reasons.emplace_back("GPU reset following unspecified hang");
|
|
|
|
// If we do not know the reason, say so, otherwise remove the trailing comma
|
|
// and space.
|
|
if (Reasons.empty())
|
|
Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
|
|
|
|
uint32_t Node = -1;
|
|
hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
|
|
|
|
AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
|
|
for (uint32_t I = 0, E = Plugin.getNumDevices();
|
|
Node != uint32_t(-1) && I < E; ++I) {
|
|
AMDGPUDeviceTy &AMDGPUDevice =
|
|
reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
|
|
auto KernelTraceInfoRecord =
|
|
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
|
|
|
|
uint32_t DeviceNode = -1;
|
|
if (auto Err =
|
|
AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
|
|
consumeError(std::move(Err));
|
|
continue;
|
|
}
|
|
if (DeviceNode != Node)
|
|
continue;
|
|
void *DevicePtr = (void *)Event->memory_fault.virtual_address;
|
|
std::string S;
|
|
llvm::raw_string_ostream OS(S);
|
|
OS << llvm::format("Memory access fault by GPU %" PRIu32
|
|
" (agent 0x%" PRIx64
|
|
") at virtual address %p. Reasons: %s",
|
|
Node, Event->memory_fault.agent.handle,
|
|
(void *)Event->memory_fault.virtual_address,
|
|
llvm::join(Reasons, ", ").c_str());
|
|
ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
|
|
ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
|
|
/*Abort*/ true);
|
|
}
|
|
|
|
// Abort the execution since we do not recover from this error.
|
|
FATAL_MESSAGE(1,
|
|
"Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
|
|
") at virtual address %p. Reasons: %s",
|
|
Node, Event->memory_fault.agent.handle,
|
|
(void *)Event->memory_fault.virtual_address,
|
|
llvm::join(Reasons, ", ").c_str());
|
|
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
/// Indicate whether the HSA runtime was correctly initialized. Even if there
|
|
/// is no available devices this boolean will be true. It indicates whether
|
|
/// we can safely call HSA functions (e.g., hsa_shut_down).
|
|
bool Initialized;
|
|
|
|
/// Arrays of the available GPU and CPU agents. These arrays of handles should
|
|
/// not be here but in the AMDGPUDeviceTy structures directly. However, the
|
|
/// HSA standard does not provide API functions to retirve agents directly,
|
|
/// only iterating functions. We cache the agents here for convenience.
|
|
llvm::SmallVector<hsa_agent_t> KernelAgents;
|
|
|
|
/// The device representing all HSA host agents.
|
|
AMDHostDeviceTy *HostDevice;
|
|
};
|
|
|
|
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
|
KernelArgsTy &KernelArgs,
|
|
KernelLaunchParamsTy LaunchParams,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
|
if (ArgsSize != LaunchParams.Size &&
|
|
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
|
|
return Plugin::error("Mismatch of kernel arguments size");
|
|
|
|
AMDGPUPluginTy &AMDGPUPlugin =
|
|
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
|
|
AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
|
|
AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
|
|
|
|
void *AllArgs = nullptr;
|
|
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
|
|
return Err;
|
|
|
|
// Account for user requested dynamic shared memory.
|
|
uint32_t GroupSize = getGroupSize();
|
|
if (uint32_t MaxDynCGroupMem = std::max(
|
|
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
|
|
GroupSize += MaxDynCGroupMem;
|
|
}
|
|
|
|
uint64_t StackSize;
|
|
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
|
|
return Err;
|
|
|
|
// Copy the explicit arguments.
|
|
// TODO: We should expose the args memory manager alloc to the common part as
|
|
// alternative to copying them twice.
|
|
if (LaunchParams.Size)
|
|
std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
|
|
|
|
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
|
|
|
|
AMDGPUStreamTy *Stream = nullptr;
|
|
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
|
|
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
|
|
ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
|
|
utils::advancePtr(AllArgs, LaunchParams.Size));
|
|
|
|
// Set the COV5+ implicit arguments to the appropriate values.
|
|
std::memset(ImplArgs, 0, getImplicitArgsSize());
|
|
ImplArgs->BlockCountX = NumBlocks[0];
|
|
ImplArgs->BlockCountY = NumBlocks[1];
|
|
ImplArgs->BlockCountZ = NumBlocks[2];
|
|
ImplArgs->GroupSizeX = NumThreads[0];
|
|
ImplArgs->GroupSizeY = NumThreads[1];
|
|
ImplArgs->GroupSizeZ = NumThreads[2];
|
|
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
|
|
? 3
|
|
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
|
|
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
|
|
}
|
|
|
|
// Push the kernel launch into the stream.
|
|
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
|
|
GroupSize, StackSize, ArgsMemoryManager);
|
|
}
|
|
|
|
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
|
KernelArgsTy &KernelArgs,
|
|
uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3]) const {
|
|
// Only do all this when the output is requested
|
|
if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
|
|
return Plugin::success();
|
|
|
|
// We don't have data to print additional info, but no hard error
|
|
if (!KernelInfo.has_value())
|
|
return Plugin::success();
|
|
|
|
// General Info
|
|
auto NumGroups = NumBlocks;
|
|
auto ThreadsPerGroup = NumThreads;
|
|
|
|
// Kernel Arguments Info
|
|
auto ArgNum = KernelArgs.NumArgs;
|
|
auto LoopTripCount = KernelArgs.Tripcount;
|
|
|
|
// Details for AMDGPU kernels (read from image)
|
|
// https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
|
|
auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
|
|
auto SGPRCount = (*KernelInfo).SGPRCount;
|
|
auto VGPRCount = (*KernelInfo).VGPRCount;
|
|
auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
|
|
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
|
|
auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
|
|
|
|
// Prints additional launch info that contains the following.
|
|
// Num Args: The number of kernel arguments
|
|
// Teams x Thrds: The number of teams and the number of threads actually
|
|
// running.
|
|
// MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
|
|
// kernel in work-items
|
|
// LDS Usage: Amount of bytes used in LDS storage
|
|
// S/VGPR Count: the number of S/V GPRs occupied by the kernel
|
|
// S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
|
|
// Tripcount: loop tripcount for the kernel
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
|
|
"#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS "
|
|
"Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
|
|
"%lu\n",
|
|
ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2],
|
|
ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2],
|
|
MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
|
|
SGPRSpillCount, VGPRSpillCount, LoopTripCount);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
template <typename... ArgsTy>
|
|
static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
|
|
hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
|
|
if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
|
|
return Error::success();
|
|
|
|
const char *Desc = "Unknown error";
|
|
hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
|
|
if (Ret != HSA_STATUS_SUCCESS)
|
|
REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
|
|
|
|
return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
|
|
ErrFmt, Args..., Desc);
|
|
}
|
|
|
|
void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
|
|
TargetAllocTy Kind) {
|
|
// Allocate memory from the pool.
|
|
void *Ptr = nullptr;
|
|
if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
|
|
consumeError(std::move(Err));
|
|
return nullptr;
|
|
}
|
|
assert(Ptr && "Invalid pointer");
|
|
|
|
// Get a list of agents that can access this memory pool.
|
|
llvm::SmallVector<hsa_agent_t> Agents;
|
|
llvm::copy_if(
|
|
Plugin.getKernelAgents(), std::back_inserter(Agents),
|
|
[&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); });
|
|
|
|
// Allow all valid kernel agents to access the allocation.
|
|
if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) {
|
|
REPORT("%s\n", toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
return Ptr;
|
|
}
|
|
|
|
void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
|
|
if (Size == 0)
|
|
return nullptr;
|
|
|
|
// Find the correct memory pool.
|
|
AMDGPUMemoryPoolTy *MemoryPool = nullptr;
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
|
|
MemoryPool = CoarseGrainedMemoryPools[0];
|
|
break;
|
|
case TARGET_ALLOC_HOST:
|
|
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
|
|
break;
|
|
case TARGET_ALLOC_SHARED:
|
|
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
|
|
break;
|
|
}
|
|
|
|
if (!MemoryPool) {
|
|
REPORT("No memory pool for the specified allocation kind\n");
|
|
return nullptr;
|
|
}
|
|
|
|
// Allocate from the corresponding memory pool.
|
|
void *Alloc = nullptr;
|
|
if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
|
|
REPORT("%s\n", toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
|
|
if (Alloc) {
|
|
// Get a list of agents that can access this memory pool. Inherently
|
|
// necessary for host or shared allocations Also enabled for device memory
|
|
// to allow device to device memcpy
|
|
llvm::SmallVector<hsa_agent_t> Agents;
|
|
llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(),
|
|
std::back_inserter(Agents), [&](hsa_agent_t Agent) {
|
|
return MemoryPool->canAccess(Agent);
|
|
});
|
|
|
|
// Enable all valid kernel agents to access the buffer.
|
|
if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) {
|
|
REPORT("%s\n", toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
return Alloc;
|
|
}
|
|
|
|
void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
|
|
void *Data) {
|
|
auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
|
|
|
|
if (Status == HSA_STATUS_ERROR_EXCEPTION) {
|
|
auto KernelTraceInfoRecord =
|
|
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
|
|
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
|
|
[=](__tgt_async_info &AsyncInfo) {
|
|
auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
|
|
if (!Stream || !Stream->getQueue())
|
|
return false;
|
|
return Stream->getQueue()->Queue == Source;
|
|
};
|
|
ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
|
|
AsyncInfoWrapperMatcher);
|
|
}
|
|
|
|
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
|
|
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
|
|
}
|
|
|
|
} // namespace plugin
|
|
} // namespace target
|
|
} // namespace omp
|
|
} // namespace llvm
|
|
|
|
extern "C" {
|
|
llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() {
|
|
return new llvm::omp::target::plugin::AMDGPUPluginTy();
|
|
}
|
|
}
|