mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-25 07:56:06 +00:00
[OpenMP][libomptarget][AMDGPU] Enable active HSA wait state
Adds HSA timeout hint of 2 seconds to the AMDGPU nextgen-plugin to improve performance of small kernels. The HSA runtime may stay in HSA_WAIT_STATE_ACTIVE for up to the timeout value before switching to HSA_WAIT_STATE_BLOCKED. This can improve latency from which small kernels can benefit. The value was determined via experimentation w/ different benchmarks. The timeout value can be overriden using the environment variable LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT with a value in microseconds. Original author: Greg Rodgers <Gregory.Rodgers@amd.com> Contributions from: JP Lehr <JanPatrick.Lehr@amd.com> Differential Revision: https://reviews.llvm.org/D148808
This commit is contained in:
parent
f3dcd3ad99
commit
f238a98e84
@ -1160,6 +1160,7 @@ There are several environment variables to change the behavior of the plugins:
|
||||
* ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU``
|
||||
* ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES``
|
||||
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
|
||||
* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT``
|
||||
|
||||
The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``,
|
||||
``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in
|
||||
@ -1238,6 +1239,14 @@ managing several pre-created signals. These signals are mainly used by AMDGPU
|
||||
streams. More HSA signals will be created dynamically throughout the execution
|
||||
if needed. The default value is ``64``.
|
||||
|
||||
LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT
|
||||
"""""""""""""""""""""""""""""""""""
|
||||
|
||||
This environment variable controls the timeout hint in microseconds for the
|
||||
HSA wait state within the AMDGPU plugin. For the duration of this value
|
||||
the HSA runtime may busy wait. This can reduce overall latency.
|
||||
The default value is ``2000000``.
|
||||
|
||||
.. _remote_offloading_plugin:
|
||||
|
||||
Remote Offloading Plugin:
|
||||
|
@ -511,8 +511,14 @@ struct AMDGPUSignalTy {
|
||||
}
|
||||
|
||||
/// Wait until the signal gets a zero value.
|
||||
Error wait() const {
|
||||
// TODO: Is it better to use busy waiting or blocking the thread?
|
||||
Error wait(const uint64_t ActiveTimeout = 0) const {
|
||||
if (ActiveTimeout) {
|
||||
hsa_signal_value_t Got = 1;
|
||||
Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
|
||||
if (Got == 0)
|
||||
return Plugin::success();
|
||||
}
|
||||
while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0)
|
||||
;
|
||||
@ -884,6 +890,9 @@ private:
|
||||
/// Mutex to protect stream's management.
|
||||
mutable std::mutex Mutex;
|
||||
|
||||
/// Timeout hint for HSA actively waiting for signal value to change
|
||||
const uint64_t StreamBusyWaitMicroseconds;
|
||||
|
||||
/// Return the current number of asychronous operations on the stream.
|
||||
uint32_t size() const { return NextSlot; }
|
||||
|
||||
@ -1247,7 +1256,7 @@ public:
|
||||
return Plugin::success();
|
||||
|
||||
// Wait until all previous operations on the stream have completed.
|
||||
if (auto Err = Slots[last()].Signal->wait())
|
||||
if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds))
|
||||
return Err;
|
||||
|
||||
// Reset the stream and perform all pending post actions.
|
||||
@ -1555,6 +1564,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
1 * 1024 * 1024), // 1MB
|
||||
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
|
||||
64),
|
||||
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
|
||||
AMDGPUStreamManager(*this), AMDGPUEventManager(*this),
|
||||
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice),
|
||||
Queues() {}
|
||||
@ -1679,6 +1689,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
const uint64_t getStreamBusyWaitMicroseconds() const {
|
||||
return OMPX_StreamBusyWait;
|
||||
}
|
||||
|
||||
Expected<std::unique_ptr<MemoryBuffer>>
|
||||
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
|
||||
|
||||
@ -1941,7 +1955,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
|
||||
return Err;
|
||||
|
||||
if (auto Err = Signal.wait())
|
||||
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
||||
return Err;
|
||||
|
||||
if (auto Err = Signal.deinit())
|
||||
@ -1998,7 +2012,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
|
||||
return Err;
|
||||
|
||||
if (auto Err = Signal.wait())
|
||||
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
||||
return Err;
|
||||
|
||||
if (auto Err = Signal.deinit())
|
||||
@ -2173,6 +2187,12 @@ private:
|
||||
/// will be created.
|
||||
UInt32Envar OMPX_InitialNumSignals;
|
||||
|
||||
/// Environment variables to set the time to wait in active state before
|
||||
/// switching to blocked state. The default 2000000 busywaits for 2 seconds
|
||||
/// before going into a blocking HSA wait state. The unit for these variables
|
||||
/// are microseconds.
|
||||
UInt32Envar OMPX_StreamBusyWait;
|
||||
|
||||
/// Stream manager for AMDGPU streams.
|
||||
AMDGPUStreamManagerTy AMDGPUStreamManager;
|
||||
|
||||
@ -2267,7 +2287,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
|
||||
: Agent(Device.getAgent()), Queue(Device.getNextQueue()),
|
||||
SignalManager(Device.getSignalManager()),
|
||||
// Initialize the std::deque with some empty positions.
|
||||
Slots(32), NextSlot(0), SyncCycle(0) {}
|
||||
Slots(32), NextSlot(0), SyncCycle(0),
|
||||
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
|
||||
|
||||
/// Class implementing the AMDGPU-specific functionalities of the global
|
||||
/// handler.
|
||||
|
Loading…
x
Reference in New Issue
Block a user