diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index b2ede888b542..ed575f2213f2 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3363,16 +3363,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; - hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; - if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { - // Initialize implicit arguments. - ImplArgs = reinterpret_cast( - utils::advancePtr(AllArgs, LaunchParams.Size)); - - // Initialize the implicit arguments to zero. - std::memset(ImplArgs, 0, getImplicitArgsSize()); - } - // Copy the explicit arguments. // TODO: We should expose the args memory manager alloc to the common part as // alternative to copying them twice. @@ -3385,17 +3375,24 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) return Err; - // Set the COV5+ implicit arguments to the appropriate values. - ImplArgs->BlockCountX = NumBlocks[0]; - ImplArgs->BlockCountY = NumBlocks[1]; - ImplArgs->BlockCountZ = NumBlocks[2]; - ImplArgs->GroupSizeX = NumThreads[0]; - ImplArgs->GroupSizeY = NumThreads[1]; - ImplArgs->GroupSizeZ = NumThreads[2]; - ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1 - ? 3 - : 1 + (NumBlocks[1] * NumThreads[1] != 1); - ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem; + hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; + if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { + ImplArgs = reinterpret_cast( + utils::advancePtr(AllArgs, LaunchParams.Size)); + + // Set the COV5+ implicit arguments to the appropriate values. + std::memset(ImplArgs, 0, getImplicitArgsSize()); + ImplArgs->BlockCountX = NumBlocks[0]; + ImplArgs->BlockCountY = NumBlocks[1]; + ImplArgs->BlockCountZ = NumBlocks[2]; + ImplArgs->GroupSizeX = NumThreads[0]; + ImplArgs->GroupSizeY = NumThreads[1]; + ImplArgs->GroupSizeZ = NumThreads[2]; + ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1 + ? 3 + : 1 + (NumBlocks[1] * NumThreads[1] != 1); + ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem; + } // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,