[Offload] Guard HSA implicit arguments if they aren't created (#133073)

Summary: We conditionally allocate the implicit arguments, so they possibly are null. The flang compiler seems to hit this case, even though it shouldn't when it's supposed to conform to the HSA code object. For now guard this to fix the regression and cover a case in the future where someone rolls a fully custom implementatation. Fixes: https://github.com/llvm/llvm-project/issues/132982
2025-04-14 17:06:38 +00:00 · 2025-03-26 08:54:33 -05:00 · 2025-03-26 08:54:33 -05:00 · 75f810e025
commit 75f810e025
parent 1b07e865a1
1 changed files with 18 additions and 21 deletions
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@ -3363,16 +3363,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
    return Err;

-  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
-  if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
-    // Initialize implicit arguments.
-    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
-        utils::advancePtr(AllArgs, LaunchParams.Size));
-
-    // Initialize the implicit arguments to zero.
-    std::memset(ImplArgs, 0, getImplicitArgsSize());
-  }
-
  // Copy the explicit arguments.
  // TODO: We should expose the args memory manager alloc to the common part as
  // 	   alternative to copying them twice.
@ -3385,17 +3375,24 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
    return Err;

-  // Set the COV5+ implicit arguments to the appropriate values.
-  ImplArgs->BlockCountX = NumBlocks[0];
-  ImplArgs->BlockCountY = NumBlocks[1];
-  ImplArgs->BlockCountZ = NumBlocks[2];
-  ImplArgs->GroupSizeX = NumThreads[0];
-  ImplArgs->GroupSizeY = NumThreads[1];
-  ImplArgs->GroupSizeZ = NumThreads[2];
-  ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
-                           ? 3
-                           : 1 + (NumBlocks[1] * NumThreads[1] != 1);
-  ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
+  if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
+    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+        utils::advancePtr(AllArgs, LaunchParams.Size));
+
+    // Set the COV5+ implicit arguments to the appropriate values.
+    std::memset(ImplArgs, 0, getImplicitArgsSize());
+    ImplArgs->BlockCountX = NumBlocks[0];
+    ImplArgs->BlockCountY = NumBlocks[1];
+    ImplArgs->BlockCountZ = NumBlocks[2];
+    ImplArgs->GroupSizeX = NumThreads[0];
+    ImplArgs->GroupSizeY = NumThreads[1];
+    ImplArgs->GroupSizeZ = NumThreads[2];
+    ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
+                             ? 3
+                             : 1 + (NumBlocks[1] * NumThreads[1] != 1);
+    ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+  }

  // Push the kernel launch into the stream.
  return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,