[AMDGPU] Add support for point sample accel out of order returns (#127991)

Add target feature for point sample acceleration and enable it for relevant targets. Also add support to insert waitcnts where required when point sample accel may have occurred. This has implications for out of order returns, which is why extra waitcnts are required. Add a VMEM_NOSAMPLER bit in the register masks to determine when waitcnt is required.
2025-04-18 15:36:58 +00:00 · 2025-04-10 15:33:48 +01:00 · 2025-04-10 15:33:48 +01:00 · 15428e0d78
commit 15428e0d78
parent f989db5745
7 changed files with 334 additions and 29 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@ -1100,6 +1100,12 @@ def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts",
  "Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions"
 >;

+def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel",
+  "HasPointSampleAccel",
+  "true",
+  "Has point sample acceleration feature"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@ -1811,20 +1817,23 @@ def FeatureISAVersion11_5_0 : FeatureSet<
  !listconcat(FeatureISAVersion11_Common.Features,
    [FeatureSALUFloatInsts,
     FeatureDPPSrc1SGPR,
-     FeatureRequiredExportPriority])>;
+     FeatureRequiredExportPriority,
+     FeaturePointSampleAccel])>;

 def FeatureISAVersion11_5_1 : FeatureSet<
  !listconcat(FeatureISAVersion11_Common.Features,
    [FeatureSALUFloatInsts,
     FeatureDPPSrc1SGPR,
     Feature1_5xVGPRs,
-     FeatureRequiredExportPriority])>;
+     FeatureRequiredExportPriority,
+     FeaturePointSampleAccel])>;

 def FeatureISAVersion11_5_2 : FeatureSet<
  !listconcat(FeatureISAVersion11_Common.Features,
    [FeatureSALUFloatInsts,
     FeatureDPPSrc1SGPR,
-     FeatureRequiredExportPriority])>;
+     FeatureRequiredExportPriority,
+     FeaturePointSampleAccel])>;

 def FeatureISAVersion11_5_3 : FeatureSet<
  !listconcat(FeatureISAVersion11_Common.Features,
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -259,6 +259,7 @@ protected:
  bool HasMinimum3Maximum3F16 = false;
  bool HasMinimum3Maximum3PKF16 = false;
  bool HasLshlAddU64Inst = false;
+  bool HasPointSampleAccel = false;

  bool RequiresCOV6 = false;

@ -1363,6 +1364,8 @@ public:
    return HasMinimum3Maximum3PKF16;
  }

+  bool hasPointSampleAccel() const { return HasPointSampleAccel; }
+
  /// \returns The maximum number of instructions that can be enclosed in an
  /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
  /// instruction.
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@ -52,6 +52,7 @@ class MIMGBaseOpcode : PredicateControl {
  bit BVH = 0;
  bit A16 = 0;
  bit NoReturn = 0;
+  bit PointSampleAccel = 0; // Opcode eligible for gfx11.5 point sample acceleration
 }

 def MIMGBaseOpcode : GenericEnum {
@ -63,7 +64,8 @@ def MIMGBaseOpcodesTable : GenericTable {
  let CppTypeName = "MIMGBaseOpcodeInfo";
  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
-                "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn"];
+                "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn",
+                "PointSampleAccel"];
  string TypeOf_BaseOpcode = "MIMGBaseOpcode";

  let PrimaryKey = ["BaseOpcode"];
@ -1458,13 +1460,14 @@ multiclass MIMG_Sampler_NoReturn <mimgopc op, AMDGPUSampleVariant sample, bit wq
  }
 }

-multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
-                         bit isG16 = 0, bit isGetLod = 0,
+multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit isPointSampleAccel = 0,
+                         bit wqm = 0, bit isG16 = 0, bit isGetLod = 0,
                         string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
                         bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> {
  def "" : MIMG_Sampler_BaseOpcode<sample> {
    let HasD16 = !not(isGetLod);
    let G16 = isG16;
+    let PointSampleAccel = isPointSampleAccel;
  }

  let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@ -1485,8 +1488,8 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
  defm "_nortn" : MIMG_Sampler_NoReturn <op, sample, wqm, isG16, asm>;
 }

-multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample>
-    : MIMG_Sampler<op, sample, 1>;
+multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample, bit isPointSampleAccel = 0>
+    : MIMG_Sampler<op, sample, isPointSampleAccel, 1>;

 multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
                        string asm = "image_gather4"#sample.LowerCaseMod> {
@ -1684,15 +1687,15 @@ let AssemblerPredicate = isGFX12Plus in {
  def : AMDGPUMnemonicAlias<"image_atomic_fmax", "image_atomic_max_flt">;
 }

-defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
+defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample, 1>;
 let OtherPredicates = [HasImageInsts, HasExtendedImageInsts] in {
 defm IMAGE_SAMPLE_CL            : MIMG_Sampler_WQM <mimgopc<0x40, 0x40, 0x21>, AMDGPUSample_cl>;
 defm IMAGE_SAMPLE_D             : MIMG_Sampler <mimgopc<0x1c, 0x1c, 0x22>, AMDGPUSample_d>;
 defm IMAGE_SAMPLE_D_CL          : MIMG_Sampler <mimgopc<0x41, 0x41, 0x23>, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_L             : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_L             : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l, 1>;
 defm IMAGE_SAMPLE_B             : MIMG_Sampler_WQM <mimgopc<0x1e, 0x1e, 0x25>, AMDGPUSample_b>;
 defm IMAGE_SAMPLE_B_CL          : MIMG_Sampler_WQM <mimgopc<0x42, 0x42, 0x26>, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ            : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_LZ            : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz, 1>;
 defm IMAGE_SAMPLE_C             : MIMG_Sampler_WQM <mimgopc<0x20, 0x20, 0x28>, AMDGPUSample_c>;
 defm IMAGE_SAMPLE_C_CL          : MIMG_Sampler_WQM <mimgopc<0x43, 0x43, 0x29>, AMDGPUSample_c_cl>;
 defm IMAGE_SAMPLE_C_D           : MIMG_Sampler <mimgopc<0x21, 0x21, 0x2a>, AMDGPUSample_c_d>;
@ -1745,7 +1748,7 @@ defm IMAGE_GATHER4_C_LZ_O       : MIMG_Gather <mimgopc<0x37, 0x37, 0x5f>, AMDGPU
 let OtherPredicates = [HasImageInsts, HasExtendedImageInsts, isGFX9Plus] in
 defm IMAGE_GATHER4H             : MIMG_Gather <mimgopc<0x90, 0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">;

-defm IMAGE_GET_LOD              : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+defm IMAGE_GET_LOD              : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 0, 1, 0, 1, "image_get_lod">;

 defm IMAGE_SAMPLE_CD            : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x68>, AMDGPUSample_cd>;
 defm IMAGE_SAMPLE_CD_CL         : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
@ -1758,22 +1761,22 @@ defm IMAGE_SAMPLE_C_CD_CL_O     : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6f
 } // End OtherPredicates = [HasImageInsts, HasExtendedImageInsts]

 let OtherPredicates = [HasImageInsts, HasExtendedImageInsts, HasG16] in {
-defm IMAGE_SAMPLE_D_G16         : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_G16      : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_D_G16       : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_G16    : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_D_O_G16       : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_O_G16    : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_O_G16     : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_O_G16  : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_G16        : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_G16     : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_G16      : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_G16   : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_D_G16         : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16      : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_D_G16       : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16    : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 0, 1>;
+defm IMAGE_SAMPLE_D_O_G16       : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16    : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_D_O_G16     : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16  : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_CD_G16        : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16     : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16      : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16   : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 0, 1>;
 } // End OtherPredicates = [HasImageInsts, HasExtendedImageInsts, HasG16]

 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -387,6 +387,10 @@ public:
    return LDSDMAStores;
  }

+  bool hasPointSampleAccel(const MachineInstr &MI) const;
+  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+                                      RegInterval Interval) const;
+
  void print(raw_ostream &) const;
  void dump() const { print(dbgs()); }

@ -826,6 +830,34 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
  setScoreByInterval(Interval, CntTy, Score);
 }

+// Return true if the subtarget is one that enables Point Sample Acceleration
+// and the MachineInstr passed in is one to which it might be applied (the
+// hardware makes this decision based on several factors, but we can't determine
+// this at compile time, so we have to assume it might be applied if the
+// instruction supports it).
+bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
+  if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+    return false;
+
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+  return BaseInfo->PointSampleAccel;
+}
+
+// Return true if the subtarget enables Point Sample Acceleration, the supplied
+// MachineInstr is one to which it might be applied and the supplied interval is
+// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
+// (this is the type that a point sample accelerated instruction effectively
+// becomes)
+bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
+    const MachineInstr &MI, RegInterval Interval) const {
+  if (!hasPointSampleAccel(MI))
+    return false;
+
+  return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
+}
+
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                    const SIRegisterInfo *TRI,
                                    const MachineRegisterInfo *MRI,
@ -942,8 +974,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
          // defs. That's required for a sane index into `VgprMemTypes` below
          assert(TRI->isVectorRegister(*MRI, Op.getReg()));
          VmemType V = getVmemType(Inst);
+          unsigned char TypesMask = 1 << V;
+          // If instruction can have Point Sample Accel applied, we have to flag
+          // this with another potential dependency
+          if (hasPointSampleAccel(Inst))
+            TypesMask |= 1 << VMEM_NOSAMPLER;
          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
-            VgprVmemTypes[RegNo] |= 1 << V;
+            VgprVmemTypes[RegNo] |= TypesMask;
        }
      }
      setScoreByInterval(Interval, T, CurrScore);
@ -1813,9 +1850,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
          // previous write and this write are the same type of VMEM
          // instruction, in which case they are (in some architectures)
          // guaranteed to write their results in order anyway.
+          // Additionally check instructions where Point Sample Acceleration
+          // might be applied.
          if (Op.isUse() || !updateVMCntOnly(MI) ||
              ScoreBrackets.hasOtherPendingVmemTypes(Interval,
                                                     getVmemType(MI)) ||
+              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
              !ST->hasVmemWriteVgprInOrder()) {
            ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
            ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@ -420,6 +420,7 @@ struct MIMGBaseOpcodeInfo {
  bool BVH;
  bool A16;
  bool NoReturn;
+  bool PointSampleAccel;
 };

 LLVM_READONLY
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+
+define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) {
+; GFX11-LABEL: gather_sample:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1150-LABEL: gather_sample:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: gather_sample:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
+
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc2, <4 x i32> %samp2, i1 false, i32 0, i32 0)
+  %t0 = extractelement <4 x float> %v, i32 0
+  %res0 = insertelement <3 x float> poison, float %t0, i32 0
+  %t1 = extractelement <4 x float> %v, i32 1
+  %res1 = insertelement <3 x float> %res0, float %t1, i32 1
+  %t2 = extractelement <4 x float> %w, i32 0
+  %res2 = insertelement <3 x float> %res1, float %t2, i32 2
+  ret <3 x float> %res2
+}
+
+define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) {
+; GFX11-LABEL: sample_gather:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1150-LABEL: sample_gather:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: sample_gather:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
+
+  %w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc2, <4 x i32> %samp2, i1 false, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %t0 = extractelement <4 x float> %v, i32 0
+  %res0 = insertelement <3 x float> poison, float %t0, i32 0
+  %t1 = extractelement <4 x float> %v, i32 1
+  %res1 = insertelement <3 x float> %res0, float %t1, i32 1
+  %t2 = extractelement <4 x float> %w, i32 0
+  %res2 = insertelement <3 x float> %res1, float %t2, i32 2
+  ret <3 x float> %res2
+}
+
+define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) {
+; GFX11-LABEL: sample_load:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1150-LABEL: sample_load:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: sample_load:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
+
+  %w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0)
+  %t0 = extractelement <4 x float> %v, i32 0
+  %res0 = insertelement <3 x float> poison, float %t0, i32 0
+  %t1 = extractelement <4 x float> %v, i32 1
+  %res1 = insertelement <3 x float> %res0, float %t1, i32 1
+  %t2 = extractelement <4 x float> %w, i32 0
+  %res2 = insertelement <3 x float> %res1, float %t2, i32 2
+  ret <3 x float> %res2
+}
+
+define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) {
+; GFX11-LABEL: load_sample:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1150-LABEL: load_sample:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: load_sample:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
+
+  %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0)
+  %w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %t0 = extractelement <4 x float> %v, i32 0
+  %res0 = insertelement <3 x float> poison, float %t0, i32 0
+  %t1 = extractelement <4 x float> %v, i32 1
+  %res1 = insertelement <3 x float> %res0, float %t1, i32 1
+  %t2 = extractelement <4 x float> %w, i32 0
+  %res2 = insertelement <3 x float> %res1, float %t2, i32 2
+  ret <3 x float> %res2
+}
+
+
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32)
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
@ -0,0 +1,80 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN,GFX1150 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+---
+name: waitcnt-gather-sample
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: waitcnt-gather-sample
+    ; GCN: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; GFX1150-NEXT: S_WAITCNT 1015
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+    ; GCN-NEXT: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    S_ENDPGM 0
+...
+---
+name: waitcnt-gather-sample-o
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: waitcnt-gather-sample-o
+    ; GCN: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+    ; GCN-NEXT:     $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_O_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_O_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    S_ENDPGM 0
+...
+---
+name: waitcnt-sample-gather
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: waitcnt-sample-gather
+    ; GCN: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    ; GFX1150-NEXT: S_WAITCNT 1015
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+    ; GCN-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    S_ENDPGM 0
+...
+---
+name: waitcnt-sample-load
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: waitcnt-sample-load
+    ; GCN: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    ; GFX11: S_WAITCNT 1015
+    ; GFX1150-NEXT: S_WAITCNT 1015
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT 0
+    ; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    S_ENDPGM 0
+...
+---
+name: waitcnt-load-sample
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: waitcnt-load-sample
+    ; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    ; GFX11: S_WAITCNT 1015
+    ; GFX1150-NEXT: S_WAITCNT 1015
+    ; GFX12-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    S_ENDPGM 0
+...