[VPlan] Consistently use (Part, 0) for first lane scalar values (#80271)

At the moment, some VPInstructions create only a single scalar value, but use VPTransformatState's 'vector' storage for this value. Those values are effectively uniform-per-VF (or in some cases uniform-across-VF-and-UF). Using the vector/per-part storage doesn't interact well with other recipes, that more accurately using (Part, Lane) to look up scalar values and prevents VPInstructions creating scalars from interacting with other recipes working with scalars. This PR tries to unify handling of scalars by using (Part, 0) for scalar values where only the first lane is demanded. This allows using VPInstructions with other recipes like VPScalarCastRecipe and is also needed when using VPInstructions in more cases otuside the vector loop region to generate scalars. Depends on https://github.com/llvm/llvm-project/pull/80269
2025-04-18 19:16:43 +00:00 · 2024-02-26 19:06:43 +00:00 · 2024-02-26 19:06:43 +00:00 · 911055e34f
commit 911055e34f
parent 1865c7ea85
7 changed files with 79 additions and 51 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -9127,7 +9127,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
         "Unexpected type.");

  auto *IVR = getParent()->getPlan()->getCanonicalIV();
-  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));

  if (onlyScalarsGenerated(State.VF.isScalable())) {
    // This is the normalized GEP that starts counting at zero.
@ -9243,7 +9243,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {

 void VPReductionRecipe::execute(VPTransformState &State) {
  assert(!State.Instance && "Reduction being replicated.");
-  Value *PrevInChain = State.get(getChainOp(), 0);
+  Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
  RecurKind Kind = RdxDesc.getRecurrenceKind();
  bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
  // Propagate the fast-math flags carried by the underlying instruction.
@ -9252,8 +9252,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
  for (unsigned Part = 0; Part < State.UF; ++Part) {
    Value *NewVecOp = State.get(getVecOp(), Part);
    if (VPValue *Cond = getCondOp()) {
-      Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
-                                           : State.get(Cond, {Part, 0});
+      Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
      VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
      Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
      Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
@ -9278,7 +9277,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
            NewVecOp);
      PrevInChain = NewRed;
    } else {
-      PrevInChain = State.get(getChainOp(), Part);
+      PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
      NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
    }
    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
@ -9289,7 +9288,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
    else
      NextInChain = State.Builder.CreateBinOp(
          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
-    State.set(this, NextInChain, Part);
+    State.set(this, NextInChain, Part, /*IsScalar*/ true);
  }
 }

@ -9404,7 +9403,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
          // We don't want to update the value in the map as it might be used in
          // another expression. So don't call resetVectorValue(StoredVal).
        }
-        auto *VecPtr = State.get(getAddr(), Part);
+        auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
        if (isMaskRequired)
          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                            BlockInMaskParts[Part]);
@ -9428,7 +9427,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
                                         nullptr, "wide.masked.gather");
      State.addMetadata(NewLI, LI);
    } else {
-      auto *VecPtr = State.get(getAddr(), Part);
+      auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
      if (isMaskRequired)
        NewLI = Builder.CreateMaskedLoad(
            DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@ -242,7 +242,16 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
  return Extract;
 }

-Value *VPTransformState::get(VPValue *Def, unsigned Part) {
+Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
+  if (NeedsScalar) {
+    assert((VF.isScalar() || Def->isLiveIn() ||
+            (hasScalarValue(Def, VPIteration(Part, 0)) &&
+             Data.PerPartScalars[Def][Part].size() == 1)) &&
+           "Trying to access a single scalar per part but has multiple scalars "
+           "per part.");
+    return get(Def, VPIteration(Part, 0));
+  }
+
  // If Values have been set for this Def return the one relevant for \p Part.
  if (hasVectorValue(Def, Part))
    return Data.PerPartOutput[Def][Part];
@ -789,21 +798,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
    auto *TCMO = Builder.CreateSub(TripCountV,
                                   ConstantInt::get(TripCountV->getType(), 1),
                                   "trip.count.minus.1");
-    auto VF = State.VF;
-    Value *VTCMO =
-        VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
-    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-      State.set(BackedgeTakenCount, VTCMO, Part);
+    BackedgeTakenCount->setUnderlyingValue(TCMO);
  }

-  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(&VectorTripCount, VectorTripCountV, Part);
+  VectorTripCount.setUnderlyingValue(VectorTripCountV);

  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
  // FIXME: Model VF * UF computation completely in VPlan.
-  State.set(&VFxUF,
-            createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
-            0);
+  VFxUF.setUnderlyingValue(
+      createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));

  // When vectorizing the epilogue loop, the canonical induction start value
  // needs to be changed from zero to the value after the main vector loop.
@ -884,12 +887,16 @@ void VPlan::execute(VPTransformState *State) {
                            isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
                            (isa<VPReductionPHIRecipe>(PhiR) &&
                             cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+    bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+                       (isa<VPReductionPHIRecipe>(PhiR) &&
+                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
    unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;

    for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-      Value *Phi = State->get(PhiR, Part);
-      Value *Val = State->get(PhiR->getBackedgeValue(),
-                              SinglePartNeeded ? State->UF - 1 : Part);
+      Value *Phi = State->get(PhiR, Part, NeedsScalar);
+      Value *Val =
+          State->get(PhiR->getBackedgeValue(),
+                     SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
      cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
    }
  }
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@ -259,9 +259,10 @@ struct VPTransformState {
    DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
  } Data;

-  /// Get the generated Value for the given VPValue \p Def and the given \p Part.
-  /// \see set.
-  Value *get(VPValue *Def, unsigned Part);
+  /// Get the generated vector Value for a given VPValue \p Def and a given \p
+  /// Part if \p IsScalar is false, otherwise return the generated scalar
+  /// for \p Part. \See set.
+  Value *get(VPValue *Def, unsigned Part, bool IsScalar = false);

  /// Get the generated Value for a given VPValue and given Part and Lane.
  Value *get(VPValue *Def, const VPIteration &Instance);
@ -282,14 +283,22 @@ struct VPTransformState {
           I->second[Instance.Part][CacheIdx];
  }

-  /// Set the generated Value for a given VPValue and a given Part.
-  void set(VPValue *Def, Value *V, unsigned Part) {
+  /// Set the generated vector Value for a given VPValue and a given Part, if \p
+  /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0).
+  void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) {
+    if (IsScalar) {
+      set(Def, V, VPIteration(Part, 0));
+      return;
+    }
+    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+           "scalar values must be stored as (Part, 0)");
    if (!Data.PerPartOutput.count(Def)) {
      DataState::PerPartValuesTy Entry(UF);
      Data.PerPartOutput[Def] = Entry;
    }
    Data.PerPartOutput[Def][Part] = V;
  }
+
  /// Reset an existing vector value for \p Def and a given \p Part.
  void reset(VPValue *Def, Value *V, unsigned Part) {
    auto Iter = Data.PerPartOutput.find(Def);
@ -1376,6 +1385,13 @@ public:

  /// Returns the result type of the cast.
  Type *getResultType() const { return ResultTy; }
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    // At the moment, only uniform codegen is implemented.
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };

 /// A recipe for widening Call instructions.
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@ -279,11 +279,12 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
  Builder.SetCurrentDebugLocation(getDebugLoc());

  if (Instruction::isBinaryOp(getOpcode())) {
+    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
    if (Part != 0 && vputils::onlyFirstPartUsed(this))
-      return State.get(this, 0);
+      return State.get(this, 0, OnlyFirstLaneUsed);

-    Value *A = State.get(getOperand(0), Part);
-    Value *B = State.get(getOperand(1), Part);
+    Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
+    Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
    auto *Res =
        Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
    if (auto *I = dyn_cast<Instruction>(Res))
@ -385,8 +386,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
    if (Part != 0)
      return nullptr;
    // First create the compare.
-    Value *IV = State.get(getOperand(0), Part);
-    Value *TC = State.get(getOperand(1), Part);
+    Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
+    Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
    Value *Cond = Builder.CreateICmpEQ(IV, TC);

    // Now create the branch.
@ -407,7 +408,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
  }
  case VPInstruction::ComputeReductionResult: {
    if (Part != 0)
-      return State.get(this, 0);
+      return State.get(this, 0, /*IsScalar*/ true);

    // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
    // and will be removed by breaking up the recipe further.
@ -424,7 +425,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
    Type *PhiTy = OrigPhi->getType();
    VectorParts RdxParts(State.UF);
    for (unsigned Part = 0; Part < State.UF; ++Part)
-      RdxParts[Part] = State.get(LoopExitingDef, Part);
+      RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());

    // If the vector reduction can be performed in a smaller type, we truncate
    // then extend the loop exit value to enable InstCombine to evaluate the
@ -512,9 +513,15 @@ void VPInstruction::execute(VPTransformState &State) {
    if (!hasResult())
      continue;
    assert(GeneratedValue && "generateInstruction must produce a value");
-    State.set(this, GeneratedValue, Part);
+
+    bool IsVector = GeneratedValue->getType()->isVectorTy();
+    State.set(this, GeneratedValue, Part, !IsVector);
+    assert((IsVector || getOpcode() == VPInstruction::ComputeReductionResult ||
+            State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
+           "scalar value but not only first lane used");
  }
 }
+
 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
  if (Instruction::isBinaryOp(getOpcode()))
@ -530,8 +537,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
  case VPInstruction::CalculateTripCountMinusVF:
  case VPInstruction::CanonicalIVIncrementForPart:
  case VPInstruction::BranchOnCount:
-    // TODO: Cover additional operands.
-    return getOperand(0) == Op;
+    return true;
  };
  llvm_unreachable("switch should return");
 }
@ -1344,7 +1350,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
      PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
    }

-    State.set(this, PartPtr, Part);
+    State.set(this, PartPtr, Part, /*IsScalar*/ true);
  }
 }

@ -1640,7 +1646,7 @@ void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
  EntryPart->addIncoming(Start, VectorPH);
  EntryPart->setDebugLoc(getDebugLoc());
  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(this, EntryPart, Part);
+    State.set(this, EntryPart, Part, /*IsScalar*/ true);
 }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@ -1711,7 +1717,7 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif

 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
-  Value *CanonicalIV = State.get(getOperand(0), 0);
+  Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
  Type *STy = CanonicalIV->getType();
  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
  ElementCount VF = State.VF;
@ -1801,7 +1807,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
    Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
    EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
-    State.set(this, EntryPart, Part);
+    State.set(this, EntryPart, Part, IsInLoop);
  }

  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
@ -1833,7 +1839,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
  }

  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-    Value *EntryPart = State.get(this, Part);
+    Value *EntryPart = State.get(this, Part, IsInLoop);
    // Make sure to add the reduction start value only to the
    // first unroll part.
    Value *StartVal = (Part == 0) ? StartV : Iden;
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@ -73,12 +73,6 @@ protected:
  // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
  // back-end and analysis information for the new IR.

-  // Set \p Val as the underlying Value of this VPValue.
-  void setUnderlyingValue(Value *Val) {
-    assert(!UnderlyingVal && "Underlying Value is already set.");
-    UnderlyingVal = Val;
-  }
-
 public:
  /// Return the underlying Value attached to this VPValue.
  Value *getUnderlyingValue() { return UnderlyingVal; }
@ -192,6 +186,12 @@ public:
  /// is a live-in value.
  /// TODO: Also handle recipes defined in pre-header blocks.
  bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); }
+
+  // Set \p Val as the underlying Value of this VPValue.
+  void setUnderlyingValue(Value *Val) {
+    assert(!UnderlyingVal && "Underlying Value is already set.");
+    UnderlyingVal = Val;
+  }
 };

 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@ -117,10 +117,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; DATA_NO_LANEMASK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA_NO_LANEMASK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    br label [[VECTOR_BODY:%.*]]
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@ -142,7 +142,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY19:%.*]]
-; CHECK:       vector.body19:
+; CHECK:       vector.body17:
 ; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0