[Cost] Add CostKind to getVectorInstrCost and its related users

LoopUnroll estimates the loop size via getInstructionCost(), but getInstructionCost() cannot pass CostKind to getVectorInstrCost(). And so does getShuffleCost() to getBroadcastShuffleOverhead(), getPermuteShuffleOverhead(), getExtractSubvectorOverhead(), and getInsertSubvectorOverhead(). To address this, this patch adds an argument CostKind to these functions. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D142116
2025-04-16 16:56:35 +00:00 · 2023-01-21 05:29:05 -08:00 · 2023-01-21 05:29:05 -08:00 · 5fb3a57ea7
commit 5fb3a57ea7
parent 97a1c98f8e
29 changed files with 383 additions and 260 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -752,13 +752,16 @@ public:
  /// extracted from vectors.
  InstructionCost getScalarizationOverhead(VectorType *Ty,
                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract) const;
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind) const;

  /// Estimate the overhead of scalarizing an instructions unique
  /// non-constant operands. The (potentially vector) types to use for each of
  /// argument are passes via Tys.
-  InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                   ArrayRef<Type *> Tys) const;
+  InstructionCost
+  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                   ArrayRef<Type *> Tys,
+                                   TTI::TargetCostKind CostKind) const;

  /// If target has efficient vector element load/store instructions, it can
  /// return true here so that insertion/extraction costs are not added to
@ -1193,6 +1196,7 @@ public:
  /// case is to provision the cost of vectorization/scalarization in
  /// vectorizer passes.
  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index = -1, Value *Op0 = nullptr,
                                     Value *Op1 = nullptr) const;

@ -1203,6 +1207,7 @@ public:
  /// A typical suitable use case is cost estimation when vector instruction
  /// exists (e.g., from basic blocks during transformation).
  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index = -1) const;

  /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
@ -1675,11 +1680,12 @@ public:
  virtual bool useColdCCForColdCall(Function &F) = 0;
  virtual InstructionCost getScalarizationOverhead(VectorType *Ty,
                                                   const APInt &DemandedElts,
-                                                   bool Insert,
-                                                   bool Extract) = 0;
+                                                   bool Insert, bool Extract,
+                                                   TargetCostKind CostKind) = 0;
  virtual InstructionCost
  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                   ArrayRef<Type *> Tys) = 0;
+                                   ArrayRef<Type *> Tys,
+                                   TargetCostKind CostKind) = 0;
  virtual bool supportsEfficientVectorElementLoadStore() = 0;
  virtual bool supportsTailCalls() = 0;
  virtual bool supportsTailCallFor(const CallBase *CB) = 0;
@ -1787,9 +1793,11 @@ public:
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) = 0;
  virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                             TTI::TargetCostKind CostKind,
                                             unsigned Index, Value *Op0,
                                             Value *Op1) = 0;
  virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                             TTI::TargetCostKind CostKind,
                                             unsigned Index) = 0;

  virtual InstructionCost
@ -2150,13 +2158,16 @@ public:

  InstructionCost getScalarizationOverhead(VectorType *Ty,
                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract) override {
-    return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
+                                           bool Insert, bool Extract,
+                                           TargetCostKind CostKind) override {
+    return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                         CostKind);
  }
  InstructionCost
  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                   ArrayRef<Type *> Tys) override {
-    return Impl.getOperandsScalarizationOverhead(Args, Tys);
+                                   ArrayRef<Type *> Tys,
+                                   TargetCostKind CostKind) override {
+    return Impl.getOperandsScalarizationOverhead(Args, Tys, CostKind);
  }

  bool supportsEfficientVectorElementLoadStore() override {
@ -2360,13 +2371,16 @@ public:
                                     const Instruction *I) override {
    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  }
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1) override {
-    return Impl.getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0,
+                                     Value *Op1) override {
+    return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
  }
  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index) override {
-    return Impl.getVectorInstrCost(I, Val, Index);
+    return Impl.getVectorInstrCost(I, Val, CostKind, Index);
  }
  InstructionCost
  getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -333,12 +333,15 @@ public:

  InstructionCost getScalarizationOverhead(VectorType *Ty,
                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract) const {
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind) const {
    return 0;
  }

-  InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                   ArrayRef<Type *> Tys) const {
+  InstructionCost
+  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                   ArrayRef<Type *> Tys,
+                                   TTI::TargetCostKind CostKind) const {
    return 0;
  }

@ -585,12 +588,15 @@ public:
    return 1;
  }

-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1) const {
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0,
+                                     Value *Op1) const {
    return 1;
  }

  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index) const {
    return 1;
  }
@ -1176,7 +1182,7 @@ public:
      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
        if (CI->getValue().getActiveBits() <= 32)
          Idx = CI->getZExtValue();
-      return TargetTTI->getVectorInstrCost(*IE, Ty, Idx);
+      return TargetTTI->getVectorInstrCost(*IE, Ty, CostKind, Idx);
    }
    case Instruction::ShuffleVector: {
      auto *Shuffle = dyn_cast<ShuffleVectorInst>(U);
@ -1272,7 +1278,7 @@ public:
        if (CI->getValue().getActiveBits() <= 32)
          Idx = CI->getZExtValue();
      Type *DstTy = U->getOperand(0)->getType();
-      return TargetTTI->getVectorInstrCost(*EEI, DstTy, Idx);
+      return TargetTTI->getVectorInstrCost(*EEI, DstTy, CostKind, Idx);
    }
    }

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@ -86,23 +86,25 @@ private:

  /// Estimate a cost of Broadcast as an extract and sequence of insert
  /// operations.
-  InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
+  InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
+                                              TTI::TargetCostKind CostKind) {
    InstructionCost Cost = 0;
    // Broadcast cost is equal to the cost of extracting the zero'th element
    // plus the cost of inserting it into every element of the result vector.
-    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0,
-                                        nullptr, nullptr);
+    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
+                                        CostKind, 0, nullptr, nullptr);

    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
-      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i,
-                                          nullptr, nullptr);
+      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
+                                          CostKind, i, nullptr, nullptr);
    }
    return Cost;
  }

  /// Estimate a cost of shuffle as a sequence of extract and insert
  /// operations.
-  InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
+  InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
+                                            TTI::TargetCostKind CostKind) {
    InstructionCost Cost = 0;
    // Shuffle cost is equal to the cost of extracting element from its argument
    // plus the cost of inserting them onto the result vector.
@ -112,18 +114,20 @@ private:
    // vector and finally index 3 of second vector and insert them at index
    // <0,1,2,3> of result vector.
    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
-      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i,
-                                          nullptr, nullptr);
-      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i,
-                                          nullptr, nullptr);
+      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
+                                          CostKind, i, nullptr, nullptr);
+      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
+                                          CostKind, i, nullptr, nullptr);
    }
    return Cost;
  }

  /// Estimate a cost of subvector extraction as a sequence of extract and
  /// insert operations.
-  InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
-                                       FixedVectorType *SubVTy) {
+  InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
+                                              TTI::TargetCostKind CostKind,
+                                              int Index,
+                                              FixedVectorType *SubVTy) {
    assert(VTy && SubVTy &&
           "Can only extract subvectors from vectors");
    int NumSubElts = SubVTy->getNumElements();
@ -137,18 +141,21 @@ private:
    // the source type plus the cost of inserting them into the result vector
    // type.
    for (int i = 0; i != NumSubElts; ++i) {
-      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
-                                          i + Index, nullptr, nullptr);
-      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i,
-                                          nullptr, nullptr);
+      Cost +=
+          thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
+                                      CostKind, i + Index, nullptr, nullptr);
+      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
+                                          CostKind, i, nullptr, nullptr);
    }
    return Cost;
  }

  /// Estimate a cost of subvector insertion as a sequence of extract and
  /// insert operations.
-  InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
-                                      FixedVectorType *SubVTy) {
+  InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
+                                             TTI::TargetCostKind CostKind,
+                                             int Index,
+                                             FixedVectorType *SubVTy) {
    assert(VTy && SubVTy &&
           "Can only insert subvectors into vectors");
    int NumSubElts = SubVTy->getNumElements();
@ -163,9 +170,10 @@ private:
    // type.
    for (int i = 0; i != NumSubElts; ++i) {
      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
-                                          i, nullptr, nullptr);
-      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
-                                          i + Index, nullptr, nullptr);
+                                          CostKind, i, nullptr, nullptr);
+      Cost +=
+          thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
+                                      i + Index, nullptr, nullptr);
    }
    return Cost;
  }
@ -216,7 +224,7 @@ private:
                                 FixedVectorType::get(
                                     PointerType::get(VT->getElementType(), 0),
                                     VT->getNumElements()),
-                                 -1, nullptr, nullptr)
+                                 CostKind, -1, nullptr, nullptr)
            : 0;
    InstructionCost LoadCost =
        VT->getNumElements() *
@ -224,8 +232,9 @@ private:
         getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));

    // Next, compute the cost of packing the result in a vector.
-    InstructionCost PackingCost = getScalarizationOverhead(
-        VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
+    InstructionCost PackingCost =
+        getScalarizationOverhead(VT, Opcode != Instruction::Store,
+                                 Opcode == Instruction::Store, CostKind);

    InstructionCost ConditionalCost = 0;
    if (VariableMask) {
@ -241,7 +250,7 @@ private:
               Instruction::ExtractElement,
               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
                                    VT->getNumElements()),
-               -1, nullptr, nullptr) +
+               CostKind, -1, nullptr, nullptr) +
           getCFInstrCost(Instruction::Br, CostKind) +
           getCFInstrCost(Instruction::PHI, CostKind));
    }
@ -710,7 +719,8 @@ public:
  /// extracted from vectors.
  InstructionCost getScalarizationOverhead(VectorType *InTy,
                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract) {
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind) {
    /// FIXME: a bitfield is not a reasonable abstraction for talking about
    /// which elements are needed from a scalable vector
    if (isa<ScalableVectorType>(InTy))
@ -726,11 +736,11 @@ public:
      if (!DemandedElts[i])
        continue;
      if (Insert)
-        Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i,
-                                            nullptr, nullptr);
+        Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
+                                            CostKind, i, nullptr, nullptr);
      if (Extract)
-        Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i,
-                                            nullptr, nullptr);
+        Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
+                                            CostKind, i, nullptr, nullptr);
    }

    return Cost;
@ -738,20 +748,24 @@ public:

  /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
  InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
-                                           bool Extract) {
+                                           bool Extract,
+                                           TTI::TargetCostKind CostKind) {
    if (isa<ScalableVectorType>(InTy))
      return InstructionCost::getInvalid();
    auto *Ty = cast<FixedVectorType>(InTy);

    APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
-    return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
+    return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                             CostKind);
  }

  /// Estimate the overhead of scalarizing an instructions unique
  /// non-constant operands. The (potentially vector) types to use for each of
  /// argument are passes via Tys.
-  InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                   ArrayRef<Type *> Tys) {
+  InstructionCost
+  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                   ArrayRef<Type *> Tys,
+                                   TTI::TargetCostKind CostKind) {
    assert(Args.size() == Tys.size() && "Expected matching Args and Tys");

    InstructionCost Cost = 0;
@ -766,7 +780,8 @@ public:

      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
        if (auto *VecTy = dyn_cast<VectorType>(Ty))
-          Cost += getScalarizationOverhead(VecTy, false, true);
+          Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
+                                           /*Extract*/ true, CostKind);
      }
    }

@ -779,14 +794,17 @@ public:
  /// added as a heuristic.
  InstructionCost getScalarizationOverhead(VectorType *RetTy,
                                           ArrayRef<const Value *> Args,
-                                           ArrayRef<Type *> Tys) {
-    InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
+                                           ArrayRef<Type *> Tys,
+                                           TTI::TargetCostKind CostKind) {
+    InstructionCost Cost = getScalarizationOverhead(
+        RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
    if (!Args.empty())
-      Cost += getOperandsScalarizationOverhead(Args, Tys);
+      Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
    else
      // When no information on arguments is provided, we add the cost
      // associated with one argument as a heuristic.
-      Cost += getScalarizationOverhead(RetTy, false, true);
+      Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
+                                       /*Extract*/ true, CostKind);

    return Cost;
  }
@ -898,7 +916,7 @@ public:
      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
      SmallVector<Type *> Tys(Args.size(), Ty);
-      return getScalarizationOverhead(VTy, Args, Tys) +
+      return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
             VTy->getNumElements() * Cost;
    }

@ -951,7 +969,7 @@ public:
    switch (improveShuffleKindFromMask(Kind, Mask)) {
    case TTI::SK_Broadcast:
      if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
-        return getBroadcastShuffleOverhead(FVT);
+        return getBroadcastShuffleOverhead(FVT, CostKind);
      return InstructionCost::getInvalid();
    case TTI::SK_Select:
    case TTI::SK_Splice:
@ -960,13 +978,13 @@ public:
    case TTI::SK_PermuteSingleSrc:
    case TTI::SK_PermuteTwoSrc:
      if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
-        return getPermuteShuffleOverhead(FVT);
+        return getPermuteShuffleOverhead(FVT, CostKind);
      return InstructionCost::getInvalid();
    case TTI::SK_ExtractSubvector:
-      return getExtractSubvectorOverhead(Tp, Index,
+      return getExtractSubvectorOverhead(Tp, CostKind, Index,
                                         cast<FixedVectorType>(SubTp));
    case TTI::SK_InsertSubvector:
-      return getInsertSubvectorOverhead(Tp, Index,
+      return getInsertSubvectorOverhead(Tp, CostKind, Index,
                                        cast<FixedVectorType>(SubTp));
    }
    llvm_unreachable("Unknown TTI::ShuffleKind");
@ -1110,7 +1128,9 @@ public:

      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
-      return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
+      return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
+                                      CostKind) +
+             Num * Cost;
    }

    // We already handled vector-to-vector and scalar-to-scalar conversions.
@ -1119,8 +1139,12 @@ public:
    //  that the conversion is scalarized in one way or another.
    if (Opcode == Instruction::BitCast) {
      // Illegal bitcasts are done by storing and loading from a stack slot.
-      return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
-             (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
+      return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
+                                                /*Extract*/ true, CostKind)
+                     : 0) +
+             (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
+                                                /*Extract*/ false, CostKind)
+                     : 0);
    }

    llvm_unreachable("Unhandled cast");
@ -1128,11 +1152,11 @@ public:

  InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                           VectorType *VecTy, unsigned Index) {
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
    return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                       Index, nullptr, nullptr) +
+                                       CostKind, Index, nullptr, nullptr) +
           thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
-                                     TTI::CastContextHint::None,
-                                     TTI::TCK_RecipThroughput);
+                                     TTI::CastContextHint::None, CostKind);
  }

  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
@ -1183,19 +1207,23 @@ public:

      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
-      return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
+      return getScalarizationOverhead(ValVTy, /*Insert*/ true,
+                                      /*Extract*/ false, CostKind) +
+             Num * Cost;
    }

    // Unknown scalar opcode.
    return 1;
  }

-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1) {
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1) {
    return getRegUsageForType(Val->getScalarType());
  }

  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index) {
    Value *Op0 = nullptr;
    Value *Op1 = nullptr;
@ -1203,7 +1231,8 @@ public:
      Op0 = IE->getOperand(0);
      Op1 = IE->getOperand(1);
    }
-    return thisT()->getVectorInstrCost(I.getOpcode(), Val, Index, Op0, Op1);
+    return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
+                                       Op1);
  }

  InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
@ -1231,10 +1260,10 @@ public:
    APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
    Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
                                              /*Insert*/ false,
-                                              /*Extract*/ true);
-    Cost +=
-        thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
-                                          /*Insert*/ true, /*Extract*/ false);
+                                              /*Extract*/ true, CostKind);
+    Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
+                                              /*Insert*/ true,
+                                              /*Extract*/ false, CostKind);

    return Cost;
  }
@ -1275,9 +1304,9 @@ public:
      if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
        // This is a vector load/store for some illegal type that is scalarized.
        // We must account for the cost of building or decomposing the vector.
-        Cost += getScalarizationOverhead(cast<VectorType>(Src),
-                                         Opcode != Instruction::Store,
-                                         Opcode == Instruction::Store);
+        Cost += getScalarizationOverhead(
+            cast<VectorType>(Src), Opcode != Instruction::Store,
+            Opcode == Instruction::Store, CostKind);
      }
    }

@ -1389,13 +1418,13 @@ public:
      //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
      // The cost is estimated as extract elements at 0, 2, 4, 6 from the
      // <8 x i32> vector and insert them into a <4 x i32> vector.
-      InstructionCost InsSubCost =
-          thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
-                                            /*Insert*/ true, /*Extract*/ false);
+      InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
+          SubVT, DemandedAllSubElts,
+          /*Insert*/ true, /*Extract*/ false, CostKind);
      Cost += Indices.size() * InsSubCost;
-      Cost +=
-          thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
-                                            /*Insert*/ false, /*Extract*/ true);
+      Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                                /*Insert*/ false,
+                                                /*Extract*/ true, CostKind);
    } else {
      // The interleave cost is extract elements from sub vectors, and
      // insert them into the wide vector.
@ -1410,13 +1439,13 @@ public:
      // The cost is estimated as extract all elements (of actual members,
      // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
      // i32> vector.
-      InstructionCost ExtSubCost =
-          thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
-                                            /*Insert*/ false, /*Extract*/ true);
+      InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
+          SubVT, DemandedAllSubElts,
+          /*Insert*/ false, /*Extract*/ true, CostKind);
      Cost += ExtSubCost * Indices.size();
      Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
                                                /*Insert*/ true,
-                                                /*Extract*/ false);
+                                                /*Extract*/ false, CostKind);
    }

    if (!UseMaskForCond)
@ -1649,10 +1678,11 @@ public:
    if (RetVF.isVector() && !RetVF.isScalable()) {
      ScalarizationCost = 0;
      if (!RetTy->isVoidTy())
-        ScalarizationCost +=
-            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+        ScalarizationCost += getScalarizationOverhead(
+            cast<VectorType>(RetTy),
+            /*Insert*/ true, /*Extract*/ false, CostKind);
      ScalarizationCost +=
-          getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
+          getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
    }

    IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
@ -1704,7 +1734,8 @@ public:
      Type *ScalarRetTy = RetTy;
      if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
        if (!SkipScalarizationCost)
-          ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
+          ScalarizationCost = getScalarizationOverhead(
+              RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
        ScalarCalls = std::max(ScalarCalls,
                               cast<FixedVectorType>(RetVTy)->getNumElements());
        ScalarRetTy = RetTy->getScalarType();
@ -1714,7 +1745,8 @@ public:
        Type *Ty = Tys[i];
        if (auto *VTy = dyn_cast<VectorType>(Ty)) {
          if (!SkipScalarizationCost)
-            ScalarizationCost += getScalarizationOverhead(VTy, false, true);
+            ScalarizationCost += getScalarizationOverhead(
+                VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
          ScalarCalls = std::max(ScalarCalls,
                                 cast<FixedVectorType>(VTy)->getNumElements());
          Ty = Ty->getScalarType();
@ -2124,8 +2156,10 @@ public:
        return InstructionCost::getInvalid();

      InstructionCost ScalarizationCost =
-          SkipScalarizationCost ? ScalarizationCostPassed
-                                : getScalarizationOverhead(RetVTy, true, false);
+          SkipScalarizationCost
+              ? ScalarizationCostPassed
+              : getScalarizationOverhead(RetVTy, /*Insert*/ true,
+                                         /*Extract*/ false, CostKind);

      unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
      SmallVector<Type *, 4> ScalarTys;
@ -2141,7 +2175,8 @@ public:
      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
        if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
          if (!ICA.skipScalarizationCost())
-            ScalarizationCost += getScalarizationOverhead(VTy, false, true);
+            ScalarizationCost += getScalarizationOverhead(
+                VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
          ScalarCalls = std::max(ScalarCalls,
                                 cast<FixedVectorType>(VTy)->getNumElements());
        }
@ -2258,8 +2293,8 @@ public:
    ArithCost +=
        NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
    return ShuffleCost + ArithCost +
-           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0,
-                                       nullptr, nullptr);
+           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
+                                       CostKind, 0, nullptr, nullptr);
  }

  /// Try to calculate the cost of performing strict (in-order) reductions,
@ -2286,8 +2321,8 @@ public:
      return InstructionCost::getInvalid();

    auto *VTy = cast<FixedVectorType>(Ty);
-    InstructionCost ExtractCost =
-        getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
+    InstructionCost ExtractCost = getScalarizationOverhead(
+        VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
    InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
        Opcode, VTy->getElementType(), CostKind);
    ArithCost *= VTy->getNumElements();
@ -2366,8 +2401,8 @@ public:
    // The last min/max should be in vector registers and we counted it above.
    // So just need a single extractelement.
    return ShuffleCost + MinMaxCost +
-           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0,
-                                       nullptr, nullptr);
+           thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
+                                       CostKind, 0, nullptr, nullptr);
  }

  InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -513,16 +513,17 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
  return TTIImpl->useColdCCForColdCall(F);
 }

-InstructionCost
-TargetTransformInfo::getScalarizationOverhead(VectorType *Ty,
-                                              const APInt &DemandedElts,
-                                              bool Insert, bool Extract) const {
-  return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
+InstructionCost TargetTransformInfo::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                           CostKind);
 }

 InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
-    ArrayRef<const Value *> Args, ArrayRef<Type *> Tys) const {
-  return TTIImpl->getOperandsScalarizationOverhead(Args, Tys);
+    ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
+    TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getOperandsScalarizationOverhead(Args, Tys, CostKind);
 }

 bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
@ -898,23 +899,25 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
 }

 InstructionCost TargetTransformInfo::getVectorInstrCost(
-    unsigned Opcode, Type *Val, unsigned Index, Value *Op0, Value *Op1) const {
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    Value *Op0, Value *Op1) const {
  // FIXME: Assert that Opcode is either InsertElement or ExtractElement.
  // This is mentioned in the interface description and respected by all
  // callers, but never asserted upon.
  InstructionCost Cost =
-      TTIImpl->getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+      TTIImpl->getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }

-InstructionCost TargetTransformInfo::getVectorInstrCost(const Instruction &I,
-                                                        Type *Val,
-                                                        unsigned Index) const {
+InstructionCost
+TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
+                                        TTI::TargetCostKind CostKind,
+                                        unsigned Index) const {
  // FIXME: Assert that Opcode is either InsertElement or ExtractElement.
  // This is mentioned in the interface description and respected by all
  // callers, but never asserted upon.
-  InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, Index);
+  InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, CostKind, Index);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@ -7383,11 +7383,11 @@ class VectorPromoteHelper {
    // The scalar chain of computation has to pay for the transition
    // scalar to vector.
    // The vector chain has to account for the combining cost.
-    InstructionCost ScalarCost =
-        TTI.getVectorInstrCost(*Transition, PromotedType, Index);
-    InstructionCost VectorCost = StoreExtractCombineCost;
    enum TargetTransformInfo::TargetCostKind CostKind =
        TargetTransformInfo::TCK_RecipThroughput;
+    InstructionCost ScalarCost =
+        TTI.getVectorInstrCost(*Transition, PromotedType, CostKind, Index);
+    InstructionCost VectorCost = StoreExtractCombineCost;
    for (const auto &Inst : InstsToBePromoted) {
      // Compute the cost.
      // By construction, all instructions being promoted are arithmetic ones.
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -2131,14 +2131,14 @@ InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,

  // Get the cost for the extract. We compute the cost (if any) for the extend
  // below.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                            Index, nullptr, nullptr);
+                                            CostKind, Index, nullptr, nullptr);

  // Legalize the types.
  auto VecLT = getTypeLegalizationCost(VecTy);
  auto DstVT = TLI->getValueType(DL, Dst);
  auto SrcVT = TLI->getValueType(DL, Src);
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  // If the resulting type is still a vector and the destination type is legal,
  // we may get the extension for free. If not, get the default cost for the
@ -2225,13 +2225,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
 }

 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                   TTI::TargetCostKind CostKind,
                                                   unsigned Index, Value *Op0,
                                                   Value *Op1) {
  return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
 }

 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
-                                                   Type *Val, unsigned Index) {
+                                                   Type *Val,
+                                                   TTI::TargetCostKind CostKind,
+                                                   unsigned Index) {
  return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
 }

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -169,9 +169,11 @@ public:
  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                 const Instruction *I = nullptr);

-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);
  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index);

  InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -790,6 +790,7 @@ GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
 }

 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                               TTI::TargetCostKind CostKind,
                                               unsigned Index, Value *Op0,
                                               Value *Op1) {
  switch (Opcode) {
@ -800,7 +801,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    if (EltSize < 32) {
      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
        return 0;
-      return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1);
+      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
+                                       Op1);
    }

    // Extracts are just reads of a subregister, so are free. Inserts are
@ -811,7 +813,7 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    return Index == ~0u ? 2 : 0;
  }
  default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1);
+    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
  }
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -162,6 +162,7 @@ public:

  using BaseT::getVectorInstrCost;
  InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index, Value *Op0, Value *Op1);

  bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@ -108,6 +108,7 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
 }

 InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                                TTI::TargetCostKind CostKind,
                                                unsigned Index, Value *Op0,
                                                Value *Op1) {
  switch (Opcode) {
@ -116,7 +117,8 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    unsigned EltSize =
        DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
    if (EltSize < 32) {
-      return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1);
+      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
+                                       Op1);
    }

    // Extracts are just reads of a subregister, so are free. Inserts are
@ -127,7 +129,7 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    return Index == ~0u ? 2 : 0;
  }
  default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1);
+    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
  }
 }

--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@ -62,6 +62,7 @@ public:
                                 const Instruction *I = nullptr);
  using BaseT::getVectorInstrCost;
  InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                     TTI::TargetCostKind CostKind,
                                     unsigned Index, Value *Op0, Value *Op1);
 };

--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -874,6 +874,7 @@ InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 }

 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                               TTI::TargetCostKind CostKind,
                                               unsigned Index, Value *Op0,
                                               Value *Op1) {
  // Penalize inserting into an D-subregister. We end up with a three times
@ -894,7 +895,8 @@ InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    if (ValTy->isVectorTy() &&
        ValTy->getScalarSizeInBits() <= 32)
      return std::max<InstructionCost>(
-          BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1), 2U);
+          BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
+          2U);
  }

  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
@ -907,7 +909,7 @@ InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
  }

-  return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1);
+  return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
 }

 InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
@ -1021,12 +1023,14 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
    if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
      // One scalaization insert, one scalarization extract and the cost of the
      // fcmps.
-      return BaseT::getScalarizationOverhead(VecValTy, false, true) +
-             BaseT::getScalarizationOverhead(VecCondTy, true, false) +
+      return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
+                                             /*Extract*/ true, CostKind) +
+             BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
+                                             /*Extract*/ false, CostKind) +
             VecValTy->getNumElements() *
                 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
-                                    VecCondTy->getScalarType(), VecPred, CostKind,
-                                    I);
+                                    VecCondTy->getScalarType(), VecPred,
+                                    CostKind, I);
    }

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
@ -1039,7 +1043,8 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
    if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
      if (LT.first > 1)
        return LT.first * BaseCost +
-               BaseT::getScalarizationOverhead(VecCondTy, true, false);
+               BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
+                                               /*Extract*/ false, CostKind);
      return BaseCost;
    }
  }
@ -1442,7 +1447,8 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
    // Return the cost of multiple scalar invocation plus the cost of
    // inserting and extracting the values.
    SmallVector<Type *> Tys(Args.size(), Ty);
-    return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
+    return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
+           Num * Cost;
  }

  return BaseCost;
@ -1581,8 +1587,11 @@ InstructionCost ARMTTIImpl::getGatherScatterOpCost(
  // The scalarization cost should be a lot higher. We use the number of vector
  // elements plus the scalarization overhead.
  InstructionCost ScalarCost =
-      NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
-      BaseT::getScalarizationOverhead(VTy, false, true);
+      NumElems * LT.first +
+      BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
+                                      CostKind) +
+      BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
+                                      CostKind);

  if (EltSize < 8 || Alignment < EltSize / 8)
    return ScalarCost;
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@ -240,8 +240,9 @@ public:
                                     const Instruction *I = nullptr);

  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);

  InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
                                            const SCEV *Ptr);
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@ -139,14 +139,17 @@ ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth,
 }

 InstructionCost HexagonTTIImpl::getScalarizationOverhead(
-    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) {
-  return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) {
+  return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                         CostKind);
 }

 InstructionCost
 HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                 ArrayRef<Type *> Tys) {
-  return BaseT::getOperandsScalarizationOverhead(Args, Tys);
+                                                 ArrayRef<Type *> Tys,
+                                                 TTI::TargetCostKind CostKind) {
+  return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind);
 }

 InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
@ -329,6 +332,7 @@ InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
 }

 InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                   TTI::TargetCostKind CostKind,
                                                   unsigned Index, Value *Op0,
                                                   Value *Op1) {
  Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
@ -339,8 +343,8 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
    if (ElemTy->isIntegerTy(32))
      return Cost;
    // If it's not a 32-bit value, there will need to be an extract.
-    return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index,
-                                     Op0, Op1);
+    return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, CostKind,
+                                     Index, Op0, Op1);
  }

  if (Opcode == Instruction::ExtractElement)
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@ -107,9 +107,12 @@ public:

  InstructionCost getScalarizationOverhead(VectorType *Ty,
                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract);
-  InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                   ArrayRef<Type *> Tys);
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
+  InstructionCost
+  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                   ArrayRef<Type *> Tys,
+                                   TTI::TargetCostKind CostKind);
  InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                   ArrayRef<Type *> Tys,
                                   TTI::TargetCostKind CostKind);
@ -154,8 +157,9 @@ public:
                                   TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr);
  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);

  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                 const Instruction *I = nullptr) {
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@ -675,6 +675,7 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 }

 InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                               TTI::TargetCostKind CostKind,
                                               unsigned Index, Value *Op0,
                                               Value *Op1) {
  assert(Val->isVectorTy() && "This must be a vector type");
@ -687,7 +688,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
    return InstructionCost::getMax();

  InstructionCost Cost =
-      BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+      BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
  Cost *= CostFactor;

  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
@ -829,8 +830,8 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
  if (Src->isVectorTy() && Opcode == Instruction::Store)
    for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
         ++i)
-      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i, nullptr,
-                                 nullptr);
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i,
+                                 nullptr, nullptr);

  return Cost;
 }
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@ -126,8 +126,9 @@ public:
                                     TTI::TargetCostKind CostKind,
                                     const Instruction *I = nullptr);
  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);
  InstructionCost
  getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                  unsigned AddressSpace, TTI::TargetCostKind CostKind,
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@ -1198,13 +1198,14 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 }

 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                 TTI::TargetCostKind CostKind,
                                                 unsigned Index, Value *Op0,
                                                 Value *Op1) {
  assert(Val->isVectorTy() && "This must be a vector type");

  if (Opcode != Instruction::ExtractElement &&
      Opcode != Instruction::InsertElement)
-    return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);

  // Legalize the type.
  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@ -1218,7 +1219,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
    return LT.first;

  if (!isTypeLegal(Val))
-    return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);

  // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
  // and vslideup + vmv.s.x to insert element to vector.
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@ -157,8 +157,9 @@ public:
                                     const Instruction *I = nullptr);

  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);

  InstructionCost getArithmeticInstrCost(
      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@ -532,7 +532,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
      return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
    if (DivRemConst) {
      SmallVector<Type *> Tys(Args.size(), Ty);
-      return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
+      return VF * DivMulSeqCost +
+             getScalarizationOverhead(VTy, Args, Tys, CostKind);
    }
    if ((SignedDivRem || UnsignedDivRem) && VF > 4)
      // Temporary hack: disable high vectorization factors with integer
@ -558,7 +559,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
            getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
        SmallVector<Type *> Tys(Args.size(), Ty);
        InstructionCost Cost =
-            (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
+            (VF * ScalarCost) +
+            getScalarizationOverhead(VTy, Args, Tys, CostKind);
        // FIXME: VF 2 for these FP operations are currently just as
        // expensive as for VF 4.
        if (VF == 2)
@ -576,8 +578,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
    // There is no native support for FRem.
    if (Opcode == Instruction::FRem) {
      SmallVector<Type *> Tys(Args.size(), Ty);
-      InstructionCost Cost =
-          (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
+      InstructionCost Cost = (VF * LIBCALL_COST) +
+                             getScalarizationOverhead(VTy, Args, Tys, CostKind);
      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
      if (VF == 2 && ScalarBits == 32)
        Cost *= 2;
@ -865,8 +867,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
        NeedsExtracts = false;

-      TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
-      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
+      TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                          NeedsExtracts, CostKind);
+      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
+                                          /*Extract*/ false, CostKind);

      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@ -878,7 +882,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
    if (Opcode == Instruction::FPTrunc) {
      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
        return VF /*ldxbr/lexbr*/ +
-               getScalarizationOverhead(DstVecTy, true, false);
+               getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+                                        /*Extract*/ false, CostKind);
      else // double -> float
        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
    }
@ -891,7 +896,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
        return VF * 2;
      }
      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(SrcVecTy, false, true);
+      return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                           /*Extract*/ true, CostKind);
    }
  }

@ -996,6 +1002,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 }

 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                   TTI::TargetCostKind CostKind,
                                                   unsigned Index, Value *Op0,
                                                   Value *Op1) {
  // vlvgp will insert two grs into a vector register, so only count half the
@ -1013,7 +1020,7 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
    return Cost;
  }

-  return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+  return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
 }

 // Check if a load may be folded as a memory operand in its user.
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@ -107,8 +107,9 @@ public:
                                     TTI::TargetCostKind CostKind,
                                     const Instruction *I = nullptr);
  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);
  bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
  InstructionCost
  getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@ -80,12 +80,12 @@ InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
  return Cost;
 }

-InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode,
-                                                       Type *Val,
-                                                       unsigned Index,
-                                                       Value *Op0, Value *Op1) {
-  InstructionCost Cost =
-      BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index, Op0, Op1);
+InstructionCost
+WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                       TTI::TargetCostKind CostKind,
+                                       unsigned Index, Value *Op0, Value *Op1) {
+  InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
+      Opcode, Val, CostKind, Index, Op0, Op1);

  // SIMD128's insert/extract currently only take constant indices.
  if (Index == -1u)
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@ -66,8 +66,9 @@ public:
      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
      const Instruction *CxtI = nullptr);
  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);

  /// @}

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@ -4257,6 +4257,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 }

 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                               TTI::TargetCostKind CostKind,
                                               unsigned Index, Value *Op0,
                                               Value *Op1) {
  static const CostTblEntry SLMCostTbl[] = {
@ -4269,7 +4270,6 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  assert(Val->isVectorTy() && "This must be a vector type");
  Type *ScalarType = Val->getScalarType();
  InstructionCost RegisterFileMoveCost = 0;
-  TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput;

  // Non-immediate extraction/insertion can be handled as a sequence of
  // aliased loads+stores via the stack.
@ -4401,14 +4401,14 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
    RegisterFileMoveCost += 1;

-  return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1) +
+  return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
         RegisterFileMoveCost;
 }

-InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
-                                                     const APInt &DemandedElts,
-                                                     bool Insert,
-                                                     bool Extract) {
+InstructionCost
+X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+                                     bool Insert, bool Extract,
+                                     TTI::TargetCostKind CostKind) {
  assert(DemandedElts.getBitWidth() ==
             cast<FixedVectorType>(Ty)->getNumElements() &&
         "Vector size mismatch");
@ -4416,7 +4416,6 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
  MVT MScalarTy = LT.second.getScalarType();
  unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
-  TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput;
  InstructionCost Cost = 0;

  constexpr unsigned LaneBitWidth = 128;
@ -4436,8 +4435,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
      // For types we can insert directly, insertion into 128-bit sub vectors is
      // cheap, followed by a cheap chain of concatenations.
      if (LegalVectorBitWidth <= LaneBitWidth) {
-        Cost +=
-            BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
+        Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+                                                /*Extract*/ false, CostKind);
      } else {
        // In each 128-lane, if at least one index is demanded but not all
        // indices are demanded and this 128-lane is not the first 128-lane of
@ -4477,7 +4476,7 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
            Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
                                   CostKind, I * NumEltsPerLane, LaneTy);
          Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
-                                                  false);
+                                                  /*Extract*/ false, CostKind);
        }

        APInt AffectedLanes =
@ -4554,8 +4553,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
            continue;
          Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
                                 CostKind, I * NumEltsPerLane, LaneTy);
-          Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, false,
-                                                  Extract);
+          Cost += BaseT::getScalarizationOverhead(
+              LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
        }

        return Cost;
@ -4563,7 +4562,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
    }

    // Fallback to default extraction.
-    Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+    Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
+                                            Extract, CostKind);
  }

  return Cost;
@ -4815,7 +4815,7 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                              CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
        assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
        Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
-                                         !IsLoad);
+                                         !IsLoad, CostKind);
      }

      // This isn't exactly right. We're using slow unaligned 32-byte accesses
@ -4856,15 +4856,15 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
      (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
    // Scalarization
    APInt DemandedElts = APInt::getAllOnes(NumElem);
-    InstructionCost MaskSplitCost =
-        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
+    InstructionCost MaskSplitCost = getScalarizationOverhead(
+        MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
    InstructionCost ScalarCompareCost = getCmpSelInstrCost(
        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
        CmpInst::BAD_ICMP_PREDICATE, CostKind);
    InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
    InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-    InstructionCost ValueSplitCost =
-        getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
+    InstructionCost ValueSplitCost = getScalarizationOverhead(
+        SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
    InstructionCost MemopCost =
        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                         Alignment, AddressSpace, CostKind);
@ -5174,8 +5174,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
  }

  // Add the final extract element to the cost.
-  return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0,
-                                            nullptr, nullptr);
+  return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
+                                            CostKind, 0, nullptr, nullptr);
 }

 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
@ -5475,8 +5475,8 @@ X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
  }

  // Add the final extract element to the cost.
-  return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0,
-                                         nullptr, nullptr);
+  return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
+                                         CostKind, 0, nullptr, nullptr);
 }

 /// Calculate the cost of materializing a 64-bit value. This helper
@ -5781,7 +5781,7 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
    auto *MaskTy =
        FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
    MaskUnpackCost = getScalarizationOverhead(
-        MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true);
+        MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
    InstructionCost ScalarCompareCost = getCmpSelInstrCost(
        Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
        CmpInst::BAD_ICMP_PREDICATE, CostKind);
@ -5791,7 +5791,7 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,

  InstructionCost AddressUnpackCost = getScalarizationOverhead(
      FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
-      /*Insert=*/false, /*Extract=*/true);
+      /*Insert=*/false, /*Extract=*/true, CostKind);

  // The cost of the scalar loads/stores.
  InstructionCost MemoryOpCost =
@ -5800,10 +5800,10 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,

  // The cost of forming the vector from loaded scalars/
  // scalarizing the vector to perform scalar stores.
-  InstructionCost InsertExtractCost =
-      getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts,
-                               /*Insert=*/Opcode == Instruction::Load,
-                               /*Extract=*/Opcode == Instruction::Store);
+  InstructionCost InsertExtractCost = getScalarizationOverhead(
+      cast<FixedVectorType>(SrcVTy), DemandedElts,
+      /*Insert=*/Opcode == Instruction::Load,
+      /*Extract=*/Opcode == Instruction::Store, CostKind);

  return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
 }
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@ -147,11 +147,13 @@ public:
                                     TTI::TargetCostKind CostKind,
                                     const Instruction *I = nullptr);
  using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index,
-                                     Value *Op0, Value *Op1);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     TTI::TargetCostKind CostKind,
+                                     unsigned Index, Value *Op0, Value *Op1);
  InstructionCost getScalarizationOverhead(VectorType *Ty,
                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract);
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
  InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
                                            int VF,
                                            const APInt &DemandedDstElts,
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1683,8 +1683,8 @@ private:

  /// Estimate the overhead of scalarizing an instruction. This is a
  /// convenience wrapper for the type-based getScalarizationOverhead API.
-  InstructionCost getScalarizationOverhead(Instruction *I,
-                                           ElementCount VF) const;
+  InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
+                                           TTI::TargetCostKind CostKind) const;

  /// Returns true if an artificially high cost for emulated masked memrefs
  /// should be used.
@ -3443,8 +3443,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
  // to be vectors, so we need to extract individual elements from there,
  // execute VF scalar calls, and then gather the result into the vector return
  // value.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  InstructionCost ScalarCallCost =
-      TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
+      TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
  if (VF.isScalar())
    return ScalarCallCost;

@ -3455,7 +3456,8 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

  // Compute costs of unpacking argument values for the scalar calls and
  // packing the return values to a vector.
-  InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
+  InstructionCost ScalarizationCost =
+      getScalarizationOverhead(CI, VF, CostKind);

  InstructionCost Cost =
      ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
@ -3471,7 +3473,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,

  // If the corresponding vector cost is cheaper, return its cost.
  InstructionCost VectorCallCost =
-      TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
+      TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
  if (VectorCallCost < Cost) {
    NeedToScalarize = false;
    Cost = VectorCallCost;
@ -4478,7 +4480,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,

    // The cost of insertelement and extractelement instructions needed for
    // scalarization.
-    ScalarizationCost += getScalarizationOverhead(I, VF);
+    ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);

    // Scale the cost by the probability of executing the predicated blocks.
    // This assumes the predicated block for each vector lane is equally
@ -6239,13 +6241,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

    // Compute the scalarization overhead of needed insertelement instructions
    // and phi nodes.
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
    if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
      ScalarCost += TTI.getScalarizationOverhead(
          cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnes(VF.getFixedValue()), true, false);
+          APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
+          /*Extract*/ false, CostKind);
      ScalarCost +=
-          VF.getFixedValue() *
-          TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+          VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
    }

    // Compute the scalarization overhead of needed extractelement
@ -6261,7 +6264,8 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
        else if (needsExtract(J, VF)) {
          ScalarCost += TTI.getScalarizationOverhead(
              cast<VectorType>(ToVectorTy(J->getType(), VF)),
-              APInt::getAllOnes(VF.getFixedValue()), false, true);
+              APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
+              /*Extract*/ true, CostKind);
        }
      }

@ -6390,14 +6394,15 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

  // Don't pass *I here, since it is scalar but will actually be part of a
  // vectorized loop where the user of it is a vectorized instruction.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF.getKnownMinValue() *
-          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS, TTI::TCK_RecipThroughput);
+  Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
+                                                      ValTy->getScalarType(),
+                                                      Alignment, AS, CostKind);

  // Get the overhead of the extractelement and insertelement instructions
  // we might create due to scalarization.
-  Cost += getScalarizationOverhead(I, VF);
+  Cost += getScalarizationOverhead(I, VF, CostKind);

  // If we have a predicated load/store, it will need extra i1 extracts and
  // conditional branches, but may not be executed for each vector lane. Scale
@ -6410,8 +6415,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
        VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
    Cost += TTI.getScalarizationOverhead(
        Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
-        /*Insert=*/false, /*Extract=*/true);
-    Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+        /*Insert=*/false, /*Extract=*/true, CostKind);
+    Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);

    if (useEmulatedMaskMemRefHack(I, VF))
      // Artificially setting to a high enough value to practically disable
@ -6477,7 +6482,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
         (isLoopInvariantStoreValue
              ? 0
              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       VF.getKnownMinValue() - 1));
+                                       CostKind, VF.getKnownMinValue() - 1));
 }

 InstructionCost
@ -6772,9 +6777,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
  return VectorizationCostTy(C, TypeNotScalarized);
 }

-InstructionCost
-LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
-                                                     ElementCount VF) const {
+InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
+    Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {

  // There is no mechanism yet to create a scalable scalarization loop,
  // so this is currently Invalid.
@ -6789,8 +6793,9 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
  if (!RetTy->isVoidTy() &&
      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
    Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
-        false);
+        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
+        /*Insert*/ true,
+        /*Extract*/ false, CostKind);

  // Some targets keep addresses scalar.
  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@ -6810,7 +6815,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
  for (auto *V : filterExtractingOperands(Ops, VF))
    Tys.push_back(MaybeVectorizeType(V->getType(), VF));
  return Cost + TTI.getOperandsScalarizationOverhead(
-                    filterExtractingOperands(Ops, VF), Tys);
+                    filterExtractingOperands(Ops, VF), Tys, CostKind);
 }

 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
@ -7067,7 +7072,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
      return (
          TTI.getScalarizationOverhead(
-              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
+              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+              /*Insert*/ false, /*Extract*/ true, CostKind) +
          (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
    } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
      // The back-edge branch will remain, as will all scalar branches.
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -6664,7 +6664,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
          continue;
        }
      }
-      Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx);
+      Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+                                      Idx);
    }
    // Add a cost for subvector extracts/inserts if required.
    for (const auto &Data : ExtractVectorsTys) {
@ -6792,7 +6793,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
      bool NeedShuffle =
          VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof);
      InstructionCost InsertCost =
-          TTI->getVectorInstrCost(Instruction::InsertElement, VecTy,
+          TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
                                  /*Index=*/0, PoisonValue::get(VecTy), *It);
      return InsertCost + (NeedShuffle
                               ? TTI->getShuffleCost(
@ -7047,7 +7048,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        }
      }
      return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
-                                     *getExtractIndex(I));
+                                     CostKind, *getExtractIndex(I));
    };
    auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
    return GetCostDiff(GetScalarCost, GetVectorCost);
@ -7116,7 +7117,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

    InstructionCost Cost = 0;
    Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
-                                          /*Insert*/ true, /*Extract*/ false);
+                                          /*Insert*/ true, /*Extract*/ false,
+                                          CostKind);

    // First cost - resize to actual vector size if not identity shuffle or
    // need to shift the vector.
@ -7995,6 +7997,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
    // extend the extracted value back to the original type. Here, we account
    // for the extract and the added cost of the sign extend if needed.
    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
    if (MinBWs.count(ScalarRoot)) {
      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
@ -8004,8 +8007,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
                                                   VecTy, EU.Lane);
    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+      ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                             CostKind, EU.Lane);
    }
  }

@ -8079,7 +8082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
        EstimateShufflesCost);
    InstructionCost InsertCost = TTI->getScalarizationOverhead(
        cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
-        /*Insert*/ true, /*Extract*/ false);
+        /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
    Cost -= InsertCost;
  }

@ -8427,9 +8430,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
 InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,
                                       const APInt &ShuffledIndices,
                                       bool NeedToShuffle) const {
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  InstructionCost Cost =
      TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,
-                                    /*Extract*/ false);
+                                    /*Extract*/ false, CostKind);
  if (NeedToShuffle)
    Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
  return Cost;
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@ -230,8 +230,10 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  InstructionCost OldCost =
      TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
-  OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
-                                          /* Insert */ true, HasExtract);
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  OldCost +=
+      TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+                                   /* Insert */ true, HasExtract, CostKind);

  // New pattern: load VecPtr
  InstructionCost NewCost =
@ -346,9 +348,12 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
    return nullptr;

  Type *VecTy = Ext0->getVectorOperand()->getType();
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
-  InstructionCost Cost0 = TTI.getVectorInstrCost(*Ext0, VecTy, Index0);
-  InstructionCost Cost1 = TTI.getVectorInstrCost(*Ext1, VecTy, Index1);
+  InstructionCost Cost0 =
+      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
+  InstructionCost Cost1 =
+      TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);

  // If both costs are invalid no shuffle is needed
  if (!Cost0.isValid() && !Cost1.isValid())
@ -411,11 +416,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
  // both sequences.
  unsigned Ext0Index = Ext0IndexC->getZExtValue();
  unsigned Ext1Index = Ext1IndexC->getZExtValue();
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  InstructionCost Extract0Cost =
-      TTI.getVectorInstrCost(*Ext0, VecTy, Ext0Index);
+      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
  InstructionCost Extract1Cost =
-      TTI.getVectorInstrCost(*Ext1, VecTy, Ext1Index);
+      TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);

  // A more expensive extract will always be replaced by a splat shuffle.
  // For example, if Ext0 is more expensive:
@ -645,15 +651,16 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
  Mask[Index] = Index + NumElts;

  Type *ScalarTy = VecTy->getScalarType();
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  InstructionCost OldCost =
      TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
-      TTI.getVectorInstrCost(I, VecTy, Index);
+      TTI.getVectorInstrCost(I, VecTy, CostKind, Index);

  // If the extract has one use, it will be eliminated, so count it in the
  // original cost. If it has more than one use, ignore the cost because it will
  // be the same before/after.
  if (Extract->hasOneUse())
-    OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index);
+    OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);

  InstructionCost NewCost =
      TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
@ -801,8 +808,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {

  // Get cost estimate for the insert element. This cost will factor into
  // both sequences.
-  InstructionCost InsertCost =
-      TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost InsertCost = TTI.getVectorInstrCost(
+      Instruction::InsertElement, VecTy, CostKind, Index);
  InstructionCost OldCost =
      (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
  InstructionCost NewCost = ScalarOpCost + InsertCost +
@ -891,8 +899,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
  if (!VecTy)
    return false;

-  InstructionCost OldCost = TTI.getVectorInstrCost(*Ext0, VecTy, Index0);
-  OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, Index1);
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost OldCost =
+      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
+  OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
  OldCost +=
      TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
                             CmpInst::makeCmpResultType(I0->getType()), Pred) *
@ -912,7 +922,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
  NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
                                ShufMask);
  NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
-  NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CheapIndex);
+  NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);

  // Aggressively form vector ops if the cost is equal because the transform
  // may enable further optimization.
@ -1169,8 +1179,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
    }

    auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
    OriginalCost +=
-        TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT,
+        TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind,
                               Index ? Index->getZExtValue() : -1);
    ScalarizedCost +=
        TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll
@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void