[VP][RISCV] Add vp.ctlz/cttz and RISC-V support.

The patch also adds expandVPCTLZ and expandVPCTTZ to expand vp.ctlz/cttz nodes and the cost model of vp.ctlz/cttz. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D140370
2025-04-29 10:16:05 +00:00 · 2022-12-20 13:24:01 +08:00 · 2022-12-20 13:24:01 +08:00 · 1e9e1b9cf8
commit 1e9e1b9cf8
parent 63d46869ea
16 changed files with 25941 additions and 1 deletions
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@ -15394,6 +15394,8 @@ Semantics:
 The '``llvm.ctpop``' intrinsic counts the 1's in a variable, or within
 each element of a vector.

+.. _int_ctlz:
+
 '``llvm.ctlz.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

@ -15438,6 +15440,8 @@ zeros in a variable, or within each element of the vector. If
 if ``is_zero_poison == 0`` and ``poison`` otherwise. For example,
 ``llvm.ctlz(i32 2) = 30``.

+.. _int_cttz:
+
 '``llvm.cttz.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

@ -22278,6 +22282,100 @@ Examples:
      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison


+.. _int_vp_ctlz:
+
+'``llvm.vp.ctlz.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.ctlz.v16i32 (<16 x i32> <op>, <16 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+      declare <vscale x 4 x i32>  @llvm.vp.ctlz.nxv4i32 (<vscale x 4 x i32> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+      declare <256 x i64>  @llvm.vp.ctlz.v256i64 (<256 x i64> <op>, <256 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+
+Overview:
+"""""""""
+
+Predicated ctlz of a vector of integers.
+
+
+Arguments:
+""""""""""
+
+The first operand and the result have the same vector of integer type. The
+second operand is the vector mask and has the same number of elements as the
+result vector type. The third operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.ctlz``' intrinsic performs ctlz (:ref:`ctlz <int_ctlz>`) of the first operand on each
+enabled lane.  The result on disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_cttz:
+
+'``llvm.vp.cttz.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.cttz.v16i32 (<16 x i32> <op>, <16 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+      declare <vscale x 4 x i32>  @llvm.vp.cttz.nxv4i32 (<vscale x 4 x i32> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+      declare <256 x i64>  @llvm.vp.cttz.v256i64 (<256 x i64> <op>, <256 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+
+Overview:
+"""""""""
+
+Predicated cttz of a vector of integers.
+
+
+Arguments:
+""""""""""
+
+The first operand and the result have the same vector of integer type. The
+second operand is the vector mask and has the same number of elements as the
+result vector type. The third operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.cttz``' intrinsic performs cttz (:ref:`cttz <int_cttz>`) of the first operand on each
+enabled lane.  The result on disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
 .. _int_vp_fshl:

 '``llvm.vp.fshl.*``' Intrinsics
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@ -4911,6 +4911,11 @@ public:
  /// \returns The expansion result or SDValue() if it fails.
  SDValue expandCTLZ(SDNode *N, SelectionDAG &DAG) const;

+  /// Expand VP_CTLZ/VP_CTLZ_ZERO_UNDEF nodes.
+  /// \param N Node to expand
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandVPCTLZ(SDNode *N, SelectionDAG &DAG) const;
+
  /// Expand CTTZ via Table Lookup.
  /// \param N Node to expand
  /// \returns The expansion result or SDValue() if it fails.
@ -4923,6 +4928,11 @@ public:
  /// \returns The expansion result or SDValue() if it fails.
  SDValue expandCTTZ(SDNode *N, SelectionDAG &DAG) const;

+  /// Expand VP_CTTZ/VP_CTTZ_ZERO_UNDEF nodes.
+  /// \param N Node to expand
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandVPCTTZ(SDNode *N, SelectionDAG &DAG) const;
+
  /// Expand ABS nodes. Expands vector/scalar ABS nodes,
  /// vector nodes can only succeed if all operations are legal/custom.
  /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@ -1817,6 +1817,19 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                               llvm_i32_ty]>;
 }

+let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<3>>] in {
+  def int_vp_ctlz : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty,
+                               llvm_i1_ty]>;
+  def int_vp_cttz : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty,
+                               llvm_i1_ty]>;
+}
+
 def int_get_active_lane_mask:
  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
            [llvm_anyint_ty, LLVMMatchType<1>],
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@ -228,6 +228,22 @@ END_REGISTER_VP(vp_bitreverse, VP_BITREVERSE)
 BEGIN_REGISTER_VP(vp_ctpop, 1, 2, VP_CTPOP, -1)
 END_REGISTER_VP(vp_ctpop, VP_CTPOP)

+// llvm.vp.ctlz(x,mask,vlen, is_zero_poison)
+BEGIN_REGISTER_VP_INTRINSIC(vp_ctlz, 1, 2)
+BEGIN_REGISTER_VP_SDNODE(VP_CTLZ, -1, vp_ctlz, 1, 2)
+END_REGISTER_VP_SDNODE(VP_CTLZ)
+BEGIN_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF, -1, vp_ctlz_zero_undef, 1, 2)
+END_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF)
+END_REGISTER_VP_INTRINSIC(vp_ctlz)
+
+// llvm.vp.cttz(x,mask,vlen, is_zero_poison)
+BEGIN_REGISTER_VP_INTRINSIC(vp_cttz, 1, 2)
+BEGIN_REGISTER_VP_SDNODE(VP_CTTZ, -1, vp_cttz, 1, 2)
+END_REGISTER_VP_SDNODE(VP_CTTZ)
+BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_UNDEF, -1, vp_cttz_zero_undef, 1, 2)
+END_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_UNDEF)
+END_REGISTER_VP_INTRINSIC(vp_cttz)
+
 // llvm.vp.fshl(x,y,z,mask,vlen)
 BEGIN_REGISTER_VP(vp_fshl, 3, 4, VP_FSHL, -1)
 END_REGISTER_VP(vp_fshl, VP_FSHL)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@ -813,6 +813,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
      return;
    }
    break;
+  case ISD::VP_CTLZ:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
+    if (SDValue Expanded = TLI.expandVPCTLZ(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
  case ISD::CTTZ:
  case ISD::CTTZ_ZERO_UNDEF:
    if (SDValue Expanded = TLI.expandCTTZ(Node, DAG)) {
@ -820,6 +827,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
      return;
    }
    break;
+  case ISD::VP_CTTZ:
+  case ISD::VP_CTTZ_ZERO_UNDEF:
+    if (SDValue Expanded = TLI.expandVPCTTZ(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
  case ISD::FSHL:
  case ISD::VP_FSHL:
  case ISD::FSHR:
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -1016,9 +1016,13 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::BSWAP:
  case ISD::VP_BSWAP:
  case ISD::CTLZ:
+  case ISD::VP_CTLZ:
  case ISD::CTTZ:
+  case ISD::VP_CTTZ:
  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ZERO_UNDEF:
  case ISD::CTPOP:
  case ISD::VP_CTPOP:
  case ISD::FABS: case ISD::VP_FABS:
@ -4097,11 +4101,15 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::BSWAP:
  case ISD::VP_BSWAP:
  case ISD::CTLZ:
+  case ISD::VP_CTLZ:
  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
  case ISD::CTPOP:
  case ISD::VP_CTPOP:
  case ISD::CTTZ:
+  case ISD::VP_CTTZ:
  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ZERO_UNDEF:
  case ISD::FNEG: case ISD::VP_FNEG:
  case ISD::VP_FABS:
  case ISD::VP_SQRT:
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -7437,6 +7437,16 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
 static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
  std::optional<unsigned> ResOPC;
  switch (VPIntrin.getIntrinsicID()) {
+  case Intrinsic::vp_ctlz: {
+    bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(3))->isOne();
+    ResOPC = IsZeroUndef ? ISD::VP_CTLZ_ZERO_UNDEF : ISD::VP_CTLZ;
+    break;
+  }
+  case Intrinsic::vp_cttz: {
+    bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(3))->isOne();
+    ResOPC = IsZeroUndef ? ISD::VP_CTTZ_ZERO_UNDEF : ISD::VP_CTTZ;
+    break;
+  }
 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
  case Intrinsic::VPID:                                                        \
    ResOPC = ISD::VPSD;                                                        \
@ -7771,6 +7781,16 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
    setValue(&VPIntrin, N);
    break;
  }
+  case ISD::VP_CTLZ:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ:
+  case ISD::VP_CTTZ_ZERO_UNDEF: {
+    // Pop is_zero_poison operand.
+    OpValues.pop_back();
+    SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues);
+    setValue(&VPIntrin, Result);
+    break;
+  }
  }
 }

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -8382,6 +8382,33 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
  return DAG.getNode(ISD::CTPOP, dl, VT, Op);
 }

+SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+  SDValue VL = Node->getOperand(2);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // do this:
+  // x = x | (x >> 1);
+  // x = x | (x >> 2);
+  // ...
+  // x = x | (x >>16);
+  // x = x | (x >>32); // for 64-bit input
+  // return popcount(~x);
+  for (unsigned i = 0; (1U << i) < NumBitsPerElt; ++i) {
+    SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
+    Op = DAG.getNode(ISD::VP_OR, dl, VT, Op,
+                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op, Tmp, Mask, VL), Mask,
+                     VL);
+  }
+  Op = DAG.getNode(ISD::VP_XOR, dl, VT, Op, DAG.getConstant(-1, dl, VT), Mask,
+                   VL);
+  return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL);
+}
+
 SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
                                        const SDLoc &DL, EVT VT, SDValue Op,
                                        unsigned BitWidth) const {
@ -8482,6 +8509,22 @@ SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const {
  return DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
 }

+SDValue TargetLowering::expandVPCTTZ(SDNode *Node, SelectionDAG &DAG) const {
+  SDValue Op = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+  SDValue VL = Node->getOperand(2);
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+
+  // Same as the vector part of expandCTTZ, use: popcount(~x & (x - 1))
+  SDValue Not = DAG.getNode(ISD::VP_XOR, dl, VT, Op,
+                            DAG.getConstant(-1, dl, VT), Mask, VL);
+  SDValue MinusOne = DAG.getNode(ISD::VP_SUB, dl, VT, Op,
+                                 DAG.getConstant(1, dl, VT), Mask, VL);
+  SDValue Tmp = DAG.getNode(ISD::VP_AND, dl, VT, Not, MinusOne, Mask, VL);
+  return DAG.getNode(ISD::VP_CTPOP, dl, VT, Tmp, Mask, VL);
+}
+
 SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
                                  bool IsNegative) const {
  SDLoc dl(N);
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@ -611,7 +611,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::BSWAP, VT, Expand);
      setOperationAction({ISD::VP_BSWAP, ISD::VP_BITREVERSE}, VT, Expand);
      setOperationAction({ISD::VP_FSHL, ISD::VP_FSHR}, VT, Expand);
-      setOperationAction(ISD::VP_CTPOP, VT, Expand);
+      setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
+                          ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
+                         VT, Expand);

      // Custom-lower extensions and truncations from/to mask types.
      setOperationAction({ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND},
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@ -746,6 +746,82 @@ static const CostTblEntry VectorIntrinsicCostTable[]{
    {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
    {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
    {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
+    {Intrinsic::vp_ctlz, MVT::v2i8, 19},
+    {Intrinsic::vp_ctlz, MVT::v4i8, 19},
+    {Intrinsic::vp_ctlz, MVT::v8i8, 19},
+    {Intrinsic::vp_ctlz, MVT::v16i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
+    {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
+    {Intrinsic::vp_ctlz, MVT::v2i16, 28},
+    {Intrinsic::vp_ctlz, MVT::v4i16, 28},
+    {Intrinsic::vp_ctlz, MVT::v8i16, 28},
+    {Intrinsic::vp_ctlz, MVT::v16i16, 28},
+    {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
+    {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
+    {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
+    {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
+    {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
+    {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
+    {Intrinsic::vp_ctlz, MVT::v2i32, 31},
+    {Intrinsic::vp_ctlz, MVT::v4i32, 31},
+    {Intrinsic::vp_ctlz, MVT::v8i32, 31},
+    {Intrinsic::vp_ctlz, MVT::v16i32, 31},
+    {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
+    {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
+    {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
+    {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
+    {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
+    {Intrinsic::vp_ctlz, MVT::v2i64, 35},
+    {Intrinsic::vp_ctlz, MVT::v4i64, 35},
+    {Intrinsic::vp_ctlz, MVT::v8i64, 35},
+    {Intrinsic::vp_ctlz, MVT::v16i64, 35},
+    {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
+    {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
+    {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
+    {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
+    {Intrinsic::vp_cttz, MVT::v2i8, 16},
+    {Intrinsic::vp_cttz, MVT::v4i8, 16},
+    {Intrinsic::vp_cttz, MVT::v8i8, 16},
+    {Intrinsic::vp_cttz, MVT::v16i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
+    {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
+    {Intrinsic::vp_cttz, MVT::v2i16, 23},
+    {Intrinsic::vp_cttz, MVT::v4i16, 23},
+    {Intrinsic::vp_cttz, MVT::v8i16, 23},
+    {Intrinsic::vp_cttz, MVT::v16i16, 23},
+    {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
+    {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
+    {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
+    {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
+    {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
+    {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
+    {Intrinsic::vp_cttz, MVT::v2i32, 24},
+    {Intrinsic::vp_cttz, MVT::v4i32, 24},
+    {Intrinsic::vp_cttz, MVT::v8i32, 24},
+    {Intrinsic::vp_cttz, MVT::v16i32, 24},
+    {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
+    {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
+    {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
+    {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
+    {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
+    {Intrinsic::vp_cttz, MVT::v2i64, 25},
+    {Intrinsic::vp_cttz, MVT::v4i64, 25},
+    {Intrinsic::vp_cttz, MVT::v8i64, 25},
+    {Intrinsic::vp_cttz, MVT::v16i64, 25},
+    {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
+    {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
+    {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
+    {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
 };

 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
--- a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
@ -382,6 +382,216 @@ define void @vp_ctpop() {
  ret void
 }

+define void @vp_ctlz() {
+; CHECK-LABEL: 'vp_ctlz'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %1 = call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %2 = call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %3 = call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %4 = call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %5 = call <vscale x 1 x i8> @llvm.vp.ctlz.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %6 = call <vscale x 2 x i8> @llvm.vp.ctlz.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %7 = call <vscale x 4 x i8> @llvm.vp.ctlz.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %8 = call <vscale x 8 x i8> @llvm.vp.ctlz.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %9 = call <vscale x 16 x i8> @llvm.vp.ctlz.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %10 = call <vscale x 32 x i8> @llvm.vp.ctlz.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %11 = call <vscale x 64 x i8> @llvm.vp.ctlz.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %12 = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %13 = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %14 = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %16 = call <vscale x 1 x i16> @llvm.vp.ctlz.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %17 = call <vscale x 2 x i16> @llvm.vp.ctlz.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %18 = call <vscale x 4 x i16> @llvm.vp.ctlz.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %19 = call <vscale x 8 x i16> @llvm.vp.ctlz.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %20 = call <vscale x 16 x i16> @llvm.vp.ctlz.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %21 = call <vscale x 32 x i16> @llvm.vp.ctlz.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %22 = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %23 = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %24 = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %25 = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %26 = call <vscale x 1 x i16> @llvm.vp.ctlz.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %27 = call <vscale x 2 x i16> @llvm.vp.ctlz.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %28 = call <vscale x 4 x i16> @llvm.vp.ctlz.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %29 = call <vscale x 8 x i16> @llvm.vp.ctlz.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %30 = call <vscale x 16 x i16> @llvm.vp.ctlz.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %31 = call <vscale x 32 x i16> @llvm.vp.ctlz.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %32 = call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %33 = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %34 = call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %35 = call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %36 = call <vscale x 1 x i32> @llvm.vp.ctlz.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %37 = call <vscale x 2 x i32> @llvm.vp.ctlz.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %38 = call <vscale x 4 x i32> @llvm.vp.ctlz.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %39 = call <vscale x 8 x i32> @llvm.vp.ctlz.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %40 = call <vscale x 16 x i32> @llvm.vp.ctlz.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %41 = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %42 = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %43 = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %44 = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %45 = call <vscale x 1 x i64> @llvm.vp.ctlz.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %46 = call <vscale x 2 x i64> @llvm.vp.ctlz.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %47 = call <vscale x 4 x i64> @llvm.vp.ctlz.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %48 = call <vscale x 8 x i64> @llvm.vp.ctlz.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %49 = call <vscale x 16 x i64> @llvm.vp.ctlz.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i8> @llvm.vp.ctlz.nvx1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i8> @llvm.vp.ctlz.nvx2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i8> @llvm.vp.ctlz.nvx4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i8> @llvm.vp.ctlz.nvx8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i8> @llvm.vp.ctlz.nvx16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 32 x i8> @llvm.vp.ctlz.nvx32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+  call <vscale x 64 x i8> @llvm.vp.ctlz.nvx64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef, i1 false)
+  call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i16> @llvm.vp.ctlz.nvx1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i16> @llvm.vp.ctlz.nvx2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i16> @llvm.vp.ctlz.nvx4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i16> @llvm.vp.ctlz.nvx8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i16> @llvm.vp.ctlz.nvx16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 32 x i16> @llvm.vp.ctlz.nvx32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+  call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i16> @llvm.vp.ctlz.nvx1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i16> @llvm.vp.ctlz.nvx2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i16> @llvm.vp.ctlz.nvx4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i16> @llvm.vp.ctlz.nvx8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i16> @llvm.vp.ctlz.nvx16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 32 x i16> @llvm.vp.ctlz.nvx32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+  call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i32> @llvm.vp.ctlz.nvx1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i32> @llvm.vp.ctlz.nvx2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i32> @llvm.vp.ctlz.nvx4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i32> @llvm.vp.ctlz.nvx8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i32> @llvm.vp.ctlz.nvx16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i64> @llvm.vp.ctlz.nvx1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i64> @llvm.vp.ctlz.nvx2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i64> @llvm.vp.ctlz.nvx4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i64> @llvm.vp.ctlz.nvx8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i64> @llvm.vp.ctlz.nvx16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  ret void
+}
+
+define void @vp_cttz() {
+; CHECK-LABEL: 'vp_cttz'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %1 = call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %2 = call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %3 = call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %5 = call <vscale x 1 x i8> @llvm.vp.cttz.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %6 = call <vscale x 2 x i8> @llvm.vp.cttz.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %7 = call <vscale x 4 x i8> @llvm.vp.cttz.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %8 = call <vscale x 8 x i8> @llvm.vp.cttz.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %9 = call <vscale x 16 x i8> @llvm.vp.cttz.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %10 = call <vscale x 32 x i8> @llvm.vp.cttz.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %11 = call <vscale x 64 x i8> @llvm.vp.cttz.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %12 = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %13 = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %14 = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %15 = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %16 = call <vscale x 1 x i16> @llvm.vp.cttz.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %17 = call <vscale x 2 x i16> @llvm.vp.cttz.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %18 = call <vscale x 4 x i16> @llvm.vp.cttz.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %19 = call <vscale x 8 x i16> @llvm.vp.cttz.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %20 = call <vscale x 16 x i16> @llvm.vp.cttz.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %21 = call <vscale x 32 x i16> @llvm.vp.cttz.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %22 = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %23 = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %24 = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %25 = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %26 = call <vscale x 1 x i16> @llvm.vp.cttz.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %27 = call <vscale x 2 x i16> @llvm.vp.cttz.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %28 = call <vscale x 4 x i16> @llvm.vp.cttz.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %29 = call <vscale x 8 x i16> @llvm.vp.cttz.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %30 = call <vscale x 16 x i16> @llvm.vp.cttz.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %31 = call <vscale x 32 x i16> @llvm.vp.cttz.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %32 = call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %33 = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %34 = call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %35 = call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %36 = call <vscale x 1 x i32> @llvm.vp.cttz.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %37 = call <vscale x 2 x i32> @llvm.vp.cttz.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %38 = call <vscale x 4 x i32> @llvm.vp.cttz.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %39 = call <vscale x 8 x i32> @llvm.vp.cttz.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %40 = call <vscale x 16 x i32> @llvm.vp.cttz.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %41 = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %42 = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %43 = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %44 = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %45 = call <vscale x 1 x i64> @llvm.vp.cttz.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %46 = call <vscale x 2 x i64> @llvm.vp.cttz.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %47 = call <vscale x 4 x i64> @llvm.vp.cttz.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %48 = call <vscale x 8 x i64> @llvm.vp.cttz.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %49 = call <vscale x 16 x i64> @llvm.vp.cttz.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i8> @llvm.vp.cttz.nvx1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i8> @llvm.vp.cttz.nvx2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i8> @llvm.vp.cttz.nvx4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i8> @llvm.vp.cttz.nvx8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i8> @llvm.vp.cttz.nvx16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 32 x i8> @llvm.vp.cttz.nvx32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+  call <vscale x 64 x i8> @llvm.vp.cttz.nvx64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef, i1 false)
+  call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i16> @llvm.vp.cttz.nvx1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i16> @llvm.vp.cttz.nvx2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i16> @llvm.vp.cttz.nvx4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i16> @llvm.vp.cttz.nvx8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i16> @llvm.vp.cttz.nvx16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 32 x i16> @llvm.vp.cttz.nvx32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+  call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i16> @llvm.vp.cttz.nvx1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i16> @llvm.vp.cttz.nvx2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i16> @llvm.vp.cttz.nvx4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i16> @llvm.vp.cttz.nvx8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i16> @llvm.vp.cttz.nvx16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 32 x i16> @llvm.vp.cttz.nvx32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef, i1 false)
+  call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i32> @llvm.vp.cttz.nvx1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i32> @llvm.vp.cttz.nvx2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i32> @llvm.vp.cttz.nvx4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i32> @llvm.vp.cttz.nvx8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i32> @llvm.vp.cttz.nvx16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef, i1 false)
+  call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef, i1 false)
+  call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef, i1 false)
+  call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef, i1 false)
+  call <vscale x 1 x i64> @llvm.vp.cttz.nvx1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef, i1 false)
+  call <vscale x 2 x i64> @llvm.vp.cttz.nvx2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef, i1 false)
+  call <vscale x 4 x i64> @llvm.vp.cttz.nvx4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef, i1 false)
+  call <vscale x 8 x i64> @llvm.vp.cttz.nvx8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef, i1 false)
+  call <vscale x 16 x i64> @llvm.vp.cttz.nvx16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef, i1 false)
+  ret void
+}
+
 declare i16 @llvm.bswap.i16(i16)
 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
 declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
@ -559,3 +769,83 @@ declare <vscale x 2 x i64> @llvm.vp.ctpop.nvx2i64(<vscale x 2 x i64>, <vscale x
 declare <vscale x 4 x i64> @llvm.vp.ctpop.nvx4i64(<vscale x 4 x i64>, <vscale x 4 x i1>, i32)
 declare <vscale x 8 x i64> @llvm.vp.ctpop.nvx8i64(<vscale x 8 x i64>, <vscale x 8 x i1>, i32)
 declare <vscale x 16 x i64> @llvm.vp.ctpop.nvx16i64(<vscale x 16 x i64>, <vscale x 16 x i1>, i32)
+
+declare <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8>, <2 x i1>, i32, i1 immarg)
+declare <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8>, <4 x i1>, i32, i1 immarg)
+declare <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8>, <8 x i1>, i32, i1 immarg)
+declare <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i8> @llvm.vp.ctlz.nvx1i8(<vscale x 1 x i8>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i8> @llvm.vp.ctlz.nvx2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i8> @llvm.vp.ctlz.nvx4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i8> @llvm.vp.ctlz.nvx8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i8> @llvm.vp.ctlz.nvx16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i32, i1 immarg)
+declare <vscale x 32 x i8> @llvm.vp.ctlz.nvx32i8(<vscale x 32 x i8>, <vscale x 32 x i1>, i32, i1 immarg)
+declare <vscale x 64 x i8> @llvm.vp.ctlz.nvx64i8(<vscale x 64 x i8>, <vscale x 64 x i1>, i32, i1 immarg)
+declare <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16>, <2 x i1>, i32, i1 immarg)
+declare <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16>, <4 x i1>, i32, i1 immarg)
+declare <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16>, <8 x i1>, i32, i1 immarg)
+declare <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i16> @llvm.vp.ctlz.nvx1i16(<vscale x 1 x i16>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i16> @llvm.vp.ctlz.nvx2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i16> @llvm.vp.ctlz.nvx4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i16> @llvm.vp.ctlz.nvx8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i16> @llvm.vp.ctlz.nvx16i16(<vscale x 16 x i16>, <vscale x 16 x i1>, i32, i1 immarg)
+declare <vscale x 32 x i16> @llvm.vp.ctlz.nvx32i16(<vscale x 32 x i16>, <vscale x 32 x i1>, i32, i1 immarg)
+declare <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32>, <2 x i1>, i32, i1 immarg)
+declare <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32>, <4 x i1>, i32, i1 immarg)
+declare <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32>, <8 x i1>, i32, i1 immarg)
+declare <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i32> @llvm.vp.ctlz.nvx1i32(<vscale x 1 x i32>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i32> @llvm.vp.ctlz.nvx2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i32> @llvm.vp.ctlz.nvx4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i32> @llvm.vp.ctlz.nvx8i32(<vscale x 8 x i32>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i32> @llvm.vp.ctlz.nvx16i32(<vscale x 16 x i32>, <vscale x 16 x i1>, i32, i1 immarg)
+declare <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64>, <2 x i1>, i32, i1 immarg)
+declare <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64>, <4 x i1>, i32, i1 immarg)
+declare <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64>, <8 x i1>, i32, i1 immarg)
+declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i64> @llvm.vp.ctlz.nvx1i64(<vscale x 1 x i64>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i64> @llvm.vp.ctlz.nvx2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i64> @llvm.vp.ctlz.nvx4i64(<vscale x 4 x i64>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i64> @llvm.vp.ctlz.nvx8i64(<vscale x 8 x i64>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i64> @llvm.vp.ctlz.nvx16i64(<vscale x 16 x i64>, <vscale x 16 x i1>, i32, i1 immarg)
+
+declare <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8>, <2 x i1>, i32, i1 immarg)
+declare <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8>, <4 x i1>, i32, i1 immarg)
+declare <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8>, <8 x i1>, i32, i1 immarg)
+declare <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i8> @llvm.vp.cttz.nvx1i8(<vscale x 1 x i8>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i8> @llvm.vp.cttz.nvx2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i8> @llvm.vp.cttz.nvx4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i8> @llvm.vp.cttz.nvx8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i8> @llvm.vp.cttz.nvx16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i32, i1 immarg)
+declare <vscale x 32 x i8> @llvm.vp.cttz.nvx32i8(<vscale x 32 x i8>, <vscale x 32 x i1>, i32, i1 immarg)
+declare <vscale x 64 x i8> @llvm.vp.cttz.nvx64i8(<vscale x 64 x i8>, <vscale x 64 x i1>, i32, i1 immarg)
+declare <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16>, <2 x i1>, i32, i1 immarg)
+declare <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16>, <4 x i1>, i32, i1 immarg)
+declare <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16>, <8 x i1>, i32, i1 immarg)
+declare <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i16> @llvm.vp.cttz.nvx1i16(<vscale x 1 x i16>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i16> @llvm.vp.cttz.nvx2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i16> @llvm.vp.cttz.nvx4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i16> @llvm.vp.cttz.nvx8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i16> @llvm.vp.cttz.nvx16i16(<vscale x 16 x i16>, <vscale x 16 x i1>, i32, i1 immarg)
+declare <vscale x 32 x i16> @llvm.vp.cttz.nvx32i16(<vscale x 32 x i16>, <vscale x 32 x i1>, i32, i1 immarg)
+declare <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32>, <2 x i1>, i32, i1 immarg)
+declare <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32>, <4 x i1>, i32, i1 immarg)
+declare <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32>, <8 x i1>, i32, i1 immarg)
+declare <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i32> @llvm.vp.cttz.nvx1i32(<vscale x 1 x i32>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i32> @llvm.vp.cttz.nvx2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i32> @llvm.vp.cttz.nvx4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i32> @llvm.vp.cttz.nvx8i32(<vscale x 8 x i32>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i32> @llvm.vp.cttz.nvx16i32(<vscale x 16 x i32>, <vscale x 16 x i1>, i32, i1 immarg)
+declare <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64>, <2 x i1>, i32, i1 immarg)
+declare <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64>, <4 x i1>, i32, i1 immarg)
+declare <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64>, <8 x i1>, i32, i1 immarg)
+declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, <16 x i1>, i32, i1 immarg)
+declare <vscale x 1 x i64> @llvm.vp.cttz.nvx1i64(<vscale x 1 x i64>, <vscale x 1 x i1>, i32, i1 immarg)
+declare <vscale x 2 x i64> @llvm.vp.cttz.nvx2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i1 immarg)
+declare <vscale x 4 x i64> @llvm.vp.cttz.nvx4i64(<vscale x 4 x i64>, <vscale x 4 x i1>, i32, i1 immarg)
+declare <vscale x 8 x i64> @llvm.vp.cttz.nvx8i64(<vscale x 8 x i64>, <vscale x 8 x i1>, i32, i1 immarg)
+declare <vscale x 16 x i64> @llvm.vp.cttz.nvx16i64(<vscale x 16 x i64>, <vscale x 16 x i1>, i32, i1 immarg)
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@ -152,6 +152,10 @@ protected:
        << "(<8 x i16>, <8 x i1>, i32) ";
    Str << " declare <8 x i16> @llvm.vp.ctpop.v8i16"
        << "(<8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.ctlz.v8i16"
+        << "(<8 x i16>, <8 x i1>, i32, i1 immarg) ";
+    Str << " declare <8 x i16> @llvm.vp.cttz.v8i16"
+        << "(<8 x i16>, <8 x i1>, i32, i1 immarg) ";
    Str << " declare <8 x i16> @llvm.vp.fshl.v8i16"
        << "(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i1>, i32) ";
    Str << " declare <8 x i16> @llvm.vp.fshr.v8i16"