[AArch64] Add @llvm.experimental.vector.match (#101974)

This patch introduces an experimental intrinsic for matching the
elements of one vector against the elements of another.

For AArch64 targets that support SVE2, the intrinsic lowers to a MATCH
instruction for supported fixed and scalar vector types.
This commit is contained in:
Ricardo Jesus 2024-11-14 09:00:19 +00:00 committed by GitHub
parent debfd7b0b4
commit e52238b59f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 735 additions and 0 deletions

View File

@ -20091,6 +20091,44 @@ are undefined.
}
'``llvm.experimental.vector.match.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
This is an overloaded intrinsic.
::
declare <<n> x i1> @llvm.experimental.vector.match(<<n> x <ty>> %op1, <<m> x <ty>> %op2, <<n> x i1> %mask)
declare <vscale x <n> x i1> @llvm.experimental.vector.match(<vscale x <n> x <ty>> %op1, <<m> x <ty>> %op2, <vscale x <n> x i1> %mask)
Overview:
"""""""""
Find active elements of the first argument matching any elements of the second.
Arguments:
""""""""""
The first argument is the search vector, the second argument the vector of
elements we are searching for (i.e. for which we consider a match successful),
and the third argument is a mask that controls which elements of the first
argument are active. The first two arguments must be vectors of matching
integer element types. The first and third arguments and the result type must
have matching element counts (fixed or scalable). The second argument must be a
fixed vector, but its length may be different from the remaining arguments.
Semantics:
""""""""""
The '``llvm.experimental.vector.match``' intrinsic compares each active element
in the first argument against the elements of the second argument, placing
``1`` in the corresponding element of the output vector if any equality
comparison is successful, and ``0`` otherwise. Inactive elements in the mask
are set to ``0`` in the output.
Matrix Intrinsics
-----------------

View File

@ -483,6 +483,13 @@ public:
bool ZeroIsPoison,
const ConstantRange *VScaleRange) const;
/// Return true if the @llvm.experimental.vector.match intrinsic should be
/// expanded for vector type `VT' and search size `SearchSize' using generic
/// code in SelectionDAGBuilder.
virtual bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const {
return true;
}
// Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
// vecreduce(op(x, y)) for the reduction opcode RedOpc.
virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {

View File

@ -1920,6 +1920,14 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
[ IntrArgMemOnly ]>;
// Experimental match
def int_experimental_vector_match : DefaultAttrsIntrinsic<
[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
[ llvm_anyvector_ty,
llvm_anyvector_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], // Mask
[ IntrNoMem, IntrNoSync, IntrWillReturn ]>;
// Operators
let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
// Integer arithmetic

View File

@ -8175,6 +8175,36 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
return;
}
case Intrinsic::experimental_vector_match: {
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
SDValue Mask = getValue(I.getOperand(2));
EVT Op1VT = Op1.getValueType();
EVT Op2VT = Op2.getValueType();
EVT ResVT = Mask.getValueType();
unsigned SearchSize = Op2VT.getVectorNumElements();
// If the target has native support for this vector match operation, lower
// the intrinsic untouched; otherwise, expand it below.
if (!TLI.shouldExpandVectorMatch(Op1VT, SearchSize)) {
visitTargetIntrinsic(I, Intrinsic);
return;
}
SDValue Ret = DAG.getConstant(0, sdl, ResVT);
for (unsigned i = 0; i < SearchSize; ++i) {
SDValue Op2Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl,
Op2VT.getVectorElementType(), Op2,
DAG.getVectorIdxConstant(i, sdl));
SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, sdl, Op1VT, Op2Elem);
SDValue Cmp = DAG.getSetCC(sdl, ResVT, Op1, Splat, ISD::SETEQ);
Ret = DAG.getNode(ISD::OR, sdl, ResVT, Ret, Cmp);
}
setValue(&I, DAG.getNode(ISD::AND, sdl, ResVT, Ret, Mask));
return;
}
case Intrinsic::vector_reverse:
visitVectorReverse(I);
return;

View File

@ -6150,6 +6150,31 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
&Call);
break;
}
case Intrinsic::experimental_vector_match: {
Value *Op1 = Call.getArgOperand(0);
Value *Op2 = Call.getArgOperand(1);
Value *Mask = Call.getArgOperand(2);
VectorType *Op1Ty = dyn_cast<VectorType>(Op1->getType());
VectorType *Op2Ty = dyn_cast<VectorType>(Op2->getType());
VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
Check(Op1Ty && Op2Ty && MaskTy, "Operands must be vectors.", &Call);
Check(isa<FixedVectorType>(Op2Ty),
"Second operand must be a fixed length vector.", &Call);
Check(Op1Ty->getElementType()->isIntegerTy(),
"First operand must be a vector of integers.", &Call);
Check(Op1Ty->getElementType() == Op2Ty->getElementType(),
"First two operands must have the same element type.", &Call);
Check(Op1Ty->getElementCount() == MaskTy->getElementCount(),
"First operand and mask must have the same number of elements.",
&Call);
Check(MaskTy->getElementType()->isIntegerTy(1),
"Mask must be a vector of i1's.", &Call);
Check(Call.getType() == MaskTy, "Return type must match the mask type.",
&Call);
break;
}
case Intrinsic::vector_insert: {
Value *Vec = Call.getArgOperand(0);
Value *SubVec = Call.getArgOperand(1);

View File

@ -2059,6 +2059,19 @@ bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
VT != MVT::v4i1 && VT != MVT::v2i1;
}
bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,
unsigned SearchSize) const {
// MATCH is SVE2 and only available in non-streaming mode.
if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
return true;
// Furthermore, we can only use it for 8-bit or 16-bit elements.
if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
return SearchSize != 8;
if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
return SearchSize != 8 && SearchSize != 16;
return true;
}
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
@ -5780,6 +5793,72 @@ SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
}
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue ID =
DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
auto Op1 = Op.getOperand(1);
auto Op2 = Op.getOperand(2);
auto Mask = Op.getOperand(3);
EVT Op1VT = Op1.getValueType();
EVT Op2VT = Op2.getValueType();
EVT ResVT = Op.getValueType();
assert((Op1VT.getVectorElementType() == MVT::i8 ||
Op1VT.getVectorElementType() == MVT::i16) &&
"Expected 8-bit or 16-bit characters.");
// Scalable vector type used to wrap operands.
// A single container is enough for both operands because ultimately the
// operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
EVT OpContainerVT = Op1VT.isScalableVector()
? Op1VT
: getContainerForFixedLengthVector(DAG, Op1VT);
if (Op2VT.is128BitVector()) {
// If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
// Further, if the result is scalable, broadcast Op2 to a full SVE register.
if (ResVT.isScalableVector())
Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
DAG.getTargetConstant(0, dl, MVT::i64));
} else {
// If Op2 is not a full 128-bit vector, we always need to broadcast it.
unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op2IntVT, Op2,
DAG.getConstant(0, dl, MVT::i64));
Op2 = DAG.getSplatVector(Op2PromotedVT, dl, Op2);
Op2 = DAG.getBitcast(OpContainerVT, Op2);
}
// If the result is scalable, we just need to carry out the MATCH.
if (ResVT.isScalableVector())
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1, Op2);
// If the result is fixed, we can still use MATCH but we need to wrap the
// first operand and the mask in scalable vectors before doing so.
// Wrap the operands.
Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, Op1VT, Mask);
Mask = convertFixedMaskToScalableVector(Mask, DAG);
// Carry out the match.
SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.getValueType(),
ID, Mask, Op1, Op2);
// Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
// (v16i8/v8i8).
Match = DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match);
Match = convertFromScalableVector(DAG, Op1VT, Match);
return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Match);
}
SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = Op.getConstantOperandVal(1);
@ -6383,6 +6462,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
}
case Intrinsic::experimental_vector_match: {
return LowerVectorMatch(Op, DAG);
}
}
}
@ -27153,6 +27235,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
return;
}
case Intrinsic::experimental_vector_match:
case Intrinsic::get_active_lane_mask: {
if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
return;

View File

@ -985,6 +985,8 @@ public:
bool shouldExpandCttzElements(EVT VT) const override;
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override;
/// If a change in streaming mode is required on entry to/return from a
/// function call it emits and returns the corresponding SMSTART or SMSTOP
/// node. \p Condition should be one of the enum values from

View File

@ -0,0 +1,542 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
define <vscale x 16 x i1> @match_nxv16i8_v1i8(<vscale x 16 x i8> %op1, <1 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv16i8_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mov z1.b, b1
; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <1 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
define <vscale x 16 x i1> @match_nxv16i8_v2i8(<vscale x 16 x i8> %op1, <2 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv16i8_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov w8, v1.s[1]
; CHECK-NEXT: fmov w9, s1
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: mov z2.b, w9
; CHECK-NEXT: mov z1.b, w8
; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b
; CHECK-NEXT: cmpeq p1.b, p1/z, z0.b, z2.b
; CHECK-NEXT: sel p1.b, p1, p1.b, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <2 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
define <vscale x 16 x i1> @match_nxv16i8_v4i8(<vscale x 16 x i8> %op1, <4 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv16i8_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umov w8, v1.h[1]
; CHECK-NEXT: umov w9, v1.h[0]
; CHECK-NEXT: umov w10, v1.h[2]
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: mov z2.b, w8
; CHECK-NEXT: mov z3.b, w9
; CHECK-NEXT: umov w8, v1.h[3]
; CHECK-NEXT: mov z1.b, w10
; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z2.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z2.b, w8
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z1.b
; CHECK-NEXT: cmpeq p1.b, p1/z, z0.b, z2.b
; CHECK-NEXT: mov p2.b, p3/m, p3.b
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov p1.b, p2/m, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <4 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
define <vscale x 16 x i1> @match_nxv16i8_v8i8(<vscale x 16 x i8> %op1, <8 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv16i8_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mov z1.d, d1
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <8 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
define <vscale x 16 x i1> @match_nxv16i8_v16i8(<vscale x 16 x i8> %op1, <16 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv16i8_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z1.q, q1
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <16 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) #0 {
; CHECK-LABEL: match_v16i8_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: dup v1.16b, v1.b[0]
; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
define <16 x i1> @match_v16i8_v2i8(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask) #0 {
; CHECK-LABEL: match_v16i8_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: dup v3.16b, v1.b[4]
; CHECK-NEXT: dup v1.16b, v1.b[0]
; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
define <16 x i1> @match_v16i8_v4i8(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask) #0 {
; CHECK-LABEL: match_v16i8_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: dup v3.16b, v1.b[2]
; CHECK-NEXT: dup v4.16b, v1.b[0]
; CHECK-NEXT: dup v5.16b, v1.b[4]
; CHECK-NEXT: dup v1.16b, v1.b[6]
; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
; CHECK-NEXT: cmeq v5.16b, v0.16b, v5.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
; CHECK-NEXT: orr v1.16b, v4.16b, v3.16b
; CHECK-NEXT: orr v0.16b, v5.16b, v0.16b
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
define <16 x i1> @match_v16i8_v8i8(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask) #0 {
; CHECK-LABEL: match_v16i8_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.16b, v2.16b, #7
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z1.d, d1
; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
define <16 x i1> @match_v16i8_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 {
; CHECK-LABEL: match_v16i8_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.16b, v2.16b, #7
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
define <8 x i1> @match_v8i8_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 {
; CHECK-LABEL: match_v8i8_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.8b, v2.8b, #7
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: mov z1.d, d1
; CHECK-NEXT: cmlt v2.8b, v2.8b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask)
ret <8 x i1> %r
}
define <vscale x 8 x i1> @match_nxv8i16_v8i16(<vscale x 8 x i16> %op1, <8 x i16> %op2, <vscale x 8 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv8i16_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z1.q, q1
; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
; CHECK-NEXT: ret
%r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <8 x i16> %op2, <vscale x 8 x i1> %mask)
ret <vscale x 8 x i1> %r
}
define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 {
; CHECK-LABEL: match_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: shl v2.8h, v2.8h, #15
; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0
; CHECK-NEXT: match p0.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask)
ret <8 x i1> %r
}
; Cases where op2 has more elements than op1.
define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) #0 {
; CHECK-LABEL: match_v8i8_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.8b, v2.8b, #7
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: cmlt v2.8b, v2.8b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0
; CHECK-NEXT: match p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
ret <8 x i1> %r
}
define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv16i8_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z3.b, z1.b[1]
; CHECK-NEXT: mov z4.b, b1
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: mov z5.b, z1.b[2]
; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z4.b
; CHECK-NEXT: mov z3.b, z1.b[3]
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z5.b
; CHECK-NEXT: mov z4.b, z1.b[4]
; CHECK-NEXT: mov p2.b, p3/m, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z1.b[5]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
; CHECK-NEXT: mov z4.b, z1.b[6]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z1.b[7]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
; CHECK-NEXT: mov z4.b, z1.b[8]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z1.b[9]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
; CHECK-NEXT: mov z4.b, z1.b[10]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z1.b[11]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
; CHECK-NEXT: mov z4.b, z1.b[12]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z1.b[13]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
; CHECK-NEXT: mov z4.b, z1.b[14]
; CHECK-NEXT: mov z1.b, z1.b[15]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, b2
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z4.b
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[1]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[2]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[3]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[4]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[5]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[6]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[7]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[8]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[9]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[10]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[11]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[12]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[13]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: mov z3.b, z2.b[14]
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: cmpeq p3.b, p1/z, z0.b, z1.b
; CHECK-NEXT: mov z1.b, z2.b[15]
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: cmpeq p4.b, p1/z, z0.b, z3.b
; CHECK-NEXT: cmpeq p1.b, p1/z, z0.b, z1.b
; CHECK-NEXT: sel p2.b, p2, p2.b, p3.b
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov p1.b, p2/m, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
ret <vscale x 16 x i1> %r
}
define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) #0 {
; CHECK-LABEL: match_v16i8_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v4.16b, v1.b[1]
; CHECK-NEXT: dup v5.16b, v1.b[0]
; CHECK-NEXT: dup v6.16b, v1.b[2]
; CHECK-NEXT: dup v7.16b, v1.b[3]
; CHECK-NEXT: dup v16.16b, v1.b[4]
; CHECK-NEXT: dup v17.16b, v1.b[5]
; CHECK-NEXT: dup v18.16b, v1.b[6]
; CHECK-NEXT: dup v19.16b, v1.b[7]
; CHECK-NEXT: dup v20.16b, v1.b[8]
; CHECK-NEXT: cmeq v4.16b, v0.16b, v4.16b
; CHECK-NEXT: cmeq v5.16b, v0.16b, v5.16b
; CHECK-NEXT: cmeq v6.16b, v0.16b, v6.16b
; CHECK-NEXT: cmeq v7.16b, v0.16b, v7.16b
; CHECK-NEXT: cmeq v16.16b, v0.16b, v16.16b
; CHECK-NEXT: cmeq v17.16b, v0.16b, v17.16b
; CHECK-NEXT: dup v21.16b, v2.b[7]
; CHECK-NEXT: dup v22.16b, v1.b[10]
; CHECK-NEXT: orr v4.16b, v5.16b, v4.16b
; CHECK-NEXT: orr v5.16b, v6.16b, v7.16b
; CHECK-NEXT: orr v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmeq v7.16b, v0.16b, v18.16b
; CHECK-NEXT: cmeq v16.16b, v0.16b, v19.16b
; CHECK-NEXT: cmeq v17.16b, v0.16b, v20.16b
; CHECK-NEXT: dup v18.16b, v1.b[9]
; CHECK-NEXT: dup v19.16b, v1.b[11]
; CHECK-NEXT: dup v20.16b, v1.b[12]
; CHECK-NEXT: cmeq v22.16b, v0.16b, v22.16b
; CHECK-NEXT: orr v4.16b, v4.16b, v5.16b
; CHECK-NEXT: orr v5.16b, v6.16b, v7.16b
; CHECK-NEXT: orr v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmeq v7.16b, v0.16b, v18.16b
; CHECK-NEXT: dup v18.16b, v1.b[13]
; CHECK-NEXT: cmeq v16.16b, v0.16b, v19.16b
; CHECK-NEXT: cmeq v17.16b, v0.16b, v20.16b
; CHECK-NEXT: dup v19.16b, v2.b[0]
; CHECK-NEXT: dup v20.16b, v2.b[1]
; CHECK-NEXT: orr v4.16b, v4.16b, v5.16b
; CHECK-NEXT: dup v5.16b, v2.b[6]
; CHECK-NEXT: orr v6.16b, v6.16b, v7.16b
; CHECK-NEXT: orr v7.16b, v16.16b, v17.16b
; CHECK-NEXT: cmeq v16.16b, v0.16b, v18.16b
; CHECK-NEXT: cmeq v17.16b, v0.16b, v19.16b
; CHECK-NEXT: cmeq v18.16b, v0.16b, v20.16b
; CHECK-NEXT: dup v19.16b, v2.b[2]
; CHECK-NEXT: cmeq v5.16b, v0.16b, v5.16b
; CHECK-NEXT: cmeq v20.16b, v0.16b, v21.16b
; CHECK-NEXT: dup v21.16b, v2.b[8]
; CHECK-NEXT: orr v6.16b, v6.16b, v22.16b
; CHECK-NEXT: orr v7.16b, v7.16b, v16.16b
; CHECK-NEXT: dup v16.16b, v1.b[14]
; CHECK-NEXT: dup v1.16b, v1.b[15]
; CHECK-NEXT: orr v17.16b, v17.16b, v18.16b
; CHECK-NEXT: cmeq v18.16b, v0.16b, v19.16b
; CHECK-NEXT: dup v19.16b, v2.b[3]
; CHECK-NEXT: orr v5.16b, v5.16b, v20.16b
; CHECK-NEXT: cmeq v20.16b, v0.16b, v21.16b
; CHECK-NEXT: dup v21.16b, v2.b[9]
; CHECK-NEXT: cmeq v16.16b, v0.16b, v16.16b
; CHECK-NEXT: cmeq v1.16b, v0.16b, v1.16b
; CHECK-NEXT: orr v4.16b, v4.16b, v6.16b
; CHECK-NEXT: orr v17.16b, v17.16b, v18.16b
; CHECK-NEXT: cmeq v18.16b, v0.16b, v19.16b
; CHECK-NEXT: dup v19.16b, v2.b[4]
; CHECK-NEXT: orr v5.16b, v5.16b, v20.16b
; CHECK-NEXT: cmeq v20.16b, v0.16b, v21.16b
; CHECK-NEXT: dup v21.16b, v2.b[10]
; CHECK-NEXT: orr v7.16b, v7.16b, v16.16b
; CHECK-NEXT: orr v16.16b, v17.16b, v18.16b
; CHECK-NEXT: cmeq v17.16b, v0.16b, v19.16b
; CHECK-NEXT: dup v18.16b, v2.b[5]
; CHECK-NEXT: orr v5.16b, v5.16b, v20.16b
; CHECK-NEXT: cmeq v19.16b, v0.16b, v21.16b
; CHECK-NEXT: dup v20.16b, v2.b[11]
; CHECK-NEXT: orr v1.16b, v7.16b, v1.16b
; CHECK-NEXT: orr v6.16b, v16.16b, v17.16b
; CHECK-NEXT: cmeq v7.16b, v0.16b, v18.16b
; CHECK-NEXT: dup v17.16b, v2.b[12]
; CHECK-NEXT: orr v5.16b, v5.16b, v19.16b
; CHECK-NEXT: cmeq v16.16b, v0.16b, v20.16b
; CHECK-NEXT: dup v18.16b, v2.b[13]
; CHECK-NEXT: dup v19.16b, v2.b[14]
; CHECK-NEXT: orr v1.16b, v4.16b, v1.16b
; CHECK-NEXT: dup v2.16b, v2.b[15]
; CHECK-NEXT: orr v4.16b, v6.16b, v7.16b
; CHECK-NEXT: cmeq v6.16b, v0.16b, v17.16b
; CHECK-NEXT: orr v5.16b, v5.16b, v16.16b
; CHECK-NEXT: cmeq v7.16b, v0.16b, v18.16b
; CHECK-NEXT: cmeq v16.16b, v0.16b, v19.16b
; CHECK-NEXT: cmeq v0.16b, v0.16b, v2.16b
; CHECK-NEXT: orr v1.16b, v1.16b, v4.16b
; CHECK-NEXT: orr v4.16b, v5.16b, v6.16b
; CHECK-NEXT: orr v5.16b, v7.16b, v16.16b
; CHECK-NEXT: orr v1.16b, v1.16b, v4.16b
; CHECK-NEXT: orr v0.16b, v5.16b, v0.16b
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
ret <16 x i1> %r
}
; Data types not supported by MATCH.
; Note: The cases for SVE could be made tighter.
define <vscale x 4 x i1> @match_nxv4xi32_v4i32(<vscale x 4 x i32> %op1, <4 x i32> %op2, <vscale x 4 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv4xi32_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z2.s, z1.s[1]
; CHECK-NEXT: mov z3.s, s1
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z4.s, z1.s[2]
; CHECK-NEXT: mov z1.s, z1.s[3]
; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z2.s
; CHECK-NEXT: cmpeq p3.s, p1/z, z0.s, z3.s
; CHECK-NEXT: cmpeq p4.s, p1/z, z0.s, z4.s
; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s
; CHECK-NEXT: mov p2.b, p3/m, p3.b
; CHECK-NEXT: sel p2.b, p2, p2.b, p4.b
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: mov p1.b, p2/m, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = tail call <vscale x 4 x i1> @llvm.experimental.vector.match(<vscale x 4 x i32> %op1, <4 x i32> %op2, <vscale x 4 x i1> %mask)
ret <vscale x 4 x i1> %r
}
define <vscale x 2 x i1> @match_nxv2xi64_v2i64(<vscale x 2 x i64> %op1, <2 x i64> %op2, <vscale x 2 x i1> %mask) #0 {
; CHECK-LABEL: match_nxv2xi64_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z2.d, z1.d[1]
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z1.d, d1
; CHECK-NEXT: cmpeq p2.d, p1/z, z0.d, z2.d
; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d
; CHECK-NEXT: sel p1.b, p1, p1.b, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ret
%r = tail call <vscale x 2 x i1> @llvm.experimental.vector.match(<vscale x 2 x i64> %op1, <2 x i64> %op2, <vscale x 2 x i1> %mask)
ret <vscale x 2 x i1> %r
}
define <4 x i1> @match_v4xi32_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #0 {
; CHECK-LABEL: match_v4xi32_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v3.4s, v1.s[1]
; CHECK-NEXT: dup v4.4s, v1.s[0]
; CHECK-NEXT: dup v5.4s, v1.s[2]
; CHECK-NEXT: dup v1.4s, v1.s[3]
; CHECK-NEXT: cmeq v3.4s, v0.4s, v3.4s
; CHECK-NEXT: cmeq v4.4s, v0.4s, v4.4s
; CHECK-NEXT: cmeq v5.4s, v0.4s, v5.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
; CHECK-NEXT: orr v1.16b, v4.16b, v3.16b
; CHECK-NEXT: orr v0.16b, v5.16b, v0.16b
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: ret
%r = tail call <4 x i1> @llvm.experimental.vector.match(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask)
ret <4 x i1> %r
}
define <2 x i1> @match_v2xi64_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #0 {
; CHECK-LABEL: match_v2xi64_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v3.2d, v1.d[1]
; CHECK-NEXT: dup v1.2d, v1.d[0]
; CHECK-NEXT: cmeq v3.2d, v0.2d, v3.2d
; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d
; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: ret
%r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask)
ret <2 x i1> %r
}
attributes #0 = { "target-features"="+sve2" }