mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 11:06:06 +00:00
Revert "[AArch64][SVE] Improve code quality of vector unsigned/signed add reductions. (#97339)"
This reverts commit b7b0071680e60c60da9d4d858f944fd95d76fd42. The change caused regression in a performance testing.
This commit is contained in:
parent
f2eb7c7344
commit
3f8d77bcc7
@ -17576,71 +17576,6 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
|
||||
return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
|
||||
}
|
||||
|
||||
// Turn [sign|zero]_extend(vecreduce_add()) into SVE's SADDV|UADDV
|
||||
// instructions.
|
||||
static SDValue
|
||||
performVecReduceAddExtCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
|
||||
const AArch64TargetLowering &TLI) {
|
||||
if (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
|
||||
N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND)
|
||||
return SDValue();
|
||||
bool IsSigned = N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND;
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
|
||||
SDValue VecOp = N->getOperand(0).getOperand(0);
|
||||
EVT VecOpVT = VecOp.getValueType();
|
||||
SDLoc DL(N);
|
||||
|
||||
// Split the input vectors if not legal, e.g.
|
||||
// i32 (vecreduce_add (zext nxv32i8 %op to nxv32i32))
|
||||
// ->
|
||||
// i32 (add
|
||||
// (i32 vecreduce_add (zext nxv16i8 %op.lo to nxv16i32)),
|
||||
// (i32 vecreduce_add (zext nxv16i8 %op.hi to nxv16i32)))
|
||||
if (TLI.getTypeAction(*DAG.getContext(), VecOpVT) ==
|
||||
TargetLowering::TypeSplitVector) {
|
||||
SDValue Lo, Hi;
|
||||
std::tie(Lo, Hi) = DAG.SplitVector(VecOp, DL);
|
||||
unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
||||
EVT HalfVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
|
||||
*DAG.getContext());
|
||||
Lo = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0),
|
||||
DAG.getNode(ExtOpc, DL, HalfVT, Lo));
|
||||
Hi = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0),
|
||||
DAG.getNode(ExtOpc, DL, HalfVT, Hi));
|
||||
return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Lo, Hi);
|
||||
}
|
||||
|
||||
if (!TLI.isTypeLegal(VecOpVT))
|
||||
return SDValue();
|
||||
|
||||
if (VecOpVT.isFixedLengthVector() &&
|
||||
!TLI.useSVEForFixedLengthVectorVT(VecOpVT, !Subtarget.isNeonAvailable()))
|
||||
return SDValue();
|
||||
|
||||
// The input type is legal so map VECREDUCE_ADD to UADDV/SADDV, e.g.
|
||||
// i32 (vecreduce_add (zext nxv16i8 %op to nxv16i32))
|
||||
// ->
|
||||
// i32 (UADDV nxv16i8:%op)
|
||||
EVT ElemType = N->getValueType(0);
|
||||
SDValue Pg = getPredicateForVector(DAG, DL, VecOpVT);
|
||||
if (VecOpVT.isFixedLengthVector()) {
|
||||
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VecOpVT);
|
||||
VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
|
||||
}
|
||||
SDValue Res =
|
||||
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
|
||||
DAG.getConstant(IsSigned ? Intrinsic::aarch64_sve_saddv
|
||||
: Intrinsic::aarch64_sve_uaddv,
|
||||
DL, MVT::i64),
|
||||
Pg, VecOp);
|
||||
if (ElemType != MVT::i64)
|
||||
Res = DAG.getAnyExtOrTrunc(Res, DL, ElemType);
|
||||
|
||||
return Res;
|
||||
}
|
||||
|
||||
// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
|
||||
// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
|
||||
// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
|
||||
@ -25326,11 +25261,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
||||
return performInsertVectorEltCombine(N, DCI);
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return performExtractVectorEltCombine(N, DCI, Subtarget);
|
||||
case ISD::VECREDUCE_ADD: {
|
||||
if (SDValue Val = performVecReduceAddCombine(N, DCI.DAG, Subtarget))
|
||||
return Val;
|
||||
return performVecReduceAddExtCombine(N, DCI, *this);
|
||||
}
|
||||
case ISD::VECREDUCE_ADD:
|
||||
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
|
||||
case AArch64ISD::UADDV:
|
||||
return performUADDVCombine(N, DAG);
|
||||
case AArch64ISD::SMULL:
|
||||
|
@ -145,10 +145,11 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
|
||||
define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
|
||||
; CHECK-LABEL: add_ext_v32i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: uaddlp v1.8h, v1.16b
|
||||
; CHECK-NEXT: uadalp v1.8h, v0.16b
|
||||
; CHECK-NEXT: uadalp v1.8h, v2.16b
|
||||
; CHECK-NEXT: addv h0, v1.8h
|
||||
; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b
|
||||
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
|
||||
; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
|
||||
; CHECK-NEXT: uadalp v0.8h, v2.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
%ae = zext <32 x i8> %a to <32 x i16>
|
||||
|
@ -103,12 +103,17 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
|
||||
define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
|
||||
; CHECK-LABEL: add_ext_i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.b
|
||||
; CHECK-NEXT: uaddv d1, p0, z1.b
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: fmov w9, s1
|
||||
; CHECK-NEXT: add w0, w8, w9
|
||||
; CHECK-NEXT: uunpkhi z2.h, z0.b
|
||||
; CHECK-NEXT: uunpklo z0.h, z0.b
|
||||
; CHECK-NEXT: uunpkhi z3.h, z1.b
|
||||
; CHECK-NEXT: uunpklo z1.h, z1.b
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: add z0.h, z0.h, z2.h
|
||||
; CHECK-NEXT: add z1.h, z1.h, z3.h
|
||||
; CHECK-NEXT: add z0.h, z0.h, z1.h
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.h
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
|
||||
; CHECK-NEXT: ret
|
||||
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
|
||||
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
|
||||
@ -121,15 +126,21 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
|
||||
define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
|
||||
; CHECK-LABEL: add_ext_v32i16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: uaddv d1, p0, z1.b
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.b
|
||||
; CHECK-NEXT: uaddv d2, p0, z2.b
|
||||
; CHECK-NEXT: fmov w8, s1
|
||||
; CHECK-NEXT: fmov w9, s0
|
||||
; CHECK-NEXT: add w8, w9, w8
|
||||
; CHECK-NEXT: fmov w9, s2
|
||||
; CHECK-NEXT: add w0, w8, w9
|
||||
; CHECK-NEXT: uunpklo z3.h, z1.b
|
||||
; CHECK-NEXT: uunpklo z4.h, z0.b
|
||||
; CHECK-NEXT: uunpkhi z1.h, z1.b
|
||||
; CHECK-NEXT: uunpkhi z0.h, z0.b
|
||||
; CHECK-NEXT: uunpkhi z5.h, z2.b
|
||||
; CHECK-NEXT: uunpklo z2.h, z2.b
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: add z0.h, z0.h, z1.h
|
||||
; CHECK-NEXT: add z1.h, z4.h, z3.h
|
||||
; CHECK-NEXT: add z0.h, z1.h, z0.h
|
||||
; CHECK-NEXT: add z1.h, z2.h, z5.h
|
||||
; CHECK-NEXT: add z0.h, z0.h, z1.h
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.h
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
|
||||
; CHECK-NEXT: ret
|
||||
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
|
||||
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
|
||||
|
@ -1,4 +1,3 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
|
||||
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256
|
||||
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
|
||||
@ -7,31 +6,24 @@
|
||||
|
||||
define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) {
|
||||
; SVE256-LABEL: test:
|
||||
; SVE256: // %bb.0: // %L.entry
|
||||
; SVE256-NEXT: ptrue p0.h, vl16
|
||||
; SVE256-NEXT: mov w9, wzr
|
||||
; SVE256-NEXT: mov w10, wzr
|
||||
; SVE256-NEXT: mov w8, wzr
|
||||
; SVE256-NEXT: mov w11, #-16 // =0xfffffff0
|
||||
; SVE256-NEXT: .p2align 5, , 16
|
||||
; SVE256-NEXT: .LBB0_1: // %L1
|
||||
; SVE256-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; SVE256-NEXT: sxtw x12, w9
|
||||
; SVE256-NEXT: sxtw x13, w10
|
||||
; SVE256-NEXT: adds w11, w11, #1
|
||||
; SVE256-NEXT: add w10, w10, w3
|
||||
; SVE256-NEXT: ld1b { z0.h }, p0/z, [x0, x12]
|
||||
; SVE256-NEXT: ld1b { z1.h }, p0/z, [x2, x13]
|
||||
; SVE256-NEXT: add w9, w9, w1
|
||||
; SVE256-NEXT: sub z0.h, z0.h, z1.h
|
||||
; SVE256-NEXT: saddv d0, p0, z0.h
|
||||
; SVE256-NEXT: fmov w12, s0
|
||||
; SVE256-NEXT: add w8, w12, w8
|
||||
; SVE256-NEXT: b.lo .LBB0_1
|
||||
; SVE256-NEXT: // %bb.2: // %L2
|
||||
; SVE256-NEXT: mov w0, w8
|
||||
; SVE256-NEXT: ret
|
||||
; SVE256: ld1b { z0.h }, p0/z,
|
||||
; SVE256: ld1b { z1.h }, p0/z,
|
||||
; SVE256: sub z0.h, z0.h, z1.h
|
||||
; SVE256-NEXT: sunpklo z1.s, z0.h
|
||||
; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16
|
||||
; SVE256-NEXT: sunpklo z0.s, z0.h
|
||||
; SVE256-NEXT: add z0.s, z1.s, z0.s
|
||||
; SVE256-NEXT: uaddv d0, p1, z0.s
|
||||
|
||||
; NEON-LABEL: test:
|
||||
; NEON: ldr q0, [x0, w9, sxtw]
|
||||
; NEON: ldr q1, [x2, w10, sxtw]
|
||||
; NEON: usubl2 v2.8h, v0.16b, v1.16b
|
||||
; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b
|
||||
; NEON: saddl2 v1.4s, v0.8h, v2.8h
|
||||
; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h
|
||||
; NEON-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; NEON-NEXT: addv s0, v0.4s
|
||||
|
||||
L.entry:
|
||||
br label %L1
|
||||
@ -63,5 +55,3 @@ L2: ; preds = %L1
|
||||
}
|
||||
|
||||
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; NEON: {{.*}}
|
||||
|
@ -188,94 +188,6 @@ define i64 @uaddv_nxv2i64(<vscale x 2 x i64> %a) {
|
||||
ret i64 %res
|
||||
}
|
||||
|
||||
define i32 @uaddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
|
||||
; CHECK-LABEL: uaddv_nxv16i8_nxv16i32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.b
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
|
||||
; CHECK-NEXT: ret
|
||||
%1 = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
|
||||
%2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i64 @uaddv_nxv16i16_nxv16i64(<vscale x 16 x i16> %a) {
|
||||
; CHECK-LABEL: uaddv_nxv16i16_nxv16i64:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: uaddv d1, p0, z1.h
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.h
|
||||
; CHECK-NEXT: fmov x8, d1
|
||||
; CHECK-NEXT: fmov x9, d0
|
||||
; CHECK-NEXT: add x0, x9, x8
|
||||
; CHECK-NEXT: ret
|
||||
%1 = zext <vscale x 16 x i16> %a to <vscale x 16 x i64>
|
||||
%2 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> %1)
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
define i32 @uaddv_nxv16i16_nxv16i32(<vscale x 32 x i16> %a) {
|
||||
; CHECK-LABEL: uaddv_nxv16i16_nxv16i32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: uaddv d3, p0, z3.h
|
||||
; CHECK-NEXT: uaddv d2, p0, z2.h
|
||||
; CHECK-NEXT: uaddv d1, p0, z1.h
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.h
|
||||
; CHECK-NEXT: fmov w8, s3
|
||||
; CHECK-NEXT: fmov w9, s2
|
||||
; CHECK-NEXT: fmov w10, s1
|
||||
; CHECK-NEXT: fmov w11, s0
|
||||
; CHECK-NEXT: add w8, w9, w8
|
||||
; CHECK-NEXT: add w9, w11, w10
|
||||
; CHECK-NEXT: add w0, w9, w8
|
||||
; CHECK-NEXT: ret
|
||||
%1 = zext <vscale x 32 x i16> %a to <vscale x 32 x i32>
|
||||
%2 = call i32 @llvm.vector.reduce.add.nxv32i64(<vscale x 32 x i32> %1)
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i32 @saddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
|
||||
; CHECK-LABEL: saddv_nxv16i8_nxv16i32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.b
|
||||
; CHECK-NEXT: saddv d0, p0, z0.b
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
|
||||
; CHECK-NEXT: ret
|
||||
%1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
|
||||
%2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i32 @uaddv_nxv32i16_nxv32i32(ptr %a) {
|
||||
; CHECK-LABEL: uaddv_nxv32i16_nxv32i32:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ptrue p0.h
|
||||
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #3, mul vl]
|
||||
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #2, mul vl]
|
||||
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #1, mul vl]
|
||||
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: uaddv d0, p0, z0.h
|
||||
; CHECK-NEXT: uaddv d1, p0, z1.h
|
||||
; CHECK-NEXT: uaddv d2, p0, z2.h
|
||||
; CHECK-NEXT: uaddv d3, p0, z3.h
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: fmov w9, s1
|
||||
; CHECK-NEXT: fmov w10, s2
|
||||
; CHECK-NEXT: fmov w11, s3
|
||||
; CHECK-NEXT: add w8, w9, w8
|
||||
; CHECK-NEXT: add w9, w11, w10
|
||||
; CHECK-NEXT: add w0, w9, w8
|
||||
; CHECK-NEXT: ret
|
||||
%1 = load <vscale x 32 x i16>, ptr %a, align 16
|
||||
%2 = zext <vscale x 32 x i16> %1 to <vscale x 32 x i32>
|
||||
%3 = call i32 @llvm.vector.reduce.add.nxv32i32(<vscale x 32 x i32> %2)
|
||||
ret i32 %3
|
||||
}
|
||||
|
||||
; UMINV
|
||||
|
||||
define i8 @umin_nxv16i8(<vscale x 16 x i8> %a) {
|
||||
|
@ -1,102 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,NO_STREAMING
|
||||
; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE_128
|
||||
; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE_MIN_256
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
define i32 @reduce_uadd_v16i8(<32 x i8> %a) #0 {
|
||||
; NO_STREAMING-LABEL: reduce_uadd_v16i8:
|
||||
; NO_STREAMING: // %bb.0:
|
||||
; NO_STREAMING-NEXT: ushll2 v2.8h, v1.16b, #0
|
||||
; NO_STREAMING-NEXT: ushll v1.8h, v1.8b, #0
|
||||
; NO_STREAMING-NEXT: ushll2 v3.8h, v0.16b, #0
|
||||
; NO_STREAMING-NEXT: ushll v0.8h, v0.8b, #0
|
||||
; NO_STREAMING-NEXT: uaddl2 v4.4s, v1.8h, v2.8h
|
||||
; NO_STREAMING-NEXT: uaddl v1.4s, v1.4h, v2.4h
|
||||
; NO_STREAMING-NEXT: uaddl2 v2.4s, v0.8h, v3.8h
|
||||
; NO_STREAMING-NEXT: uaddl v0.4s, v0.4h, v3.4h
|
||||
; NO_STREAMING-NEXT: add v1.4s, v1.4s, v4.4s
|
||||
; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s
|
||||
; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; NO_STREAMING-NEXT: addv s0, v0.4s
|
||||
; NO_STREAMING-NEXT: fmov w0, s0
|
||||
; NO_STREAMING-NEXT: ret
|
||||
;
|
||||
; SVE_128-LABEL: reduce_uadd_v16i8:
|
||||
; SVE_128: // %bb.0:
|
||||
; SVE_128-NEXT: ptrue p0.b
|
||||
; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; SVE_128-NEXT: uaddv d1, p0, z1.b
|
||||
; SVE_128-NEXT: uaddv d0, p0, z0.b
|
||||
; SVE_128-NEXT: fmov x8, d1
|
||||
; SVE_128-NEXT: fmov x9, d0
|
||||
; SVE_128-NEXT: add w0, w9, w8
|
||||
; SVE_128-NEXT: ret
|
||||
;
|
||||
; SVE_MIN_256-LABEL: reduce_uadd_v16i8:
|
||||
; SVE_MIN_256: // %bb.0:
|
||||
; SVE_MIN_256-NEXT: ptrue p0.b, vl16
|
||||
; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; SVE_MIN_256-NEXT: splice z0.b, p0, z0.b, z1.b
|
||||
; SVE_MIN_256-NEXT: ptrue p0.b, vl32
|
||||
; SVE_MIN_256-NEXT: uaddv d0, p0, z0.b
|
||||
; SVE_MIN_256-NEXT: fmov x0, d0
|
||||
; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0
|
||||
; SVE_MIN_256-NEXT: ret
|
||||
%1 = zext <32 x i8> %a to <32 x i32>
|
||||
%2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i32 @reduce_sadd_v16i8(<32 x i8> %a) #0 {
|
||||
; NO_STREAMING-LABEL: reduce_sadd_v16i8:
|
||||
; NO_STREAMING: // %bb.0:
|
||||
; NO_STREAMING-NEXT: sshll2 v2.8h, v1.16b, #0
|
||||
; NO_STREAMING-NEXT: sshll v1.8h, v1.8b, #0
|
||||
; NO_STREAMING-NEXT: sshll2 v3.8h, v0.16b, #0
|
||||
; NO_STREAMING-NEXT: sshll v0.8h, v0.8b, #0
|
||||
; NO_STREAMING-NEXT: saddl2 v4.4s, v1.8h, v2.8h
|
||||
; NO_STREAMING-NEXT: saddl v1.4s, v1.4h, v2.4h
|
||||
; NO_STREAMING-NEXT: saddl2 v2.4s, v0.8h, v3.8h
|
||||
; NO_STREAMING-NEXT: saddl v0.4s, v0.4h, v3.4h
|
||||
; NO_STREAMING-NEXT: add v1.4s, v1.4s, v4.4s
|
||||
; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s
|
||||
; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; NO_STREAMING-NEXT: addv s0, v0.4s
|
||||
; NO_STREAMING-NEXT: fmov w0, s0
|
||||
; NO_STREAMING-NEXT: ret
|
||||
;
|
||||
; SVE_128-LABEL: reduce_sadd_v16i8:
|
||||
; SVE_128: // %bb.0:
|
||||
; SVE_128-NEXT: ptrue p0.b
|
||||
; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; SVE_128-NEXT: saddv d1, p0, z1.b
|
||||
; SVE_128-NEXT: saddv d0, p0, z0.b
|
||||
; SVE_128-NEXT: fmov x8, d1
|
||||
; SVE_128-NEXT: fmov x9, d0
|
||||
; SVE_128-NEXT: add w0, w9, w8
|
||||
; SVE_128-NEXT: ret
|
||||
;
|
||||
; SVE_MIN_256-LABEL: reduce_sadd_v16i8:
|
||||
; SVE_MIN_256: // %bb.0:
|
||||
; SVE_MIN_256-NEXT: ptrue p0.b, vl16
|
||||
; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; SVE_MIN_256-NEXT: splice z0.b, p0, z0.b, z1.b
|
||||
; SVE_MIN_256-NEXT: ptrue p0.b, vl32
|
||||
; SVE_MIN_256-NEXT: saddv d0, p0, z0.b
|
||||
; SVE_MIN_256-NEXT: fmov x0, d0
|
||||
; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0
|
||||
; SVE_MIN_256-NEXT: ret
|
||||
%1 = sext <32 x i8> %a to <32 x i32>
|
||||
%2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
attributes #0 = { "target-features"="+sve" }
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; CHECK: {{.*}}
|
@ -1968,9 +1968,10 @@ define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
|
||||
; CHECK-SD-BASE: // %bb.0: // %entry
|
||||
; CHECK-SD-BASE-NEXT: umull2 v2.8h, v1.16b, v0.16b
|
||||
; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b
|
||||
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
|
||||
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
|
||||
; CHECK-SD-BASE-NEXT: uaddl2 v1.4s, v0.8h, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
|
||||
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-SD-BASE-NEXT: fmov w0, s0
|
||||
; CHECK-SD-BASE-NEXT: ret
|
||||
;
|
||||
@ -2295,9 +2296,10 @@ define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
|
||||
; CHECK-SD-BASE: // %bb.0: // %entry
|
||||
; CHECK-SD-BASE-NEXT: smull2 v2.8h, v1.16b, v0.16b
|
||||
; CHECK-SD-BASE-NEXT: smull v0.8h, v1.8b, v0.8b
|
||||
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
|
||||
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
|
||||
; CHECK-SD-BASE-NEXT: saddl2 v1.4s, v0.8h, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
|
||||
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-SD-BASE-NEXT: fmov w0, s0
|
||||
; CHECK-SD-BASE-NEXT: ret
|
||||
;
|
||||
@ -3866,9 +3868,10 @@ entry:
|
||||
define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) {
|
||||
; CHECK-SD-LABEL: add_v32i8_v32i16_zext:
|
||||
; CHECK-SD: // %bb.0: // %entry
|
||||
; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b
|
||||
; CHECK-SD-NEXT: uadalp v1.8h, v0.16b
|
||||
; CHECK-SD-NEXT: addv h0, v1.8h
|
||||
; CHECK-SD-NEXT: uaddl2 v2.8h, v0.16b, v1.16b
|
||||
; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
|
||||
; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h
|
||||
; CHECK-SD-NEXT: addv h0, v0.8h
|
||||
; CHECK-SD-NEXT: fmov w0, s0
|
||||
; CHECK-SD-NEXT: ret
|
||||
;
|
||||
@ -3991,9 +3994,10 @@ entry:
|
||||
define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) {
|
||||
; CHECK-SD-LABEL: add_v32i8_v32i16_sext:
|
||||
; CHECK-SD: // %bb.0: // %entry
|
||||
; CHECK-SD-NEXT: saddlp v1.8h, v1.16b
|
||||
; CHECK-SD-NEXT: sadalp v1.8h, v0.16b
|
||||
; CHECK-SD-NEXT: addv h0, v1.8h
|
||||
; CHECK-SD-NEXT: saddl2 v2.8h, v0.16b, v1.16b
|
||||
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
|
||||
; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h
|
||||
; CHECK-SD-NEXT: addv h0, v0.8h
|
||||
; CHECK-SD-NEXT: fmov w0, s0
|
||||
; CHECK-SD-NEXT: ret
|
||||
;
|
||||
@ -4234,14 +4238,14 @@ define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) {
|
||||
; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext:
|
||||
; CHECK-SD-BASE: // %bb.0: // %entry
|
||||
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v1.16b, #0
|
||||
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
|
||||
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v0.16b, #0
|
||||
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
|
||||
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
|
||||
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v1.8h, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v2.4h
|
||||
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v3.8h
|
||||
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v3.4h
|
||||
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v4.4s
|
||||
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h
|
||||
; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
|
||||
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
|
||||
; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s
|
||||
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
|
||||
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
|
||||
@ -4507,14 +4511,14 @@ define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) {
|
||||
; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext:
|
||||
; CHECK-SD-BASE: // %bb.0: // %entry
|
||||
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v1.16b, #0
|
||||
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
|
||||
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v0.16b, #0
|
||||
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
|
||||
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
|
||||
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v1.8h, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v2.4h
|
||||
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v3.8h
|
||||
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v3.4h
|
||||
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v4.4s
|
||||
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v3.8h, v2.8h
|
||||
; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h
|
||||
; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v0.8h, v1.8h
|
||||
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
|
||||
; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s
|
||||
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
|
||||
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
|
||||
|
Loading…
x
Reference in New Issue
Block a user