From 3f8d77bcc7f3c6ada0650d848d76ea9827e78de4 Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Mon, 22 Jul 2024 17:22:56 +0000 Subject: [PATCH] Revert "[AArch64][SVE] Improve code quality of vector unsigned/signed add reductions. (#97339)" This reverts commit b7b0071680e60c60da9d4d858f944fd95d76fd42. The change caused regression in a performance testing. --- .../Target/AArch64/AArch64ISelLowering.cpp | 72 +------------ llvm/test/CodeGen/AArch64/double_reduct.ll | 9 +- llvm/test/CodeGen/AArch64/sve-doublereduct.ll | 41 ++++--- .../CodeGen/AArch64/sve-fixed-vector-zext.ll | 44 +++----- llvm/test/CodeGen/AArch64/sve-int-reduce.ll | 88 --------------- ...-streaming-mode-fixed-length-reductions.ll | 102 ------------------ llvm/test/CodeGen/AArch64/vecreduce-add.ll | 52 ++++----- 7 files changed, 78 insertions(+), 330 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index eae0200f37f0..99f9758a3716 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17576,71 +17576,6 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP); } -// Turn [sign|zero]_extend(vecreduce_add()) into SVE's SADDV|UADDV -// instructions. -static SDValue -performVecReduceAddExtCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AArch64TargetLowering &TLI) { - if (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && - N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) - return SDValue(); - bool IsSigned = N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND; - - SelectionDAG &DAG = DCI.DAG; - auto &Subtarget = DAG.getSubtarget(); - SDValue VecOp = N->getOperand(0).getOperand(0); - EVT VecOpVT = VecOp.getValueType(); - SDLoc DL(N); - - // Split the input vectors if not legal, e.g. - // i32 (vecreduce_add (zext nxv32i8 %op to nxv32i32)) - // -> - // i32 (add - // (i32 vecreduce_add (zext nxv16i8 %op.lo to nxv16i32)), - // (i32 vecreduce_add (zext nxv16i8 %op.hi to nxv16i32))) - if (TLI.getTypeAction(*DAG.getContext(), VecOpVT) == - TargetLowering::TypeSplitVector) { - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(VecOp, DL); - unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - EVT HalfVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( - *DAG.getContext()); - Lo = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), - DAG.getNode(ExtOpc, DL, HalfVT, Lo)); - Hi = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), - DAG.getNode(ExtOpc, DL, HalfVT, Hi)); - return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Lo, Hi); - } - - if (!TLI.isTypeLegal(VecOpVT)) - return SDValue(); - - if (VecOpVT.isFixedLengthVector() && - !TLI.useSVEForFixedLengthVectorVT(VecOpVT, !Subtarget.isNeonAvailable())) - return SDValue(); - - // The input type is legal so map VECREDUCE_ADD to UADDV/SADDV, e.g. - // i32 (vecreduce_add (zext nxv16i8 %op to nxv16i32)) - // -> - // i32 (UADDV nxv16i8:%op) - EVT ElemType = N->getValueType(0); - SDValue Pg = getPredicateForVector(DAG, DL, VecOpVT); - if (VecOpVT.isFixedLengthVector()) { - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VecOpVT); - VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); - } - SDValue Res = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, - DAG.getConstant(IsSigned ? Intrinsic::aarch64_sve_saddv - : Intrinsic::aarch64_sve_uaddv, - DL, MVT::i64), - Pg, VecOp); - if (ElemType != MVT::i64) - Res = DAG.getAnyExtOrTrunc(Res, DL, ElemType); - - return Res; -} - // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one)) // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B)) @@ -25326,11 +25261,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI, Subtarget); - case ISD::VECREDUCE_ADD: { - if (SDValue Val = performVecReduceAddCombine(N, DCI.DAG, Subtarget)) - return Val; - return performVecReduceAddExtCombine(N, DCI, *this); - } + case ISD::VECREDUCE_ADD: + return performVecReduceAddCombine(N, DCI.DAG, Subtarget); case AArch64ISD::UADDV: return performUADDVCombine(N, DAG); case AArch64ISD::SMULL: diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll index cf5e15da0b17..b10114bc0ffa 100644 --- a/llvm/test/CodeGen/AArch64/double_reduct.ll +++ b/llvm/test/CodeGen/AArch64/double_reduct.ll @@ -145,10 +145,11 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: add_ext_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddlp v1.8h, v1.16b -; CHECK-NEXT: uadalp v1.8h, v0.16b -; CHECK-NEXT: uadalp v1.8h, v2.16b -; CHECK-NEXT: addv h0, v1.8h +; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: add v0.8h, v0.8h, v3.8h +; CHECK-NEXT: uadalp v0.8h, v2.16b +; CHECK-NEXT: addv h0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %ae = zext <32 x i8> %a to <32 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll index b289dfbec527..7bc31d44bb65 100644 --- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll +++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll @@ -103,12 +103,17 @@ define i32 @add_i32( %a, %b) { define i16 @add_ext_i16( %a, %b) { ; CHECK-LABEL: add_ext_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uaddv d0, p0, z0.b -; CHECK-NEXT: uaddv d1, p0, z1.b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: uunpkhi z2.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z3.h, z1.b +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %ae = zext %a to %be = zext %b to @@ -121,15 +126,21 @@ define i16 @add_ext_i16( %a, %b) { define i16 @add_ext_v32i16( %a, %b) { ; CHECK-LABEL: add_ext_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uaddv d1, p0, z1.b -; CHECK-NEXT: uaddv d0, p0, z0.b -; CHECK-NEXT: uaddv d2, p0, z2.b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: uunpklo z3.h, z1.b +; CHECK-NEXT: uunpklo z4.h, z0.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: uunpkhi z5.h, z2.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: add z1.h, z4.h, z3.h +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: add z1.h, z2.h, z5.h +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %ae = zext %a to %be = zext %b to diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll index 24c817d41030..1ab2589bccd5 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON @@ -7,31 +6,24 @@ define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) { ; SVE256-LABEL: test: -; SVE256: // %bb.0: // %L.entry -; SVE256-NEXT: ptrue p0.h, vl16 -; SVE256-NEXT: mov w9, wzr -; SVE256-NEXT: mov w10, wzr -; SVE256-NEXT: mov w8, wzr -; SVE256-NEXT: mov w11, #-16 // =0xfffffff0 -; SVE256-NEXT: .p2align 5, , 16 -; SVE256-NEXT: .LBB0_1: // %L1 -; SVE256-NEXT: // =>This Inner Loop Header: Depth=1 -; SVE256-NEXT: sxtw x12, w9 -; SVE256-NEXT: sxtw x13, w10 -; SVE256-NEXT: adds w11, w11, #1 -; SVE256-NEXT: add w10, w10, w3 -; SVE256-NEXT: ld1b { z0.h }, p0/z, [x0, x12] -; SVE256-NEXT: ld1b { z1.h }, p0/z, [x2, x13] -; SVE256-NEXT: add w9, w9, w1 -; SVE256-NEXT: sub z0.h, z0.h, z1.h -; SVE256-NEXT: saddv d0, p0, z0.h -; SVE256-NEXT: fmov w12, s0 -; SVE256-NEXT: add w8, w12, w8 -; SVE256-NEXT: b.lo .LBB0_1 -; SVE256-NEXT: // %bb.2: // %L2 -; SVE256-NEXT: mov w0, w8 -; SVE256-NEXT: ret +; SVE256: ld1b { z0.h }, p0/z, +; SVE256: ld1b { z1.h }, p0/z, +; SVE256: sub z0.h, z0.h, z1.h +; SVE256-NEXT: sunpklo z1.s, z0.h +; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16 +; SVE256-NEXT: sunpklo z0.s, z0.h +; SVE256-NEXT: add z0.s, z1.s, z0.s +; SVE256-NEXT: uaddv d0, p1, z0.s +; NEON-LABEL: test: +; NEON: ldr q0, [x0, w9, sxtw] +; NEON: ldr q1, [x2, w10, sxtw] +; NEON: usubl2 v2.8h, v0.16b, v1.16b +; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b +; NEON: saddl2 v1.4s, v0.8h, v2.8h +; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h +; NEON-NEXT: add v0.4s, v0.4s, v1.4s +; NEON-NEXT: addv s0, v0.4s L.entry: br label %L1 @@ -63,5 +55,3 @@ L2: ; preds = %L1 } declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; NEON: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll index c8dd719aa03c..8c1b5225b7f2 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -188,94 +188,6 @@ define i64 @uaddv_nxv2i64( %a) { ret i64 %res } -define i32 @uaddv_nxv16i8_nxv16i32( %a) { -; CHECK-LABEL: uaddv_nxv16i8_nxv16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uaddv d0, p0, z0.b -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 -; CHECK-NEXT: ret - %1 = zext %a to - %2 = call i32 @llvm.vector.reduce.add.nxv16i32( %1) - ret i32 %2 -} - -define i64 @uaddv_nxv16i16_nxv16i64( %a) { -; CHECK-LABEL: uaddv_nxv16i16_nxv16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: uaddv d1, p0, z1.h -; CHECK-NEXT: uaddv d0, p0, z0.h -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: add x0, x9, x8 -; CHECK-NEXT: ret - %1 = zext %a to - %2 = call i64 @llvm.vector.reduce.add.nxv16i64( %1) - ret i64 %2 -} - -define i32 @uaddv_nxv16i16_nxv16i32( %a) { -; CHECK-LABEL: uaddv_nxv16i16_nxv16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: uaddv d3, p0, z3.h -; CHECK-NEXT: uaddv d2, p0, z2.h -; CHECK-NEXT: uaddv d1, p0, z1.h -; CHECK-NEXT: uaddv d0, p0, z0.h -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: add w9, w11, w10 -; CHECK-NEXT: add w0, w9, w8 -; CHECK-NEXT: ret - %1 = zext %a to - %2 = call i32 @llvm.vector.reduce.add.nxv32i64( %1) - ret i32 %2 -} - -define i32 @saddv_nxv16i8_nxv16i32( %a) { -; CHECK-LABEL: saddv_nxv16i8_nxv16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: saddv d0, p0, z0.b -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 -; CHECK-NEXT: ret - %1 = sext %a to - %2 = call i32 @llvm.vector.reduce.add.nxv16i32( %1) - ret i32 %2 -} - -define i32 @uaddv_nxv32i16_nxv32i32(ptr %a) { -; CHECK-LABEL: uaddv_nxv32i16_nxv32i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #3, mul vl] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] -; CHECK-NEXT: uaddv d0, p0, z0.h -; CHECK-NEXT: uaddv d1, p0, z1.h -; CHECK-NEXT: uaddv d2, p0, z2.h -; CHECK-NEXT: uaddv d3, p0, z3.h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: fmov w11, s3 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: add w9, w11, w10 -; CHECK-NEXT: add w0, w9, w8 -; CHECK-NEXT: ret - %1 = load , ptr %a, align 16 - %2 = zext %1 to - %3 = call i32 @llvm.vector.reduce.add.nxv32i32( %2) - ret i32 %3 -} - ; UMINV define i8 @umin_nxv16i8( %a) { diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll deleted file mode 100644 index 608b3bdeac75..000000000000 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll +++ /dev/null @@ -1,102 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,NO_STREAMING -; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE_128 -; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE_MIN_256 - -target triple = "aarch64-unknown-linux-gnu" - -define i32 @reduce_uadd_v16i8(<32 x i8> %a) #0 { -; NO_STREAMING-LABEL: reduce_uadd_v16i8: -; NO_STREAMING: // %bb.0: -; NO_STREAMING-NEXT: ushll2 v2.8h, v1.16b, #0 -; NO_STREAMING-NEXT: ushll v1.8h, v1.8b, #0 -; NO_STREAMING-NEXT: ushll2 v3.8h, v0.16b, #0 -; NO_STREAMING-NEXT: ushll v0.8h, v0.8b, #0 -; NO_STREAMING-NEXT: uaddl2 v4.4s, v1.8h, v2.8h -; NO_STREAMING-NEXT: uaddl v1.4s, v1.4h, v2.4h -; NO_STREAMING-NEXT: uaddl2 v2.4s, v0.8h, v3.8h -; NO_STREAMING-NEXT: uaddl v0.4s, v0.4h, v3.4h -; NO_STREAMING-NEXT: add v1.4s, v1.4s, v4.4s -; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s -; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s -; NO_STREAMING-NEXT: addv s0, v0.4s -; NO_STREAMING-NEXT: fmov w0, s0 -; NO_STREAMING-NEXT: ret -; -; SVE_128-LABEL: reduce_uadd_v16i8: -; SVE_128: // %bb.0: -; SVE_128-NEXT: ptrue p0.b -; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0 -; SVE_128-NEXT: uaddv d1, p0, z1.b -; SVE_128-NEXT: uaddv d0, p0, z0.b -; SVE_128-NEXT: fmov x8, d1 -; SVE_128-NEXT: fmov x9, d0 -; SVE_128-NEXT: add w0, w9, w8 -; SVE_128-NEXT: ret -; -; SVE_MIN_256-LABEL: reduce_uadd_v16i8: -; SVE_MIN_256: // %bb.0: -; SVE_MIN_256-NEXT: ptrue p0.b, vl16 -; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0 -; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1 -; SVE_MIN_256-NEXT: splice z0.b, p0, z0.b, z1.b -; SVE_MIN_256-NEXT: ptrue p0.b, vl32 -; SVE_MIN_256-NEXT: uaddv d0, p0, z0.b -; SVE_MIN_256-NEXT: fmov x0, d0 -; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0 -; SVE_MIN_256-NEXT: ret - %1 = zext <32 x i8> %a to <32 x i32> - %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1) - ret i32 %2 -} - -define i32 @reduce_sadd_v16i8(<32 x i8> %a) #0 { -; NO_STREAMING-LABEL: reduce_sadd_v16i8: -; NO_STREAMING: // %bb.0: -; NO_STREAMING-NEXT: sshll2 v2.8h, v1.16b, #0 -; NO_STREAMING-NEXT: sshll v1.8h, v1.8b, #0 -; NO_STREAMING-NEXT: sshll2 v3.8h, v0.16b, #0 -; NO_STREAMING-NEXT: sshll v0.8h, v0.8b, #0 -; NO_STREAMING-NEXT: saddl2 v4.4s, v1.8h, v2.8h -; NO_STREAMING-NEXT: saddl v1.4s, v1.4h, v2.4h -; NO_STREAMING-NEXT: saddl2 v2.4s, v0.8h, v3.8h -; NO_STREAMING-NEXT: saddl v0.4s, v0.4h, v3.4h -; NO_STREAMING-NEXT: add v1.4s, v1.4s, v4.4s -; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s -; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s -; NO_STREAMING-NEXT: addv s0, v0.4s -; NO_STREAMING-NEXT: fmov w0, s0 -; NO_STREAMING-NEXT: ret -; -; SVE_128-LABEL: reduce_sadd_v16i8: -; SVE_128: // %bb.0: -; SVE_128-NEXT: ptrue p0.b -; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1 -; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0 -; SVE_128-NEXT: saddv d1, p0, z1.b -; SVE_128-NEXT: saddv d0, p0, z0.b -; SVE_128-NEXT: fmov x8, d1 -; SVE_128-NEXT: fmov x9, d0 -; SVE_128-NEXT: add w0, w9, w8 -; SVE_128-NEXT: ret -; -; SVE_MIN_256-LABEL: reduce_sadd_v16i8: -; SVE_MIN_256: // %bb.0: -; SVE_MIN_256-NEXT: ptrue p0.b, vl16 -; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0 -; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1 -; SVE_MIN_256-NEXT: splice z0.b, p0, z0.b, z1.b -; SVE_MIN_256-NEXT: ptrue p0.b, vl32 -; SVE_MIN_256-NEXT: saddv d0, p0, z0.b -; SVE_MIN_256-NEXT: fmov x0, d0 -; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0 -; SVE_MIN_256-NEXT: ret - %1 = sext <32 x i8> %a to <32 x i32> - %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1) - ret i32 %2 -} - -attributes #0 = { "target-features"="+sve" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 27e786eb1ced..c81fd26a7752 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -1968,9 +1968,10 @@ define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK-SD-BASE: // %bb.0: // %entry ; CHECK-SD-BASE-NEXT: umull2 v2.8h, v1.16b, v0.16b ; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b -; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v2.8h -; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h -; CHECK-SD-BASE-NEXT: addv s0, v1.4s +; CHECK-SD-BASE-NEXT: uaddl2 v1.4s, v0.8h, v2.8h +; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h +; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-BASE-NEXT: addv s0, v0.4s ; CHECK-SD-BASE-NEXT: fmov w0, s0 ; CHECK-SD-BASE-NEXT: ret ; @@ -2295,9 +2296,10 @@ define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK-SD-BASE: // %bb.0: // %entry ; CHECK-SD-BASE-NEXT: smull2 v2.8h, v1.16b, v0.16b ; CHECK-SD-BASE-NEXT: smull v0.8h, v1.8b, v0.8b -; CHECK-SD-BASE-NEXT: saddlp v1.4s, v2.8h -; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h -; CHECK-SD-BASE-NEXT: addv s0, v1.4s +; CHECK-SD-BASE-NEXT: saddl2 v1.4s, v0.8h, v2.8h +; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h +; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-BASE-NEXT: addv s0, v0.4s ; CHECK-SD-BASE-NEXT: fmov w0, s0 ; CHECK-SD-BASE-NEXT: ret ; @@ -3866,9 +3868,10 @@ entry: define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) { ; CHECK-SD-LABEL: add_v32i8_v32i16_zext: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b -; CHECK-SD-NEXT: uadalp v1.8h, v0.16b -; CHECK-SD-NEXT: addv h0, v1.8h +; CHECK-SD-NEXT: uaddl2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: addv h0, v0.8h ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret ; @@ -3991,9 +3994,10 @@ entry: define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) { ; CHECK-SD-LABEL: add_v32i8_v32i16_sext: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: saddlp v1.8h, v1.16b -; CHECK-SD-NEXT: sadalp v1.8h, v0.16b -; CHECK-SD-NEXT: addv h0, v1.8h +; CHECK-SD-NEXT: saddl2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: addv h0, v0.8h ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret ; @@ -4234,14 +4238,14 @@ define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) { ; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext: ; CHECK-SD-BASE: // %bb.0: // %entry ; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v1.16b, #0 -; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v0.16b, #0 +; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v1.8h, v2.8h -; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v2.4h -; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v3.8h -; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v3.4h -; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v4.4s +; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v3.8h, v2.8h +; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h +; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v0.8h, v1.8h +; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s ; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-SD-BASE-NEXT: addv s0, v0.4s @@ -4507,14 +4511,14 @@ define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) { ; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext: ; CHECK-SD-BASE: // %bb.0: // %entry ; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v1.16b, #0 -; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v0.16b, #0 +; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v1.8h, v2.8h -; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v2.4h -; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v3.8h -; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v3.4h -; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v4.4s +; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v3.8h, v2.8h +; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h +; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v0.8h, v1.8h +; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h +; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s ; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-SD-BASE-NEXT: addv s0, v0.4s