diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 790107b772fc..dbb9241fe8ca 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -10328,6 +10328,50 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op, Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); } + MVT XLenVT = Subtarget.getXLenVT(); + auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); + + // On some uarchs vrgather.vv will read from every input register for each + // output register, regardless of the indices. However to reverse a vector + // each output register only needs to read from one register. So decompose it + // into LMUL * M1 vrgather.vvs, so we get O(LMUL) performance instead of + // O(LMUL^2). + // + // vsetvli a1, zero, e64, m4, ta, ma + // vrgatherei16.vv v12, v8, v16 + // -> + // vsetvli a1, zero, e64, m1, ta, ma + // vrgather.vv v15, v8, v16 + // vrgather.vv v14, v9, v16 + // vrgather.vv v13, v10, v16 + // vrgather.vv v12, v11, v16 + if (ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) && + ContainerVT.getVectorElementCount().isKnownMultipleOf(2)) { + auto [Lo, Hi] = DAG.SplitVector(Vec, DL); + Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, Lo.getSimpleValueType(), Lo); + Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, Hi.getSimpleValueType(), Hi); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ContainerVT, Hi, Lo); + + // Fixed length vectors might not fit exactly into their container, and so + // leave a gap in the front of the vector after being reversed. Slide this + // away. + // + // x x x x 3 2 1 0 <- v4i16 @ vlen=128 + // 0 1 2 3 x x x x <- reverse + // x x x x 0 1 2 3 <- vslidedown.vx + if (VecVT.isFixedLengthVector()) { + SDValue Offset = DAG.getNode( + ISD::SUB, DL, XLenVT, + DAG.getElementCount(DL, XLenVT, ContainerVT.getVectorElementCount()), + DAG.getElementCount(DL, XLenVT, VecVT.getVectorElementCount())); + Concat = + getVSlidedown(DAG, Subtarget, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), Concat, Offset, Mask, VL); + Concat = convertFromScalableVector(VecVT, Concat, DAG, Subtarget); + } + return Concat; + } + unsigned EltSize = ContainerVT.getScalarSizeInBits(); unsigned MinSize = ContainerVT.getSizeInBits().getKnownMinValue(); unsigned VectorBitsMax = Subtarget.getRealMaxVLen(); @@ -10375,9 +10419,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op, IntVT = IntVT.changeVectorElementType(MVT::i16); } - MVT XLenVT = Subtarget.getXLenVT(); - auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); - // Calculate VLMAX-1 for the desired SEW. SDValue VLMinus1 = DAG.getNode( ISD::SUB, DL, XLenVT, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index e773d93fad47..cbf9829826fb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -94,13 +94,21 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) { ; NO-ZVBB: # %bb.0: ; NO-ZVBB-NEXT: li a0, 32 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; NO-ZVBB-NEXT: vid.v v8 -; NO-ZVBB-NEXT: li a0, 31 -; NO-ZVBB-NEXT: vrsub.vx v8, v8, a0 -; NO-ZVBB-NEXT: vmv.v.i v10, 0 -; NO-ZVBB-NEXT: vmerge.vim v10, v10, 1, v0 -; NO-ZVBB-NEXT: vrgather.vv v12, v10, v8 -; NO-ZVBB-NEXT: vmsne.vi v0, v12, 0 +; NO-ZVBB-NEXT: vmv.v.i v8, 0 +; NO-ZVBB-NEXT: vmerge.vim v8, v8, 1, v0 +; NO-ZVBB-NEXT: csrr a1, vlenb +; NO-ZVBB-NEXT: addi a2, a1, -1 +; NO-ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; NO-ZVBB-NEXT: vid.v v10 +; NO-ZVBB-NEXT: vrsub.vx v10, v10, a2 +; NO-ZVBB-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; NO-ZVBB-NEXT: vrgatherei16.vv v13, v8, v10 +; NO-ZVBB-NEXT: vrgatherei16.vv v12, v9, v10 +; NO-ZVBB-NEXT: slli a1, a1, 1 +; NO-ZVBB-NEXT: addi a1, a1, -32 +; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; NO-ZVBB-NEXT: vslidedown.vx v8, v12, a1 +; NO-ZVBB-NEXT: vmsne.vi v0, v8, 0 ; NO-ZVBB-NEXT: ret ; ; ZVBB-LABEL: reverse_v32i1: @@ -117,13 +125,23 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) { ; NO-ZVBB: # %bb.0: ; NO-ZVBB-NEXT: li a0, 64 ; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; NO-ZVBB-NEXT: vmv.v.i v8, 0 +; NO-ZVBB-NEXT: vmerge.vim v12, v8, 1, v0 +; NO-ZVBB-NEXT: csrr a1, vlenb +; NO-ZVBB-NEXT: addi a2, a1, -1 +; NO-ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma ; NO-ZVBB-NEXT: vid.v v8 -; NO-ZVBB-NEXT: li a0, 63 -; NO-ZVBB-NEXT: vrsub.vx v8, v8, a0 -; NO-ZVBB-NEXT: vmv.v.i v12, 0 -; NO-ZVBB-NEXT: vmerge.vim v12, v12, 1, v0 -; NO-ZVBB-NEXT: vrgather.vv v16, v12, v8 -; NO-ZVBB-NEXT: vmsne.vi v0, v16, 0 +; NO-ZVBB-NEXT: vrsub.vx v16, v8, a2 +; NO-ZVBB-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; NO-ZVBB-NEXT: vrgatherei16.vv v11, v12, v16 +; NO-ZVBB-NEXT: vrgatherei16.vv v10, v13, v16 +; NO-ZVBB-NEXT: vrgatherei16.vv v9, v14, v16 +; NO-ZVBB-NEXT: vrgatherei16.vv v8, v15, v16 +; NO-ZVBB-NEXT: slli a1, a1, 2 +; NO-ZVBB-NEXT: addi a1, a1, -64 +; NO-ZVBB-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; NO-ZVBB-NEXT: vslidedown.vx v8, v8, a1 +; NO-ZVBB-NEXT: vmsne.vi v0, v8, 0 ; NO-ZVBB-NEXT: ret ; ; ZVBB-LABEL: reverse_v64i1: @@ -140,13 +158,27 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 -; CHECK-NEXT: vrgather.vv v24, v16, v8 -; CHECK-NEXT: vmsne.vi v0, v24, 0 +; CHECK-NEXT: vrsub.vx v24, v8, a2 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v15, v16, v24 +; CHECK-NEXT: vrgatherei16.vv v14, v17, v24 +; CHECK-NEXT: vrgatherei16.vv v13, v18, v24 +; CHECK-NEXT: vrgatherei16.vv v12, v19, v24 +; CHECK-NEXT: vrgatherei16.vv v11, v20, v24 +; CHECK-NEXT: vrgatherei16.vv v10, v21, v24 +; CHECK-NEXT: vrgatherei16.vv v9, v22, v24 +; CHECK-NEXT: vrgatherei16.vv v8, v23, v24 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: addi a1, a1, -128 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = shufflevector <128 x i1> %a, <128 x i1> poison, <128 x i32> ret <128 x i1> %res @@ -220,13 +252,19 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) { define <32 x i8> @reverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: reverse_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a1, a0, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v12, v9, v10 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> ret <32 x i8> %res @@ -235,13 +273,21 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) { define <64 x i8> @reverse_v64i8(<64 x i8> %a) { ; CHECK-LABEL: reverse_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 64 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a1, a0, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: li a0, 63 -; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v15, v8, v16 +; CHECK-NEXT: vrgatherei16.vv v14, v9, v16 +; CHECK-NEXT: vrgatherei16.vv v13, v10, v16 +; CHECK-NEXT: vrgatherei16.vv v12, v11, v16 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -64 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <64 x i8> %a, <64 x i8> poison, <64 x i32> ret <64 x i8> %res @@ -302,11 +348,17 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) { define <16 x i16> @reverse_v16i16(<16 x i16> %a) { ; CHECK-LABEL: reverse_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 15 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v10 +; CHECK-NEXT: vrgather.vv v12, v9, v10 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> ret <16 x i16> %res @@ -315,13 +367,21 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) { define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-LABEL: reverse_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> ret <32 x i16> %res @@ -369,12 +429,18 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) { define <8 x i32> @reverse_v8i32(<8 x i32> %a) { ; CHECK-LABEL: reverse_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 7 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v10 +; CHECK-NEXT: vrgather.vv v12, v9, v10 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> ret <8 x i32> %res @@ -383,12 +449,19 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) { define <16 x i32> @reverse_v16i32(<16 x i32> %a) { ; CHECK-LABEL: reverse_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v16, v12, 15 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> ret <16 x i32> %res @@ -417,12 +490,18 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) { define <4 x i64> @reverse_v4i64(<4 x i64> %a) { ; CHECK-LABEL: reverse_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v10 +; CHECK-NEXT: vrgather.vv v12, v9, v10 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -431,12 +510,20 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) { define <8 x i64> @reverse_v8i64(<8 x i64> %a) { ; CHECK-LABEL: reverse_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v16, v12, 7 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> ret <8 x i64> %res @@ -498,11 +585,17 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) { define <16 x half> @reverse_v16f16(<16 x half> %a) { ; CHECK-LABEL: reverse_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 15 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v10 +; CHECK-NEXT: vrgather.vv v12, v9, v10 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <16 x half> %a, <16 x half> poison, <16 x i32> ret <16 x half> %res @@ -511,13 +604,21 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) { define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-LABEL: reverse_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <32 x half> %a, <32 x half> poison, <32 x i32> ret <32 x half> %res @@ -565,12 +666,18 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) { define <8 x float> @reverse_v8f32(<8 x float> %a) { ; CHECK-LABEL: reverse_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 7 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v10 +; CHECK-NEXT: vrgather.vv v12, v9, v10 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> ret <8 x float> %res @@ -579,12 +686,19 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) { define <16 x float> @reverse_v16f32(<16 x float> %a) { ; CHECK-LABEL: reverse_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v16, v12, 15 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> ret <16 x float> %res @@ -613,12 +727,18 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) { define <4 x double> @reverse_v4f64(<4 x double> %a) { ; CHECK-LABEL: reverse_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v10 +; CHECK-NEXT: vrgather.vv v12, v9, v10 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> ret <4 x double> %res @@ -627,12 +747,20 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) { define <8 x double> @reverse_v8f64(<8 x double> %a) { ; CHECK-LABEL: reverse_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vi v16, v12, 7 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrsub.vx v16, v12, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> ret <8 x double> %res @@ -835,21 +963,28 @@ define <16 x i8> @reverse_v16i8_2(<8 x i8> %a, <8 x i8> %b) { define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: reverse_v32i8_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v9 -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v16, v14, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v16 -; CHECK-NEXT: vrsub.vi v8, v14, 15 +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a1, a0, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v9, v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v11, v12 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vi v12, v12, 15 ; CHECK-NEXT: lui a0, 16 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> ret <32 x i8> %res @@ -898,16 +1033,23 @@ define <8 x i16> @reverse_v8i16_2(<4 x i16> %a, <4 x i16> %b) { define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: reverse_v16i16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vrsub.vx v9, v9, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v9 +; CHECK-NEXT: vrgather.vv v12, v8, v9 +; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vrsub.vi v16, v14, 15 -; CHECK-NEXT: vrgather.vv v10, v8, v16 -; CHECK-NEXT: vrsub.vi v8, v14, 7 +; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vi v12, v12, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> ret <16 x i16> %res @@ -916,21 +1058,30 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) { define <32 x i16> @reverse_v32i16_2(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: reverse_v32i16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vid.v v20 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v24, v20, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v24 -; CHECK-NEXT: vrsub.vi v8, v20, 15 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v16, v8, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vrsub.vi v16, v16, 15 ; CHECK-NEXT: lui a0, 16 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vrgather.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %res @@ -963,18 +1114,25 @@ define <4 x i32> @reverse_v4i32_2(<2 x i32> %a, < 2 x i32> %b) { define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: reverse_v8i32_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v9 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vrsub.vi v13, v9, 7 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v13 +; CHECK-NEXT: vrsub.vx v9, v9, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v9 +; CHECK-NEXT: vrgather.vv v12, v8, v9 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vrsub.vi v8, v9, 3 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrsub.vi v12, v11, 3 ; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v10, v12, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgatherei16.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> ret <8 x i32> %res @@ -983,19 +1141,27 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) { define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: reverse_v16i32_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v18, v10, 15 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v16, v8, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 7 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v16, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %res @@ -1004,17 +1170,28 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) { define <32 x i32> @reverse_v32i32_2(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: reverse_v32i32_2: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vrsub.vx v17, v16, a1 +; CHECK-NEXT: vrgather.vv v23, v8, v17 +; CHECK-NEXT: vrgather.vv v22, v9, v17 +; CHECK-NEXT: vrgather.vv v21, v10, v17 +; CHECK-NEXT: vrgather.vv v20, v11, v17 +; CHECK-NEXT: vrgather.vv v16, v8, v17 +; CHECK-NEXT: vmv.v.v v17, v16 ; CHECK-NEXT: vmv4r.v v24, v12 -; CHECK-NEXT: vmv4r.v v16, v8 -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vid.v v20 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v28, v20, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v16, v28 +; CHECK-NEXT: vmv2r.v v18, v16 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vrsub.vi v16, v20, 15 +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vrsub.vi v16, v16, 15 ; CHECK-NEXT: lui a0, 16 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu @@ -1043,18 +1220,27 @@ define <4 x i64> @reverse_v4i64_2(<2 x i64> %a, < 2 x i64> %b) { define <8 x i64> @reverse_v8i64_2(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: reverse_v8i64_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v11, v10, 7 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v11 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v16, v8, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 3 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v16, v14, 3 ; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> ret <8 x i64> %res @@ -1103,16 +1289,23 @@ define <8 x half> @reverse_v8f16_2(<4 x half> %a, <4 x half> %b) { define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: reverse_v16f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vrsub.vx v9, v9, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v9 +; CHECK-NEXT: vrgather.vv v12, v8, v9 +; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vrsub.vi v16, v14, 15 -; CHECK-NEXT: vrgather.vv v10, v8, v16 -; CHECK-NEXT: vrsub.vi v8, v14, 7 +; CHECK-NEXT: vslidedown.vx v8, v12, a0 +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vi v12, v12, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x half> %a, <8 x half> %b, <16 x i32> ret <16 x half> %res @@ -1121,13 +1314,21 @@ define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) { define <32 x half> @reverse_v32f16_2(<16 x half> %a) { ; CHECK-LABEL: reverse_v32f16_2: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: li a0, 31 -; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vx v10, v10, a1 +; CHECK-NEXT: vrgather.vv v15, v8, v10 +; CHECK-NEXT: vrgather.vv v14, v9, v10 +; CHECK-NEXT: vrgather.vv v12, v8, v10 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: ret %res = shufflevector <16 x half> %a, <16 x half> poison, <32 x i32> ret <32 x half> %res @@ -1160,18 +1361,25 @@ define <4 x float> @reverse_v4f32_2(<2 x float> %a, <2 x float> %b) { define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: reverse_v8f32_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v9 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vrsub.vi v13, v9, 7 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v13 +; CHECK-NEXT: vrsub.vx v9, v9, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v9 +; CHECK-NEXT: vrgather.vv v12, v8, v9 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vrsub.vi v8, v9, 3 +; CHECK-NEXT: vid.v v11 +; CHECK-NEXT: vrsub.vi v12, v11, 3 ; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v10, v12, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgatherei16.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> ret <8 x float> %res @@ -1180,19 +1388,27 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) { define <16 x float> @reverse_v16f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: reverse_v16f32_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v18, v10, 15 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v16, v8, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 7 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v16, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> ret <16 x float> %res @@ -1216,18 +1432,27 @@ define <4 x double> @reverse_v4f64_2(<2 x double> %a, < 2 x double> %b) { define <8 x double> @reverse_v8f64_2(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: reverse_v8f64_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v16, v10 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v11, v10, 7 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v11 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vx v12, v12, a1 +; CHECK-NEXT: vrgather.vv v19, v8, v12 +; CHECK-NEXT: vrgather.vv v18, v9, v12 +; CHECK-NEXT: vrgather.vv v16, v8, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv.v.v v17, v16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v16, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 3 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v16, v14, 3 ; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> ret <8 x double> %res @@ -1242,11 +1467,48 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) vscale_range(16, 1024) { ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v12, v8, v10 -; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vrsub.vx v10, v10, a2 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v12, v9, v10 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: addi a1, a1, -256 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v12, a1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %res = shufflevector <256 x i1> %a, <256 x i1> poison, <256 x i32> ret <256 x i1> %res } + +define <8 x i32> @reverse_v8i32_exact_vlen_128(<8 x i32> %a) vscale_range(2, 2) { +; CHECK-LABEL: reverse_v8i32_exact_vlen_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vi v12, v10, 3 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> + ret <8 x i32> %res +} + +define <16 x i32> @reverse_v16i32_exact_vlen_256(<16 x i32> %a) vscale_range(4, 4) { +; CHECK-LABEL: reverse_v16i32_exact_vlen_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vi v12, v10, 7 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> + ret <16 x i32> %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll index 60a03f1d97e5..2a915529e61d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll @@ -301,94 +301,108 @@ define @reverse_nxv8i1( %a) { define @reverse_nxv16i1( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv16i1: ; RV32-BITS-UNKNOWN: # %bb.0: -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: slli a0, a0, 1 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v12 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v12, v12, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v12 -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v10, 1 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vid.v v8 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v8 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v12, 1 ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv16i1: ; RV32-BITS-256: # %bb.0: -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: slli a0, a0, 1 ; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vid.v v10 -; RV32-BITS-256-NEXT: vrsub.vx v10, v10, a0 -; RV32-BITS-256-NEXT: vrgather.vv v12, v8, v10 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: vrsub.vx v8, v8, a0 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-BITS-256-NEXT: vmv.v.i v10, 0 +; RV32-BITS-256-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vrgather.vv v13, v10, v8 +; RV32-BITS-256-NEXT: vrgather.vv v12, v11, v8 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; RV32-BITS-256-NEXT: vand.vi v8, v12, 1 ; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv16i1: ; RV32-BITS-512: # %bb.0: -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: slli a0, a0, 1 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vid.v v10 -; RV32-BITS-512-NEXT: vrsub.vx v10, v10, a0 -; RV32-BITS-512-NEXT: vrgather.vv v12, v8, v10 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vid.v v8 +; RV32-BITS-512-NEXT: vrsub.vx v8, v8, a0 +; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-BITS-512-NEXT: vmv.v.i v10, 0 +; RV32-BITS-512-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vrgather.vv v13, v10, v8 +; RV32-BITS-512-NEXT: vrgather.vv v12, v11, v8 +; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; RV32-BITS-512-NEXT: vand.vi v8, v12, 1 ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i1: ; RV64-BITS-UNKNOWN: # %bb.0: -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: slli a0, a0, 1 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v12 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v12, v12, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v12 -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v10, 1 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vid.v v8 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v8 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v12, 1 ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv16i1: ; RV64-BITS-256: # %bb.0: -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: slli a0, a0, 1 ; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vid.v v10 -; RV64-BITS-256-NEXT: vrsub.vx v10, v10, a0 -; RV64-BITS-256-NEXT: vrgather.vv v12, v8, v10 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: vrsub.vx v8, v8, a0 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-BITS-256-NEXT: vmv.v.i v10, 0 +; RV64-BITS-256-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vrgather.vv v13, v10, v8 +; RV64-BITS-256-NEXT: vrgather.vv v12, v11, v8 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; RV64-BITS-256-NEXT: vand.vi v8, v12, 1 ; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv16i1: ; RV64-BITS-512: # %bb.0: -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: slli a0, a0, 1 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vid.v v10 -; RV64-BITS-512-NEXT: vrsub.vx v10, v10, a0 -; RV64-BITS-512-NEXT: vrgather.vv v12, v8, v10 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vid.v v8 +; RV64-BITS-512-NEXT: vrsub.vx v8, v8, a0 +; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-BITS-512-NEXT: vmv.v.i v10, 0 +; RV64-BITS-512-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vrgather.vv v13, v10, v8 +; RV64-BITS-512-NEXT: vrgather.vv v12, v11, v8 +; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; RV64-BITS-512-NEXT: vand.vi v8, v12, 1 ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret @@ -399,95 +413,121 @@ define @reverse_nxv16i1( %a) { define @reverse_nxv32i1( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv32i1: ; RV32-BITS-UNKNOWN: # %bb.0: +; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vid.v v8 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v12, v8, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: slli a0, a0, 2 -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v16 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v8, v16 -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v12, 1 +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v16, v12 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v17, v12 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v18, v12 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v19, v12 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv32i1: ; RV32-BITS-256: # %bb.0: +; RV32-BITS-256-NEXT: csrr a0, vlenb +; RV32-BITS-256-NEXT: addi a0, a0, -1 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: vrsub.vx v12, v8, a0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: slli a0, a0, 2 -; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vid.v v12 -; RV32-BITS-256-NEXT: vrsub.vx v12, v12, a0 -; RV32-BITS-256-NEXT: vrgather.vv v16, v8, v12 -; RV32-BITS-256-NEXT: vand.vi v8, v16, 1 +; RV32-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vrgather.vv v11, v16, v12 +; RV32-BITS-256-NEXT: vrgather.vv v10, v17, v12 +; RV32-BITS-256-NEXT: vrgather.vv v9, v18, v12 +; RV32-BITS-256-NEXT: vrgather.vv v8, v19, v12 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; RV32-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv32i1: ; RV32-BITS-512: # %bb.0: +; RV32-BITS-512-NEXT: csrr a0, vlenb +; RV32-BITS-512-NEXT: addi a0, a0, -1 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vid.v v8 +; RV32-BITS-512-NEXT: vrsub.vx v12, v8, a0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 -; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: slli a0, a0, 2 -; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vid.v v12 -; RV32-BITS-512-NEXT: vrsub.vx v12, v12, a0 -; RV32-BITS-512-NEXT: vrgather.vv v16, v8, v12 -; RV32-BITS-512-NEXT: vand.vi v8, v16, 1 +; RV32-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vrgather.vv v11, v16, v12 +; RV32-BITS-512-NEXT: vrgather.vv v10, v17, v12 +; RV32-BITS-512-NEXT: vrgather.vv v9, v18, v12 +; RV32-BITS-512-NEXT: vrgather.vv v8, v19, v12 +; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; RV32-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv32i1: ; RV64-BITS-UNKNOWN: # %bb.0: +; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vid.v v8 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v12, v8, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: slli a0, a0, 2 -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e16, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v16 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v8, v16 -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v12, 1 +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v16, v12 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v17, v12 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v18, v12 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v19, v12 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv32i1: ; RV64-BITS-256: # %bb.0: +; RV64-BITS-256-NEXT: csrr a0, vlenb +; RV64-BITS-256-NEXT: addi a0, a0, -1 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: vrsub.vx v12, v8, a0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: slli a0, a0, 2 -; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vid.v v12 -; RV64-BITS-256-NEXT: vrsub.vx v12, v12, a0 -; RV64-BITS-256-NEXT: vrgather.vv v16, v8, v12 -; RV64-BITS-256-NEXT: vand.vi v8, v16, 1 +; RV64-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vrgather.vv v11, v16, v12 +; RV64-BITS-256-NEXT: vrgather.vv v10, v17, v12 +; RV64-BITS-256-NEXT: vrgather.vv v9, v18, v12 +; RV64-BITS-256-NEXT: vrgather.vv v8, v19, v12 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; RV64-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv32i1: ; RV64-BITS-512: # %bb.0: +; RV64-BITS-512-NEXT: csrr a0, vlenb +; RV64-BITS-512-NEXT: addi a0, a0, -1 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vid.v v8 +; RV64-BITS-512-NEXT: vrsub.vx v12, v8, a0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 -; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: slli a0, a0, 2 -; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vid.v v12 -; RV64-BITS-512-NEXT: vrsub.vx v12, v12, a0 -; RV64-BITS-512-NEXT: vrgather.vv v16, v8, v12 -; RV64-BITS-512-NEXT: vand.vi v8, v16, 1 +; RV64-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vrgather.vv v11, v16, v12 +; RV64-BITS-512-NEXT: vrgather.vv v10, v17, v12 +; RV64-BITS-512-NEXT: vrgather.vv v9, v18, v12 +; RV64-BITS-512-NEXT: vrgather.vv v8, v19, v12 +; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; RV64-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv32i1( %a) @@ -498,106 +538,144 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv64i1: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: slli a0, a0, 2 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v8 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v8, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v16, v16, 1, v0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v28, v16, v8 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v20, v8 +; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v24, v8, 1, v0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v24, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v25, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v26, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v27, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v28, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v29, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v30, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v31, v16 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v24, 1 +; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv64i1: ; RV32-BITS-256: # %bb.0: +; RV32-BITS-256-NEXT: csrr a0, vlenb +; RV32-BITS-256-NEXT: addi a0, a0, -1 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: slli a0, a0, 3 -; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vid.v v16 -; RV32-BITS-256-NEXT: vrsub.vx v16, v16, a0 -; RV32-BITS-256-NEXT: vrgather.vv v24, v8, v16 -; RV32-BITS-256-NEXT: vand.vi v8, v24, 1 +; RV32-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vrgather.vv v15, v16, v24 +; RV32-BITS-256-NEXT: vrgather.vv v14, v17, v24 +; RV32-BITS-256-NEXT: vrgather.vv v13, v18, v24 +; RV32-BITS-256-NEXT: vrgather.vv v12, v19, v24 +; RV32-BITS-256-NEXT: vrgather.vv v11, v20, v24 +; RV32-BITS-256-NEXT: vrgather.vv v10, v21, v24 +; RV32-BITS-256-NEXT: vrgather.vv v9, v22, v24 +; RV32-BITS-256-NEXT: vrgather.vv v8, v23, v24 +; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; RV32-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv64i1: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: slli a0, a0, 2 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v8 -; RV32-BITS-512-NEXT: vrsub.vx v8, v8, a0 +; RV32-BITS-512-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-512-NEXT: vmv.v.i v16, 0 -; RV32-BITS-512-NEXT: vmerge.vim v16, v16, 1, v0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-512-NEXT: vrgather.vv v28, v16, v8 -; RV32-BITS-512-NEXT: vrgather.vv v24, v20, v8 +; RV32-BITS-512-NEXT: vmv.v.i v8, 0 +; RV32-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vrgather.vv v15, v16, v24 +; RV32-BITS-512-NEXT: vrgather.vv v14, v17, v24 +; RV32-BITS-512-NEXT: vrgather.vv v13, v18, v24 +; RV32-BITS-512-NEXT: vrgather.vv v12, v19, v24 +; RV32-BITS-512-NEXT: vrgather.vv v11, v20, v24 +; RV32-BITS-512-NEXT: vrgather.vv v10, v21, v24 +; RV32-BITS-512-NEXT: vrgather.vv v9, v22, v24 +; RV32-BITS-512-NEXT: vrgather.vv v8, v23, v24 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-512-NEXT: vand.vi v8, v24, 1 +; RV32-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv64i1: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: slli a0, a0, 2 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v8 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v8, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v16, 0 -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v16, v16, 1, v0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v28, v16, v8 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v24, v20, v8 +; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v24, v8, 1, v0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v24, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v25, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v26, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v27, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v28, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v29, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v30, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v31, v16 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v24, 1 +; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv64i1: ; RV64-BITS-256: # %bb.0: +; RV64-BITS-256-NEXT: csrr a0, vlenb +; RV64-BITS-256-NEXT: addi a0, a0, -1 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: slli a0, a0, 3 -; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vid.v v16 -; RV64-BITS-256-NEXT: vrsub.vx v16, v16, a0 -; RV64-BITS-256-NEXT: vrgather.vv v24, v8, v16 -; RV64-BITS-256-NEXT: vand.vi v8, v24, 1 +; RV64-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vrgather.vv v15, v16, v24 +; RV64-BITS-256-NEXT: vrgather.vv v14, v17, v24 +; RV64-BITS-256-NEXT: vrgather.vv v13, v18, v24 +; RV64-BITS-256-NEXT: vrgather.vv v12, v19, v24 +; RV64-BITS-256-NEXT: vrgather.vv v11, v20, v24 +; RV64-BITS-256-NEXT: vrgather.vv v10, v21, v24 +; RV64-BITS-256-NEXT: vrgather.vv v9, v22, v24 +; RV64-BITS-256-NEXT: vrgather.vv v8, v23, v24 +; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; RV64-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv64i1: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: slli a0, a0, 2 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v8 -; RV64-BITS-512-NEXT: vrsub.vx v8, v8, a0 +; RV64-BITS-512-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-512-NEXT: vmv.v.i v16, 0 -; RV64-BITS-512-NEXT: vmerge.vim v16, v16, 1, v0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-512-NEXT: vrgather.vv v28, v16, v8 -; RV64-BITS-512-NEXT: vrgather.vv v24, v20, v8 +; RV64-BITS-512-NEXT: vmv.v.i v8, 0 +; RV64-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 +; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vrgather.vv v15, v16, v24 +; RV64-BITS-512-NEXT: vrgather.vv v14, v17, v24 +; RV64-BITS-512-NEXT: vrgather.vv v13, v18, v24 +; RV64-BITS-512-NEXT: vrgather.vv v12, v19, v24 +; RV64-BITS-512-NEXT: vrgather.vv v11, v20, v24 +; RV64-BITS-512-NEXT: vrgather.vv v10, v21, v24 +; RV64-BITS-512-NEXT: vrgather.vv v9, v22, v24 +; RV64-BITS-512-NEXT: vrgather.vv v8, v23, v24 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-512-NEXT: vand.vi v8, v24, 1 +; RV64-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv64i1( %a) @@ -918,75 +996,75 @@ define @reverse_nxv16i8( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv16i8: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: slli a0, a0, 1 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v12 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v12, v12, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v12 -; RV32-BITS-UNKNOWN-NEXT: vmv.v.v v8, v10 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vid.v v10 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v12, v10, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v8, v12 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v9, v12 +; RV32-BITS-UNKNOWN-NEXT: vmv2r.v v8, v10 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv16i8: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: slli a0, a0, 1 ; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v10 ; RV32-BITS-256-NEXT: vrsub.vx v12, v10, a0 -; RV32-BITS-256-NEXT: vrgather.vv v10, v8, v12 -; RV32-BITS-256-NEXT: vmv.v.v v8, v10 +; RV32-BITS-256-NEXT: vrgather.vv v11, v8, v12 +; RV32-BITS-256-NEXT: vrgather.vv v10, v9, v12 +; RV32-BITS-256-NEXT: vmv2r.v v8, v10 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv16i8: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: slli a0, a0, 1 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v10 ; RV32-BITS-512-NEXT: vrsub.vx v12, v10, a0 -; RV32-BITS-512-NEXT: vrgather.vv v10, v8, v12 -; RV32-BITS-512-NEXT: vmv.v.v v8, v10 +; RV32-BITS-512-NEXT: vrgather.vv v11, v8, v12 +; RV32-BITS-512-NEXT: vrgather.vv v10, v9, v12 +; RV32-BITS-512-NEXT: vmv2r.v v8, v10 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i8: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: slli a0, a0, 1 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v12 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v12, v12, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v12 -; RV64-BITS-UNKNOWN-NEXT: vmv.v.v v8, v10 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vid.v v10 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v12, v10, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v8, v12 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v9, v12 +; RV64-BITS-UNKNOWN-NEXT: vmv2r.v v8, v10 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv16i8: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: slli a0, a0, 1 ; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v10 ; RV64-BITS-256-NEXT: vrsub.vx v12, v10, a0 -; RV64-BITS-256-NEXT: vrgather.vv v10, v8, v12 -; RV64-BITS-256-NEXT: vmv.v.v v8, v10 +; RV64-BITS-256-NEXT: vrgather.vv v11, v8, v12 +; RV64-BITS-256-NEXT: vrgather.vv v10, v9, v12 +; RV64-BITS-256-NEXT: vmv2r.v v8, v10 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv16i8: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: slli a0, a0, 1 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v10 ; RV64-BITS-512-NEXT: vrsub.vx v12, v10, a0 -; RV64-BITS-512-NEXT: vrgather.vv v10, v8, v12 -; RV64-BITS-512-NEXT: vmv.v.v v8, v10 +; RV64-BITS-512-NEXT: vrgather.vv v11, v8, v12 +; RV64-BITS-512-NEXT: vrgather.vv v10, v9, v12 +; RV64-BITS-512-NEXT: vmv2r.v v8, v10 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv16i8( %a) ret %res @@ -996,75 +1074,87 @@ define @reverse_nxv32i8( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv32i8: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: slli a0, a0, 2 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v16 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v8, v16 -; RV32-BITS-UNKNOWN-NEXT: vmv.v.v v8, v12 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vid.v v12 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v12, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v8, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v9, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v16 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v16 +; RV32-BITS-UNKNOWN-NEXT: vmv4r.v v8, v12 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv32i8: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: slli a0, a0, 2 ; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v12 ; RV32-BITS-256-NEXT: vrsub.vx v16, v12, a0 -; RV32-BITS-256-NEXT: vrgather.vv v12, v8, v16 -; RV32-BITS-256-NEXT: vmv.v.v v8, v12 +; RV32-BITS-256-NEXT: vrgather.vv v15, v8, v16 +; RV32-BITS-256-NEXT: vrgather.vv v14, v9, v16 +; RV32-BITS-256-NEXT: vrgather.vv v13, v10, v16 +; RV32-BITS-256-NEXT: vrgather.vv v12, v11, v16 +; RV32-BITS-256-NEXT: vmv4r.v v8, v12 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv32i8: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: slli a0, a0, 2 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v12 ; RV32-BITS-512-NEXT: vrsub.vx v16, v12, a0 -; RV32-BITS-512-NEXT: vrgather.vv v12, v8, v16 -; RV32-BITS-512-NEXT: vmv.v.v v8, v12 +; RV32-BITS-512-NEXT: vrgather.vv v15, v8, v16 +; RV32-BITS-512-NEXT: vrgather.vv v14, v9, v16 +; RV32-BITS-512-NEXT: vrgather.vv v13, v10, v16 +; RV32-BITS-512-NEXT: vrgather.vv v12, v11, v16 +; RV32-BITS-512-NEXT: vmv4r.v v8, v12 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv32i8: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: slli a0, a0, 2 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v16 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v8, v16 -; RV64-BITS-UNKNOWN-NEXT: vmv.v.v v8, v12 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vid.v v12 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v12, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v8, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v9, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v16 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v16 +; RV64-BITS-UNKNOWN-NEXT: vmv4r.v v8, v12 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv32i8: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: slli a0, a0, 2 ; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v12 ; RV64-BITS-256-NEXT: vrsub.vx v16, v12, a0 -; RV64-BITS-256-NEXT: vrgather.vv v12, v8, v16 -; RV64-BITS-256-NEXT: vmv.v.v v8, v12 +; RV64-BITS-256-NEXT: vrgather.vv v15, v8, v16 +; RV64-BITS-256-NEXT: vrgather.vv v14, v9, v16 +; RV64-BITS-256-NEXT: vrgather.vv v13, v10, v16 +; RV64-BITS-256-NEXT: vrgather.vv v12, v11, v16 +; RV64-BITS-256-NEXT: vmv4r.v v8, v12 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv32i8: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: slli a0, a0, 2 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v12 ; RV64-BITS-512-NEXT: vrsub.vx v16, v12, a0 -; RV64-BITS-512-NEXT: vrgather.vv v12, v8, v16 -; RV64-BITS-512-NEXT: vmv.v.v v8, v12 +; RV64-BITS-512-NEXT: vrgather.vv v15, v8, v16 +; RV64-BITS-512-NEXT: vrgather.vv v14, v9, v16 +; RV64-BITS-512-NEXT: vrgather.vv v13, v10, v16 +; RV64-BITS-512-NEXT: vrgather.vv v12, v11, v16 +; RV64-BITS-512-NEXT: vmv4r.v v8, v12 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv32i8( %a) ret %res @@ -1073,80 +1163,112 @@ define @reverse_nxv32i8( %a) { define @reverse_nxv64i8( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv64i8: ; RV32-BITS-UNKNOWN: # %bb.0: +; RV32-BITS-UNKNOWN-NEXT: vmv8r.v v16, v8 ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: slli a0, a0, 2 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v16 -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v20, v8, v24 -; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v12, v24 -; RV32-BITS-UNKNOWN-NEXT: vmv8r.v v8, v16 +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vid.v v8 +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v24, v8, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v16, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v17, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v18, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v19, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v20, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v21, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v22, v24 +; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v23, v24 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv64i8: ; RV32-BITS-256: # %bb.0: +; RV32-BITS-256-NEXT: vmv8r.v v16, v8 ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: slli a0, a0, 3 ; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m8, ta, ma -; RV32-BITS-256-NEXT: vid.v v16 -; RV32-BITS-256-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-256-NEXT: vrgather.vv v16, v8, v24 -; RV32-BITS-256-NEXT: vmv.v.v v8, v16 +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: vrsub.vx v24, v8, a0 +; RV32-BITS-256-NEXT: vrgather.vv v15, v16, v24 +; RV32-BITS-256-NEXT: vrgather.vv v14, v17, v24 +; RV32-BITS-256-NEXT: vrgather.vv v13, v18, v24 +; RV32-BITS-256-NEXT: vrgather.vv v12, v19, v24 +; RV32-BITS-256-NEXT: vrgather.vv v11, v20, v24 +; RV32-BITS-256-NEXT: vrgather.vv v10, v21, v24 +; RV32-BITS-256-NEXT: vrgather.vv v9, v22, v24 +; RV32-BITS-256-NEXT: vrgather.vv v8, v23, v24 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv64i8: ; RV32-BITS-512: # %bb.0: +; RV32-BITS-512-NEXT: vmv8r.v v16, v8 ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: slli a0, a0, 2 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; RV32-BITS-512-NEXT: vid.v v16 -; RV32-BITS-512-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-512-NEXT: vrgather.vv v20, v8, v24 -; RV32-BITS-512-NEXT: vrgather.vv v16, v12, v24 -; RV32-BITS-512-NEXT: vmv8r.v v8, v16 +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vid.v v8 +; RV32-BITS-512-NEXT: vrsub.vx v24, v8, a0 +; RV32-BITS-512-NEXT: vrgather.vv v15, v16, v24 +; RV32-BITS-512-NEXT: vrgather.vv v14, v17, v24 +; RV32-BITS-512-NEXT: vrgather.vv v13, v18, v24 +; RV32-BITS-512-NEXT: vrgather.vv v12, v19, v24 +; RV32-BITS-512-NEXT: vrgather.vv v11, v20, v24 +; RV32-BITS-512-NEXT: vrgather.vv v10, v21, v24 +; RV32-BITS-512-NEXT: vrgather.vv v9, v22, v24 +; RV32-BITS-512-NEXT: vrgather.vv v8, v23, v24 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv64i8: ; RV64-BITS-UNKNOWN: # %bb.0: +; RV64-BITS-UNKNOWN-NEXT: vmv8r.v v16, v8 ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: slli a0, a0, 2 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v16 -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v20, v8, v24 -; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v16, v12, v24 -; RV64-BITS-UNKNOWN-NEXT: vmv8r.v v8, v16 +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vid.v v8 +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v24, v8, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v16, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v17, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v18, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v19, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v20, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v21, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v22, v24 +; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v23, v24 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv64i8: ; RV64-BITS-256: # %bb.0: +; RV64-BITS-256-NEXT: vmv8r.v v16, v8 ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: slli a0, a0, 3 ; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m8, ta, ma -; RV64-BITS-256-NEXT: vid.v v16 -; RV64-BITS-256-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-256-NEXT: vrgather.vv v16, v8, v24 -; RV64-BITS-256-NEXT: vmv.v.v v8, v16 +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: vrsub.vx v24, v8, a0 +; RV64-BITS-256-NEXT: vrgather.vv v15, v16, v24 +; RV64-BITS-256-NEXT: vrgather.vv v14, v17, v24 +; RV64-BITS-256-NEXT: vrgather.vv v13, v18, v24 +; RV64-BITS-256-NEXT: vrgather.vv v12, v19, v24 +; RV64-BITS-256-NEXT: vrgather.vv v11, v20, v24 +; RV64-BITS-256-NEXT: vrgather.vv v10, v21, v24 +; RV64-BITS-256-NEXT: vrgather.vv v9, v22, v24 +; RV64-BITS-256-NEXT: vrgather.vv v8, v23, v24 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv64i8: ; RV64-BITS-512: # %bb.0: +; RV64-BITS-512-NEXT: vmv8r.v v16, v8 ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: slli a0, a0, 2 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; RV64-BITS-512-NEXT: vid.v v16 -; RV64-BITS-512-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-512-NEXT: vrgather.vv v20, v8, v24 -; RV64-BITS-512-NEXT: vrgather.vv v16, v12, v24 -; RV64-BITS-512-NEXT: vmv8r.v v8, v16 +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vid.v v8 +; RV64-BITS-512-NEXT: vrsub.vx v24, v8, a0 +; RV64-BITS-512-NEXT: vrgather.vv v15, v16, v24 +; RV64-BITS-512-NEXT: vrgather.vv v14, v17, v24 +; RV64-BITS-512-NEXT: vrgather.vv v13, v18, v24 +; RV64-BITS-512-NEXT: vrgather.vv v12, v19, v24 +; RV64-BITS-512-NEXT: vrgather.vv v11, v20, v24 +; RV64-BITS-512-NEXT: vrgather.vv v10, v21, v24 +; RV64-BITS-512-NEXT: vrgather.vv v9, v22, v24 +; RV64-BITS-512-NEXT: vrgather.vv v8, v23, v24 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv64i8( %a) ret %res @@ -1204,12 +1326,14 @@ define @reverse_nxv8i16( %a) { ; CHECK-LABEL: reverse_nxv8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8i16( %a) ret %res @@ -1219,13 +1343,16 @@ define @reverse_nxv16i16( %a) { ; CHECK-LABEL: reverse_nxv16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv16i16( %a) ret %res @@ -1234,14 +1361,21 @@ define @reverse_nxv16i16( %a) { define @reverse_nxv32i16( %a) { ; CHECK-LABEL: reverse_nxv32i16: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vrgather.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a0 +; CHECK-NEXT: vrgather.vv v15, v16, v24 +; CHECK-NEXT: vrgather.vv v14, v17, v24 +; CHECK-NEXT: vrgather.vv v13, v18, v24 +; CHECK-NEXT: vrgather.vv v12, v19, v24 +; CHECK-NEXT: vrgather.vv v11, v20, v24 +; CHECK-NEXT: vrgather.vv v10, v21, v24 +; CHECK-NEXT: vrgather.vv v9, v22, v24 +; CHECK-NEXT: vrgather.vv v8, v23, v24 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv32i16( %a) ret %res @@ -1283,14 +1417,14 @@ define @reverse_nxv4i32( %a) { ; CHECK-LABEL: reverse_nxv4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv4i32( %a) ret %res @@ -1300,13 +1434,16 @@ define @reverse_nxv8i32( %a) { ; CHECK-LABEL: reverse_nxv8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8i32( %a) ret %res @@ -1315,15 +1452,21 @@ define @reverse_nxv8i32( %a) { define @reverse_nxv16i32( %a) { ; CHECK-LABEL: reverse_nxv16i32: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a0 +; CHECK-NEXT: vrgather.vv v15, v16, v24 +; CHECK-NEXT: vrgather.vv v14, v17, v24 +; CHECK-NEXT: vrgather.vv v13, v18, v24 +; CHECK-NEXT: vrgather.vv v12, v19, v24 +; CHECK-NEXT: vrgather.vv v11, v20, v24 +; CHECK-NEXT: vrgather.vv v10, v21, v24 +; CHECK-NEXT: vrgather.vv v9, v22, v24 +; CHECK-NEXT: vrgather.vv v8, v23, v24 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv16i32( %a) ret %res @@ -1349,14 +1492,14 @@ define @reverse_nxv2i64( %a) { ; CHECK-LABEL: reverse_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv2i64( %a) ret %res @@ -1366,14 +1509,16 @@ define @reverse_nxv4i64( %a) { ; CHECK-LABEL: reverse_nxv4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv4i64( %a) ret %res @@ -1382,14 +1527,21 @@ define @reverse_nxv4i64( %a) { define @reverse_nxv8i64( %a) { ; CHECK-LABEL: reverse_nxv8i64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a0 +; CHECK-NEXT: vrgather.vv v15, v16, v24 +; CHECK-NEXT: vrgather.vv v14, v17, v24 +; CHECK-NEXT: vrgather.vv v13, v18, v24 +; CHECK-NEXT: vrgather.vv v12, v19, v24 +; CHECK-NEXT: vrgather.vv v11, v20, v24 +; CHECK-NEXT: vrgather.vv v10, v21, v24 +; CHECK-NEXT: vrgather.vv v9, v22, v24 +; CHECK-NEXT: vrgather.vv v8, v23, v24 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8i64( %a) ret %res @@ -1451,12 +1603,14 @@ define @reverse_nxv8f16( %a) { ; CHECK-LABEL: reverse_nxv8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8f16( %a) ret %res @@ -1466,13 +1620,16 @@ define @reverse_nxv16f16( %a) { ; CHECK-LABEL: reverse_nxv16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vrgather.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv16f16( %a) ret %res @@ -1481,14 +1638,21 @@ define @reverse_nxv16f16( %a) { define @reverse_nxv32f16( %a) { ; CHECK-LABEL: reverse_nxv32f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vrgather.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a0 +; CHECK-NEXT: vrgather.vv v15, v16, v24 +; CHECK-NEXT: vrgather.vv v14, v17, v24 +; CHECK-NEXT: vrgather.vv v13, v18, v24 +; CHECK-NEXT: vrgather.vv v12, v19, v24 +; CHECK-NEXT: vrgather.vv v11, v20, v24 +; CHECK-NEXT: vrgather.vv v10, v21, v24 +; CHECK-NEXT: vrgather.vv v9, v22, v24 +; CHECK-NEXT: vrgather.vv v8, v23, v24 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv32f16( %a) ret %res @@ -1530,14 +1694,14 @@ define @reverse_nxv4f32( %a) { ; CHECK-LABEL: reverse_nxv4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv4f32( %a) ret %res @@ -1547,13 +1711,16 @@ define @reverse_nxv8f32( %a) { ; CHECK-LABEL: reverse_nxv8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8f32( %a) ret %res @@ -1562,15 +1729,21 @@ define @reverse_nxv8f32( %a) { define @reverse_nxv16f32( %a) { ; CHECK-LABEL: reverse_nxv16f32: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a0 +; CHECK-NEXT: vrgather.vv v15, v16, v24 +; CHECK-NEXT: vrgather.vv v14, v17, v24 +; CHECK-NEXT: vrgather.vv v13, v18, v24 +; CHECK-NEXT: vrgather.vv v12, v19, v24 +; CHECK-NEXT: vrgather.vv v11, v20, v24 +; CHECK-NEXT: vrgather.vv v10, v21, v24 +; CHECK-NEXT: vrgather.vv v9, v22, v24 +; CHECK-NEXT: vrgather.vv v8, v23, v24 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv16f32( %a) ret %res @@ -1596,14 +1769,14 @@ define @reverse_nxv2f64( %a) { ; CHECK-LABEL: reverse_nxv2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vrsub.vx v12, v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vrgather.vv v11, v8, v12 +; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv2f64( %a) ret %res @@ -1613,14 +1786,16 @@ define @reverse_nxv4f64( %a) { ; CHECK-LABEL: reverse_nxv4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: vrsub.vx v16, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgather.vv v15, v8, v16 +; CHECK-NEXT: vrgather.vv v14, v9, v16 +; CHECK-NEXT: vrgather.vv v13, v10, v16 +; CHECK-NEXT: vrgather.vv v12, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv4f64( %a) ret %res @@ -1629,14 +1804,21 @@ define @reverse_nxv4f64( %a) { define @reverse_nxv8f64( %a) { ; CHECK-LABEL: reverse_nxv8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a0 +; CHECK-NEXT: vrgather.vv v15, v16, v24 +; CHECK-NEXT: vrgather.vv v14, v17, v24 +; CHECK-NEXT: vrgather.vv v13, v18, v24 +; CHECK-NEXT: vrgather.vv v12, v19, v24 +; CHECK-NEXT: vrgather.vv v11, v20, v24 +; CHECK-NEXT: vrgather.vv v10, v21, v24 +; CHECK-NEXT: vrgather.vv v9, v22, v24 +; CHECK-NEXT: vrgather.vv v8, v23, v24 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8f64( %a) ret %res @@ -1648,16 +1830,19 @@ define @reverse_nxv3i64( %a) { ; CHECK-LABEL: reverse_nxv3i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v12, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v12 -; CHECK-NEXT: vmv1r.v v8, v17 -; CHECK-NEXT: vmv1r.v v9, v18 -; CHECK-NEXT: vmv1r.v v10, v19 +; CHECK-NEXT: vrsub.vx v14, v12, a0 +; CHECK-NEXT: vrgather.vv v13, v10, v14 +; CHECK-NEXT: vrgather.vv v10, v9, v14 +; CHECK-NEXT: vmv.v.v v12, v13 +; CHECK-NEXT: vrgather.vv v15, v8, v14 +; CHECK-NEXT: vmv.v.v v13, v10 +; CHECK-NEXT: vrgather.vv v8, v11, v14 +; CHECK-NEXT: vmv.v.v v14, v15 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv3i64( %a) ret %res @@ -1667,15 +1852,23 @@ define @reverse_nxv6i64( %a) { ; CHECK-LABEL: reverse_nxv6i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v16, v16, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v24, v8, v16 -; CHECK-NEXT: vmv2r.v v8, v26 -; CHECK-NEXT: vmv2r.v v10, v28 -; CHECK-NEXT: vmv2r.v v12, v30 +; CHECK-NEXT: vrsub.vx v22, v16, a0 +; CHECK-NEXT: vrgather.vv v21, v10, v22 +; CHECK-NEXT: vrgather.vv v19, v12, v22 +; CHECK-NEXT: vrgather.vv v18, v13, v22 +; CHECK-NEXT: vrgather.vv v20, v11, v22 +; CHECK-NEXT: vmv2r.v v16, v18 +; CHECK-NEXT: vmv2r.v v18, v20 +; CHECK-NEXT: vrgather.vv v31, v8, v22 +; CHECK-NEXT: vrgather.vv v30, v9, v22 +; CHECK-NEXT: vrgather.vv v9, v14, v22 +; CHECK-NEXT: vrgather.vv v8, v15, v22 +; CHECK-NEXT: vmv2r.v v20, v30 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv6i64( %a) ret %res @@ -1697,20 +1890,28 @@ define @reverse_nxv12i64( %a) { ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: addi a1, a0, -1 -; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; RV32-NEXT: vid.v v24 -; RV32-NEXT: vrsub.vx v24, v24, a1 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v0, v16, v24 -; RV32-NEXT: vmv4r.v v16, v4 -; RV32-NEXT: vrgatherei16.vv v0, v8, v24 -; RV32-NEXT: vmv4r.v v20, v0 +; RV32-NEXT: srli a1, a0, 3 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV32-NEXT: vid.v v20 +; RV32-NEXT: vrsub.vx v20, v20, a1 +; RV32-NEXT: vrgather.vv v31, v12, v20 +; RV32-NEXT: vrgather.vv v30, v13, v20 +; RV32-NEXT: vrgather.vv v29, v14, v20 +; RV32-NEXT: vrgather.vv v28, v15, v20 +; RV32-NEXT: vrgather.vv v27, v16, v20 +; RV32-NEXT: vrgather.vv v26, v17, v20 +; RV32-NEXT: vrgather.vv v25, v18, v20 +; RV32-NEXT: vrgather.vv v24, v19, v20 +; RV32-NEXT: vrgather.vv v15, v8, v20 +; RV32-NEXT: vrgather.vv v14, v9, v20 +; RV32-NEXT: vrgather.vv v13, v10, v20 +; RV32-NEXT: vrgather.vv v12, v11, v20 ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: addi a1, sp, 64 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: vs4r.v v4, (a0) -; RV32-NEXT: vs8r.v v16, (a1) +; RV32-NEXT: vs4r.v v12, (a0) +; RV32-NEXT: vs8r.v v24, (a1) ; RV32-NEXT: vl8re64.v v16, (a0) ; RV32-NEXT: vl8re64.v v8, (a1) ; RV32-NEXT: addi sp, s0, -80 @@ -1734,20 +1935,28 @@ define @reverse_nxv12i64( %a) { ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: addi a1, a0, -1 -; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; RV64-NEXT: vid.v v24 -; RV64-NEXT: vrsub.vx v24, v24, a1 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v0, v16, v24 -; RV64-NEXT: vmv4r.v v16, v4 -; RV64-NEXT: vrgatherei16.vv v0, v8, v24 -; RV64-NEXT: vmv4r.v v20, v0 +; RV64-NEXT: srli a1, a0, 3 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vid.v v20 +; RV64-NEXT: vrsub.vx v20, v20, a1 +; RV64-NEXT: vrgather.vv v31, v12, v20 +; RV64-NEXT: vrgather.vv v30, v13, v20 +; RV64-NEXT: vrgather.vv v29, v14, v20 +; RV64-NEXT: vrgather.vv v28, v15, v20 +; RV64-NEXT: vrgather.vv v27, v16, v20 +; RV64-NEXT: vrgather.vv v26, v17, v20 +; RV64-NEXT: vrgather.vv v25, v18, v20 +; RV64-NEXT: vrgather.vv v24, v19, v20 +; RV64-NEXT: vrgather.vv v15, v8, v20 +; RV64-NEXT: vrgather.vv v14, v9, v20 +; RV64-NEXT: vrgather.vv v13, v10, v20 +; RV64-NEXT: vrgather.vv v12, v11, v20 ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: addi a1, sp, 64 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: vs4r.v v4, (a0) -; RV64-NEXT: vs8r.v v16, (a1) +; RV64-NEXT: vs4r.v v12, (a0) +; RV64-NEXT: vs8r.v v24, (a1) ; RV64-NEXT: vl8re64.v v16, (a0) ; RV64-NEXT: vl8re64.v v8, (a1) ; RV64-NEXT: addi sp, s0, -80 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll index d0f2ce1ca800..717b3a551d21 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll @@ -446,18 +446,23 @@ define @test_vp_reverse_nxv64i8_masked( %sr ; CHECK-LABEL: test_vp_reverse_nxv64i8_masked: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 2 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v16, v16, a2 -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v28, v8, v16 -; CHECK-NEXT: vrgatherei16.vv v24, v12, v16 +; CHECK-NEXT: vrsub.vx v24, v16, a2 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v23, v8, v24 +; CHECK-NEXT: vrgatherei16.vv v22, v9, v24 +; CHECK-NEXT: vrgatherei16.vv v21, v10, v24 +; CHECK-NEXT: vrgatherei16.vv v20, v11, v24 +; CHECK-NEXT: vrgatherei16.vv v19, v12, v24 +; CHECK-NEXT: vrgatherei16.vv v18, v13, v24 +; CHECK-NEXT: vrgatherei16.vv v17, v14, v24 +; CHECK-NEXT: vrgatherei16.vv v16, v15, v24 ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v24, a1, v0.t +; CHECK-NEXT: vslidedown.vx v8, v16, a1, v0.t ; CHECK-NEXT: ret %dst = call @llvm.experimental.vp.reverse.nxv64i8( %src, %mask, i32 %evl) ret %dst @@ -467,18 +472,23 @@ define @test_vp_reverse_nxv64i8( %src, i32 ; CHECK-LABEL: test_vp_reverse_nxv64i8: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 2 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma ; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v16, v16, a2 -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v28, v8, v16 -; CHECK-NEXT: vrgatherei16.vv v24, v12, v16 +; CHECK-NEXT: vrsub.vx v24, v16, a2 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v23, v8, v24 +; CHECK-NEXT: vrgatherei16.vv v22, v9, v24 +; CHECK-NEXT: vrgatherei16.vv v21, v10, v24 +; CHECK-NEXT: vrgatherei16.vv v20, v11, v24 +; CHECK-NEXT: vrgatherei16.vv v19, v12, v24 +; CHECK-NEXT: vrgatherei16.vv v18, v13, v24 +; CHECK-NEXT: vrgatherei16.vv v17, v14, v24 +; CHECK-NEXT: vrgatherei16.vv v16, v15, v24 ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v24, a1 +; CHECK-NEXT: vslidedown.vx v8, v16, a1 ; CHECK-NEXT: ret %dst = call @llvm.experimental.vp.reverse.nxv64i8( %src, splat (i1 1), i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll index acf7d16bda98..8c1be2c1e979 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll @@ -232,14 +232,19 @@ define @test_vp_reverse_nxv64i1_masked( %sr ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 2 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v0, v16, a2 -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v20, v24, v0 -; CHECK-NEXT: vrgatherei16.vv v16, v28, v0 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vx v10, v10, a2 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v23, v24, v10 +; CHECK-NEXT: vrgatherei16.vv v22, v25, v10 +; CHECK-NEXT: vrgatherei16.vv v21, v26, v10 +; CHECK-NEXT: vrgatherei16.vv v20, v27, v10 +; CHECK-NEXT: vrgatherei16.vv v19, v28, v10 +; CHECK-NEXT: vrgatherei16.vv v18, v29, v10 +; CHECK-NEXT: vrgatherei16.vv v17, v30, v10 +; CHECK-NEXT: vrgatherei16.vv v16, v31, v10 ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vmv1r.v v0, v8 @@ -257,20 +262,25 @@ define @test_vp_reverse_nxv64i1( %src, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 2 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vrsub.vx v16, v16, a2 -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v28, v8, v16 -; CHECK-NEXT: vrgatherei16.vv v24, v12, v16 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vrsub.vx v24, v8, a2 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v15, v16, v24 +; CHECK-NEXT: vrgatherei16.vv v14, v17, v24 +; CHECK-NEXT: vrgatherei16.vv v13, v18, v24 +; CHECK-NEXT: vrgatherei16.vv v12, v19, v24 +; CHECK-NEXT: vrgatherei16.vv v11, v20, v24 +; CHECK-NEXT: vrgatherei16.vv v10, v21, v24 +; CHECK-NEXT: vrgatherei16.vv v9, v22, v24 +; CHECK-NEXT: vrgatherei16.vv v8, v23, v24 ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v24, a1 +; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret