diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6c0100389d73..707b991cda94 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2236,36 +2236,6 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { } } -/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be -/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember -/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the -/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can -/// be narrowed to an index-independent load if it feeds all wide ops at all -/// indices (checked by via the operands of the wide recipe at lane0, \p -/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V -/// is defined at \p Idx of a load interleave group. -static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember, - VPValue *V, unsigned Idx) { - auto *DefR = V->getDefiningRecipe(); - if (!DefR) - return false; - if (auto *W = dyn_cast(DefR)) - return !W->getMask() && - all_of(zip(WideMember0->operands(), WideMember->operands()), - [V](const auto P) { - // V must be as at the same places in both WideMember0 and - // WideMember. - const auto &[WideMember0Op, WideMemberOp] = P; - return (WideMember0Op == V) == (WideMemberOp == V); - }); - - if (auto *IR = dyn_cast(DefR)) - return IR->getInterleaveGroup()->getFactor() == - IR->getInterleaveGroup()->getNumMembers() && - IR->getVPValue(Idx) == V; - return false; -} - /// Returns true if \p IR is a full interleave group with factor and number of /// members both equal to \p VF. The interleave group must also access the full /// vector width \p VectorRegWidth. @@ -2328,8 +2298,6 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (R.mayWriteToMemory() && !InterleaveR) return; - // All other ops are allowed, but we reject uses that cannot be converted - // when checking all allowed consumers (store interleave groups) below. if (!InterleaveR) continue; @@ -2344,7 +2312,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // For now, we only support full interleave groups storing load interleave // groups. - if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) { + if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) { VPRecipeBase *DefR = Op.value()->getDefiningRecipe(); if (!DefR) return false; @@ -2354,25 +2322,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, IR->getInterleaveGroup()->getNumMembers() && IR->getVPValue(Op.index()) == Op.value(); })) { - StoreGroups.push_back(InterleaveR); - continue; - } - - // Check if all values feeding InterleaveR are matching wide recipes, which - // operands that can be narrowed. - auto *WideMember0 = dyn_cast_or_null( - InterleaveR->getStoredValues()[0]->getDefiningRecipe()); - if (!WideMember0) return; - for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { - auto *R = dyn_cast(V->getDefiningRecipe()); - if (!R || R->getOpcode() != WideMember0->getOpcode() || - R->getNumOperands() > 2) - return; - if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) { - return !canNarrowLoad(WideMember0, R, V, Idx); - })) - return; } StoreGroups.push_back(InterleaveR); } @@ -2380,41 +2330,23 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (StoreGroups.empty()) return; - // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. + // Convert InterleaveGroup R to a single VPWidenLoadRecipe. auto NarrowOp = [](VPRecipeBase *R) -> VPValue * { - if (auto *LoadGroup = dyn_cast(R)) { - // Narrow interleave group to wide load, as transformed VPlan will only - // process one original iteration. - auto *L = new VPWidenLoadRecipe( - *cast(LoadGroup->getInterleaveGroup()->getInsertPos()), - LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, LoadGroup->getDebugLoc()); - L->insertBefore(LoadGroup); - return L; - } - - auto *WideLoad = cast(R); - - // Narrow wide load to uniform scalar load, as transformed VPlan will only + auto *LoadGroup = cast(R); + // Narrow interleave group to wide load, as transformed VPlan will only // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), - WideLoad->operands(), /*IsUniform*/ true); - N->insertBefore(WideLoad); - return N; + auto *L = new VPWidenLoadRecipe( + *cast(LoadGroup->getInterleaveGroup()->getInsertPos()), + LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, LoadGroup->getDebugLoc()); + L->insertBefore(LoadGroup); + return L; }; // Narrow operation tree rooted at store groups. for (auto *StoreGroup : StoreGroups) { - VPValue *Res = nullptr; - if (auto *WideMember0 = dyn_cast( - StoreGroup->getStoredValues()[0]->getDefiningRecipe())) { - for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) - WideMember0->setOperand( - Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe())); - Res = WideMember0; - } else { - Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe()); - } + VPValue *Res = + NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe()); auto *S = new VPWidenStoreRecipe( *cast(StoreGroup->getInterleaveGroup()->getInsertPos()), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index a859600f2ecf..674a0fc5644c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -99,17 +99,31 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = load <2 x double>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD1]], [[WIDE_LOAD3]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[TMP10]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll index b250edb457da..b8e75eff0367 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll @@ -19,14 +19,24 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no ; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1 ; VF2-NEXT: [[TMP12:%.*]] = shl nsw i64 [[TMP10]], 1 ; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]] -; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]] -; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 -; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP5]], align 8 -; VF2-NEXT: [[TMP9:%.*]] = fneg <2 x double> [[WIDE_LOAD]] -; VF2-NEXT: [[TMP11:%.*]] = fneg <2 x double> [[WIDE_LOAD1]] -; VF2-NEXT: store <2 x double> [[TMP9]], ptr [[TMP2]], align 8 -; VF2-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 8 -; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]] +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 8 +; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> +; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> +; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP13]], align 8 +; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> +; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> +; VF2-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[STRIDED_VEC]] +; VF2-NEXT: [[TMP14:%.*]] = fneg <2 x double> [[STRIDED_VEC3]] +; VF2-NEXT: [[TMP4:%.*]] = fneg <2 x double> [[STRIDED_VEC1]] +; VF2-NEXT: [[TMP9:%.*]] = fneg <2 x double> [[STRIDED_VEC4]] +; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> +; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 +; VF2-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> [[TMP9]], <4 x i32> +; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> +; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP13]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF2: [[MIDDLE_BLOCK]]: @@ -190,15 +200,18 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) { ; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] ; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 -; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 -; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8 ; VF2-NEXT: [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1 ; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] -; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8 -; VF2-NEXT: [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] -; VF2-NEXT: store <2 x i64> [[TMP8]], ptr [[TMP7]], align 8 -; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; VF2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; VF2-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]] +; VF2-NEXT: [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]] +; VF2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP24]], <4 x i32> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; VF2-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VF2: [[MIDDLE_BLOCK]]: @@ -1001,30 +1014,28 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr ; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] ; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 ; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 -; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 -; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; VF2-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF2-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i64 0 -; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 +; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]] ; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 2 -; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8 -; VF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0 -; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer -; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8 -; VF2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i64 0 -; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT5]], <2 x i64> poison, <2 x i32> zeroinitializer +; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 +; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 ; VF2-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]] ; VF2-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]] ; VF2-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP0]], 1 ; VF2-NEXT: [[TMP20:%.*]] = shl nsw i64 [[TMP1]], 1 ; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]] ; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]] -; VF2-NEXT: store <2 x i64> [[TMP12]], ptr [[DATA_0]], align 8 -; VF2-NEXT: store <2 x i64> [[TMP13]], ptr [[DATA_1]], align 8 -; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: [[TMP14:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]] +; VF2-NEXT: [[TMP15:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]] +; VF2-NEXT: [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> +; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA_0]], align 8 +; VF2-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP15]], <4 x i32> +; VF2-NEXT: [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> +; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC4]], ptr [[DATA_1]], align 8 +; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 ; VF2-NEXT: br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; VF2: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll index 11994ff6398f..e32f1a0859a3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll @@ -20,14 +20,23 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] -; CHECK-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC1]] +; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP10]], <16 x i64> poison, <16 x i32> +; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: