[VPlan] Implement VPBlendRecipe::computeCost.

Implement VPBlendRecipe::computeCost. VPBlendRecipe is currently is also
used if only the first lane is used.

This also requires pre-computing costs for forced scalars and
instructions considered profitable to scalarize. For those, the cost
will be computed separately in the legacy cost model. This will also be
needed when implementing VPReplicateRecipe::computeCost.
This commit is contained in:
Florian Hahn 2024-10-08 21:33:41 +01:00
parent a199fb1229
commit 36fc291b6e
No known key found for this signature in database
GPG Key ID: DE52A1D3AE255351
5 changed files with 528 additions and 15 deletions

View File

@ -961,6 +961,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
/// TargetTransformInfo to query the different backends for the cost of
/// different operations.
class LoopVectorizationCostModel {
friend class LoopVectorizationPlanner;
public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
PredicatedScalarEvolution &PSE, LoopInfo *LI,
@ -7263,6 +7265,32 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
Cost += BranchCost;
}
// Pre-compute costs for instructions that are forced-scalar or profitable to
// scalarize. Their costs will be computed separately in the legacy cost
// model.
for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
continue;
CostCtx.SkipCostComputation.insert(ForcedScalar);
InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
LLVM_DEBUG({
dbgs() << "Cost of " << ForcedCost << " for VF " << VF
<< ": forced scalar " << *ForcedScalar << "\n";
});
Cost += ForcedCost;
}
for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
continue;
CostCtx.SkipCostComputation.insert(Scalarized);
LLVM_DEBUG({
dbgs() << "Cost of " << ScalarCost << " for VF " << VF
<< ": profitable to scalarize " << *Scalarized << "\n";
});
Cost += ScalarCost;
}
return Cost;
}

View File

@ -2241,6 +2241,10 @@ public:
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
/// Return the cost of this VPWidenMemoryRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,

View File

@ -1896,6 +1896,22 @@ void VPBlendRecipe::execute(VPTransformState &State) {
State.set(this, Result, OnlyFirstLaneUsed);
}
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// Handle cases where only the first lane is used the same way as the legacy
// cost model.
if (vputils::onlyFirstLaneUsed(this))
return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
Type *ResultTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
Type *CmpTy = ToVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {

View File

@ -0,0 +1,465 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-vectorize -S %s | FileCheck %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx14.0.0"
define void @test_blend_feeding_replicated_store_1(i64 %N, ptr noalias %src, ptr %dst) {
; CHECK-LABEL: define void @test_blend_feeding_replicated_store_1(
; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[N]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP43]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP43]], 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP43]], [[TMP2]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[DST]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i1> zeroinitializer, <16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP7]], [[TMP8]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP7]], <16 x ptr> [[BROADCAST_SPLAT]], <16 x ptr> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP9]], i32 0
; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 0
; CHECK-NEXT: store i8 0, ptr [[TMP11]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP9]], i32 1
; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; CHECK: [[PRED_STORE_IF1]]:
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 1
; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP9]], i32 2
; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; CHECK: [[PRED_STORE_IF3]]:
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 2
; CHECK-NEXT: store i8 0, ptr [[TMP15]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; CHECK: [[PRED_STORE_CONTINUE4]]:
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP9]], i32 3
; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
; CHECK: [[PRED_STORE_IF5]]:
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 3
; CHECK-NEXT: store i8 0, ptr [[TMP17]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; CHECK: [[PRED_STORE_CONTINUE6]]:
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP9]], i32 4
; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
; CHECK: [[PRED_STORE_IF7]]:
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 4
; CHECK-NEXT: store i8 0, ptr [[TMP19]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
; CHECK: [[PRED_STORE_CONTINUE8]]:
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP9]], i32 5
; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
; CHECK: [[PRED_STORE_IF9]]:
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 5
; CHECK-NEXT: store i8 0, ptr [[TMP21]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
; CHECK: [[PRED_STORE_CONTINUE10]]:
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP9]], i32 6
; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
; CHECK: [[PRED_STORE_IF11]]:
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 6
; CHECK-NEXT: store i8 0, ptr [[TMP23]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; CHECK: [[PRED_STORE_CONTINUE12]]:
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP9]], i32 7
; CHECK-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; CHECK: [[PRED_STORE_IF13]]:
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 7
; CHECK-NEXT: store i8 0, ptr [[TMP25]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP9]], i32 8
; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
; CHECK: [[PRED_STORE_IF15]]:
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 8
; CHECK-NEXT: store i8 0, ptr [[TMP27]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE16]]
; CHECK: [[PRED_STORE_CONTINUE16]]:
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP9]], i32 9
; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
; CHECK: [[PRED_STORE_IF17]]:
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 9
; CHECK-NEXT: store i8 0, ptr [[TMP29]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP9]], i32 10
; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
; CHECK: [[PRED_STORE_IF19]]:
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 10
; CHECK-NEXT: store i8 0, ptr [[TMP31]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP9]], i32 11
; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
; CHECK: [[PRED_STORE_IF21]]:
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 11
; CHECK-NEXT: store i8 0, ptr [[TMP33]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP9]], i32 12
; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
; CHECK: [[PRED_STORE_IF23]]:
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 12
; CHECK-NEXT: store i8 0, ptr [[TMP35]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]]
; CHECK: [[PRED_STORE_CONTINUE24]]:
; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP9]], i32 13
; CHECK-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
; CHECK: [[PRED_STORE_IF25]]:
; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 13
; CHECK-NEXT: store i8 0, ptr [[TMP37]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]]
; CHECK: [[PRED_STORE_CONTINUE26]]:
; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP9]], i32 14
; CHECK-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
; CHECK: [[PRED_STORE_IF27]]:
; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 14
; CHECK-NEXT: store i8 0, ptr [[TMP39]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]]
; CHECK: [[PRED_STORE_CONTINUE28]]:
; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP9]], i32 15
; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]]
; CHECK: [[PRED_STORE_IF29]]:
; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 15
; CHECK-NEXT: store i8 0, ptr [[TMP41]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]]
; CHECK: [[PRED_STORE_CONTINUE30]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[CONTINUE:.*]]
; CHECK: [[CONTINUE]]:
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_SRC]], align 4
; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp slt i32 [[TMP0]], 0
; CHECK-NEXT: br i1 [[CMP2_NOT]], label %[[THEN:.*]], label %[[THEN_2:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: br i1 false, label %[[THEN_2]], label %[[LOOP_LATCH]]
; CHECK: [[THEN_2]]:
; CHECK-NEXT: [[P:%.*]] = phi ptr [ null, %[[CONTINUE]] ], [ [[DST]], %[[THEN]] ]
; CHECK-NEXT: store i8 0, ptr [[P]], align 1
; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: br label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop.header
loop.header: ; preds = %loop.latch, %entry
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
%ec = icmp eq i64 %iv, %N
br i1 %ec, label %exit, label %continue
continue:
%gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
%0 = load i32, ptr %gep.src, align 4
%cmp2.not = icmp slt i32 %0, 0
br i1 %cmp2.not, label %then, label %then.2
then:
br i1 false, label %then.2, label %loop.latch
then.2:
%p = phi ptr [ null, %continue ], [ %dst, %then ]
store i8 0, ptr %p, align 1
br label %loop.latch
loop.latch:
%iv.next = add i64 %iv, 1
br label %loop.header
exit:
ret void
}
define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i1 %c.0) {
; CHECK-LABEL: define void @test_blend_feeding_replicated_store_2(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i1 [[C_0:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
; CHECK-NEXT: [[IV:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[GEP_SRC]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i1> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 0
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 0
; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP9]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP7]], i32 1
; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; CHECK: [[PRED_STORE_IF1]]:
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 1
; CHECK-NEXT: store i8 [[TMP14]], ptr [[TMP13]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP7]], i32 2
; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; CHECK: [[PRED_STORE_IF3]]:
; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 2
; CHECK-NEXT: store i8 [[TMP18]], ptr [[TMP17]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; CHECK: [[PRED_STORE_CONTINUE4]]:
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP7]], i32 3
; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
; CHECK: [[PRED_STORE_IF5]]:
; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[INDEX]], 3
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP20]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 3
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP21]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; CHECK: [[PRED_STORE_CONTINUE6]]:
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP7]], i32 4
; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
; CHECK: [[PRED_STORE_IF7]]:
; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 4
; CHECK-NEXT: store i8 [[TMP26]], ptr [[TMP25]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
; CHECK: [[PRED_STORE_CONTINUE8]]:
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP7]], i32 5
; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
; CHECK: [[PRED_STORE_IF9]]:
; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[INDEX]], 5
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 5
; CHECK-NEXT: store i8 [[TMP30]], ptr [[TMP29]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
; CHECK: [[PRED_STORE_CONTINUE10]]:
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP7]], i32 6
; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
; CHECK: [[PRED_STORE_IF11]]:
; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[INDEX]], 6
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP32]]
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 6
; CHECK-NEXT: store i8 [[TMP34]], ptr [[TMP33]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; CHECK: [[PRED_STORE_CONTINUE12]]:
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i1> [[TMP7]], i32 7
; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; CHECK: [[PRED_STORE_IF13]]:
; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[INDEX]], 7
; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP36]]
; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 7
; CHECK-NEXT: store i8 [[TMP38]], ptr [[TMP37]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i1> [[TMP7]], i32 8
; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
; CHECK: [[PRED_STORE_IF15]]:
; CHECK-NEXT: [[TMP40:%.*]] = add i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP40]]
; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 8
; CHECK-NEXT: store i8 [[TMP42]], ptr [[TMP41]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE16]]
; CHECK: [[PRED_STORE_CONTINUE16]]:
; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x i1> [[TMP7]], i32 9
; CHECK-NEXT: br i1 [[TMP43]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
; CHECK: [[PRED_STORE_IF17]]:
; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[INDEX]], 9
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP44]]
; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 9
; CHECK-NEXT: store i8 [[TMP46]], ptr [[TMP45]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP7]], i32 10
; CHECK-NEXT: br i1 [[TMP47]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
; CHECK: [[PRED_STORE_IF19]]:
; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[INDEX]], 10
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP48]]
; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 10
; CHECK-NEXT: store i8 [[TMP50]], ptr [[TMP49]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
; CHECK-NEXT: [[TMP51:%.*]] = extractelement <16 x i1> [[TMP7]], i32 11
; CHECK-NEXT: br i1 [[TMP51]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
; CHECK: [[PRED_STORE_IF21]]:
; CHECK-NEXT: [[TMP52:%.*]] = add i32 [[INDEX]], 11
; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP52]]
; CHECK-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 11
; CHECK-NEXT: store i8 [[TMP54]], ptr [[TMP53]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
; CHECK-NEXT: [[TMP55:%.*]] = extractelement <16 x i1> [[TMP7]], i32 12
; CHECK-NEXT: br i1 [[TMP55]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
; CHECK: [[PRED_STORE_IF23]]:
; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[INDEX]], 12
; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP56]]
; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 12
; CHECK-NEXT: store i8 [[TMP58]], ptr [[TMP57]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]]
; CHECK: [[PRED_STORE_CONTINUE24]]:
; CHECK-NEXT: [[TMP59:%.*]] = extractelement <16 x i1> [[TMP7]], i32 13
; CHECK-NEXT: br i1 [[TMP59]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
; CHECK: [[PRED_STORE_IF25]]:
; CHECK-NEXT: [[TMP60:%.*]] = add i32 [[INDEX]], 13
; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP60]]
; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 13
; CHECK-NEXT: store i8 [[TMP62]], ptr [[TMP61]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]]
; CHECK: [[PRED_STORE_CONTINUE26]]:
; CHECK-NEXT: [[TMP63:%.*]] = extractelement <16 x i1> [[TMP7]], i32 14
; CHECK-NEXT: br i1 [[TMP63]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
; CHECK: [[PRED_STORE_IF27]]:
; CHECK-NEXT: [[TMP64:%.*]] = add i32 [[INDEX]], 14
; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP64]]
; CHECK-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 14
; CHECK-NEXT: store i8 [[TMP66]], ptr [[TMP65]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]]
; CHECK: [[PRED_STORE_CONTINUE28]]:
; CHECK-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP7]], i32 15
; CHECK-NEXT: br i1 [[TMP67]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]]
; CHECK: [[PRED_STORE_IF29]]:
; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[INDEX]], 15
; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP68]]
; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 15
; CHECK-NEXT: store i8 [[TMP70]], ptr [[TMP69]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]]
; CHECK: [[PRED_STORE_CONTINUE30]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
; CHECK-NEXT: br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], 0
; CHECK-NEXT: br i1 [[C_1]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: br i1 [[C_0]], label %[[LOOP_LATCH]], label %[[THEN]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[P:%.*]] = phi i8 [ 1, %[[LOOP_HEADER]] ], [ 0, %[[ELSE]] ]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV1]]
; CHECK-NEXT: store i8 [[P]], ptr [[GEP_DST]], align 1
; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 100
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop.header
loop.header:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
%gep.src = getelementptr inbounds i8, ptr %src, i32 %iv
%l = load i8, ptr %gep.src, align 1
%c.1 = icmp eq i8 %l, 0
br i1 %c.1, label %then, label %else
else:
br i1 %c.0, label %loop.latch, label %then
then:
%p = phi i8 [ 1, %loop.header ], [ 0, %else ]
%gep.dst = getelementptr inbounds i8, ptr %dst, i32 %iv
store i8 %p, ptr %gep.dst, align 1
br label %loop.latch
loop.latch:
%iv.next = add i32 %iv, 1
%ec = icmp eq i32 %iv.next, 100
br i1 %ec, label %exit, label %loop.header
exit:
ret void
}
define void @test_blend_feeding_replicated_store_3(ptr noalias %src.1, ptr noalias %src.2, ptr noalias %dst, i32 %x, i64 %N, i1 %c.2) {
entry:
br label %loop.header
loop.header:
%iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
%l.1 = load i8, ptr %src.1, align 1
%ext = zext i8 %l.1 to i32
%mul = mul i32 %x, %ext
%div = sdiv i32 %mul, 255
%l.2 = load i8, ptr %src.2, align 1
%c.1 = icmp eq i8 %l.2, 0
br i1 %c.1, label %then, label %else.1
else.1:
br i1 %c.2, label %loop.latch, label %else.2
else.2:
%trunc.div = trunc i32 %div to i8
br label %then
then:
%p = phi i8 [ 0, %loop.header ], [ %trunc.div, %else.2 ]
store i8 %p, ptr %dst, align 1
br label %loop.latch
loop.latch:
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, %N
br i1 %ec, label %exit, label %loop.header
exit:
ret void
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
;.

View File

@ -23,10 +23,10 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
; CHECK: [[PRED_UREM_IF]]:
; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[MUL]], [[X]]
; CHECK-NEXT: [[REM:%.*]] = urem i64 [[MUL]], [[X]]
; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE]]
; CHECK: [[PRED_UREM_CONTINUE]]:
; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP3]], %[[PRED_UREM_IF]] ]
; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[REM]], %[[PRED_UREM_IF]] ]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
; CHECK: [[PRED_UREM_IF1]]:
@ -49,14 +49,14 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
; CHECK-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 0)
; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP9]], i64 0)
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1
; CHECK-NEXT: [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[PREDPHI]], 1
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP16]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]]
; CHECK-NEXT: store i64 0, ptr [[TMP18]], align 8
; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8
; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
@ -67,19 +67,19 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[REM:%.*]] = urem i64 [[MUL]], [[X]]
; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REM]], i64 0)
; CHECK-NEXT: [[REM1:%.*]] = urem i64 [[MUL]], [[X]]
; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REM1]], i64 0)
; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, %[[LOOP_HEADER]] ], [ [[SMAX]], %[[ELSE]] ]
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]]
; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[LOOP_HEADER]] ], [ [[SMAX]], %[[ELSE]] ]
; CHECK-NEXT: [[IV_NEXT:%.*]] = add i64 [[IV]], 1
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
; CHECK-NEXT: store i64 0, ptr [[GEP1]], align 8
; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 0
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void