mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-17 08:06:40 +00:00
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow (the next vector iteration's predicate is generated with the llvm.active.lane.mask intrinsic and then tested for the backedge), the LoopVectorizer still inserts a runtime check to see if the 'i + VF' may at any point overflow for the given trip-count. When it does, it falls back to a scalar epilogue loop. We can get rid of that runtime check in the pre-header and therefore also remove the scalar epilogue loop. This reduces code-size and avoids a runtime check. Consider the following loop: void foo(char * __restrict__ dst, char *src, unsigned long N) { for (unsigned long i=0; i<N; ++i) dst[i] = src[i] + 42; } If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter will overflow when calculating the predicate for the next vector iteration at some point, because LLVM does: vector.ph: %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N) vector.body: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ] ... %index.next = add i64 %index, 16 ; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed. %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N) %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0 br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 The solution: What we can do instead is calculate the predicate before incrementing the loop iteration counter, such that the llvm.active.lane.mask is calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e. vector.ph: %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N) %N_minus_VF = select %N > 16 ? %N - 16 : 0 vector.body: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ] ... %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF) %index.next = add i64 %index, %4 ; The add above may still overflow, but this time the active.lane.mask is not affected %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0 br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 For N = 20, we'd then get: vector.ph: %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N) ; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> %N_minus_VF = select 20 > 16 ? 20 - 16 : 0 ; %N_minus_VF = 4 vector.body: (1st iteration) ... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop ... %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4) ; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> %index.next = add i64 0, 16 ; %index.next = 16 %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0 ; %8 = 1 br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 ; branch to %vector.body vector.body: (2nd iteration) ... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop ... %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4) ; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> %index.next = add i64 16, 16 ; %index.next = 32 %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0 ; %8 = 0 br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 ; branch to %for.cond.cleanup Reviewed By: fhahn, david-arm Differential Revision: https://reviews.llvm.org/D142109
This commit is contained in:
parent
de111ae70a
commit
fe1b51ffee
@ -186,6 +186,10 @@ enum class TailFoldingStyle {
|
||||
/// active.lane.mask to calculate the mask for the next iteration. If the
|
||||
/// increment overflows, the mask is no longer correct.
|
||||
DataAndControlFlow,
|
||||
/// Use predicate to control both data and control flow, but modify
|
||||
/// the trip count so that a runtime overflow check can be avoided
|
||||
/// and such that the scalar epilogue loop can always be removed.
|
||||
DataAndControlFlowWithoutRuntimeCheck
|
||||
};
|
||||
|
||||
class TargetTransformInfo;
|
||||
|
@ -349,7 +349,7 @@ public:
|
||||
|
||||
TailFoldingStyle getPreferredTailFoldingStyle() const {
|
||||
if (ST->hasSVE())
|
||||
return TailFoldingStyle::DataAndControlFlow;
|
||||
return TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
|
||||
return TailFoldingStyle::DataWithoutLaneMask;
|
||||
}
|
||||
|
||||
|
@ -232,6 +232,25 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
|
||||
"prefers tail-folding, don't attempt vectorization if "
|
||||
"tail-folding fails.")));
|
||||
|
||||
static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
|
||||
"force-tail-folding-style", cl::desc("Force the tail folding style"),
|
||||
cl::init(TailFoldingStyle::None),
|
||||
cl::values(
|
||||
clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
|
||||
clEnumValN(
|
||||
TailFoldingStyle::Data, "data",
|
||||
"Create lane mask for data only, using active.lane.mask intrinsic"),
|
||||
clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
|
||||
"data-without-lane-mask",
|
||||
"Create lane mask with compare/stepvector"),
|
||||
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
|
||||
"Create lane mask using active.lane.mask intrinsic, and use "
|
||||
"it for both data and control flow"),
|
||||
clEnumValN(
|
||||
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
|
||||
"data-and-control-without-rt-check",
|
||||
"Similar to data-and-control, but remove the runtime check")));
|
||||
|
||||
static cl::opt<bool> MaximizeBandwidth(
|
||||
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
||||
cl::desc("Maximize bandwidth when selecting vectorization factor which "
|
||||
@ -1554,6 +1573,9 @@ public:
|
||||
if (!CanFoldTailByMasking)
|
||||
return TailFoldingStyle::None;
|
||||
|
||||
if (ForceTailFoldingStyle.getNumOccurrences())
|
||||
return ForceTailFoldingStyle;
|
||||
|
||||
return TTI.getPreferredTailFoldingStyle();
|
||||
}
|
||||
|
||||
@ -1562,12 +1584,6 @@ public:
|
||||
return getTailFoldingStyle() != TailFoldingStyle::None;
|
||||
}
|
||||
|
||||
/// Returns true if were tail-folding and want to use the active lane mask
|
||||
/// for vector loop control flow.
|
||||
bool useActiveLaneMaskForControlFlow() const {
|
||||
return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow;
|
||||
}
|
||||
|
||||
/// Returns true if the instructions in this block requires predication
|
||||
/// for any reason, e.g. because tail folding now requires a predicate
|
||||
/// or because the block in the original loop was predicated.
|
||||
@ -2155,6 +2171,17 @@ public:
|
||||
};
|
||||
} // namespace
|
||||
|
||||
static bool useActiveLaneMask(TailFoldingStyle Style) {
|
||||
return Style == TailFoldingStyle::Data ||
|
||||
Style == TailFoldingStyle::DataAndControlFlow ||
|
||||
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
|
||||
}
|
||||
|
||||
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
|
||||
return Style == TailFoldingStyle::DataAndControlFlow ||
|
||||
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
|
||||
}
|
||||
|
||||
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
|
||||
// vectorization. The loop needs to be annotated with #pragma omp simd
|
||||
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
|
||||
@ -3020,10 +3047,12 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
|
||||
Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
|
||||
};
|
||||
|
||||
if (!Cost->foldTailByMasking())
|
||||
TailFoldingStyle Style = Cost->getTailFoldingStyle();
|
||||
if (Style == TailFoldingStyle::None)
|
||||
CheckMinIters =
|
||||
Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
|
||||
else if (VF.isScalable()) {
|
||||
else if (VF.isScalable() &&
|
||||
Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
|
||||
// vscale is not necessarily a power-of-2, which means we cannot guarantee
|
||||
// an overflow to zero when updating induction variables and so an
|
||||
// additional overflow check is required before entering the vector loop.
|
||||
@ -8154,8 +8183,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
|
||||
|
||||
// If we're using the active lane mask for control flow, then we get the
|
||||
// mask from the active lane mask PHI that is cached in the VPlan.
|
||||
TailFoldingStyle Style = CM.getTailFoldingStyle();
|
||||
if (Style == TailFoldingStyle::DataAndControlFlow)
|
||||
TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
|
||||
if (useActiveLaneMaskForControlFlow(TFStyle))
|
||||
return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
|
||||
|
||||
// Introduce the early-exit compare IV <= BTC to form header block mask.
|
||||
@ -8170,8 +8199,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
|
||||
|
||||
VPBuilder::InsertPointGuard Guard(Builder);
|
||||
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
|
||||
if (Style != TailFoldingStyle::None &&
|
||||
Style != TailFoldingStyle::DataWithoutLaneMask) {
|
||||
if (useActiveLaneMask(TFStyle)) {
|
||||
VPValue *TC = Plan.getOrCreateTripCount();
|
||||
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
|
||||
nullptr, "active.lane.mask");
|
||||
@ -8786,9 +8814,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
|
||||
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
|
||||
|
||||
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
|
||||
EB->appendRecipe(CanonicalIVIncrement);
|
||||
|
||||
if (Style == TailFoldingStyle::DataAndControlFlow) {
|
||||
if (useActiveLaneMaskForControlFlow(Style)) {
|
||||
// Create the active lane mask instruction in the vplan preheader.
|
||||
VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
|
||||
|
||||
@ -8803,6 +8829,26 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
|
||||
|
||||
// Create the ActiveLaneMask instruction using the correct start values.
|
||||
VPValue *TC = Plan.getOrCreateTripCount();
|
||||
|
||||
VPValue *TripCount, *IncrementValue;
|
||||
if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
|
||||
// When avoiding a runtime check, the active.lane.mask inside the loop
|
||||
// uses a modified trip count and the induction variable increment is
|
||||
// done after the active.lane.mask intrinsic is called.
|
||||
auto *TCMinusVF =
|
||||
new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
|
||||
Preheader->appendRecipe(TCMinusVF);
|
||||
IncrementValue = CanonicalIVPHI;
|
||||
TripCount = TCMinusVF;
|
||||
} else {
|
||||
// When the loop is guarded by a runtime overflow check for the loop
|
||||
// induction variable increment by VF, we can increment the value before
|
||||
// the get.active.lane mask and use the unmodified tripcount.
|
||||
EB->appendRecipe(CanonicalIVIncrement);
|
||||
IncrementValue = CanonicalIVIncrement;
|
||||
TripCount = TC;
|
||||
}
|
||||
|
||||
auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
|
||||
{CanonicalIVIncrementParts, TC}, DL,
|
||||
"active.lane.mask.entry");
|
||||
@ -8817,15 +8863,21 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
|
||||
CanonicalIVIncrementParts =
|
||||
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
|
||||
: VPInstruction::CanonicalIVIncrementForPart,
|
||||
{CanonicalIVIncrement}, DL);
|
||||
{IncrementValue}, DL);
|
||||
EB->appendRecipe(CanonicalIVIncrementParts);
|
||||
|
||||
auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
|
||||
{CanonicalIVIncrementParts, TC}, DL,
|
||||
{CanonicalIVIncrementParts, TripCount}, DL,
|
||||
"active.lane.mask.next");
|
||||
EB->appendRecipe(ALM);
|
||||
LaneMaskPhi->addOperand(ALM);
|
||||
|
||||
if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
|
||||
// Do the increment of the canonical IV after the active.lane.mask, because
|
||||
// that value is still based off %CanonicalIVPHI
|
||||
EB->appendRecipe(CanonicalIVIncrement);
|
||||
}
|
||||
|
||||
// We have to invert the mask here because a true condition means jumping
|
||||
// to the exit block.
|
||||
auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
|
||||
@ -8835,6 +8887,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
|
||||
new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
|
||||
EB->appendRecipe(BranchBack);
|
||||
} else {
|
||||
EB->appendRecipe(CanonicalIVIncrement);
|
||||
|
||||
// Add the BranchOnCount VPInstruction to the latch.
|
||||
VPInstruction *BranchBack = new VPInstruction(
|
||||
VPInstruction::BranchOnCount,
|
||||
|
@ -790,6 +790,7 @@ public:
|
||||
SLPLoad,
|
||||
SLPStore,
|
||||
ActiveLaneMask,
|
||||
CalculateTripCountMinusVF,
|
||||
CanonicalIVIncrement,
|
||||
CanonicalIVIncrementNUW,
|
||||
// The next two are similar to the above, but instead increment the
|
||||
@ -892,6 +893,7 @@ public:
|
||||
default:
|
||||
return false;
|
||||
case VPInstruction::ActiveLaneMask:
|
||||
case VPInstruction::CalculateTripCountMinusVF:
|
||||
case VPInstruction::CanonicalIVIncrement:
|
||||
case VPInstruction::CanonicalIVIncrementNUW:
|
||||
case VPInstruction::CanonicalIVIncrementForPart:
|
||||
|
@ -275,6 +275,17 @@ void VPInstruction::generateInstruction(VPTransformState &State,
|
||||
}
|
||||
break;
|
||||
}
|
||||
case VPInstruction::CalculateTripCountMinusVF: {
|
||||
Value *ScalarTC = State.get(getOperand(0), Part);
|
||||
Value *Step =
|
||||
createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
|
||||
Value *Sub = Builder.CreateSub(ScalarTC, Step);
|
||||
Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
|
||||
Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
|
||||
Value *Sel = Builder.CreateSelect(Cmp, Sub, Zero);
|
||||
State.set(this, Sel, Part);
|
||||
break;
|
||||
}
|
||||
case VPInstruction::CanonicalIVIncrement:
|
||||
case VPInstruction::CanonicalIVIncrementNUW: {
|
||||
Value *Next = nullptr;
|
||||
@ -411,6 +422,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
|
||||
case VPInstruction::BranchOnCond:
|
||||
O << "branch-on-cond";
|
||||
break;
|
||||
case VPInstruction::CalculateTripCountMinusVF:
|
||||
O << "TC > VF ? TC - VF : 0";
|
||||
break;
|
||||
case VPInstruction::CanonicalIVIncrementForPart:
|
||||
O << "VF * Part + ";
|
||||
break;
|
||||
|
@ -63,8 +63,8 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]]
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP28]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP29]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 998)
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 1002)
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP30]], i32 0
|
||||
|
@ -4,6 +4,9 @@
|
||||
define void @invariant_store_red_exit_is_phi(ptr %dst, ptr readonly %src, i64 %n) {
|
||||
; CHECK-LABEL: @invariant_store_red_exit_is_phi(
|
||||
; CHECK: vector.ph:
|
||||
; CHECK: %[[N_MINUS_VF:.*]] = sub i64 %n, %[[VSCALE_X_4:.*]]
|
||||
; CHECK: %[[CMP:.*]] = icmp ugt i64 %n, %[[VSCALE_X_4]]
|
||||
; CHECK: %[[N2:.*]] = select i1 %[[CMP]], i64 %[[N_MINUS_VF]], i64 0
|
||||
; CHECK: %[[ACTIVE_LANE_MASK_ENTRY:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
|
||||
; CHECK: vector.body:
|
||||
; CHECK: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1> [ %[[ACTIVE_LANE_MASK_ENTRY]], %vector.ph ], [ %[[ACTIVE_LANE_MASK_NEXT:.*]], %vector.body ]
|
||||
@ -11,7 +14,7 @@ define void @invariant_store_red_exit_is_phi(ptr %dst, ptr readonly %src, i64 %n
|
||||
; CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0
|
||||
; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 4 x i32> %[[VEC_PHI]], %[[LOAD]]
|
||||
; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32> %[[VEC_PHI]]
|
||||
; CHECK: %[[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %n)
|
||||
; CHECK: %[[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %[[N2]])
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
|
||||
; CHECK-NEXT: store i32 %[[SUM]], ptr %dst, align 4
|
||||
|
@ -4,16 +4,20 @@ target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
|
||||
; CHECK-LABEL: @trip7_i64(
|
||||
; CHECK: vector.ph:
|
||||
; CHECK: [[N_MINUS_VF:%.*]] = sub i64 7, [[VSCALE_X_VF:%.*]]
|
||||
; CHECK: [[CMP:%.*]] = icmp ugt i64 7, [[VSCALE_X_VF]]
|
||||
; CHECK: [[TRIP_COUNT:%.*]] = select i1 [[CMP]], i64 [[N_MINUS_VF]], i64 0
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
|
||||
; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ]
|
||||
; CHECK: {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
|
||||
; CHECK: {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
|
||||
; CHECK: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> {{%.*}}, ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TRIP_COUNT]])
|
||||
; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7)
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[COND:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NOT]], i32 0
|
||||
; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body
|
||||
|
@ -11,22 +11,23 @@ target triple = "aarch64-unknown-linux-gnu"
|
||||
; VPLANS-LABEL: Checking a loop in 'simple_memset'
|
||||
; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
|
||||
; VPLANS-NEXT: vector.ph:
|
||||
; VPLANS-NEXT: EMIT vp<%2> = VF * Part + ir<0>
|
||||
; VPLANS-NEXT: EMIT vp<%3> = active lane mask vp<%2> <badref>
|
||||
; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0>
|
||||
; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 <badref>
|
||||
; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]> <badref>
|
||||
; VPLANS-NEXT: Successor(s): vector loop
|
||||
; VPLANS-EMPTY:
|
||||
; VPLANS-NEXT: <x1> vector loop: {
|
||||
; VPLANS-NEXT: vector.body:
|
||||
; VPLANS-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION
|
||||
; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<%5> = phi vp<%3>, vp<%10>
|
||||
; VPLANS-NEXT: vp<%6> = SCALAR-STEPS vp<%4>, ir<1>
|
||||
; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%6>
|
||||
; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<%5>
|
||||
; VPLANS-NEXT: EMIT vp<%8> = VF * UF + vp<%4>
|
||||
; VPLANS-NEXT: EMIT vp<%9> = VF * Part + vp<%8>
|
||||
; VPLANS-NEXT: EMIT vp<%10> = active lane mask vp<%9> <badref>
|
||||
; VPLANS-NEXT: EMIT vp<%11> = not vp<%10>
|
||||
; VPLANS-NEXT: EMIT branch-on-cond vp<%11>
|
||||
; VPLANS-NEXT: EMIT vp<[[INDV:%[0-9]+]]> = CANONICAL-INDUCTION
|
||||
; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%[0-9]+]]>
|
||||
; VPLANS-NEXT: vp<[[STEP:%[0-9]+]]> = SCALAR-STEPS vp<[[INDV]]>, ir<1>
|
||||
; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]>
|
||||
; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<[[LANEMASK_PHI]]>
|
||||
; VPLANS-NEXT: EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]>
|
||||
; VPLANS-NEXT: EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]> vp<[[NEWTC]]>
|
||||
; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = VF * UF + vp<[[INDV]]>
|
||||
; VPLANS-NEXT: EMIT vp<[[NOT:%[0-9]+]]> = not vp<[[LANEMASK_LOOP]]>
|
||||
; VPLANS-NEXT: EMIT branch-on-cond vp<[[NOT]]>
|
||||
; VPLANS-NEXT: No successors
|
||||
; VPLANS-NEXT: }
|
||||
|
||||
@ -34,20 +35,21 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK-LABEL: @simple_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
@ -55,17 +57,17 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -77,7 +79,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; CHECK: while.end.loopexit:
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
|
@ -6,44 +6,46 @@ target triple = "aarch64-unknown-linux-gnu"
|
||||
define void @trip1024_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 {
|
||||
; CHECK-LABEL: @trip1024_i64(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 -1025, [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP2]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP7]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 1024, [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 1024, [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1024)
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <vscale x 2 x i64>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <vscale x 2 x i64>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP14]] to <vscale x 2 x i64>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64>* [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1024)
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP15]], i32 0
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x i64>* [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[TMP22]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -52,11 +54,11 @@ define void @trip1024_i64(i64* noalias nocapture noundef %dst, i64* noalias noca
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[I_06:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_06]]
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
|
||||
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[TMP22]], 1
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
|
||||
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[TMP24]], 1
|
||||
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[I_06]]
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
|
||||
; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP23]], [[MUL]]
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
|
||||
; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP25]], [[MUL]]
|
||||
; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX1]], align 8
|
||||
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_06]], 1
|
||||
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1024
|
||||
|
@ -10,46 +10,47 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
|
||||
; CHECK-LABEL: @add_reduction_i32(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
|
||||
; CHECK-NEXT: [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
|
||||
; CHECK-NEXT: [[TMP15]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; CHECK: while.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
@ -61,52 +62,53 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
|
||||
; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; CHECK: while.end.loopexit:
|
||||
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]]
|
||||
;
|
||||
; CHECK-IN-LOOP-LABEL: @add_reduction_i32(
|
||||
; CHECK-IN-LOOP-NEXT: entry:
|
||||
; CHECK-IN-LOOP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-IN-LOOP: vector.ph:
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK-IN-LOOP: vector.body:
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP9]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP10]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI]]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP18]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP19]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-IN-LOOP: middle.block:
|
||||
; CHECK-IN-LOOP-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK-IN-LOOP: scalar.ph:
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; CHECK-IN-LOOP: while.body:
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
@ -118,7 +120,7 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
|
||||
; CHECK-IN-LOOP-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; CHECK-IN-LOOP: while.end.loopexit:
|
||||
; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: ret i32 [[RED_NEXT_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
@ -142,45 +144,46 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
|
||||
; CHECK-LABEL: @add_reduction_f32(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <vscale x 4 x float>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP14]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; CHECK: while.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
@ -192,51 +195,52 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
|
||||
; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
|
||||
; CHECK: while.end.loopexit:
|
||||
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]]
|
||||
;
|
||||
; CHECK-IN-LOOP-LABEL: @add_reduction_f32(
|
||||
; CHECK-IN-LOOP-NEXT: entry:
|
||||
; CHECK-IN-LOOP-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-IN-LOOP: vector.ph:
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK-IN-LOOP: vector.body:
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP9]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <vscale x 4 x float>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP10]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP14]])
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK-IN-LOOP: middle.block:
|
||||
; CHECK-IN-LOOP-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK-IN-LOOP: scalar.ph:
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; CHECK-IN-LOOP: while.body:
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
@ -248,7 +252,7 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
|
||||
; CHECK-IN-LOOP-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
|
||||
; CHECK-IN-LOOP: while.end.loopexit:
|
||||
; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: ret float [[RED_NEXT_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
@ -271,67 +275,68 @@ while.end.loopexit: ; preds = %while.body
|
||||
define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
|
||||
; CHECK-LABEL: @cond_xor_reduction(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP14]], i32 0
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[TMP21]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[TMP15]], i32 0
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[TMP14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[TMP22]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP24]]
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[TMP25]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP21]])
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP22]])
|
||||
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
|
||||
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP27]], 5
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP28]], 5
|
||||
; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
|
||||
; CHECK: if.then:
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
|
||||
; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP28]]
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
|
||||
; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP29]]
|
||||
; CHECK-NEXT: br label [[FOR_INC]]
|
||||
; CHECK: for.inc:
|
||||
; CHECK-NEXT: [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ]
|
||||
@ -339,69 +344,70 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
|
||||
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
|
||||
; CHECK: for.end:
|
||||
; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: ret i32 [[RES_LCSSA]]
|
||||
;
|
||||
; CHECK-IN-LOOP-LABEL: @cond_xor_reduction(
|
||||
; CHECK-IN-LOOP-NEXT: entry:
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-IN-LOOP: vector.ph:
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
|
||||
; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK-IN-LOOP: vector.body:
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP9]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP9]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP14]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 4 x i32>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP18]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP20]] = xor i32 [[TMP19]], [[VEC_PHI]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[TMP23]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
||||
; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP10]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP10]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[TMP15]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <vscale x 4 x i32>*
|
||||
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP19]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP21]] = xor i32 [[TMP20]], [[VEC_PHI]]
|
||||
; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
|
||||
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
||||
; CHECK-IN-LOOP: middle.block:
|
||||
; CHECK-IN-LOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK-IN-LOOP: scalar.ph:
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK-IN-LOOP: for.body:
|
||||
; CHECK-IN-LOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP25]], 5
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-IN-LOOP-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP26]], 5
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
|
||||
; CHECK-IN-LOOP: if.then:
|
||||
; CHECK-IN-LOOP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
|
||||
; CHECK-IN-LOOP-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP26]]
|
||||
; CHECK-IN-LOOP-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
|
||||
; CHECK-IN-LOOP-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP27]]
|
||||
; CHECK-IN-LOOP-NEXT: br label [[FOR_INC]]
|
||||
; CHECK-IN-LOOP: for.inc:
|
||||
; CHECK-IN-LOOP-NEXT: [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ]
|
||||
@ -409,7 +415,7 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
|
||||
; CHECK-IN-LOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
||||
; CHECK-IN-LOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
|
||||
; CHECK-IN-LOOP: for.end:
|
||||
; CHECK-IN-LOOP-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-IN-LOOP-NEXT: ret i32 [[RES_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
|
@ -1,3 +1,4 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -opaque-pointers=0 -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-interleave=4 -force-vector-width=4 < %s | FileCheck %s
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
@ -7,110 +8,138 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
|
||||
; CHECK-LABEL: @simple_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 12
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP14]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]]
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]]
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]]
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]]
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT13]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT15]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT12]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT14]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK22:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK23:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK24:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX6]], 0
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX6]], [[TMP19]]
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], [[TMP24]]
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]]
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]]
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]]
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]]
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0
|
||||
; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP36]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
|
||||
; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1
|
||||
; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]]
|
||||
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 4
|
||||
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i64 [[TMP38]]
|
||||
; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT12]], <vscale x 4 x i32>* [[TMP40]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
|
||||
; CHECK-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8
|
||||
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i64 [[TMP42]]
|
||||
; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT14]], <vscale x 4 x i32>* [[TMP44]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
|
||||
; CHECK-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 12
|
||||
; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i64 [[TMP46]]
|
||||
; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT16]], <vscale x 4 x i32>* [[TMP48]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK10]])
|
||||
; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 16
|
||||
; CHECK-NEXT: [[INDEX_NEXT17]] = add i64 [[INDEX6]], [[TMP50]]
|
||||
; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT19:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP52]]
|
||||
; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8
|
||||
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0
|
||||
; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1
|
||||
; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]]
|
||||
; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12
|
||||
; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0
|
||||
; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1
|
||||
; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]]
|
||||
; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP31]]
|
||||
; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP36]]
|
||||
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP41]]
|
||||
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP46]]
|
||||
; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, i32* [[TMP47]], i32 0
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP52]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 8
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT20:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP54]]
|
||||
; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 12
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT21:%.*]] = add i64 [[INDEX_NEXT17]], [[TMP56]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK22]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT17]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK23]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT19]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK24]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT20]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT21]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP57:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK22]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP58:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK23]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP59:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK24]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP60:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK25]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP61:%.*]] = extractelement <vscale x 4 x i1> [[TMP57]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 4
|
||||
; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP47]], i64 [[TMP54]]
|
||||
; CHECK-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT11]], <vscale x 4 x i32>* [[TMP56]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
|
||||
; CHECK-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 8
|
||||
; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP47]], i64 [[TMP58]]
|
||||
; CHECK-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT13]], <vscale x 4 x i32>* [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
|
||||
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 12
|
||||
; CHECK-NEXT: [[TMP63:%.*]] = getelementptr i32, i32* [[TMP47]], i64 [[TMP62]]
|
||||
; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT15]], <vscale x 4 x i32>* [[TMP64]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
|
||||
; CHECK-NEXT: [[TMP65:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP65]], 4
|
||||
; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[INDEX6]], [[TMP66]]
|
||||
; CHECK-NEXT: [[TMP68:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP69:%.*]] = mul i64 [[TMP68]], 8
|
||||
; CHECK-NEXT: [[TMP70:%.*]] = add i64 [[INDEX6]], [[TMP69]]
|
||||
; CHECK-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 12
|
||||
; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[INDEX6]], [[TMP72]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP67]], i64 [[TMP20]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP70]], i64 [[TMP25]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP73]], i64 [[TMP30]])
|
||||
; CHECK-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 16
|
||||
; CHECK-NEXT: [[INDEX_NEXT19]] = add i64 [[INDEX6]], [[TMP75]]
|
||||
; CHECK-NEXT: [[TMP76:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP77:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP78:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP79:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP80:%.*]] = extractelement <vscale x 4 x i1> [[TMP76]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; CHECK: while.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
|
||||
; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP]], align 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; CHECK: while.end.loopexit:
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
br label %while.body
|
||||
@ -131,140 +160,175 @@ define void @cond_memset(i32 %val, i32* noalias readonly %cond_ptr, i32* noalias
|
||||
; CHECK-LABEL: @cond_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 12
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP14]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]]
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]]
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]]
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]]
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT14]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT16]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT18]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT13]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT15]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT17]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK25:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK26:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK27:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK10:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK28:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX6]], 0
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX6]], [[TMP19]]
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 0
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX6]], [[TMP24]]
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 12
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP27]], 0
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX6]], [[TMP29]]
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP20]]
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP25]]
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP30]]
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, i32* [[TMP31]], i32 0
|
||||
; CHECK-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP35]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP36]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1
|
||||
; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]]
|
||||
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 4
|
||||
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[TMP31]], i64 [[TMP38]]
|
||||
; CHECK-NEXT: [[TMP40:%.*]] = bitcast i32* [[TMP39]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP40]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8
|
||||
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[TMP31]], i64 [[TMP42]]
|
||||
; CHECK-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP44]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 12
|
||||
; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[TMP31]], i64 [[TMP46]]
|
||||
; CHECK-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP48]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP49:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP50:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD11]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP51:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD12]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD13]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP20]]
|
||||
; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP25]]
|
||||
; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP30]]
|
||||
; CHECK-NEXT: [[TMP57:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i1> [[TMP49]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP58:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i1> [[TMP50]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP59:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i1> [[TMP51]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP60:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK10]], <vscale x 4 x i1> [[TMP52]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP53]], i32 0
|
||||
; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP62]], i32 4, <vscale x 4 x i1> [[TMP57]])
|
||||
; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4
|
||||
; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[TMP53]], i64 [[TMP64]]
|
||||
; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[TMP65]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT15]], <vscale x 4 x i32>* [[TMP66]], i32 4, <vscale x 4 x i1> [[TMP58]])
|
||||
; CHECK-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 8
|
||||
; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP53]], i64 [[TMP68]]
|
||||
; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT17]], <vscale x 4 x i32>* [[TMP70]], i32 4, <vscale x 4 x i1> [[TMP59]])
|
||||
; CHECK-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 12
|
||||
; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP53]], i64 [[TMP72]]
|
||||
; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT19]], <vscale x 4 x i32>* [[TMP74]], i32 4, <vscale x 4 x i1> [[TMP60]])
|
||||
; CHECK-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 16
|
||||
; CHECK-NEXT: [[INDEX_NEXT20]] = add i64 [[INDEX6]], [[TMP76]]
|
||||
; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 4
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT22:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP78]]
|
||||
; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8
|
||||
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0
|
||||
; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1
|
||||
; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]]
|
||||
; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12
|
||||
; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0
|
||||
; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1
|
||||
; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]]
|
||||
; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, i32* [[COND_PTR:%.*]], i64 [[TMP31]]
|
||||
; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP36]]
|
||||
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP41]]
|
||||
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[TMP46]]
|
||||
; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, i32* [[TMP47]], i32 0
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP52]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP54:%.*]] = mul i64 [[TMP53]], 4
|
||||
; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP47]], i64 [[TMP54]]
|
||||
; CHECK-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP56]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 8
|
||||
; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP47]], i64 [[TMP58]]
|
||||
; CHECK-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 12
|
||||
; CHECK-NEXT: [[TMP63:%.*]] = getelementptr i32, i32* [[TMP47]], i64 [[TMP62]]
|
||||
; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP64]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP65:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP66:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP67:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD11]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP68:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD12]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP31]]
|
||||
; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP36]]
|
||||
; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP41]]
|
||||
; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[TMP46]]
|
||||
; CHECK-NEXT: [[TMP73:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP65]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP74:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i1> [[TMP66]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP75:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i1> [[TMP67]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP76:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i1> [[TMP68]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP77:%.*]] = getelementptr i32, i32* [[TMP69]], i32 0
|
||||
; CHECK-NEXT: [[TMP78:%.*]] = bitcast i32* [[TMP77]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP78]], i32 4, <vscale x 4 x i1> [[TMP73]])
|
||||
; CHECK-NEXT: [[TMP79:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 8
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT23:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP80]]
|
||||
; CHECK-NEXT: [[TMP81:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP81]], 12
|
||||
; CHECK-NEXT: [[INDEX_PART_NEXT24:%.*]] = add i64 [[INDEX_NEXT20]], [[TMP82]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK25]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT20]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK26]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT22]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK27]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT23]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK28]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT24]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP83:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK25]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP84:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK26]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP85:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK27]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP86:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK28]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP87:%.*]] = extractelement <vscale x 4 x i1> [[TMP83]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP87]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP80:%.*]] = mul i64 [[TMP79]], 4
|
||||
; CHECK-NEXT: [[TMP81:%.*]] = getelementptr i32, i32* [[TMP69]], i64 [[TMP80]]
|
||||
; CHECK-NEXT: [[TMP82:%.*]] = bitcast i32* [[TMP81]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT14]], <vscale x 4 x i32>* [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP74]])
|
||||
; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 8
|
||||
; CHECK-NEXT: [[TMP85:%.*]] = getelementptr i32, i32* [[TMP69]], i64 [[TMP84]]
|
||||
; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[TMP85]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT16]], <vscale x 4 x i32>* [[TMP86]], i32 4, <vscale x 4 x i1> [[TMP75]])
|
||||
; CHECK-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 12
|
||||
; CHECK-NEXT: [[TMP89:%.*]] = getelementptr i32, i32* [[TMP69]], i64 [[TMP88]]
|
||||
; CHECK-NEXT: [[TMP90:%.*]] = bitcast i32* [[TMP89]] to <vscale x 4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT18]], <vscale x 4 x i32>* [[TMP90]], i32 4, <vscale x 4 x i1> [[TMP76]])
|
||||
; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 4
|
||||
; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]]
|
||||
; CHECK-NEXT: [[TMP94:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP95:%.*]] = mul i64 [[TMP94]], 8
|
||||
; CHECK-NEXT: [[TMP96:%.*]] = add i64 [[INDEX6]], [[TMP95]]
|
||||
; CHECK-NEXT: [[TMP97:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP98:%.*]] = mul i64 [[TMP97]], 12
|
||||
; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX6]], [[TMP98]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP20]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP96]], i64 [[TMP25]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP99]], i64 [[TMP30]])
|
||||
; CHECK-NEXT: [[TMP100:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP100]], 16
|
||||
; CHECK-NEXT: [[INDEX_NEXT22]] = add i64 [[INDEX6]], [[TMP101]]
|
||||
; CHECK-NEXT: [[TMP102:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP103:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT19]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP104:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT20]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP105:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT21]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP106:%.*]] = extractelement <vscale x 4 x i1> [[TMP102]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP106]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; CHECK: while.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; CHECK-NEXT: [[COND_GEP:%.*]] = getelementptr i32, i32* [[COND_PTR]], i64 [[INDEX]]
|
||||
; CHECK-NEXT: [[COND_I32:%.*]] = load i32, i32* [[COND_GEP]], align 4
|
||||
; CHECK-NEXT: [[COND_I1:%.*]] = icmp ne i32 [[COND_I32]], 0
|
||||
; CHECK-NEXT: br i1 [[COND_I1]], label [[DO_STORE:%.*]], label [[WHILE_END]]
|
||||
; CHECK: do.store:
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
|
||||
; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP]], align 4
|
||||
; CHECK-NEXT: br label [[WHILE_END]]
|
||||
; CHECK: while.end:
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
|
||||
; CHECK: while.end.loopexit:
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
br label %while.body
|
||||
|
@ -8,20 +8,21 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK-LABEL: @simple_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
@ -29,17 +30,17 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -51,7 +52,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
|
||||
; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; CHECK: while.end.loopexit:
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
@ -80,6 +81,9 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], 3
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[UMAX]], 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[UMAX]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
|
||||
@ -87,15 +91,15 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -131,39 +135,40 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @simple_memcpy(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP15]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -206,48 +211,49 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = sub i64 -1, [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
|
||||
; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP11]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i64> [[TMP12]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i64> [[TMP13]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 4, [[TMP16]]
|
||||
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i64> [[TMP13]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = mul <vscale x 4 x i64> [[TMP14]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 4, [[TMP17]]
|
||||
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP18]], i64 0
|
||||
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
|
||||
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
|
||||
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]]
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[TMP22]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[TMP23]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -287,40 +293,41 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
|
||||
; CHECK-LABEL: @simple_gather_scatter(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
|
||||
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP15]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
|
||||
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -365,39 +372,40 @@ while.end.loopexit: ; preds = %while.body
|
||||
define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @uniform_load(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[SRC:%.*]], align 4
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i64 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SRC:%.*]], align 4
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -439,20 +447,21 @@ for.end: ; preds = %for.body, %entry
|
||||
define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr noalias readonly %cond, i64 %n) #0 {
|
||||
; CHECK-LABEL: @cond_uniform_load(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[SRC:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
|
||||
@ -460,27 +469,27 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[TMP12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = or <vscale x 4 x i1> [[TMP14]], [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP17]])
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[N]])
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
|
||||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[TMP18]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[TMP22]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -489,14 +498,14 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP23]], 0
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP24]], 0
|
||||
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
|
||||
; CHECK: if.then:
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[SRC]], align 4
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SRC]], align 4
|
||||
; CHECK-NEXT: br label [[IF_END]]
|
||||
; CHECK: if.end:
|
||||
; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP24]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP25]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
|
||||
; CHECK-NEXT: store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
|
||||
@ -538,20 +547,21 @@ for.end: ; preds = %for.inc, %entry
|
||||
define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @uniform_store(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
|
||||
@ -559,18 +569,18 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -609,41 +619,42 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @simple_fdiv(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP14]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -688,42 +699,43 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @simple_idiv(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP14]]
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]])
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
|
||||
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4
|
||||
; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP18]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP19]], i32 0
|
||||
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
|
@ -10,6 +10,9 @@ target triple = "aarch64-linux-gnu"
|
||||
define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @uniform_load(
|
||||
; CHECK: vector.ph:
|
||||
; CHECK: [[N_MINUS_VF:%.*]] = sub i64 %n, [[VSCALE_X_VF:.*]]
|
||||
; CHECK: [[CMP:%.*]] = icmp ugt i64 %n, [[VSCALE_X_VF]]
|
||||
; CHECK: [[N2:%.*]] = select i1 [[CMP]], i64 [[N_MINUS_VF]], i64 0
|
||||
; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n)
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
|
||||
@ -23,8 +26,8 @@ define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n)
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
|
||||
; CHECK-NEXT: [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
|
||||
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
|
||||
; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4
|
||||
; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX_NEXT]], i64 %n)
|
||||
; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0
|
||||
; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body
|
||||
|
@ -0,0 +1,277 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE
|
||||
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA
|
||||
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK
|
||||
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL
|
||||
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control-without-rt-check < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL_NO_RT_CHECK
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
; Test the different tail folding styles.
|
||||
|
||||
define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" {
|
||||
; NONE-LABEL: @simple_memset_tailfold(
|
||||
; NONE-NEXT: entry:
|
||||
; NONE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; NONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; NONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; NONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]]
|
||||
; NONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; NONE: vector.ph:
|
||||
; NONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; NONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]]
|
||||
; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
|
||||
; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; NONE-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; NONE: vector.body:
|
||||
; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; NONE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0
|
||||
; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
|
||||
; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
|
||||
; NONE-NEXT: store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4
|
||||
; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
|
||||
; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]]
|
||||
; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
|
||||
; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; NONE: middle.block:
|
||||
; NONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
|
||||
; NONE-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; NONE: scalar.ph:
|
||||
; NONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; NONE-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; NONE: while.body:
|
||||
; NONE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; NONE-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
|
||||
; NONE-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; NONE-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; NONE-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; NONE-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; NONE: while.end.loopexit:
|
||||
; NONE-NEXT: ret void
|
||||
;
|
||||
; DATA-LABEL: @simple_memset_tailfold(
|
||||
; DATA-NEXT: entry:
|
||||
; DATA-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; DATA-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; DATA-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; DATA-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; DATA: vector.ph:
|
||||
; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; DATA-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; DATA-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; DATA: vector.body:
|
||||
; DATA-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; DATA-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]])
|
||||
; DATA-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
|
||||
; DATA-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; DATA-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; DATA-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
|
||||
; DATA-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
|
||||
; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
|
||||
; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; DATA: middle.block:
|
||||
; DATA-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; DATA: scalar.ph:
|
||||
; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; DATA-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; DATA: while.body:
|
||||
; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; DATA-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
|
||||
; DATA-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; DATA-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; DATA-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; DATA-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; DATA: while.end.loopexit:
|
||||
; DATA-NEXT: ret void
|
||||
;
|
||||
; DATA_NO_LANEMASK-LABEL: @simple_memset_tailfold(
|
||||
; DATA_NO_LANEMASK-NEXT: entry:
|
||||
; DATA_NO_LANEMASK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; DATA_NO_LANEMASK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; DATA_NO_LANEMASK: vector.ph:
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; DATA_NO_LANEMASK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
|
||||
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
|
||||
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; DATA_NO_LANEMASK: vector.body:
|
||||
; DATA_NO_LANEMASK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i64 0
|
||||
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
|
||||
; DATA_NO_LANEMASK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]])
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
|
||||
; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]]
|
||||
; DATA_NO_LANEMASK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
|
||||
; DATA_NO_LANEMASK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; DATA_NO_LANEMASK: middle.block:
|
||||
; DATA_NO_LANEMASK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; DATA_NO_LANEMASK: scalar.ph:
|
||||
; DATA_NO_LANEMASK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; DATA_NO_LANEMASK-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; DATA_NO_LANEMASK: while.body:
|
||||
; DATA_NO_LANEMASK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; DATA_NO_LANEMASK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
|
||||
; DATA_NO_LANEMASK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; DATA_NO_LANEMASK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; DATA_NO_LANEMASK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; DATA_NO_LANEMASK: while.end.loopexit:
|
||||
; DATA_NO_LANEMASK-NEXT: ret void
|
||||
;
|
||||
; DATA_AND_CONTROL-LABEL: @simple_memset_tailfold(
|
||||
; DATA_AND_CONTROL-NEXT: entry:
|
||||
; DATA_AND_CONTROL-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
|
||||
; DATA_AND_CONTROL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; DATA_AND_CONTROL: vector.ph:
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
|
||||
; DATA_AND_CONTROL-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
|
||||
; DATA_AND_CONTROL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
|
||||
; DATA_AND_CONTROL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; DATA_AND_CONTROL-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; DATA_AND_CONTROL: vector.body:
|
||||
; DATA_AND_CONTROL-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
|
||||
; DATA_AND_CONTROL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
|
||||
; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
|
||||
; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; DATA_AND_CONTROL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
|
||||
; DATA_AND_CONTROL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; DATA_AND_CONTROL: middle.block:
|
||||
; DATA_AND_CONTROL-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; DATA_AND_CONTROL: scalar.ph:
|
||||
; DATA_AND_CONTROL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; DATA_AND_CONTROL-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; DATA_AND_CONTROL: while.body:
|
||||
; DATA_AND_CONTROL-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; DATA_AND_CONTROL-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
|
||||
; DATA_AND_CONTROL-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; DATA_AND_CONTROL-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; DATA_AND_CONTROL-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; DATA_AND_CONTROL: while.end.loopexit:
|
||||
; DATA_AND_CONTROL-NEXT: ret void
|
||||
;
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-LABEL: @simple_memset_tailfold(
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: entry:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK: vector.ph:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK: vector.body:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK: middle.block:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK: scalar.ph:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[WHILE_BODY:%.*]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK: while.body:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK: while.end.loopexit:
|
||||
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body, %entry
|
||||
%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
|
||||
%gep = getelementptr i32, ptr %ptr, i64 %index
|
||||
store i32 %val, ptr %gep
|
||||
%index.next = add nsw i64 %index, 1
|
||||
%cmp10 = icmp ult i64 %index.next, %n
|
||||
br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
ret void
|
||||
}
|
||||
|
||||
!0 = distinct !{!0, !1, !2, !3, !4}
|
||||
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
|
||||
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
|
||||
!3 = !{!"llvm.loop.interleave.count", i32 1}
|
||||
!4 = !{!"llvm.loop.vectorize.width", i32 4}
|
Loading…
x
Reference in New Issue
Block a user