2022-06-28 10:34:30 +01:00
|
|
|
//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
///
|
|
|
|
/// \file
|
|
|
|
/// This file contains implementations for different VPlan recipes.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "VPlan.h"
|
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
#include "llvm/ADT/Twine.h"
|
|
|
|
#include "llvm/Analysis/IVDescriptors.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2022-07-13 14:39:59 -07:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2022-06-28 10:34:30 +01:00
|
|
|
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
|
|
|
|
#include <cassert>
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2022-07-11 16:01:04 -07:00
|
|
|
using VectorParts = SmallVector<Value *, 2>;
|
|
|
|
|
2023-02-14 00:09:44 -08:00
|
|
|
namespace llvm {
|
2022-06-28 10:34:30 +01:00
|
|
|
extern cl::opt<bool> EnableVPlanNativePath;
|
2023-02-14 00:09:44 -08:00
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-09 18:46:56 -07:00
|
|
|
#define LV_NAME "loop-vectorize"
|
|
|
|
#define DEBUG_TYPE LV_NAME
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
bool VPRecipeBase::mayWriteToMemory() const {
|
|
|
|
switch (getVPDefID()) {
|
|
|
|
case VPWidenMemoryInstructionSC: {
|
|
|
|
return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
|
|
|
|
}
|
|
|
|
case VPReplicateSC:
|
|
|
|
case VPWidenCallSC:
|
|
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
|
|
->mayWriteToMemory();
|
|
|
|
case VPBranchOnMaskSC:
|
2022-12-04 12:58:46 +00:00
|
|
|
case VPScalarIVStepsSC:
|
2023-01-30 10:29:27 +00:00
|
|
|
case VPPredInstPHISC:
|
2022-06-28 10:34:30 +01:00
|
|
|
return false;
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPBlendSC:
|
|
|
|
case VPReductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenCanonicalIVSC:
|
2023-05-05 13:20:16 +01:00
|
|
|
case VPWidenCastSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenGEPSC:
|
|
|
|
case VPWidenIntOrFpInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenPHISC:
|
|
|
|
case VPWidenSC:
|
|
|
|
case VPWidenSelectSC: {
|
|
|
|
const Instruction *I =
|
|
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
|
|
(void)I;
|
|
|
|
assert((!I || !I->mayWriteToMemory()) &&
|
|
|
|
"underlying instruction may write to memory");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VPRecipeBase::mayReadFromMemory() const {
|
|
|
|
switch (getVPDefID()) {
|
|
|
|
case VPWidenMemoryInstructionSC: {
|
|
|
|
return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
|
|
|
|
}
|
|
|
|
case VPReplicateSC:
|
|
|
|
case VPWidenCallSC:
|
|
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
|
|
->mayReadFromMemory();
|
|
|
|
case VPBranchOnMaskSC:
|
2022-12-04 12:58:46 +00:00
|
|
|
case VPScalarIVStepsSC:
|
2023-01-31 21:51:03 +00:00
|
|
|
case VPPredInstPHISC:
|
2022-06-28 10:34:30 +01:00
|
|
|
return false;
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPBlendSC:
|
|
|
|
case VPReductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenCanonicalIVSC:
|
2023-05-05 13:20:16 +01:00
|
|
|
case VPWidenCastSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenGEPSC:
|
|
|
|
case VPWidenIntOrFpInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenPHISC:
|
|
|
|
case VPWidenSC:
|
|
|
|
case VPWidenSelectSC: {
|
|
|
|
const Instruction *I =
|
|
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
|
|
(void)I;
|
|
|
|
assert((!I || !I->mayReadFromMemory()) &&
|
|
|
|
"underlying instruction may read from memory");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VPRecipeBase::mayHaveSideEffects() const {
|
|
|
|
switch (getVPDefID()) {
|
2022-11-30 17:04:19 +00:00
|
|
|
case VPDerivedIVSC:
|
2022-07-27 19:29:26 +01:00
|
|
|
case VPPredInstPHISC:
|
|
|
|
return false;
|
2023-03-05 12:08:54 +01:00
|
|
|
case VPWidenCallSC:
|
|
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
|
|
->mayHaveSideEffects();
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPBlendSC:
|
|
|
|
case VPReductionSC:
|
|
|
|
case VPScalarIVStepsSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenCanonicalIVSC:
|
2023-05-05 13:20:16 +01:00
|
|
|
case VPWidenCastSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenGEPSC:
|
|
|
|
case VPWidenIntOrFpInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenPHISC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenPointerInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenSelectSC: {
|
2022-06-28 10:34:30 +01:00
|
|
|
const Instruction *I =
|
|
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
|
|
(void)I;
|
|
|
|
assert((!I || !I->mayHaveSideEffects()) &&
|
|
|
|
"underlying instruction has side-effects");
|
|
|
|
return false;
|
|
|
|
}
|
2023-02-07 22:02:50 +00:00
|
|
|
case VPWidenMemoryInstructionSC:
|
|
|
|
assert(cast<VPWidenMemoryInstructionRecipe>(this)
|
|
|
|
->getIngredient()
|
|
|
|
.mayHaveSideEffects() == mayWriteToMemory() &&
|
|
|
|
"mayHaveSideffects result for ingredient differs from this "
|
|
|
|
"implementation");
|
|
|
|
return mayWriteToMemory();
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPReplicateSC: {
|
|
|
|
auto *R = cast<VPReplicateRecipe>(this);
|
|
|
|
return R->getUnderlyingInstr()->mayHaveSideEffects();
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
|
|
|
|
auto Lane = VPLane::getLastLaneForVF(State.VF);
|
|
|
|
VPValue *ExitValue = getOperand(0);
|
2022-08-26 18:26:32 +01:00
|
|
|
if (vputils::isUniformAfterVectorization(ExitValue))
|
2022-06-28 10:34:30 +01:00
|
|
|
Lane = VPLane::getFirstLane();
|
|
|
|
Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
|
|
|
|
State.Builder.GetInsertBlock());
|
|
|
|
}
|
|
|
|
|
2023-05-22 10:44:17 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2023-05-22 09:53:52 +01:00
|
|
|
void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
|
|
|
|
O << "Live-out ";
|
|
|
|
getPhi()->printAsOperand(O);
|
|
|
|
O << " = ";
|
|
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
|
|
O << "\n";
|
|
|
|
}
|
2023-05-22 10:44:17 +01:00
|
|
|
#endif
|
2023-05-22 09:53:52 +01:00
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
|
|
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
|
|
assert(InsertPos->getParent() &&
|
|
|
|
"Insertion position not in any VPBasicBlock");
|
|
|
|
Parent = InsertPos->getParent();
|
|
|
|
Parent->getRecipeList().insert(InsertPos->getIterator(), this);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::insertBefore(VPBasicBlock &BB,
|
|
|
|
iplist<VPRecipeBase>::iterator I) {
|
|
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
|
|
assert(I == BB.end() || I->getParent() == &BB);
|
|
|
|
Parent = &BB;
|
|
|
|
BB.getRecipeList().insert(I, this);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
|
|
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
|
|
assert(InsertPos->getParent() &&
|
|
|
|
"Insertion position not in any VPBasicBlock");
|
|
|
|
Parent = InsertPos->getParent();
|
|
|
|
Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::removeFromParent() {
|
|
|
|
assert(getParent() && "Recipe not in any VPBasicBlock");
|
|
|
|
getParent()->getRecipeList().remove(getIterator());
|
|
|
|
Parent = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
|
|
|
|
assert(getParent() && "Recipe not in any VPBasicBlock");
|
|
|
|
return getParent()->getRecipeList().erase(getIterator());
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
|
|
|
|
removeFromParent();
|
|
|
|
insertAfter(InsertPos);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::moveBefore(VPBasicBlock &BB,
|
|
|
|
iplist<VPRecipeBase>::iterator I) {
|
|
|
|
removeFromParent();
|
|
|
|
insertBefore(BB, I);
|
|
|
|
}
|
|
|
|
|
2023-08-07 12:35:53 +01:00
|
|
|
FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
|
|
|
|
assert(OpType == OperationType::FPMathOp &&
|
|
|
|
"recipe doesn't have fast math flags");
|
|
|
|
FastMathFlags Res;
|
|
|
|
Res.setAllowReassoc(FMFs.AllowReassoc);
|
|
|
|
Res.setNoNaNs(FMFs.NoNaNs);
|
|
|
|
Res.setNoInfs(FMFs.NoInfs);
|
|
|
|
Res.setNoSignedZeros(FMFs.NoSignedZeros);
|
|
|
|
Res.setAllowReciprocal(FMFs.AllowReciprocal);
|
|
|
|
Res.setAllowContract(FMFs.AllowContract);
|
|
|
|
Res.setApproxFunc(FMFs.ApproxFunc);
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
|
2023-07-05 19:15:55 +01:00
|
|
|
Value *VPInstruction::generateInstruction(VPTransformState &State,
|
|
|
|
unsigned Part) {
|
2022-06-28 10:34:30 +01:00
|
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
Builder.SetCurrentDebugLocation(DL);
|
|
|
|
|
|
|
|
if (Instruction::isBinaryOp(getOpcode())) {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
|
|
|
Value *B = State.get(getOperand(1), Part);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
switch (getOpcode()) {
|
|
|
|
case VPInstruction::Not: {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateNot(A, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::ICmpULE: {
|
|
|
|
Value *IV = State.get(getOperand(0), Part);
|
|
|
|
Value *TC = State.get(getOperand(1), Part);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateICmpULE(IV, TC, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case Instruction::Select: {
|
|
|
|
Value *Cond = State.get(getOperand(0), Part);
|
|
|
|
Value *Op1 = State.get(getOperand(1), Part);
|
|
|
|
Value *Op2 = State.get(getOperand(2), Part);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateSelect(Cond, Op1, Op2, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::ActiveLaneMask: {
|
|
|
|
// Get first lane of vector induction variable.
|
|
|
|
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
|
|
|
|
// Get the original loop tripcount.
|
2023-05-04 14:00:13 +01:00
|
|
|
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
|
|
|
|
auto *PredTy = VectorType::get(Int1Ty, State.VF);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
|
|
|
|
{PredTy, ScalarTC->getType()},
|
|
|
|
{VIVElem0, ScalarTC}, nullptr, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::FirstOrderRecurrenceSplice: {
|
|
|
|
// Generate code to combine the previous and current values in vector v3.
|
|
|
|
//
|
|
|
|
// vector.ph:
|
|
|
|
// v_init = vector(..., ..., ..., a[-1])
|
|
|
|
// br vector.body
|
|
|
|
//
|
|
|
|
// vector.body
|
|
|
|
// i = phi [0, vector.ph], [i+4, vector.body]
|
|
|
|
// v1 = phi [v_init, vector.ph], [v2, vector.body]
|
|
|
|
// v2 = a[i, i+1, i+2, i+3];
|
|
|
|
// v3 = vector(v1(3), v2(0, 1, 2))
|
|
|
|
|
|
|
|
// For the first part, use the recurrence phi (v1), otherwise v2.
|
|
|
|
auto *V1 = State.get(getOperand(0), 0);
|
|
|
|
Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
|
2023-07-05 19:15:55 +01:00
|
|
|
if (!PartMinus1->getType()->isVectorTy())
|
|
|
|
return PartMinus1;
|
|
|
|
Value *V2 = State.get(getOperand(1), Part);
|
|
|
|
return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
case VPInstruction::CalculateTripCountMinusVF: {
|
2023-05-04 14:00:13 +01:00
|
|
|
Value *ScalarTC = State.get(getOperand(0), {0, 0});
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
Value *Step =
|
|
|
|
createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
|
|
|
|
Value *Sub = Builder.CreateSub(ScalarTC, Step);
|
|
|
|
Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
|
|
|
|
Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateSelect(Cmp, Sub, Zero);
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
}
|
2023-08-08 12:12:29 +01:00
|
|
|
case VPInstruction::CanonicalIVIncrement: {
|
2022-06-28 10:34:30 +01:00
|
|
|
if (Part == 0) {
|
|
|
|
auto *Phi = State.get(getOperand(0), 0);
|
|
|
|
// The loop step is equal to the vectorization factor (num of SIMD
|
|
|
|
// elements) times the unroll factor (num of SIMD instructions).
|
|
|
|
Value *Step =
|
|
|
|
createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
|
2023-08-08 12:12:29 +01:00
|
|
|
return Builder.CreateAdd(Phi, Step, Name, hasNoUnsignedWrap(),
|
|
|
|
hasNoSignedWrap());
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2023-07-05 19:15:55 +01:00
|
|
|
return State.get(this, 0);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2022-05-10 10:49:43 +01:00
|
|
|
|
2023-08-08 12:12:29 +01:00
|
|
|
case VPInstruction::CanonicalIVIncrementForPart: {
|
2022-05-10 10:49:43 +01:00
|
|
|
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
|
2023-07-05 19:15:55 +01:00
|
|
|
if (Part == 0)
|
|
|
|
return IV;
|
2022-05-10 10:49:43 +01:00
|
|
|
|
|
|
|
// The canonical IV is incremented by the vectorization factor (num of SIMD
|
|
|
|
// elements) times the unroll part.
|
|
|
|
Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
|
2023-08-08 12:12:29 +01:00
|
|
|
return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), false);
|
2022-05-10 10:49:43 +01:00
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPInstruction::BranchOnCond: {
|
|
|
|
if (Part != 0)
|
2023-07-05 19:15:55 +01:00
|
|
|
return nullptr;
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
|
|
|
|
VPRegionBlock *ParentRegion = getParent()->getParent();
|
|
|
|
VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
|
|
|
|
|
|
|
|
// Replace the temporary unreachable terminator with a new conditional
|
|
|
|
// branch, hooking it up to backward destination for exiting blocks now and
|
|
|
|
// to forward destination(s) later when they are created.
|
|
|
|
BranchInst *CondBr =
|
|
|
|
Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
|
|
|
|
|
|
|
|
if (getParent()->isExiting())
|
|
|
|
CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
|
|
|
|
|
|
|
|
CondBr->setSuccessor(0, nullptr);
|
|
|
|
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
|
2023-07-05 19:15:55 +01:00
|
|
|
return CondBr;
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::BranchOnCount: {
|
|
|
|
if (Part != 0)
|
2023-07-05 19:15:55 +01:00
|
|
|
return nullptr;
|
2022-06-28 10:34:30 +01:00
|
|
|
// First create the compare.
|
|
|
|
Value *IV = State.get(getOperand(0), Part);
|
|
|
|
Value *TC = State.get(getOperand(1), Part);
|
|
|
|
Value *Cond = Builder.CreateICmpEQ(IV, TC);
|
|
|
|
|
|
|
|
// Now create the branch.
|
|
|
|
auto *Plan = getParent()->getPlan();
|
|
|
|
VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
|
|
|
|
VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
|
|
|
|
|
|
|
|
// Replace the temporary unreachable terminator with a new conditional
|
|
|
|
// branch, hooking it up to backward destination (the header) now and to the
|
|
|
|
// forward destination (the exit/middle block) later when it is created.
|
|
|
|
// Note that CreateCondBr expects a valid BB as first argument, so we need
|
|
|
|
// to set it to nullptr later.
|
|
|
|
BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
|
|
|
|
State.CFG.VPBB2IRBB[Header]);
|
|
|
|
CondBr->setSuccessor(0, nullptr);
|
|
|
|
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
|
2023-07-05 19:15:55 +01:00
|
|
|
return CondBr;
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unsupported opcode for instruction");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPInstruction::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "VPInstruction executing an Instance");
|
|
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
|
|
|
|
State.Builder.setFastMathFlags(FMF);
|
2023-07-05 19:15:55 +01:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *GeneratedValue = generateInstruction(State, Part);
|
|
|
|
if (!hasResult())
|
|
|
|
continue;
|
|
|
|
assert(GeneratedValue && "generateInstruction must produce a value");
|
|
|
|
State.set(this, GeneratedValue, Part);
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPInstruction::dump() const {
|
|
|
|
VPSlotTracker SlotTracker(getParent()->getPlan());
|
|
|
|
print(dbgs(), "", SlotTracker);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPInstruction::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
|
|
|
|
if (hasResult()) {
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (getOpcode()) {
|
|
|
|
case VPInstruction::Not:
|
|
|
|
O << "not";
|
|
|
|
break;
|
|
|
|
case VPInstruction::ICmpULE:
|
|
|
|
O << "icmp ule";
|
|
|
|
break;
|
|
|
|
case VPInstruction::SLPLoad:
|
|
|
|
O << "combined load";
|
|
|
|
break;
|
|
|
|
case VPInstruction::SLPStore:
|
|
|
|
O << "combined store";
|
|
|
|
break;
|
|
|
|
case VPInstruction::ActiveLaneMask:
|
|
|
|
O << "active lane mask";
|
|
|
|
break;
|
|
|
|
case VPInstruction::FirstOrderRecurrenceSplice:
|
|
|
|
O << "first-order splice";
|
|
|
|
break;
|
|
|
|
case VPInstruction::CanonicalIVIncrement:
|
2023-08-08 11:31:21 +01:00
|
|
|
O << "VF * UF +";
|
2022-06-28 10:34:30 +01:00
|
|
|
break;
|
|
|
|
case VPInstruction::BranchOnCond:
|
|
|
|
O << "branch-on-cond";
|
|
|
|
break;
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
|
|
O << "TC > VF ? TC - VF : 0";
|
|
|
|
break;
|
2022-05-10 10:49:43 +01:00
|
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
2023-08-08 11:31:21 +01:00
|
|
|
O << "VF * Part +";
|
2022-05-10 10:49:43 +01:00
|
|
|
break;
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPInstruction::BranchOnCount:
|
2023-08-08 11:31:21 +01:00
|
|
|
O << "branch-on-count";
|
2022-06-28 10:34:30 +01:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
O << Instruction::getOpcodeName(getOpcode());
|
|
|
|
}
|
|
|
|
|
|
|
|
O << FMF;
|
2023-08-08 12:12:29 +01:00
|
|
|
printFlags(O);
|
2023-08-08 11:31:21 +01:00
|
|
|
printOperands(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
if (DL) {
|
|
|
|
O << ", !dbg ";
|
|
|
|
DL.print(O);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
|
|
|
|
// Make sure the VPInstruction is a floating-point operation.
|
|
|
|
assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
|
|
|
|
Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
|
|
|
|
Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
|
|
|
|
Opcode == Instruction::FCmp) &&
|
|
|
|
"this op can't take fast-math flags");
|
|
|
|
FMF = FMFNew;
|
|
|
|
}
|
|
|
|
|
2022-09-05 10:48:29 +01:00
|
|
|
void VPWidenCallRecipe::execute(VPTransformState &State) {
|
2023-05-05 18:21:56 +01:00
|
|
|
assert(State.VF.isVector() && "not widening");
|
2022-09-05 10:48:29 +01:00
|
|
|
auto &CI = *cast<CallInst>(getUnderlyingInstr());
|
|
|
|
assert(!isa<DbgInfoIntrinsic>(CI) &&
|
|
|
|
"DbgInfoIntrinsic should have been dropped during VPlan construction");
|
|
|
|
State.setDebugLocFromInst(&CI);
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-04-21 14:11:31 +01:00
|
|
|
SmallVector<Type *, 2> TysForDecl;
|
|
|
|
// Add return type if intrinsic is overloaded on it.
|
|
|
|
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) {
|
|
|
|
TysForDecl.push_back(
|
2023-05-05 18:21:56 +01:00
|
|
|
VectorType::get(CI.getType()->getScalarType(), State.VF));
|
2023-04-21 14:11:31 +01:00
|
|
|
}
|
2022-09-05 10:48:29 +01:00
|
|
|
SmallVector<Value *, 4> Args;
|
|
|
|
for (const auto &I : enumerate(operands())) {
|
|
|
|
// Some intrinsics have a scalar argument - don't replace it with a
|
|
|
|
// vector.
|
|
|
|
Value *Arg;
|
|
|
|
if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
|
|
|
|
!isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
|
|
|
|
Arg = State.get(I.value(), Part);
|
|
|
|
else
|
|
|
|
Arg = State.get(I.value(), VPIteration(0, 0));
|
|
|
|
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
|
|
|
|
TysForDecl.push_back(Arg->getType());
|
|
|
|
Args.push_back(Arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
Function *VectorF;
|
|
|
|
if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
|
|
|
|
// Use vector version of the intrinsic.
|
|
|
|
Module *M = State.Builder.GetInsertBlock()->getModule();
|
|
|
|
VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
|
|
|
|
assert(VectorF && "Can't retrieve vector intrinsic.");
|
|
|
|
} else {
|
|
|
|
#ifndef NDEBUG
|
2022-09-09 09:41:02 +01:00
|
|
|
assert(Variant != nullptr && "Can't create vector function.");
|
2022-09-05 10:48:29 +01:00
|
|
|
#endif
|
2022-09-09 09:41:02 +01:00
|
|
|
VectorF = Variant;
|
2022-09-05 10:48:29 +01:00
|
|
|
}
|
2022-09-09 09:41:02 +01:00
|
|
|
|
2022-09-05 10:48:29 +01:00
|
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
|
|
|
CI.getOperandBundlesAsDefs(OpBundles);
|
|
|
|
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
|
|
|
|
|
|
|
|
if (isa<FPMathOperator>(V))
|
|
|
|
V->copyFastMathFlags(&CI);
|
|
|
|
|
|
|
|
State.set(this, V, Part);
|
|
|
|
State.addMetadata(V, &CI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-CALL ";
|
|
|
|
|
|
|
|
auto *CI = cast<CallInst>(getUnderlyingInstr());
|
|
|
|
if (CI->getType()->isVoidTy())
|
|
|
|
O << "void ";
|
|
|
|
else {
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
|
|
|
|
O << "call @" << CI->getCalledFunction()->getName() << "(";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
O << ")";
|
2022-09-01 13:14:40 +01:00
|
|
|
|
|
|
|
if (VectorIntrinsicID)
|
|
|
|
O << " (using vector intrinsic)";
|
2022-09-09 09:41:02 +01:00
|
|
|
else {
|
|
|
|
O << " (using library function";
|
|
|
|
if (Variant->hasName())
|
|
|
|
O << ": " << Variant->getName();
|
|
|
|
O << ")";
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-SELECT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = select ";
|
|
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", ";
|
|
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", ";
|
|
|
|
getOperand(2)->printAsOperand(O, SlotTracker);
|
2023-03-10 15:28:43 +01:00
|
|
|
O << (isInvariantCond() ? " (condition is loop invariant)" : "");
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2022-07-08 09:33:17 -07:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-08 09:33:17 -07:00
|
|
|
void VPWidenSelectRecipe::execute(VPTransformState &State) {
|
|
|
|
auto &I = *cast<SelectInst>(getUnderlyingInstr());
|
|
|
|
State.setDebugLocFromInst(&I);
|
|
|
|
|
|
|
|
// The condition can be loop invariant but still defined inside the
|
|
|
|
// loop. This means that we can't just use the original 'cond' value.
|
|
|
|
// We have to take the 'vectorized' value and pick the first lane.
|
|
|
|
// Instcombine will make this a no-op.
|
|
|
|
auto *InvarCond =
|
2023-03-10 17:49:23 +01:00
|
|
|
isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
|
2022-07-08 09:33:17 -07:00
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-03-10 17:49:23 +01:00
|
|
|
Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
|
2022-07-08 09:33:17 -07:00
|
|
|
Value *Op0 = State.get(getOperand(1), Part);
|
|
|
|
Value *Op1 = State.get(getOperand(2), Part);
|
|
|
|
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
|
|
|
|
State.set(this, Sel, Part);
|
|
|
|
State.addMetadata(Sel, &I);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-23 20:36:15 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
|
|
|
|
switch (OpType) {
|
|
|
|
case OperationType::PossiblyExactOp:
|
|
|
|
if (ExactFlags.IsExact)
|
|
|
|
O << " exact";
|
|
|
|
break;
|
|
|
|
case OperationType::OverflowingBinOp:
|
|
|
|
if (WrapFlags.HasNUW)
|
|
|
|
O << " nuw";
|
|
|
|
if (WrapFlags.HasNSW)
|
|
|
|
O << " nsw";
|
|
|
|
break;
|
|
|
|
case OperationType::FPMathOp:
|
|
|
|
getFastMathFlags().print(O);
|
|
|
|
break;
|
|
|
|
case OperationType::GEPOp:
|
|
|
|
if (GEPFlags.IsInBounds)
|
|
|
|
O << " inbounds";
|
|
|
|
break;
|
|
|
|
case OperationType::Other:
|
|
|
|
break;
|
|
|
|
}
|
2023-08-08 12:12:29 +01:00
|
|
|
if (getNumOperands() > 0)
|
|
|
|
O << " ";
|
2023-05-23 20:36:15 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-07-09 18:46:56 -07:00
|
|
|
void VPWidenRecipe::execute(VPTransformState &State) {
|
|
|
|
auto &I = *cast<Instruction>(getUnderlyingValue());
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
switch (I.getOpcode()) {
|
|
|
|
case Instruction::Call:
|
|
|
|
case Instruction::Br:
|
|
|
|
case Instruction::PHI:
|
|
|
|
case Instruction::GetElementPtr:
|
|
|
|
case Instruction::Select:
|
|
|
|
llvm_unreachable("This instruction is handled by a different recipe.");
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::FNeg:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
// Just widen unops and binops.
|
|
|
|
State.setDebugLocFromInst(&I);
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
SmallVector<Value *, 2> Ops;
|
|
|
|
for (VPValue *VPOp : operands())
|
|
|
|
Ops.push_back(State.get(VPOp, Part));
|
|
|
|
|
|
|
|
Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
|
|
|
|
|
2023-05-08 17:28:50 +01:00
|
|
|
if (auto *VecOp = dyn_cast<Instruction>(V))
|
|
|
|
setFlags(VecOp);
|
2022-07-09 18:46:56 -07:00
|
|
|
|
|
|
|
// Use this vector value for all users of the original instruction.
|
|
|
|
State.set(this, V, Part);
|
|
|
|
State.addMetadata(V, &I);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::Freeze: {
|
|
|
|
State.setDebugLocFromInst(&I);
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *Op = State.get(getOperand(0), Part);
|
|
|
|
|
|
|
|
Value *Freeze = Builder.CreateFreeze(Op);
|
|
|
|
State.set(this, Freeze, Part);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::FCmp: {
|
|
|
|
// Widen compares. Generate vector compares.
|
|
|
|
bool FCmp = (I.getOpcode() == Instruction::FCmp);
|
|
|
|
auto *Cmp = cast<CmpInst>(&I);
|
|
|
|
State.setDebugLocFromInst(Cmp);
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
|
|
|
Value *B = State.get(getOperand(1), Part);
|
|
|
|
Value *C = nullptr;
|
|
|
|
if (FCmp) {
|
|
|
|
// Propagate fast math flags.
|
|
|
|
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
|
|
|
|
Builder.setFastMathFlags(Cmp->getFastMathFlags());
|
|
|
|
C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
|
|
|
|
} else {
|
|
|
|
C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
|
|
|
|
}
|
|
|
|
State.set(this, C, Part);
|
|
|
|
State.addMetadata(C, &I);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
// This instruction is not vectorized by simple widening.
|
|
|
|
LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
|
|
|
|
llvm_unreachable("Unhandled instruction!");
|
|
|
|
} // end of switch.
|
|
|
|
}
|
2022-07-08 09:33:17 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
2022-10-21 08:54:10 +01:00
|
|
|
const Instruction *UI = getUnderlyingInstr();
|
2023-05-23 20:36:15 +01:00
|
|
|
O << " = " << UI->getOpcodeName();
|
|
|
|
printFlags(O);
|
2022-10-21 08:54:10 +01:00
|
|
|
if (auto *Cmp = dyn_cast<CmpInst>(UI))
|
2023-03-07 14:57:06 +01:00
|
|
|
O << Cmp->getPredicate() << " ";
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2023-05-05 13:20:16 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenCastRecipe::execute(VPTransformState &State) {
|
|
|
|
auto *I = cast_or_null<Instruction>(getUnderlyingValue());
|
|
|
|
if (I)
|
|
|
|
State.setDebugLocFromInst(I);
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
/// Vectorize casts.
|
|
|
|
assert(State.VF.isVector() && "Not vectorizing?");
|
|
|
|
Type *DestTy = VectorType::get(getResultType(), State.VF);
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
|
|
|
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
|
|
|
|
State.set(this, Cast, Part);
|
|
|
|
State.addMetadata(Cast, I);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-CAST ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = " << Instruction::getOpcodeName(Opcode) << " ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
O << " to " << *getResultType();
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-INDUCTION";
|
|
|
|
if (getTruncInst()) {
|
|
|
|
O << "\\l\"";
|
|
|
|
O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
|
|
|
|
O << " +\n" << Indent << "\" ";
|
|
|
|
getVPValue(0)->printAsOperand(O, SlotTracker);
|
|
|
|
} else
|
|
|
|
O << " " << VPlanIngredient(IV);
|
|
|
|
|
|
|
|
O << ", ";
|
|
|
|
getStepValue()->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
|
2023-04-16 14:48:02 +01:00
|
|
|
// The step may be defined by a recipe in the preheader (e.g. if it requires
|
|
|
|
// SCEV expansion), but for the canonical induction the step is required to be
|
|
|
|
// 1, which is represented as live-in.
|
|
|
|
if (getStepValue()->getDefiningRecipe())
|
|
|
|
return false;
|
|
|
|
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
|
2022-06-28 10:34:30 +01:00
|
|
|
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
|
|
|
|
return StartC && StartC->isZero() && StepC && StepC->isOne();
|
|
|
|
}
|
|
|
|
|
2022-11-30 17:04:19 +00:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent;
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << Indent << "= DERIVED-IV ";
|
|
|
|
getStartValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " + ";
|
|
|
|
getCanonicalIV()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " * ";
|
|
|
|
getStepValue()->printAsOperand(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-11-30 17:04:19 +00:00
|
|
|
if (IndDesc.getStep()->getType() != ResultTy)
|
|
|
|
O << " (truncated to " << *ResultTy << ")";
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2022-11-30 17:04:19 +00:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent;
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << Indent << "= SCALAR-STEPS ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2022-07-10 17:10:17 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenGEPRecipe::execute(VPTransformState &State) {
|
2023-05-06 09:25:46 +01:00
|
|
|
assert(State.VF.isVector() && "not widening");
|
2022-07-10 17:10:17 -07:00
|
|
|
auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
|
|
|
|
// Construct a vector GEP by widening the operands of the scalar GEP as
|
|
|
|
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
|
|
|
|
// results in a vector of pointers when at least one operand of the GEP
|
|
|
|
// is vector-typed. Thus, to keep the representation compact, we only use
|
|
|
|
// vector-typed operands for loop-varying values.
|
|
|
|
|
2023-05-06 09:25:46 +01:00
|
|
|
if (areAllOperandsInvariant()) {
|
2022-07-10 17:10:17 -07:00
|
|
|
// If we are vectorizing, but the GEP has only loop-invariant operands,
|
|
|
|
// the GEP we build (by only using vector-typed operands for
|
|
|
|
// loop-varying values) would be a scalar pointer. Thus, to ensure we
|
|
|
|
// produce a vector of pointers, we need to either arbitrarily pick an
|
|
|
|
// operand to broadcast, or broadcast a clone of the original GEP.
|
|
|
|
// Here, we broadcast a clone of the original.
|
|
|
|
//
|
|
|
|
// TODO: If at some point we decide to scalarize instructions having
|
|
|
|
// loop-invariant operands, this special case will no longer be
|
|
|
|
// required. We would add the scalarization decision to
|
|
|
|
// collectLoopScalars() and teach getVectorValue() to broadcast
|
|
|
|
// the lane-zero scalar value.
|
2023-06-16 16:14:01 +01:00
|
|
|
SmallVector<Value *> Ops;
|
|
|
|
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
|
|
|
|
Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
|
|
|
|
|
|
|
|
auto *NewGEP =
|
|
|
|
State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
|
|
|
|
ArrayRef(Ops).drop_front(), "", isInBounds());
|
2022-07-10 17:10:17 -07:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-06-16 16:14:01 +01:00
|
|
|
Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
|
2022-07-10 17:10:17 -07:00
|
|
|
State.set(this, EntryPart, Part);
|
|
|
|
State.addMetadata(EntryPart, GEP);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// If the GEP has at least one loop-varying operand, we are sure to
|
|
|
|
// produce a vector of pointers. But if we are only unrolling, we want
|
|
|
|
// to produce a scalar GEP for each unroll part. Thus, the GEP we
|
|
|
|
// produce with the code below will be scalar (if VF == 1) or vector
|
|
|
|
// (otherwise). Note that for the unroll-only case, we still maintain
|
|
|
|
// values in the vector mapping with initVector, as we do for other
|
|
|
|
// instructions.
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
// The pointer operand of the new GEP. If it's loop-invariant, we
|
|
|
|
// won't broadcast it.
|
2023-03-09 17:52:22 +01:00
|
|
|
auto *Ptr = isPointerLoopInvariant()
|
2022-07-10 17:10:17 -07:00
|
|
|
? State.get(getOperand(0), VPIteration(0, 0))
|
|
|
|
: State.get(getOperand(0), Part);
|
|
|
|
|
|
|
|
// Collect all the indices for the new GEP. If any index is
|
|
|
|
// loop-invariant, we won't broadcast it.
|
|
|
|
SmallVector<Value *, 4> Indices;
|
|
|
|
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
|
|
|
|
VPValue *Operand = getOperand(I);
|
2023-03-09 17:52:22 +01:00
|
|
|
if (isIndexLoopInvariant(I - 1))
|
2022-07-10 17:10:17 -07:00
|
|
|
Indices.push_back(State.get(Operand, VPIteration(0, 0)));
|
|
|
|
else
|
|
|
|
Indices.push_back(State.get(Operand, Part));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
|
|
|
|
// but it should be a vector, otherwise.
|
|
|
|
auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
|
2023-05-09 15:17:20 +01:00
|
|
|
Indices, "", isInBounds());
|
2022-07-10 17:10:17 -07:00
|
|
|
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
|
|
|
|
"NewGEP is not a pointer vector");
|
|
|
|
State.set(this, NewGEP, Part);
|
|
|
|
State.addMetadata(NewGEP, GEP);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-10 17:10:17 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-GEP ";
|
2023-03-09 17:52:22 +01:00
|
|
|
O << (isPointerLoopInvariant() ? "Inv" : "Var");
|
|
|
|
for (size_t I = 0; I < getNumOperands() - 1; ++I)
|
|
|
|
O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
O << " ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
2023-05-23 20:36:15 +01:00
|
|
|
O << " = getelementptr";
|
|
|
|
printFlags(O);
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2022-07-11 16:01:04 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPBlendRecipe::execute(VPTransformState &State) {
|
|
|
|
State.setDebugLocFromInst(Phi);
|
|
|
|
// We know that all PHIs in non-header blocks are converted into
|
|
|
|
// selects, so we don't have to worry about the insertion order and we
|
|
|
|
// can just use the builder.
|
|
|
|
// At this point we generate the predication tree. There may be
|
|
|
|
// duplications since this is a simple recursive scan, but future
|
|
|
|
// optimizations will clean it up.
|
|
|
|
|
|
|
|
unsigned NumIncoming = getNumIncomingValues();
|
|
|
|
|
|
|
|
// Generate a sequence of selects of the form:
|
|
|
|
// SELECT(Mask3, In3,
|
|
|
|
// SELECT(Mask2, In2,
|
|
|
|
// SELECT(Mask1, In1,
|
|
|
|
// In0)))
|
|
|
|
// Note that Mask0 is never used: lanes for which no path reaches this phi and
|
|
|
|
// are essentially undef are taken from In0.
|
|
|
|
VectorParts Entry(State.UF);
|
|
|
|
for (unsigned In = 0; In < NumIncoming; ++In) {
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
// We might have single edge PHIs (blocks) - use an identity
|
|
|
|
// 'select' for the first PHI operand.
|
|
|
|
Value *In0 = State.get(getIncomingValue(In), Part);
|
|
|
|
if (In == 0)
|
|
|
|
Entry[Part] = In0; // Initialize with the first incoming value.
|
|
|
|
else {
|
|
|
|
// Select between the current value and the previous incoming edge
|
|
|
|
// based on the incoming mask.
|
|
|
|
Value *Cond = State.get(getMask(In), Part);
|
|
|
|
Entry[Part] =
|
|
|
|
State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part)
|
|
|
|
State.set(this, Entry[Part], Part);
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-11 16:01:04 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "BLEND ";
|
|
|
|
Phi->printAsOperand(O, false);
|
|
|
|
O << " =";
|
|
|
|
if (getNumIncomingValues() == 1) {
|
|
|
|
// Not a User of any mask: not really blending, this is a
|
|
|
|
// single-predecessor phi.
|
|
|
|
O << " ";
|
|
|
|
getIncomingValue(0)->printAsOperand(O, SlotTracker);
|
|
|
|
} else {
|
|
|
|
for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
|
|
|
|
O << " ";
|
|
|
|
getIncomingValue(I)->printAsOperand(O, SlotTracker);
|
|
|
|
O << "/";
|
|
|
|
getMask(I)->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "REDUCE ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
getChainOp()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " +";
|
|
|
|
if (isa<FPMathOperator>(getUnderlyingInstr()))
|
|
|
|
O << getUnderlyingInstr()->getFastMathFlags();
|
|
|
|
O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
|
|
|
|
getVecOp()->printAsOperand(O, SlotTracker);
|
|
|
|
if (getCondOp()) {
|
|
|
|
O << ", ";
|
|
|
|
getCondOp()->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
O << ")";
|
|
|
|
if (RdxDesc->IntermediateStore)
|
|
|
|
O << " (with final reduction value stored in invariant address sank "
|
|
|
|
"outside of loop)";
|
|
|
|
}
|
2023-02-20 10:53:45 +00:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2023-02-20 10:28:24 +00:00
|
|
|
bool VPReplicateRecipe::shouldPack() const {
|
|
|
|
// Find if the recipe is used by a widened recipe via an intervening
|
|
|
|
// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
|
|
|
|
return any_of(users(), [](const VPUser *U) {
|
|
|
|
if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
|
2023-02-20 14:11:18 +00:00
|
|
|
return any_of(PredR->users(), [PredR](const VPUser *U) {
|
|
|
|
return !U->usesScalars(PredR);
|
|
|
|
});
|
2023-02-20 10:28:24 +00:00
|
|
|
return false;
|
|
|
|
});
|
|
|
|
}
|
2023-02-20 10:53:45 +00:00
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
|
|
|
|
|
|
|
|
if (!getUnderlyingInstr()->getType()->isVoidTy()) {
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
|
2023-05-23 20:36:15 +01:00
|
|
|
O << "call";
|
|
|
|
printFlags(O);
|
|
|
|
O << "@" << CB->getCalledFunction()->getName() << "(";
|
2022-06-28 10:34:30 +01:00
|
|
|
interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
|
|
|
|
O, [&O, &SlotTracker](VPValue *Op) {
|
|
|
|
Op->printAsOperand(O, SlotTracker);
|
|
|
|
});
|
|
|
|
O << ")";
|
|
|
|
} else {
|
2023-05-23 20:36:15 +01:00
|
|
|
O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
|
|
|
|
printFlags(O);
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
|
2023-02-20 10:28:24 +00:00
|
|
|
if (shouldPack())
|
2022-06-28 10:34:30 +01:00
|
|
|
O << " (S->V)";
|
|
|
|
}
|
2022-07-13 14:39:59 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(State.Instance && "Branch on Mask works only on single instance.");
|
|
|
|
|
|
|
|
unsigned Part = State.Instance->Part;
|
|
|
|
unsigned Lane = State.Instance->Lane.getKnownLane();
|
|
|
|
|
|
|
|
Value *ConditionBit = nullptr;
|
|
|
|
VPValue *BlockInMask = getMask();
|
|
|
|
if (BlockInMask) {
|
|
|
|
ConditionBit = State.get(BlockInMask, Part);
|
|
|
|
if (ConditionBit->getType()->isVectorTy())
|
|
|
|
ConditionBit = State.Builder.CreateExtractElement(
|
|
|
|
ConditionBit, State.Builder.getInt32(Lane));
|
|
|
|
} else // Block in mask is all-one.
|
|
|
|
ConditionBit = State.Builder.getTrue();
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-13 14:39:59 -07:00
|
|
|
// Replace the temporary unreachable terminator with a new conditional branch,
|
|
|
|
// whose two destinations will be set later when they are created.
|
|
|
|
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
|
|
|
|
assert(isa<UnreachableInst>(CurrentTerminator) &&
|
|
|
|
"Expected to replace unreachable terminator with conditional branch.");
|
|
|
|
auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
|
|
|
|
CondBr->setSuccessor(0, nullptr);
|
|
|
|
ReplaceInstWithInst(CurrentTerminator, CondBr);
|
|
|
|
}
|
|
|
|
|
2022-07-17 11:34:23 +01:00
|
|
|
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(State.Instance && "Predicated instruction PHI works per instance.");
|
|
|
|
Instruction *ScalarPredInst =
|
|
|
|
cast<Instruction>(State.get(getOperand(0), *State.Instance));
|
|
|
|
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
|
|
|
|
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
|
|
|
|
assert(PredicatingBB && "Predicated block has no single predecessor.");
|
|
|
|
assert(isa<VPReplicateRecipe>(getOperand(0)) &&
|
|
|
|
"operand must be VPReplicateRecipe");
|
|
|
|
|
|
|
|
// By current pack/unpack logic we need to generate only a single phi node: if
|
|
|
|
// a vector value for the predicated instruction exists at this point it means
|
|
|
|
// the instruction has vector users only, and a phi for the vector value is
|
|
|
|
// needed. In this case the recipe of the predicated instruction is marked to
|
|
|
|
// also do that packing, thereby "hoisting" the insert-element sequence.
|
|
|
|
// Otherwise, a phi node for the scalar value is needed.
|
|
|
|
unsigned Part = State.Instance->Part;
|
|
|
|
if (State.hasVectorValue(getOperand(0), Part)) {
|
|
|
|
Value *VectorValue = State.get(getOperand(0), Part);
|
|
|
|
InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
|
|
|
|
PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
|
|
|
|
VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
|
|
|
|
VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
|
|
|
|
if (State.hasVectorValue(this, Part))
|
|
|
|
State.reset(this, VPhi, Part);
|
|
|
|
else
|
|
|
|
State.set(this, VPhi, Part);
|
|
|
|
// NOTE: Currently we need to update the value of the operand, so the next
|
|
|
|
// predicated iteration inserts its generated value in the correct vector.
|
|
|
|
State.reset(getOperand(0), VPhi, Part);
|
|
|
|
} else {
|
|
|
|
Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
|
|
|
|
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
|
|
|
|
Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
|
|
|
|
PredicatingBB);
|
|
|
|
Phi->addIncoming(ScalarPredInst, PredicatedBB);
|
|
|
|
if (State.hasScalarValue(this, *State.Instance))
|
|
|
|
State.reset(this, Phi, *State.Instance);
|
|
|
|
else
|
|
|
|
State.set(this, Phi, *State.Instance);
|
|
|
|
// NOTE: Currently we need to update the value of the operand, so the next
|
|
|
|
// predicated iteration inserts its generated value in the correct vector.
|
|
|
|
State.reset(getOperand(0), Phi, *State.Instance);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-13 14:39:59 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "PHI-PREDICATED-INSTRUCTION ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN ";
|
|
|
|
|
|
|
|
if (!isStore()) {
|
|
|
|
getVPSingleValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
|
|
|
|
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
Value *Start = getStartValue()->getLiveInIRValue();
|
|
|
|
PHINode *EntryPart = PHINode::Create(
|
|
|
|
Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
|
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
EntryPart->addIncoming(Start, VectorPH);
|
|
|
|
EntryPart->setDebugLoc(DL);
|
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
|
|
|
|
State.set(this, EntryPart, Part);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = CANONICAL-INDUCTION";
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-04-16 14:48:02 +01:00
|
|
|
bool VPCanonicalIVPHIRecipe::isCanonical(
|
|
|
|
InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
|
|
|
|
Type *Ty) const {
|
|
|
|
// The types must match and it must be an integer induction.
|
|
|
|
if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
|
|
|
|
return false;
|
|
|
|
// Start must match the start value of this canonical induction.
|
|
|
|
if (Start != getStartValue())
|
2022-11-30 17:04:19 +00:00
|
|
|
return false;
|
2023-04-16 14:48:02 +01:00
|
|
|
|
|
|
|
// If the step is defined by a recipe, it is not a ConstantInt.
|
|
|
|
if (Step->getDefiningRecipe())
|
2022-11-30 17:04:19 +00:00
|
|
|
return false;
|
|
|
|
|
2023-04-16 14:48:02 +01:00
|
|
|
ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
|
|
|
|
return StepC && StepC->isOne();
|
2022-11-30 17:04:19 +00:00
|
|
|
}
|
|
|
|
|
2022-09-23 18:23:01 +01:00
|
|
|
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
|
|
|
|
return IsScalarAfterVectorization &&
|
|
|
|
(!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = WIDEN-POINTER-INDUCTION ";
|
|
|
|
getStartValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", " << *IndDesc.getStep();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPExpandSCEVRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "cannot be used in per-lane");
|
|
|
|
const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
|
|
|
|
SCEVExpander Exp(SE, DL, "induction");
|
|
|
|
|
|
|
|
Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
|
|
|
|
&*State.Builder.GetInsertPoint());
|
2023-05-11 16:49:18 +01:00
|
|
|
assert(!State.ExpandedSCEVs.contains(Expr) &&
|
|
|
|
"Same SCEV expanded multiple times");
|
|
|
|
State.ExpandedSCEVs[Expr] = Res;
|
2022-06-28 10:34:30 +01:00
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
|
2023-05-04 14:00:13 +01:00
|
|
|
State.set(this, Res, {Part, 0});
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
getVPSingleValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " = EXPAND SCEV " << *Expr;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
|
|
|
|
Value *CanonicalIV = State.get(getOperand(0), 0);
|
|
|
|
Type *STy = CanonicalIV->getType();
|
|
|
|
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
|
|
|
|
ElementCount VF = State.VF;
|
|
|
|
Value *VStart = VF.isScalar()
|
|
|
|
? CanonicalIV
|
|
|
|
: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
|
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
|
|
|
|
Value *VStep = createStepForVF(Builder, STy, VF, Part);
|
|
|
|
if (VF.isVector()) {
|
|
|
|
VStep = Builder.CreateVectorSplat(VF, VStep);
|
|
|
|
VStep =
|
|
|
|
Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
|
|
|
|
}
|
|
|
|
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
|
|
|
|
State.set(this, CanonicalVectorIV, Part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = WIDEN-CANONICAL-INDUCTION ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
// Create a vector from the initial value.
|
|
|
|
auto *VectorInit = getStartValue()->getLiveInIRValue();
|
|
|
|
|
|
|
|
Type *VecTy = State.VF.isScalar()
|
|
|
|
? VectorInit->getType()
|
|
|
|
: VectorType::get(VectorInit->getType(), State.VF);
|
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
if (State.VF.isVector()) {
|
|
|
|
auto *IdxTy = Builder.getInt32Ty();
|
|
|
|
auto *One = ConstantInt::get(IdxTy, 1);
|
|
|
|
IRBuilder<>::InsertPointGuard Guard(Builder);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
|
|
|
|
auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
|
|
|
|
VectorInit = Builder.CreateInsertElement(
|
|
|
|
PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a phi node for the new recurrence.
|
|
|
|
PHINode *EntryPart = PHINode::Create(
|
|
|
|
VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
|
|
|
|
EntryPart->addIncoming(VectorInit, VectorPH);
|
|
|
|
State.set(this, EntryPart, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPReductionPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
PHINode *PN = cast<PHINode>(getUnderlyingValue());
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
|
|
|
|
// In order to support recurrences we need to be able to vectorize Phi nodes.
|
|
|
|
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
|
|
|
|
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
|
|
|
|
// this value when we vectorize all of the instructions that use the PHI.
|
|
|
|
bool ScalarPHI = State.VF.isScalar() || IsInLoop;
|
|
|
|
Type *VecTy =
|
|
|
|
ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
|
|
|
|
|
|
|
|
BasicBlock *HeaderBB = State.CFG.PrevBB;
|
|
|
|
assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
|
|
|
|
"recipe must be in the vector loop header");
|
|
|
|
unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
|
|
|
|
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
|
|
|
|
Value *EntryPart =
|
|
|
|
PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
|
|
|
|
State.set(this, EntryPart, Part);
|
|
|
|
}
|
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
|
|
|
|
// Reductions do not have to start at zero. They can start with
|
|
|
|
// any loop invariant values.
|
|
|
|
VPValue *StartVPV = getStartValue();
|
|
|
|
Value *StartV = StartVPV->getLiveInIRValue();
|
|
|
|
|
|
|
|
Value *Iden = nullptr;
|
|
|
|
RecurKind RK = RdxDesc.getRecurrenceKind();
|
|
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
|
2023-07-19 02:51:15 -07:00
|
|
|
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
|
|
|
|
// MinMax and AnyOf reductions have the start value as their identity.
|
2022-06-28 10:34:30 +01:00
|
|
|
if (ScalarPHI) {
|
|
|
|
Iden = StartV;
|
|
|
|
} else {
|
|
|
|
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
StartV = Iden =
|
|
|
|
Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
|
|
|
|
RdxDesc.getFastMathFlags());
|
|
|
|
|
|
|
|
if (!ScalarPHI) {
|
|
|
|
Iden = Builder.CreateVectorSplat(State.VF, Iden);
|
|
|
|
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
Constant *Zero = Builder.getInt32(0);
|
|
|
|
StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
|
|
|
|
Value *EntryPart = State.get(this, Part);
|
|
|
|
// Make sure to add the reduction start value only to the
|
|
|
|
// first unroll part.
|
|
|
|
Value *StartVal = (Part == 0) ? StartV : Iden;
|
|
|
|
cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-REDUCTION-PHI ";
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(EnableVPlanNativePath &&
|
|
|
|
"Non-native vplans are not expected to have VPWidenPHIRecipes.");
|
|
|
|
|
|
|
|
// Currently we enter here in the VPlan-native path for non-induction
|
|
|
|
// PHIs where all control flow is uniform. We simply widen these PHIs.
|
|
|
|
// Create a vector phi with no operands - the vector phi operands will be
|
|
|
|
// set at the end of vector code generation.
|
|
|
|
VPBasicBlock *Parent = getParent();
|
|
|
|
VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
|
|
|
|
unsigned StartIdx = 0;
|
|
|
|
// For phis in header blocks of loop regions, use the index of the value
|
|
|
|
// coming from the preheader.
|
|
|
|
if (LoopRegion->getEntryBasicBlock() == Parent) {
|
|
|
|
for (unsigned I = 0; I < getNumOperands(); ++I) {
|
|
|
|
if (getIncomingBlock(I) ==
|
|
|
|
LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
|
|
|
|
StartIdx = I;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Value *Op0 = State.get(getOperand(StartIdx), 0);
|
|
|
|
Type *VecTy = Op0->getType();
|
|
|
|
Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
|
|
|
|
State.set(this, VecPhi, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-PHI ";
|
|
|
|
|
|
|
|
auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
|
|
|
|
// Unless all incoming values are modeled in VPlan print the original PHI
|
|
|
|
// directly.
|
|
|
|
// TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
|
|
|
|
// values as VPValues.
|
|
|
|
if (getNumOperands() != OriginalPhi->getNumOperands()) {
|
|
|
|
O << VPlanIngredient(OriginalPhi);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
2022-05-10 10:49:43 +01:00
|
|
|
|
|
|
|
// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
|
|
|
|
// remove VPActiveLaneMaskPHIRecipe.
|
|
|
|
void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
|
|
|
|
Value *StartMask = State.get(getOperand(0), Part);
|
|
|
|
PHINode *EntryPart =
|
|
|
|
State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
|
|
|
|
EntryPart->addIncoming(StartMask, VectorPH);
|
|
|
|
EntryPart->setDebugLoc(DL);
|
|
|
|
State.set(this, EntryPart, Part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "ACTIVE-LANE-MASK-PHI ";
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|