2022-06-28 10:34:30 +01:00
|
|
|
//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
///
|
|
|
|
/// \file
|
|
|
|
/// This file contains implementations for different VPlan recipes.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "VPlan.h"
|
2023-10-27 14:38:28 +01:00
|
|
|
#include "VPlanAnalysis.h"
|
2024-08-28 13:54:41 +01:00
|
|
|
#include "VPlanUtils.h"
|
2022-06-28 10:34:30 +01:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
#include "llvm/ADT/Twine.h"
|
|
|
|
#include "llvm/Analysis/IVDescriptors.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
2024-09-06 08:41:36 -07:00
|
|
|
#include "llvm/IR/VectorBuilder.h"
|
2022-06-28 10:34:30 +01:00
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2022-07-13 14:39:59 -07:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2024-01-04 22:53:18 +00:00
|
|
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
2022-06-28 10:34:30 +01:00
|
|
|
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
|
|
|
|
#include <cassert>
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2022-07-11 16:01:04 -07:00
|
|
|
using VectorParts = SmallVector<Value *, 2>;
|
|
|
|
|
2023-02-14 00:09:44 -08:00
|
|
|
namespace llvm {
|
2022-06-28 10:34:30 +01:00
|
|
|
extern cl::opt<bool> EnableVPlanNativePath;
|
2023-02-14 00:09:44 -08:00
|
|
|
}
|
2024-07-10 14:22:21 +01:00
|
|
|
extern cl::opt<unsigned> ForceTargetInstructionCost;
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-09 18:46:56 -07:00
|
|
|
#define LV_NAME "loop-vectorize"
|
|
|
|
#define DEBUG_TYPE LV_NAME
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
bool VPRecipeBase::mayWriteToMemory() const {
|
|
|
|
switch (getVPDefID()) {
|
2023-12-15 00:23:14 +08:00
|
|
|
case VPInterleaveSC:
|
|
|
|
return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
|
2024-04-19 09:44:23 +01:00
|
|
|
case VPWidenStoreEVLSC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPWidenStoreSC:
|
|
|
|
return true;
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPReplicateSC:
|
|
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
|
|
->mayWriteToMemory();
|
2024-05-01 20:48:22 +01:00
|
|
|
case VPWidenCallSC:
|
|
|
|
return !cast<VPWidenCallRecipe>(this)
|
|
|
|
->getCalledScalarFunction()
|
|
|
|
->onlyReadsMemory();
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPBranchOnMaskSC:
|
2022-12-04 12:58:46 +00:00
|
|
|
case VPScalarIVStepsSC:
|
2023-01-30 10:29:27 +00:00
|
|
|
case VPPredInstPHISC:
|
2022-06-28 10:34:30 +01:00
|
|
|
return false;
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPBlendSC:
|
2024-07-16 16:15:24 +08:00
|
|
|
case VPReductionEVLSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPReductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenCanonicalIVSC:
|
2023-05-05 13:20:16 +01:00
|
|
|
case VPWidenCastSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenGEPSC:
|
|
|
|
case VPWidenIntOrFpInductionSC:
|
2024-04-19 09:44:23 +01:00
|
|
|
case VPWidenLoadEVLSC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPWidenLoadSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenPHISC:
|
|
|
|
case VPWidenSC:
|
2024-09-06 08:41:36 -07:00
|
|
|
case VPWidenEVLSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenSelectSC: {
|
|
|
|
const Instruction *I =
|
|
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
|
|
(void)I;
|
|
|
|
assert((!I || !I->mayWriteToMemory()) &&
|
|
|
|
"underlying instruction may write to memory");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VPRecipeBase::mayReadFromMemory() const {
|
|
|
|
switch (getVPDefID()) {
|
2024-04-19 09:44:23 +01:00
|
|
|
case VPWidenLoadEVLSC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPWidenLoadSC:
|
|
|
|
return true;
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPReplicateSC:
|
|
|
|
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
|
|
|
|
->mayReadFromMemory();
|
2024-05-01 20:48:22 +01:00
|
|
|
case VPWidenCallSC:
|
|
|
|
return !cast<VPWidenCallRecipe>(this)
|
|
|
|
->getCalledScalarFunction()
|
|
|
|
->onlyWritesMemory();
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPBranchOnMaskSC:
|
2023-01-31 21:51:03 +00:00
|
|
|
case VPPredInstPHISC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPScalarIVStepsSC:
|
2024-04-19 09:44:23 +01:00
|
|
|
case VPWidenStoreEVLSC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPWidenStoreSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
return false;
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPBlendSC:
|
2024-07-16 16:15:24 +08:00
|
|
|
case VPReductionEVLSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPReductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenCanonicalIVSC:
|
2023-05-05 13:20:16 +01:00
|
|
|
case VPWidenCastSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenGEPSC:
|
|
|
|
case VPWidenIntOrFpInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenPHISC:
|
|
|
|
case VPWidenSC:
|
2024-09-06 08:41:36 -07:00
|
|
|
case VPWidenEVLSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenSelectSC: {
|
|
|
|
const Instruction *I =
|
|
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
|
|
(void)I;
|
|
|
|
assert((!I || !I->mayReadFromMemory()) &&
|
|
|
|
"underlying instruction may read from memory");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VPRecipeBase::mayHaveSideEffects() const {
|
|
|
|
switch (getVPDefID()) {
|
2022-11-30 17:04:19 +00:00
|
|
|
case VPDerivedIVSC:
|
2022-07-27 19:29:26 +01:00
|
|
|
case VPPredInstPHISC:
|
2024-01-26 11:13:05 +00:00
|
|
|
case VPScalarCastSC:
|
2022-07-27 19:29:26 +01:00
|
|
|
return false;
|
2023-08-22 20:05:57 +01:00
|
|
|
case VPInstructionSC:
|
|
|
|
switch (cast<VPInstruction>(this)->getOpcode()) {
|
2024-01-09 10:50:08 +00:00
|
|
|
case Instruction::Or:
|
2023-09-02 21:45:24 +01:00
|
|
|
case Instruction::ICmp:
|
2024-01-06 12:08:03 +00:00
|
|
|
case Instruction::Select:
|
2023-08-22 20:05:57 +01:00
|
|
|
case VPInstruction::Not:
|
|
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
2024-06-03 20:20:30 +01:00
|
|
|
case VPInstruction::ExtractFromEnd:
|
2024-06-08 21:40:29 +01:00
|
|
|
case VPInstruction::FirstOrderRecurrenceSplice:
|
2024-05-14 09:42:49 +01:00
|
|
|
case VPInstruction::LogicalAnd:
|
2024-03-26 16:01:57 +01:00
|
|
|
case VPInstruction::PtrAdd:
|
2023-08-22 20:05:57 +01:00
|
|
|
return false;
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
2024-05-01 20:48:22 +01:00
|
|
|
case VPWidenCallSC: {
|
|
|
|
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
|
|
|
|
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
|
|
|
|
}
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPBlendSC:
|
2024-07-16 16:15:24 +08:00
|
|
|
case VPReductionEVLSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPReductionSC:
|
|
|
|
case VPScalarIVStepsSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenCanonicalIVSC:
|
2023-05-05 13:20:16 +01:00
|
|
|
case VPWidenCastSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenGEPSC:
|
|
|
|
case VPWidenIntOrFpInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenPHISC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenPointerInductionSC:
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPWidenSC:
|
2024-09-06 08:41:36 -07:00
|
|
|
case VPWidenEVLSC:
|
2023-05-04 21:40:22 +01:00
|
|
|
case VPWidenSelectSC: {
|
2022-06-28 10:34:30 +01:00
|
|
|
const Instruction *I =
|
|
|
|
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
|
|
|
|
(void)I;
|
|
|
|
assert((!I || !I->mayHaveSideEffects()) &&
|
|
|
|
"underlying instruction has side-effects");
|
|
|
|
return false;
|
|
|
|
}
|
2023-12-15 00:23:14 +08:00
|
|
|
case VPInterleaveSC:
|
|
|
|
return mayWriteToMemory();
|
2024-04-19 09:44:23 +01:00
|
|
|
case VPWidenLoadEVLSC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPWidenLoadSC:
|
2024-04-19 09:44:23 +01:00
|
|
|
case VPWidenStoreEVLSC:
|
2024-04-17 11:00:58 +01:00
|
|
|
case VPWidenStoreSC:
|
|
|
|
assert(
|
|
|
|
cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
|
|
|
|
mayWriteToMemory() &&
|
|
|
|
"mayHaveSideffects result for ingredient differs from this "
|
|
|
|
"implementation");
|
2023-02-07 22:02:50 +00:00
|
|
|
return mayWriteToMemory();
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPReplicateSC: {
|
|
|
|
auto *R = cast<VPReplicateRecipe>(this);
|
|
|
|
return R->getUnderlyingInstr()->mayHaveSideEffects();
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
|
|
|
|
VPValue *ExitValue = getOperand(0);
|
2023-08-17 18:17:02 +01:00
|
|
|
VPBasicBlock *MiddleVPBB =
|
|
|
|
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
|
2024-07-11 16:08:04 +01:00
|
|
|
VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
|
|
|
|
auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
|
|
|
|
// Values leaving the vector loop reach live out phi's in the exiting block
|
|
|
|
// via middle block.
|
|
|
|
auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
|
|
|
|
? MiddleVPBB
|
|
|
|
: ExitingVPBB;
|
|
|
|
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
|
2024-08-21 10:06:44 +02:00
|
|
|
Value *V = State.get(ExitValue, VPIteration(0, 0));
|
2024-07-11 16:08:04 +01:00
|
|
|
if (Phi->getBasicBlockIndex(PredBB) != -1)
|
|
|
|
Phi->setIncomingValueForBlock(PredBB, V);
|
|
|
|
else
|
|
|
|
Phi->addIncoming(V, PredBB);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
2023-05-22 10:44:17 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2023-05-22 09:53:52 +01:00
|
|
|
void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
|
|
|
|
O << "Live-out ";
|
|
|
|
getPhi()->printAsOperand(O);
|
|
|
|
O << " = ";
|
|
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
|
|
O << "\n";
|
|
|
|
}
|
2023-05-22 10:44:17 +01:00
|
|
|
#endif
|
2023-05-22 09:53:52 +01:00
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
|
|
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
|
|
assert(InsertPos->getParent() &&
|
|
|
|
"Insertion position not in any VPBasicBlock");
|
2024-03-11 10:56:37 +00:00
|
|
|
InsertPos->getParent()->insert(this, InsertPos->getIterator());
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::insertBefore(VPBasicBlock &BB,
|
|
|
|
iplist<VPRecipeBase>::iterator I) {
|
|
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
|
|
assert(I == BB.end() || I->getParent() == &BB);
|
2024-03-11 10:56:37 +00:00
|
|
|
BB.insert(this, I);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
|
|
|
|
assert(!Parent && "Recipe already in some VPBasicBlock");
|
|
|
|
assert(InsertPos->getParent() &&
|
|
|
|
"Insertion position not in any VPBasicBlock");
|
2024-03-11 10:56:37 +00:00
|
|
|
InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::removeFromParent() {
|
|
|
|
assert(getParent() && "Recipe not in any VPBasicBlock");
|
|
|
|
getParent()->getRecipeList().remove(getIterator());
|
|
|
|
Parent = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
|
|
|
|
assert(getParent() && "Recipe not in any VPBasicBlock");
|
|
|
|
return getParent()->getRecipeList().erase(getIterator());
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
|
|
|
|
removeFromParent();
|
|
|
|
insertAfter(InsertPos);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPRecipeBase::moveBefore(VPBasicBlock &BB,
|
|
|
|
iplist<VPRecipeBase>::iterator I) {
|
|
|
|
removeFromParent();
|
|
|
|
insertBefore(BB, I);
|
2024-07-10 14:22:21 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the underlying instruction to be used for computing \p R's cost via
|
|
|
|
/// the legacy cost model. Return nullptr if there's no suitable instruction.
|
|
|
|
static Instruction *getInstructionForCost(const VPRecipeBase *R) {
|
|
|
|
if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
|
|
|
|
return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
|
|
|
|
if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))
|
|
|
|
return IG->getInsertPos();
|
2024-09-04 09:46:02 +08:00
|
|
|
// Currently the legacy cost model only calculates the instruction cost with
|
|
|
|
// underlying instruction. Removing the WidenMem here will prevent
|
|
|
|
// force-target-instruction-cost overwriting the cost of recipe with
|
|
|
|
// underlying instruction which is inconsistent with the legacy model.
|
|
|
|
// TODO: Remove WidenMem from this function when we don't need to compare to
|
|
|
|
// the legacy model.
|
2024-07-10 14:22:21 +01:00
|
|
|
if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
|
|
|
|
return &WidenMem->getIngredient();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
|
2024-07-23 21:05:09 +01:00
|
|
|
auto *UI = getInstructionForCost(this);
|
|
|
|
if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
|
|
|
|
return 0;
|
2024-07-10 14:22:21 +01:00
|
|
|
|
|
|
|
InstructionCost RecipeCost = computeCost(VF, Ctx);
|
2024-07-23 21:05:09 +01:00
|
|
|
if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
|
2024-07-10 14:22:21 +01:00
|
|
|
RecipeCost.isValid())
|
|
|
|
RecipeCost = InstructionCost(ForceTargetInstructionCost);
|
|
|
|
|
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
|
|
|
|
dump();
|
|
|
|
});
|
|
|
|
return RecipeCost;
|
|
|
|
}
|
|
|
|
|
|
|
|
InstructionCost VPRecipeBase::computeCost(ElementCount VF,
|
|
|
|
VPCostContext &Ctx) const {
|
|
|
|
// Compute the cost for the recipe falling back to the legacy cost model using
|
|
|
|
// the underlying instruction. If there is no underlying instruction, returns
|
|
|
|
// 0.
|
|
|
|
Instruction *UI = getInstructionForCost(this);
|
|
|
|
if (UI && isa<VPReplicateRecipe>(this)) {
|
|
|
|
// VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
|
|
|
|
// transform, avoid computing their cost multiple times for now.
|
|
|
|
Ctx.SkipCostComputation.insert(UI);
|
|
|
|
}
|
|
|
|
return UI ? Ctx.getLegacyCost(UI, VF) : 0;
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
2023-08-07 12:35:53 +01:00
|
|
|
FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
|
|
|
|
assert(OpType == OperationType::FPMathOp &&
|
|
|
|
"recipe doesn't have fast math flags");
|
|
|
|
FastMathFlags Res;
|
|
|
|
Res.setAllowReassoc(FMFs.AllowReassoc);
|
|
|
|
Res.setNoNaNs(FMFs.NoNaNs);
|
|
|
|
Res.setNoInfs(FMFs.NoInfs);
|
|
|
|
Res.setNoSignedZeros(FMFs.NoSignedZeros);
|
|
|
|
Res.setAllowReciprocal(FMFs.AllowReciprocal);
|
|
|
|
Res.setAllowContract(FMFs.AllowContract);
|
|
|
|
Res.setApproxFunc(FMFs.ApproxFunc);
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
|
2023-09-02 21:45:24 +01:00
|
|
|
VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
|
|
|
|
VPValue *A, VPValue *B, DebugLoc DL,
|
|
|
|
const Twine &Name)
|
|
|
|
: VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
|
2023-09-05 15:45:14 +01:00
|
|
|
Pred, DL),
|
2024-01-19 10:27:53 +00:00
|
|
|
Opcode(Opcode), Name(Name.str()) {
|
2023-09-02 21:45:24 +01:00
|
|
|
assert(Opcode == Instruction::ICmp &&
|
|
|
|
"only ICmp predicates supported at the moment");
|
|
|
|
}
|
|
|
|
|
2023-08-08 20:11:47 +01:00
|
|
|
VPInstruction::VPInstruction(unsigned Opcode,
|
|
|
|
std::initializer_list<VPValue *> Operands,
|
|
|
|
FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
|
2023-09-05 15:45:14 +01:00
|
|
|
: VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
|
2024-01-19 10:27:53 +00:00
|
|
|
Opcode(Opcode), Name(Name.str()) {
|
2023-08-08 20:11:47 +01:00
|
|
|
// Make sure the VPInstruction is a floating-point operation.
|
|
|
|
assert(isFPMathOp() && "this op can't take fast-math flags");
|
|
|
|
}
|
|
|
|
|
2024-03-26 16:01:57 +01:00
|
|
|
bool VPInstruction::doesGeneratePerAllLanes() const {
|
|
|
|
return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VPInstruction::canGenerateScalarForFirstLane() const {
|
|
|
|
if (Instruction::isBinaryOp(getOpcode()))
|
|
|
|
return true;
|
2024-07-11 16:08:04 +01:00
|
|
|
if (isSingleScalar() || isVectorToScalar())
|
2024-06-03 20:20:30 +01:00
|
|
|
return true;
|
2024-03-26 16:01:57 +01:00
|
|
|
switch (Opcode) {
|
2024-07-05 10:08:42 +01:00
|
|
|
case Instruction::ICmp:
|
2024-03-26 16:01:57 +01:00
|
|
|
case VPInstruction::BranchOnCond:
|
|
|
|
case VPInstruction::BranchOnCount:
|
|
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
|
|
case VPInstruction::PtrAdd:
|
2024-04-04 18:30:17 -04:00
|
|
|
case VPInstruction::ExplicitVectorLength:
|
2024-03-26 16:01:57 +01:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *VPInstruction::generatePerLane(VPTransformState &State,
|
|
|
|
const VPIteration &Lane) {
|
|
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
|
|
|
|
assert(getOpcode() == VPInstruction::PtrAdd &&
|
|
|
|
"only PtrAdd opcodes are supported for now");
|
|
|
|
return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
|
|
|
|
State.get(getOperand(1), Lane), Name);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
|
2022-06-28 10:34:30 +01:00
|
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
|
|
|
|
if (Instruction::isBinaryOp(getOpcode())) {
|
2024-02-26 19:06:43 +00:00
|
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
|
|
Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
|
|
|
|
Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
|
2023-12-08 18:30:30 +00:00
|
|
|
auto *Res =
|
|
|
|
Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
|
|
|
|
if (auto *I = dyn_cast<Instruction>(Res))
|
|
|
|
setFlags(I);
|
|
|
|
return Res;
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
switch (getOpcode()) {
|
|
|
|
case VPInstruction::Not: {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateNot(A, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2023-09-02 21:45:24 +01:00
|
|
|
case Instruction::ICmp: {
|
2024-07-05 10:08:42 +01:00
|
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
|
|
Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
|
|
|
|
Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
|
2023-09-02 21:45:24 +01:00
|
|
|
return Builder.CreateCmp(getPredicate(), A, B, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case Instruction::Select: {
|
|
|
|
Value *Cond = State.get(getOperand(0), Part);
|
|
|
|
Value *Op1 = State.get(getOperand(1), Part);
|
|
|
|
Value *Op2 = State.get(getOperand(2), Part);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateSelect(Cond, Op1, Op2, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::ActiveLaneMask: {
|
|
|
|
// Get first lane of vector induction variable.
|
|
|
|
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
|
|
|
|
// Get the original loop tripcount.
|
2023-05-04 14:00:13 +01:00
|
|
|
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2024-03-06 15:59:35 -05:00
|
|
|
// If this part of the active lane mask is scalar, generate the CMP directly
|
|
|
|
// to avoid unnecessary extracts.
|
|
|
|
if (State.VF.isScalar())
|
|
|
|
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
|
|
|
|
Name);
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
|
|
|
|
auto *PredTy = VectorType::get(Int1Ty, State.VF);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
|
|
|
|
{PredTy, ScalarTC->getType()},
|
|
|
|
{VIVElem0, ScalarTC}, nullptr, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::FirstOrderRecurrenceSplice: {
|
|
|
|
// Generate code to combine the previous and current values in vector v3.
|
|
|
|
//
|
|
|
|
// vector.ph:
|
|
|
|
// v_init = vector(..., ..., ..., a[-1])
|
|
|
|
// br vector.body
|
|
|
|
//
|
|
|
|
// vector.body
|
|
|
|
// i = phi [0, vector.ph], [i+4, vector.body]
|
|
|
|
// v1 = phi [v_init, vector.ph], [v2, vector.body]
|
|
|
|
// v2 = a[i, i+1, i+2, i+3];
|
|
|
|
// v3 = vector(v1(3), v2(0, 1, 2))
|
|
|
|
|
|
|
|
// For the first part, use the recurrence phi (v1), otherwise v2.
|
|
|
|
auto *V1 = State.get(getOperand(0), 0);
|
|
|
|
Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
|
2023-07-05 19:15:55 +01:00
|
|
|
if (!PartMinus1->getType()->isVectorTy())
|
|
|
|
return PartMinus1;
|
|
|
|
Value *V2 = State.get(getOperand(1), Part);
|
|
|
|
return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
case VPInstruction::CalculateTripCountMinusVF: {
|
2024-03-24 20:59:53 +00:00
|
|
|
if (Part != 0)
|
|
|
|
return State.get(this, 0, /*IsScalar*/ true);
|
|
|
|
|
2023-05-04 14:00:13 +01:00
|
|
|
Value *ScalarTC = State.get(getOperand(0), {0, 0});
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
Value *Step =
|
|
|
|
createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
|
|
|
|
Value *Sub = Builder.CreateSub(ScalarTC, Step);
|
|
|
|
Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
|
|
|
|
Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
|
2023-07-05 19:15:55 +01:00
|
|
|
return Builder.CreateSelect(Cmp, Sub, Zero);
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
}
|
2024-04-04 18:30:17 -04:00
|
|
|
case VPInstruction::ExplicitVectorLength: {
|
|
|
|
// Compute EVL
|
|
|
|
auto GetEVL = [=](VPTransformState &State, Value *AVL) {
|
|
|
|
assert(AVL->getType()->isIntegerTy() &&
|
|
|
|
"Requested vector length should be an integer.");
|
|
|
|
|
|
|
|
// TODO: Add support for MaxSafeDist for correct loop emission.
|
|
|
|
assert(State.VF.isScalable() && "Expected scalable vector factor.");
|
|
|
|
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
|
|
|
|
|
|
|
|
Value *EVL = State.Builder.CreateIntrinsic(
|
|
|
|
State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
|
|
|
|
{AVL, VFArg, State.Builder.getTrue()});
|
|
|
|
return EVL;
|
|
|
|
};
|
|
|
|
// TODO: Restructure this code with an explicit remainder loop, vsetvli can
|
|
|
|
// be outside of the main loop.
|
|
|
|
assert(Part == 0 && "No unrolling expected for predicated vectorization.");
|
|
|
|
// Compute VTC - IV as the AVL (requested vector length).
|
|
|
|
Value *Index = State.get(getOperand(0), VPIteration(0, 0));
|
|
|
|
Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
|
|
|
|
Value *AVL = State.Builder.CreateSub(TripCount, Index);
|
|
|
|
Value *EVL = GetEVL(State, AVL);
|
|
|
|
return EVL;
|
|
|
|
}
|
2023-08-08 12:12:29 +01:00
|
|
|
case VPInstruction::CanonicalIVIncrementForPart: {
|
2022-05-10 10:49:43 +01:00
|
|
|
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
|
2023-07-05 19:15:55 +01:00
|
|
|
if (Part == 0)
|
|
|
|
return IV;
|
2022-05-10 10:49:43 +01:00
|
|
|
|
|
|
|
// The canonical IV is incremented by the vectorization factor (num of SIMD
|
|
|
|
// elements) times the unroll part.
|
|
|
|
Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
|
2023-08-08 12:59:33 +01:00
|
|
|
return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
|
|
|
|
hasNoSignedWrap());
|
2022-05-10 10:49:43 +01:00
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPInstruction::BranchOnCond: {
|
|
|
|
if (Part != 0)
|
2023-07-05 19:15:55 +01:00
|
|
|
return nullptr;
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
|
|
|
|
// Replace the temporary unreachable terminator with a new conditional
|
|
|
|
// branch, hooking it up to backward destination for exiting blocks now and
|
|
|
|
// to forward destination(s) later when they are created.
|
|
|
|
BranchInst *CondBr =
|
|
|
|
Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
|
|
|
|
CondBr->setSuccessor(0, nullptr);
|
|
|
|
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
|
2024-06-23 20:11:36 +01:00
|
|
|
|
|
|
|
if (!getParent()->isExiting())
|
|
|
|
return CondBr;
|
|
|
|
|
|
|
|
VPRegionBlock *ParentRegion = getParent()->getParent();
|
|
|
|
VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
|
|
|
|
CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
|
2023-07-05 19:15:55 +01:00
|
|
|
return CondBr;
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
case VPInstruction::BranchOnCount: {
|
|
|
|
if (Part != 0)
|
2023-07-05 19:15:55 +01:00
|
|
|
return nullptr;
|
2022-06-28 10:34:30 +01:00
|
|
|
// First create the compare.
|
2024-02-26 19:06:43 +00:00
|
|
|
Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
|
|
|
|
Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
|
2022-06-28 10:34:30 +01:00
|
|
|
Value *Cond = Builder.CreateICmpEQ(IV, TC);
|
|
|
|
|
|
|
|
// Now create the branch.
|
|
|
|
auto *Plan = getParent()->getPlan();
|
|
|
|
VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
|
|
|
|
VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
|
|
|
|
|
|
|
|
// Replace the temporary unreachable terminator with a new conditional
|
|
|
|
// branch, hooking it up to backward destination (the header) now and to the
|
|
|
|
// forward destination (the exit/middle block) later when it is created.
|
|
|
|
// Note that CreateCondBr expects a valid BB as first argument, so we need
|
|
|
|
// to set it to nullptr later.
|
|
|
|
BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
|
|
|
|
State.CFG.VPBB2IRBB[Header]);
|
|
|
|
CondBr->setSuccessor(0, nullptr);
|
|
|
|
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
|
2023-07-05 19:15:55 +01:00
|
|
|
return CondBr;
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2024-01-04 22:53:18 +00:00
|
|
|
case VPInstruction::ComputeReductionResult: {
|
|
|
|
if (Part != 0)
|
2024-02-26 19:06:43 +00:00
|
|
|
return State.get(this, 0, /*IsScalar*/ true);
|
2024-01-04 22:53:18 +00:00
|
|
|
|
|
|
|
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
|
|
|
|
// and will be removed by breaking up the recipe further.
|
|
|
|
auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
|
|
|
|
auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
|
|
|
|
// Get its reduction variable descriptor.
|
|
|
|
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
|
|
|
|
|
|
|
|
RecurKind RK = RdxDesc.getRecurrenceKind();
|
|
|
|
|
|
|
|
VPValue *LoopExitingDef = getOperand(1);
|
|
|
|
Type *PhiTy = OrigPhi->getType();
|
|
|
|
VectorParts RdxParts(State.UF);
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part)
|
2024-02-26 19:06:43 +00:00
|
|
|
RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
|
2024-01-04 22:53:18 +00:00
|
|
|
|
|
|
|
// If the vector reduction can be performed in a smaller type, we truncate
|
|
|
|
// then extend the loop exit value to enable InstCombine to evaluate the
|
|
|
|
// entire expression in the smaller type.
|
|
|
|
// TODO: Handle this in truncateToMinBW.
|
|
|
|
if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
|
|
|
|
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part)
|
|
|
|
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
|
|
|
|
}
|
|
|
|
// Reduce all of the unrolled parts into a single vector.
|
|
|
|
Value *ReducedPartRdx = RdxParts[0];
|
|
|
|
unsigned Op = RecurrenceDescriptor::getOpcode(RK);
|
2024-05-03 14:40:48 +01:00
|
|
|
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
|
|
|
|
Op = Instruction::Or;
|
2024-01-04 22:53:18 +00:00
|
|
|
|
|
|
|
if (PhiR->isOrdered()) {
|
|
|
|
ReducedPartRdx = RdxParts[State.UF - 1];
|
|
|
|
} else {
|
|
|
|
// Floating-point operations should have some FMF to enable the reduction.
|
|
|
|
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
|
|
|
|
Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
|
|
|
|
for (unsigned Part = 1; Part < State.UF; ++Part) {
|
|
|
|
Value *RdxPart = RdxParts[Part];
|
|
|
|
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
|
|
|
|
ReducedPartRdx = Builder.CreateBinOp(
|
|
|
|
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
|
2024-05-03 14:40:48 +01:00
|
|
|
else
|
2024-01-04 22:53:18 +00:00
|
|
|
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the reduction after the loop. Note that inloop reductions create
|
|
|
|
// the target reduction in the loop using a Reduction recipe.
|
2024-05-03 14:40:48 +01:00
|
|
|
if ((State.VF.isVector() ||
|
|
|
|
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
|
|
|
|
!PhiR->isInLoop()) {
|
2024-01-04 22:53:18 +00:00
|
|
|
ReducedPartRdx =
|
2024-09-03 16:49:42 -07:00
|
|
|
createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
|
2024-01-04 22:53:18 +00:00
|
|
|
// If the reduction can be performed in a smaller type, we need to extend
|
|
|
|
// the reduction to the wider type before we branch to the original loop.
|
|
|
|
if (PhiTy != RdxDesc.getRecurrenceType())
|
|
|
|
ReducedPartRdx = RdxDesc.isSigned()
|
|
|
|
? Builder.CreateSExt(ReducedPartRdx, PhiTy)
|
|
|
|
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If there were stores of the reduction value to a uniform memory address
|
|
|
|
// inside the loop, create the final store here.
|
|
|
|
if (StoreInst *SI = RdxDesc.IntermediateStore) {
|
|
|
|
auto *NewSI = Builder.CreateAlignedStore(
|
|
|
|
ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
|
|
|
|
propagateMetadata(NewSI, SI);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ReducedPartRdx;
|
|
|
|
}
|
2024-06-03 20:20:30 +01:00
|
|
|
case VPInstruction::ExtractFromEnd: {
|
|
|
|
if (Part != 0)
|
|
|
|
return State.get(this, 0, /*IsScalar*/ true);
|
|
|
|
|
|
|
|
auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
|
|
|
|
unsigned Offset = CI->getZExtValue();
|
|
|
|
assert(Offset > 0 && "Offset from end must be positive");
|
|
|
|
Value *Res;
|
|
|
|
if (State.VF.isVector()) {
|
|
|
|
assert(Offset <= State.VF.getKnownMinValue() &&
|
|
|
|
"invalid offset to extract from");
|
|
|
|
// Extract lane VF - Offset from the operand.
|
|
|
|
Res = State.get(
|
|
|
|
getOperand(0),
|
|
|
|
VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
|
|
|
|
} else {
|
|
|
|
assert(Offset <= State.UF && "invalid offset to extract from");
|
|
|
|
// When loop is unrolled without vectorizing, retrieve UF - Offset.
|
|
|
|
Res = State.get(getOperand(0), State.UF - Offset);
|
|
|
|
}
|
2024-06-14 22:16:51 +01:00
|
|
|
if (isa<ExtractElementInst>(Res))
|
|
|
|
Res->setName(Name);
|
2024-06-03 20:20:30 +01:00
|
|
|
return Res;
|
|
|
|
}
|
2024-05-14 09:42:49 +01:00
|
|
|
case VPInstruction::LogicalAnd: {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
|
|
|
Value *B = State.get(getOperand(1), Part);
|
|
|
|
return Builder.CreateLogicalAnd(A, B, Name);
|
|
|
|
}
|
2024-03-26 16:01:57 +01:00
|
|
|
case VPInstruction::PtrAdd: {
|
|
|
|
assert(vputils::onlyFirstLaneUsed(this) &&
|
|
|
|
"can only generate first lane for PtrAdd");
|
|
|
|
Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
|
|
|
|
Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
|
|
|
|
return Builder.CreatePtrAdd(Ptr, Addend, Name);
|
|
|
|
}
|
2024-07-11 16:08:04 +01:00
|
|
|
case VPInstruction::ResumePhi: {
|
|
|
|
if (Part != 0)
|
|
|
|
return State.get(this, 0, /*IsScalar*/ true);
|
|
|
|
Value *IncomingFromVPlanPred =
|
|
|
|
State.get(getOperand(0), Part, /* IsScalar */ true);
|
|
|
|
Value *IncomingFromOtherPreds =
|
|
|
|
State.get(getOperand(1), Part, /* IsScalar */ true);
|
|
|
|
auto *NewPhi =
|
|
|
|
Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
|
|
|
|
BasicBlock *VPlanPred =
|
|
|
|
State.CFG
|
|
|
|
.VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
|
|
|
|
NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
|
|
|
|
for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
|
|
|
|
assert(OtherPred != VPlanPred &&
|
|
|
|
"VPlan predecessors should not be connected yet");
|
|
|
|
NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
|
|
|
|
}
|
|
|
|
return NewPhi;
|
|
|
|
}
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Unsupported opcode for instruction");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-06-03 20:20:30 +01:00
|
|
|
bool VPInstruction::isVectorToScalar() const {
|
|
|
|
return getOpcode() == VPInstruction::ExtractFromEnd ||
|
|
|
|
getOpcode() == VPInstruction::ComputeReductionResult;
|
|
|
|
}
|
|
|
|
|
2024-07-11 16:08:04 +01:00
|
|
|
bool VPInstruction::isSingleScalar() const {
|
|
|
|
return getOpcode() == VPInstruction::ResumePhi;
|
|
|
|
}
|
|
|
|
|
2023-08-08 20:11:47 +01:00
|
|
|
#if !defined(NDEBUG)
|
|
|
|
bool VPInstruction::isFPMathOp() const {
|
|
|
|
// Inspired by FPMathOperator::classof. Notable differences are that we don't
|
|
|
|
// support Call, PHI and Select opcodes here yet.
|
|
|
|
return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
|
|
|
|
Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
|
|
|
|
Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
|
2023-09-21 11:05:55 +01:00
|
|
|
Opcode == Instruction::FCmp || Opcode == Instruction::Select;
|
2023-08-08 20:11:47 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPInstruction::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "VPInstruction executing an Instance");
|
|
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
|
2023-09-21 11:05:55 +01:00
|
|
|
assert((hasFastMathFlags() == isFPMathOp() ||
|
|
|
|
getOpcode() == Instruction::Select) &&
|
2023-08-08 20:11:47 +01:00
|
|
|
"Recipe not a FPMathOp but has fast-math flags?");
|
|
|
|
if (hasFastMathFlags())
|
|
|
|
State.Builder.setFastMathFlags(getFastMathFlags());
|
2024-04-02 10:43:34 +01:00
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2024-07-11 16:08:04 +01:00
|
|
|
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
|
|
|
|
(vputils::onlyFirstLaneUsed(this) ||
|
|
|
|
isVectorToScalar() || isSingleScalar());
|
2024-03-26 16:01:57 +01:00
|
|
|
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
|
2024-06-08 20:30:14 +01:00
|
|
|
bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
|
2023-07-05 19:15:55 +01:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2024-03-26 16:01:57 +01:00
|
|
|
if (GeneratesPerAllLanes) {
|
|
|
|
for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
|
|
|
|
Lane != NumLanes; ++Lane) {
|
|
|
|
Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));
|
|
|
|
assert(GeneratedValue && "generatePerLane must produce a value");
|
|
|
|
State.set(this, GeneratedValue, VPIteration(Part, Lane));
|
|
|
|
}
|
2023-07-05 19:15:55 +01:00
|
|
|
continue;
|
2024-03-26 16:01:57 +01:00
|
|
|
}
|
2024-02-26 19:06:43 +00:00
|
|
|
|
2024-06-08 20:30:14 +01:00
|
|
|
if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
|
|
|
|
Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
|
|
|
|
State.set(this, Part0, Part,
|
|
|
|
/*IsScalar*/ GeneratesPerFirstLaneOnly);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-03-26 16:01:57 +01:00
|
|
|
Value *GeneratedValue = generatePerPart(State, Part);
|
|
|
|
if (!hasResult())
|
|
|
|
continue;
|
|
|
|
assert(GeneratedValue && "generatePerPart must produce a value");
|
|
|
|
assert((GeneratedValue->getType()->isVectorTy() ==
|
|
|
|
!GeneratesPerFirstLaneOnly ||
|
|
|
|
State.VF.isScalar()) &&
|
|
|
|
"scalar value but not only first lane defined");
|
|
|
|
State.set(this, GeneratedValue, Part,
|
|
|
|
/*IsScalar*/ GeneratesPerFirstLaneOnly);
|
2023-07-05 19:15:55 +01:00
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2024-02-26 19:06:43 +00:00
|
|
|
|
2024-02-03 16:19:10 +00:00
|
|
|
bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
|
|
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
|
|
if (Instruction::isBinaryOp(getOpcode()))
|
|
|
|
return vputils::onlyFirstLaneUsed(this);
|
|
|
|
|
|
|
|
switch (getOpcode()) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Instruction::ICmp:
|
2024-03-26 16:01:57 +01:00
|
|
|
case VPInstruction::PtrAdd:
|
2024-02-03 16:19:10 +00:00
|
|
|
// TODO: Cover additional opcodes.
|
|
|
|
return vputils::onlyFirstLaneUsed(this);
|
|
|
|
case VPInstruction::ActiveLaneMask:
|
2024-04-04 18:30:17 -04:00
|
|
|
case VPInstruction::ExplicitVectorLength:
|
2024-02-03 16:19:10 +00:00
|
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
|
|
case VPInstruction::BranchOnCount:
|
2024-07-05 10:08:42 +01:00
|
|
|
case VPInstruction::BranchOnCond:
|
2024-07-11 16:08:04 +01:00
|
|
|
case VPInstruction::ResumePhi:
|
2024-02-26 19:06:43 +00:00
|
|
|
return true;
|
2024-02-03 16:19:10 +00:00
|
|
|
};
|
|
|
|
llvm_unreachable("switch should return");
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2024-06-09 13:19:08 +01:00
|
|
|
bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
|
|
|
|
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
|
|
|
|
if (Instruction::isBinaryOp(getOpcode()))
|
|
|
|
return vputils::onlyFirstPartUsed(this);
|
|
|
|
|
|
|
|
switch (getOpcode()) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::Select:
|
|
|
|
return vputils::onlyFirstPartUsed(this);
|
|
|
|
case VPInstruction::BranchOnCount:
|
|
|
|
case VPInstruction::BranchOnCond:
|
|
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
llvm_unreachable("switch should return");
|
|
|
|
}
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPInstruction::dump() const {
|
|
|
|
VPSlotTracker SlotTracker(getParent()->getPlan());
|
|
|
|
print(dbgs(), "", SlotTracker);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPInstruction::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
|
|
|
|
if (hasResult()) {
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (getOpcode()) {
|
|
|
|
case VPInstruction::Not:
|
|
|
|
O << "not";
|
|
|
|
break;
|
|
|
|
case VPInstruction::SLPLoad:
|
|
|
|
O << "combined load";
|
|
|
|
break;
|
|
|
|
case VPInstruction::SLPStore:
|
|
|
|
O << "combined store";
|
|
|
|
break;
|
|
|
|
case VPInstruction::ActiveLaneMask:
|
|
|
|
O << "active lane mask";
|
|
|
|
break;
|
2024-07-11 16:08:04 +01:00
|
|
|
case VPInstruction::ResumePhi:
|
|
|
|
O << "resume-phi";
|
|
|
|
break;
|
2024-04-04 18:30:17 -04:00
|
|
|
case VPInstruction::ExplicitVectorLength:
|
|
|
|
O << "EXPLICIT-VECTOR-LENGTH";
|
|
|
|
break;
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPInstruction::FirstOrderRecurrenceSplice:
|
|
|
|
O << "first-order splice";
|
|
|
|
break;
|
|
|
|
case VPInstruction::BranchOnCond:
|
|
|
|
O << "branch-on-cond";
|
|
|
|
break;
|
[LoopVectorize] Remove runtime check and scalar tail loop when tail-folding.
When using tail-folding and using the predicate for both data and control-flow
(the next vector iteration's predicate is generated with the llvm.active.lane.mask
intrinsic and then tested for the backedge), the LoopVectorizer still inserts a
runtime check to see if the 'i + VF' may at any point overflow for the given
trip-count. When it does, it falls back to a scalar epilogue loop.
We can get rid of that runtime check in the pre-header and therefore also
remove the scalar epilogue loop. This reduces code-size and avoids a runtime
check.
Consider the following loop:
void foo(char * __restrict__ dst, char *src, unsigned long N) {
for (unsigned long i=0; i<N; ++i)
dst[i] = src[i] + 42;
}
If 'N' is e.g. ULONG_MAX, and the VF > 1, then the loop iteration counter
will overflow when calculating the predicate for the next vector iteration
at some point, because LLVM does:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%index.next = add i64 %index, 16
; The add above may overflow, which would affect the lane mask and control flow. Hence a runtime check is needed.
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %N)
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
The solution:
What we can do instead is calculate the predicate before incrementing
the loop iteration counter, such that the llvm.active.lane.mask is
calculated from 'i' to 'tripcount > VF ? tripcount - VF : 0', i.e.
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
%N_minus_VF = select %N > 16 ? %N - 16 : 0
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index, i64 %N_minus_VF)
%index.next = add i64 %index, %4
; The add above may still overflow, but this time the active.lane.mask is not affected
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
For N = 20, we'd then get:
vector.ph:
%active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %N)
; %active.lane.mask.entry = <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
%N_minus_VF = select 20 > 16 ? 20 - 16 : 0
; %N_minus_VF = 4
vector.body: (1st iteration)
... ; using <1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4)
; %active.lane.mask.next = <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 0, 16
; %index.next = 16
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 1
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %vector.body
vector.body: (2nd iteration)
... ; using <1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> as predicate in the loop
...
%active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 16, i64 4)
; %active.lane.mask.next = <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
%index.next = add i64 16, 16
; %index.next = 32
%8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
; %8 = 0
br i1 %8, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
; branch to %for.cond.cleanup
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D142109
2023-01-19 10:29:16 +00:00
|
|
|
case VPInstruction::CalculateTripCountMinusVF:
|
|
|
|
O << "TC > VF ? TC - VF : 0";
|
|
|
|
break;
|
2022-05-10 10:49:43 +01:00
|
|
|
case VPInstruction::CanonicalIVIncrementForPart:
|
2023-08-08 11:31:21 +01:00
|
|
|
O << "VF * Part +";
|
2022-05-10 10:49:43 +01:00
|
|
|
break;
|
2022-06-28 10:34:30 +01:00
|
|
|
case VPInstruction::BranchOnCount:
|
2023-08-08 11:31:21 +01:00
|
|
|
O << "branch-on-count";
|
2022-06-28 10:34:30 +01:00
|
|
|
break;
|
2024-06-03 20:20:30 +01:00
|
|
|
case VPInstruction::ExtractFromEnd:
|
|
|
|
O << "extract-from-end";
|
|
|
|
break;
|
2024-01-04 22:53:18 +00:00
|
|
|
case VPInstruction::ComputeReductionResult:
|
|
|
|
O << "compute-reduction-result";
|
|
|
|
break;
|
2024-05-14 09:42:49 +01:00
|
|
|
case VPInstruction::LogicalAnd:
|
|
|
|
O << "logical-and";
|
|
|
|
break;
|
2024-03-26 16:01:57 +01:00
|
|
|
case VPInstruction::PtrAdd:
|
|
|
|
O << "ptradd";
|
|
|
|
break;
|
2022-06-28 10:34:30 +01:00
|
|
|
default:
|
|
|
|
O << Instruction::getOpcodeName(getOpcode());
|
|
|
|
}
|
|
|
|
|
2023-08-08 12:12:29 +01:00
|
|
|
printFlags(O);
|
2023-08-08 11:31:21 +01:00
|
|
|
printOperands(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2023-09-05 15:45:14 +01:00
|
|
|
if (auto DL = getDebugLoc()) {
|
2022-06-28 10:34:30 +01:00
|
|
|
O << ", !dbg ";
|
|
|
|
DL.print(O);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2024-09-14 21:21:55 +01:00
|
|
|
void VPIRInstruction::execute(VPTransformState &State) {
|
|
|
|
assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
|
|
|
|
"Only PHINodes can have extra operands");
|
|
|
|
if (getNumOperands() == 1) {
|
|
|
|
VPValue *ExitValue = getOperand(0);
|
|
|
|
auto Lane = vputils::isUniformAfterVectorization(ExitValue)
|
|
|
|
? VPLane::getFirstLane()
|
|
|
|
: VPLane::getLastLaneForVF(State.VF);
|
|
|
|
auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
|
|
|
|
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
|
|
|
|
// Set insertion point in PredBB in case an extract needs to be generated.
|
|
|
|
// TODO: Model extracts explicitly.
|
|
|
|
State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
|
|
|
|
Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
|
|
|
|
auto *Phi = cast<PHINode>(&I);
|
|
|
|
Phi->addIncoming(V, PredBB);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Advance the insert point after the wrapped IR instruction. This allows
|
|
|
|
// interleaving VPIRInstructions and other recipes.
|
|
|
|
State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "IR " << I;
|
|
|
|
|
|
|
|
if (getNumOperands() != 0) {
|
|
|
|
assert(getNumOperands() == 1 && "can have at most 1 operand");
|
|
|
|
O << " (extra operand: ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
O << ")";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-09-05 10:48:29 +01:00
|
|
|
void VPWidenCallRecipe::execute(VPTransformState &State) {
|
2023-05-05 18:21:56 +01:00
|
|
|
assert(State.VF.isVector() && "not widening");
|
2024-05-01 20:48:22 +01:00
|
|
|
Function *CalledScalarFn = getCalledScalarFunction();
|
|
|
|
assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
|
2022-09-05 10:48:29 +01:00
|
|
|
"DbgInfoIntrinsic should have been dropped during VPlan construction");
|
2024-01-19 13:33:03 +00:00
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2022-09-05 10:48:29 +01:00
|
|
|
|
2024-01-02 19:14:16 +01:00
|
|
|
bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
|
2023-11-20 13:30:03 +00:00
|
|
|
FunctionType *VFTy = nullptr;
|
|
|
|
if (Variant)
|
|
|
|
VFTy = Variant->getFunctionType();
|
2022-09-05 10:48:29 +01:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-04-21 14:11:31 +01:00
|
|
|
SmallVector<Type *, 2> TysForDecl;
|
|
|
|
// Add return type if intrinsic is overloaded on it.
|
2024-01-02 19:14:16 +01:00
|
|
|
if (UseIntrinsic &&
|
|
|
|
isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
|
2024-05-01 20:48:22 +01:00
|
|
|
TysForDecl.push_back(VectorType::get(
|
|
|
|
CalledScalarFn->getReturnType()->getScalarType(), State.VF));
|
2022-09-05 10:48:29 +01:00
|
|
|
SmallVector<Value *, 4> Args;
|
2024-05-01 20:48:22 +01:00
|
|
|
for (const auto &I : enumerate(arg_operands())) {
|
2022-09-05 10:48:29 +01:00
|
|
|
// Some intrinsics have a scalar argument - don't replace it with a
|
|
|
|
// vector.
|
|
|
|
Value *Arg;
|
2024-01-26 11:30:35 +00:00
|
|
|
if (UseIntrinsic &&
|
|
|
|
isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
|
2022-09-05 10:48:29 +01:00
|
|
|
Arg = State.get(I.value(), VPIteration(0, 0));
|
2024-01-26 11:30:35 +00:00
|
|
|
// Some vectorized function variants may also take a scalar argument,
|
|
|
|
// e.g. linear parameters for pointers. This needs to be the scalar value
|
|
|
|
// from the start of the respective part when interleaving.
|
|
|
|
else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
|
|
|
|
Arg = State.get(I.value(), VPIteration(Part, 0));
|
2023-11-20 13:30:03 +00:00
|
|
|
else
|
|
|
|
Arg = State.get(I.value(), Part);
|
2024-01-02 19:14:16 +01:00
|
|
|
if (UseIntrinsic &&
|
|
|
|
isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
|
2022-09-05 10:48:29 +01:00
|
|
|
TysForDecl.push_back(Arg->getType());
|
|
|
|
Args.push_back(Arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
Function *VectorF;
|
2024-01-02 19:14:16 +01:00
|
|
|
if (UseIntrinsic) {
|
2022-09-05 10:48:29 +01:00
|
|
|
// Use vector version of the intrinsic.
|
|
|
|
Module *M = State.Builder.GetInsertBlock()->getModule();
|
|
|
|
VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
|
|
|
|
assert(VectorF && "Can't retrieve vector intrinsic.");
|
|
|
|
} else {
|
|
|
|
#ifndef NDEBUG
|
2022-09-09 09:41:02 +01:00
|
|
|
assert(Variant != nullptr && "Can't create vector function.");
|
2022-09-05 10:48:29 +01:00
|
|
|
#endif
|
2022-09-09 09:41:02 +01:00
|
|
|
VectorF = Variant;
|
2022-09-05 10:48:29 +01:00
|
|
|
}
|
2022-09-09 09:41:02 +01:00
|
|
|
|
2024-05-01 20:48:22 +01:00
|
|
|
auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
|
2022-09-05 10:48:29 +01:00
|
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
2024-05-01 20:48:22 +01:00
|
|
|
if (CI)
|
|
|
|
CI->getOperandBundlesAsDefs(OpBundles);
|
|
|
|
|
2022-09-05 10:48:29 +01:00
|
|
|
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
|
|
|
|
|
|
|
|
if (isa<FPMathOperator>(V))
|
2024-05-01 20:48:22 +01:00
|
|
|
V->copyFastMathFlags(CI);
|
2022-09-05 10:48:29 +01:00
|
|
|
|
2024-02-21 20:36:16 +00:00
|
|
|
if (!V->getType()->isVoidTy())
|
|
|
|
State.set(this, V, Part);
|
2024-05-01 20:48:22 +01:00
|
|
|
State.addMetadata(V, CI);
|
2022-09-05 10:48:29 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-01 16:26:08 +01:00
|
|
|
InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
|
|
|
|
VPCostContext &Ctx) const {
|
|
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
|
|
if (Variant) {
|
|
|
|
return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
|
|
|
|
Variant->getFunctionType()->params(),
|
|
|
|
CostKind);
|
|
|
|
}
|
|
|
|
|
|
|
|
FastMathFlags FMF;
|
|
|
|
// TODO: Manage flags via VPRecipeWithIRFlags.
|
|
|
|
if (auto *FPMO = dyn_cast_or_null<FPMathOperator>(getUnderlyingValue()))
|
|
|
|
FMF = FPMO->getFastMathFlags();
|
|
|
|
|
2024-09-02 14:00:09 +01:00
|
|
|
// Some backends analyze intrinsic arguments to determine cost. Use the
|
|
|
|
// underlying value for the operand if it has one. Otherwise try to use the
|
|
|
|
// operand of the underlying call instruction, if there is one. Otherwise
|
|
|
|
// clear Arguments.
|
|
|
|
// TODO: Rework TTI interface to be independent of concrete IR values.
|
2024-09-01 16:26:08 +01:00
|
|
|
SmallVector<const Value *> Arguments;
|
2024-09-02 14:00:09 +01:00
|
|
|
for (const auto &[Idx, Op] : enumerate(operands())) {
|
2024-09-01 16:26:08 +01:00
|
|
|
auto *V = Op->getUnderlyingValue();
|
|
|
|
if (!V) {
|
2024-09-02 14:00:09 +01:00
|
|
|
if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
|
|
|
|
Arguments.push_back(UI->getArgOperand(Idx));
|
|
|
|
continue;
|
|
|
|
}
|
2024-09-01 16:26:08 +01:00
|
|
|
Arguments.clear();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
Arguments.push_back(V);
|
|
|
|
}
|
|
|
|
|
|
|
|
Type *RetTy =
|
|
|
|
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
|
|
|
|
SmallVector<Type *> ParamTys;
|
|
|
|
for (unsigned I = 0; I != getNumOperands(); ++I)
|
|
|
|
ParamTys.push_back(
|
|
|
|
ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
|
|
|
|
|
2024-09-02 20:47:36 +01:00
|
|
|
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
|
|
|
|
IntrinsicCostAttributes CostAttrs(
|
|
|
|
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
|
|
|
|
dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
|
2024-09-01 16:26:08 +01:00
|
|
|
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
|
|
|
|
}
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-CALL ";
|
|
|
|
|
2024-05-01 20:48:22 +01:00
|
|
|
Function *CalledFn = getCalledScalarFunction();
|
|
|
|
if (CalledFn->getReturnType()->isVoidTy())
|
2022-06-28 10:34:30 +01:00
|
|
|
O << "void ";
|
|
|
|
else {
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
|
2024-05-01 20:48:22 +01:00
|
|
|
O << "call @" << CalledFn->getName() << "(";
|
|
|
|
interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
|
|
|
|
Op->printAsOperand(O, SlotTracker);
|
|
|
|
});
|
2022-06-28 10:34:30 +01:00
|
|
|
O << ")";
|
2022-09-01 13:14:40 +01:00
|
|
|
|
|
|
|
if (VectorIntrinsicID)
|
|
|
|
O << " (using vector intrinsic)";
|
2022-09-09 09:41:02 +01:00
|
|
|
else {
|
|
|
|
O << " (using library function";
|
|
|
|
if (Variant->hasName())
|
|
|
|
O << ": " << Variant->getName();
|
|
|
|
O << ")";
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-SELECT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = select ";
|
|
|
|
getOperand(0)->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", ";
|
|
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", ";
|
|
|
|
getOperand(2)->printAsOperand(O, SlotTracker);
|
2023-03-10 15:28:43 +01:00
|
|
|
O << (isInvariantCond() ? " (condition is loop invariant)" : "");
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2022-07-08 09:33:17 -07:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-08 09:33:17 -07:00
|
|
|
void VPWidenSelectRecipe::execute(VPTransformState &State) {
|
2023-09-05 15:45:14 +01:00
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2022-07-08 09:33:17 -07:00
|
|
|
|
|
|
|
// The condition can be loop invariant but still defined inside the
|
|
|
|
// loop. This means that we can't just use the original 'cond' value.
|
|
|
|
// We have to take the 'vectorized' value and pick the first lane.
|
|
|
|
// Instcombine will make this a no-op.
|
|
|
|
auto *InvarCond =
|
2023-03-10 17:49:23 +01:00
|
|
|
isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
|
2022-07-08 09:33:17 -07:00
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-03-10 17:49:23 +01:00
|
|
|
Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
|
2022-07-08 09:33:17 -07:00
|
|
|
Value *Op0 = State.get(getOperand(1), Part);
|
|
|
|
Value *Op1 = State.get(getOperand(2), Part);
|
|
|
|
Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
|
|
|
|
State.set(this, Sel, Part);
|
2023-09-05 15:45:14 +01:00
|
|
|
State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
|
2022-07-08 09:33:17 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-08 20:11:47 +01:00
|
|
|
VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
|
|
|
|
const FastMathFlags &FMF) {
|
|
|
|
AllowReassoc = FMF.allowReassoc();
|
|
|
|
NoNaNs = FMF.noNaNs();
|
|
|
|
NoInfs = FMF.noInfs();
|
|
|
|
NoSignedZeros = FMF.noSignedZeros();
|
|
|
|
AllowReciprocal = FMF.allowReciprocal();
|
|
|
|
AllowContract = FMF.allowContract();
|
|
|
|
ApproxFunc = FMF.approxFunc();
|
|
|
|
}
|
|
|
|
|
2023-05-23 20:36:15 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
|
|
|
|
switch (OpType) {
|
2023-09-02 21:45:24 +01:00
|
|
|
case OperationType::Cmp:
|
|
|
|
O << " " << CmpInst::getPredicateName(getPredicate());
|
|
|
|
break;
|
2023-12-05 15:21:59 +00:00
|
|
|
case OperationType::DisjointOp:
|
|
|
|
if (DisjointFlags.IsDisjoint)
|
|
|
|
O << " disjoint";
|
|
|
|
break;
|
2023-05-23 20:36:15 +01:00
|
|
|
case OperationType::PossiblyExactOp:
|
|
|
|
if (ExactFlags.IsExact)
|
|
|
|
O << " exact";
|
|
|
|
break;
|
|
|
|
case OperationType::OverflowingBinOp:
|
|
|
|
if (WrapFlags.HasNUW)
|
|
|
|
O << " nuw";
|
|
|
|
if (WrapFlags.HasNSW)
|
|
|
|
O << " nsw";
|
|
|
|
break;
|
|
|
|
case OperationType::FPMathOp:
|
|
|
|
getFastMathFlags().print(O);
|
|
|
|
break;
|
|
|
|
case OperationType::GEPOp:
|
|
|
|
if (GEPFlags.IsInBounds)
|
|
|
|
O << " inbounds";
|
|
|
|
break;
|
2023-12-05 09:17:23 -05:00
|
|
|
case OperationType::NonNegOp:
|
|
|
|
if (NonNegFlags.NonNeg)
|
|
|
|
O << " nneg";
|
|
|
|
break;
|
2023-05-23 20:36:15 +01:00
|
|
|
case OperationType::Other:
|
|
|
|
break;
|
|
|
|
}
|
2023-08-08 12:12:29 +01:00
|
|
|
if (getNumOperands() > 0)
|
|
|
|
O << " ";
|
2023-05-23 20:36:15 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-07-09 18:46:56 -07:00
|
|
|
void VPWidenRecipe::execute(VPTransformState &State) {
|
2023-09-05 15:45:14 +01:00
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2022-07-09 18:46:56 -07:00
|
|
|
auto &Builder = State.Builder;
|
2023-09-06 16:27:09 +01:00
|
|
|
switch (Opcode) {
|
2022-07-09 18:46:56 -07:00
|
|
|
case Instruction::Call:
|
|
|
|
case Instruction::Br:
|
|
|
|
case Instruction::PHI:
|
|
|
|
case Instruction::GetElementPtr:
|
|
|
|
case Instruction::Select:
|
|
|
|
llvm_unreachable("This instruction is handled by a different recipe.");
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::FNeg:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
// Just widen unops and binops.
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
SmallVector<Value *, 2> Ops;
|
|
|
|
for (VPValue *VPOp : operands())
|
|
|
|
Ops.push_back(State.get(VPOp, Part));
|
|
|
|
|
2023-09-06 16:27:09 +01:00
|
|
|
Value *V = Builder.CreateNAryOp(Opcode, Ops);
|
2022-07-09 18:46:56 -07:00
|
|
|
|
2023-05-08 17:28:50 +01:00
|
|
|
if (auto *VecOp = dyn_cast<Instruction>(V))
|
|
|
|
setFlags(VecOp);
|
2022-07-09 18:46:56 -07:00
|
|
|
|
|
|
|
// Use this vector value for all users of the original instruction.
|
|
|
|
State.set(this, V, Part);
|
2023-09-06 16:27:09 +01:00
|
|
|
State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
|
2022-07-09 18:46:56 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::Freeze: {
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *Op = State.get(getOperand(0), Part);
|
|
|
|
|
|
|
|
Value *Freeze = Builder.CreateFreeze(Op);
|
|
|
|
State.set(this, Freeze, Part);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::FCmp: {
|
|
|
|
// Widen compares. Generate vector compares.
|
2023-09-06 16:27:09 +01:00
|
|
|
bool FCmp = Opcode == Instruction::FCmp;
|
2022-07-09 18:46:56 -07:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *A = State.get(getOperand(0), Part);
|
|
|
|
Value *B = State.get(getOperand(1), Part);
|
|
|
|
Value *C = nullptr;
|
|
|
|
if (FCmp) {
|
|
|
|
// Propagate fast math flags.
|
|
|
|
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
|
2023-09-06 16:27:09 +01:00
|
|
|
if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
|
|
|
|
Builder.setFastMathFlags(I->getFastMathFlags());
|
|
|
|
C = Builder.CreateFCmp(getPredicate(), A, B);
|
2022-07-09 18:46:56 -07:00
|
|
|
} else {
|
2023-09-06 16:27:09 +01:00
|
|
|
C = Builder.CreateICmp(getPredicate(), A, B);
|
2022-07-09 18:46:56 -07:00
|
|
|
}
|
|
|
|
State.set(this, C, Part);
|
2023-09-06 16:27:09 +01:00
|
|
|
State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
|
2022-07-09 18:46:56 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
// This instruction is not vectorized by simple widening.
|
2023-09-06 16:27:09 +01:00
|
|
|
LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
|
|
|
|
<< Instruction::getOpcodeName(Opcode));
|
2022-07-09 18:46:56 -07:00
|
|
|
llvm_unreachable("Unhandled instruction!");
|
|
|
|
} // end of switch.
|
2023-10-27 14:38:28 +01:00
|
|
|
|
|
|
|
#if !defined(NDEBUG)
|
|
|
|
// Verify that VPlan type inference results agree with the type of the
|
|
|
|
// generated values.
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
|
|
|
|
State.VF) == State.get(this, Part)->getType() &&
|
|
|
|
"inferred type and type from generated instructions do not match");
|
|
|
|
}
|
|
|
|
#endif
|
2022-07-09 18:46:56 -07:00
|
|
|
}
|
2023-10-27 14:38:28 +01:00
|
|
|
|
2024-08-16 21:20:23 +02:00
|
|
|
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
|
|
|
|
VPCostContext &Ctx) const {
|
|
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::FNeg: {
|
|
|
|
Type *VectorTy =
|
|
|
|
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
|
|
|
|
return Ctx.TTI.getArithmeticInstrCost(
|
|
|
|
Opcode, VectorTy, CostKind,
|
|
|
|
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
|
|
|
|
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
|
|
|
|
}
|
|
|
|
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::URem:
|
|
|
|
// More complex computation, let the legacy cost-model handle this for now.
|
|
|
|
return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
VPValue *RHS = getOperand(1);
|
|
|
|
// Certain instructions can be cheaper to vectorize if they have a constant
|
|
|
|
// second vector operand. One example of this are shifts on x86.
|
|
|
|
TargetTransformInfo::OperandValueInfo RHSInfo = {
|
|
|
|
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
|
|
|
|
if (RHS->isLiveIn())
|
|
|
|
RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
|
|
|
|
|
|
|
|
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
|
|
|
|
getOperand(1)->isDefinedOutsideVectorRegions())
|
|
|
|
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
|
|
|
|
Type *VectorTy =
|
|
|
|
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
|
|
|
|
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
|
|
|
|
|
|
|
|
SmallVector<const Value *, 4> Operands;
|
|
|
|
if (CtxI)
|
|
|
|
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
|
|
|
|
return Ctx.TTI.getArithmeticInstrCost(
|
|
|
|
Opcode, VectorTy, CostKind,
|
|
|
|
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
|
|
|
|
RHSInfo, Operands, CtxI, &Ctx.TLI);
|
|
|
|
}
|
|
|
|
case Instruction::Freeze: {
|
|
|
|
// This opcode is unknown. Assume that it is the same as 'mul'.
|
|
|
|
Type *VectorTy =
|
|
|
|
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
|
|
|
|
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
|
|
|
|
}
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::FCmp: {
|
|
|
|
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
|
|
|
|
Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
|
|
|
|
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
|
|
|
|
CostKind, CtxI);
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unsupported opcode for instruction");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-06 08:41:36 -07:00
|
|
|
void VPWidenEVLRecipe::execute(VPTransformState &State) {
|
|
|
|
unsigned Opcode = getOpcode();
|
|
|
|
// TODO: Support other opcodes
|
|
|
|
if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
|
|
|
|
llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
|
|
|
|
|
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
|
|
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
|
|
|
|
"explicit vector length.");
|
|
|
|
|
2024-09-06 09:12:06 -07:00
|
|
|
assert(State.get(getOperand(0), 0)->getType()->isVectorTy() &&
|
2024-09-06 08:41:36 -07:00
|
|
|
"VPWidenEVLRecipe should not be used for scalars");
|
|
|
|
|
|
|
|
VPValue *EVL = getEVL();
|
|
|
|
Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
|
|
|
|
IRBuilderBase &BuilderIR = State.Builder;
|
|
|
|
VectorBuilder Builder(BuilderIR);
|
|
|
|
Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
|
|
|
|
|
|
|
|
SmallVector<Value *, 4> Ops;
|
|
|
|
for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
|
|
|
|
VPValue *VPOp = getOperand(I);
|
|
|
|
Ops.push_back(State.get(VPOp, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.setMask(Mask).setEVL(EVLArg);
|
|
|
|
Value *VPInst =
|
|
|
|
Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
|
|
|
|
// Currently vp-intrinsics only accept FMF flags.
|
|
|
|
// TODO: Enable other flags when support is added.
|
|
|
|
if (isa<FPMathOperator>(VPInst))
|
|
|
|
setFlags(cast<Instruction>(VPInst));
|
|
|
|
|
|
|
|
State.set(this, VPInst, 0);
|
|
|
|
State.addMetadata(VPInst,
|
|
|
|
dyn_cast_or_null<Instruction>(getUnderlyingValue()));
|
|
|
|
}
|
|
|
|
|
2022-07-08 09:33:17 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
2023-09-06 16:27:09 +01:00
|
|
|
O << " = " << Instruction::getOpcodeName(Opcode);
|
2023-05-23 20:36:15 +01:00
|
|
|
printFlags(O);
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2024-09-06 08:41:36 -07:00
|
|
|
|
|
|
|
void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
2024-09-18 15:03:37 +08:00
|
|
|
O << Indent << "WIDEN ";
|
2024-09-06 08:41:36 -07:00
|
|
|
printAsOperand(O, SlotTracker);
|
2024-09-18 15:03:37 +08:00
|
|
|
O << " = vp." << Instruction::getOpcodeName(getOpcode());
|
2024-09-06 08:41:36 -07:00
|
|
|
printFlags(O);
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2023-05-05 13:20:16 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenCastRecipe::execute(VPTransformState &State) {
|
2023-09-05 15:45:14 +01:00
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2023-05-05 13:20:16 +01:00
|
|
|
auto &Builder = State.Builder;
|
|
|
|
/// Vectorize casts.
|
|
|
|
assert(State.VF.isVector() && "Not vectorizing?");
|
|
|
|
Type *DestTy = VectorType::get(getResultType(), State.VF);
|
2023-12-02 16:12:38 +00:00
|
|
|
VPValue *Op = getOperand(0);
|
2023-05-05 13:20:16 +01:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-12-02 16:12:38 +00:00
|
|
|
if (Part > 0 && Op->isLiveIn()) {
|
|
|
|
// FIXME: Remove once explicit unrolling is implemented using VPlan.
|
|
|
|
State.set(this, State.get(this, 0), Part);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
Value *A = State.get(Op, Part);
|
2023-05-05 13:20:16 +01:00
|
|
|
Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
|
|
|
|
State.set(this, Cast, Part);
|
2023-09-05 15:45:14 +01:00
|
|
|
State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
|
2023-05-05 13:20:16 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-CAST ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = " << Instruction::getOpcodeName(Opcode) << " ";
|
2023-12-08 10:48:54 +00:00
|
|
|
printFlags(O);
|
2023-05-05 13:20:16 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
O << " to " << *getResultType();
|
|
|
|
}
|
2023-08-20 20:59:34 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/// This function adds
|
|
|
|
/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
|
|
|
|
/// to each vector element of Val. The sequence starts at StartIndex.
|
|
|
|
/// \p Opcode is relevant for FP induction variable.
|
|
|
|
static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
|
|
|
|
Instruction::BinaryOps BinOp, ElementCount VF,
|
|
|
|
IRBuilderBase &Builder) {
|
|
|
|
assert(VF.isVector() && "only vector VFs are supported");
|
|
|
|
|
|
|
|
// Create and check the types.
|
|
|
|
auto *ValVTy = cast<VectorType>(Val->getType());
|
|
|
|
ElementCount VLen = ValVTy->getElementCount();
|
|
|
|
|
|
|
|
Type *STy = Val->getType()->getScalarType();
|
|
|
|
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
|
|
|
|
"Induction Step must be an integer or FP");
|
|
|
|
assert(Step->getType() == STy && "Step has wrong type");
|
|
|
|
|
|
|
|
SmallVector<Constant *, 8> Indices;
|
|
|
|
|
|
|
|
// Create a vector of consecutive numbers from zero to VF.
|
|
|
|
VectorType *InitVecValVTy = ValVTy;
|
|
|
|
if (STy->isFloatingPointTy()) {
|
|
|
|
Type *InitVecValSTy =
|
|
|
|
IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
|
|
|
|
InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
|
|
|
|
}
|
|
|
|
Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
|
|
|
|
|
|
|
|
// Splat the StartIdx
|
|
|
|
Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
|
|
|
|
|
|
|
|
if (STy->isIntegerTy()) {
|
|
|
|
InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
|
|
|
|
Step = Builder.CreateVectorSplat(VLen, Step);
|
|
|
|
assert(Step->getType() == Val->getType() && "Invalid step vec");
|
|
|
|
// FIXME: The newly created binary instructions should contain nsw/nuw
|
|
|
|
// flags, which can be found from the original scalar operations.
|
|
|
|
Step = Builder.CreateMul(InitVec, Step);
|
|
|
|
return Builder.CreateAdd(Val, Step, "induction");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Floating point induction.
|
|
|
|
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
|
|
|
|
"Binary Opcode should be specified for FP induction");
|
|
|
|
InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
|
|
|
|
InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
|
|
|
|
|
|
|
|
Step = Builder.CreateVectorSplat(VLen, Step);
|
|
|
|
Value *MulOp = Builder.CreateFMul(InitVec, Step);
|
|
|
|
return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A helper function that returns an integer or floating-point constant with
|
|
|
|
/// value C.
|
|
|
|
static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
|
|
|
|
return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
|
|
|
|
: ConstantFP::get(Ty, C);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "Int or FP induction being replicated.");
|
|
|
|
|
|
|
|
Value *Start = getStartValue()->getLiveInIRValue();
|
|
|
|
const InductionDescriptor &ID = getInductionDescriptor();
|
|
|
|
TruncInst *Trunc = getTruncInst();
|
|
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
|
|
|
|
assert(State.VF.isVector() && "must have vector VF");
|
|
|
|
|
|
|
|
// The value from the original loop to which we are mapping the new induction
|
|
|
|
// variable.
|
|
|
|
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
|
|
|
|
|
|
|
|
// Fast-math-flags propagate from the original induction instruction.
|
|
|
|
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
|
|
|
|
if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
|
|
|
|
Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
|
|
|
|
|
|
|
|
// Now do the actual transformations, and start with fetching the step value.
|
|
|
|
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
|
|
|
|
|
|
|
|
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
|
|
|
|
"Expected either an induction phi-node or a truncate of it!");
|
|
|
|
|
|
|
|
// Construct the initial value of the vector IV in the vector loop preheader
|
|
|
|
auto CurrIP = Builder.saveIP();
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
if (isa<TruncInst>(EntryVal)) {
|
|
|
|
assert(Start->getType()->isIntegerTy() &&
|
|
|
|
"Truncation requires an integer type");
|
|
|
|
auto *TruncType = cast<IntegerType>(EntryVal->getType());
|
|
|
|
Step = Builder.CreateTrunc(Step, TruncType);
|
|
|
|
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
|
|
|
|
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
|
|
|
|
Value *SteppedStart = getStepVector(
|
|
|
|
SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
|
|
|
|
|
|
|
|
// We create vector phi nodes for both integer and floating-point induction
|
|
|
|
// variables. Here, we determine the kind of arithmetic we will perform.
|
|
|
|
Instruction::BinaryOps AddOp;
|
|
|
|
Instruction::BinaryOps MulOp;
|
|
|
|
if (Step->getType()->isIntegerTy()) {
|
|
|
|
AddOp = Instruction::Add;
|
|
|
|
MulOp = Instruction::Mul;
|
|
|
|
} else {
|
|
|
|
AddOp = ID.getInductionOpcode();
|
|
|
|
MulOp = Instruction::FMul;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Multiply the vectorization factor by the step using integer or
|
|
|
|
// floating-point arithmetic as appropriate.
|
|
|
|
Type *StepType = Step->getType();
|
2024-09-10 10:41:35 +01:00
|
|
|
Value *RuntimeVF = State.get(getVFValue(), {0, 0});
|
2023-08-20 20:59:34 +01:00
|
|
|
if (Step->getType()->isFloatingPointTy())
|
2024-09-10 10:41:35 +01:00
|
|
|
RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
|
2023-08-20 20:59:34 +01:00
|
|
|
else
|
2024-09-10 10:41:35 +01:00
|
|
|
RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
|
2023-08-20 20:59:34 +01:00
|
|
|
Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
|
|
|
|
|
|
|
|
// Create a vector splat to use in the induction update.
|
|
|
|
//
|
|
|
|
// FIXME: If the step is non-constant, we create the vector splat with
|
|
|
|
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
|
|
|
|
// handle a constant vector splat.
|
|
|
|
Value *SplatVF = isa<Constant>(Mul)
|
|
|
|
? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
|
|
|
|
: Builder.CreateVectorSplat(State.VF, Mul);
|
|
|
|
Builder.restoreIP(CurrIP);
|
|
|
|
|
|
|
|
// We may need to add the step a number of times, depending on the unroll
|
|
|
|
// factor. The last of those goes into the PHI.
|
2023-09-11 11:32:51 +01:00
|
|
|
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
|
|
|
|
VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
|
2023-08-20 20:59:34 +01:00
|
|
|
VecInd->setDebugLoc(EntryVal->getDebugLoc());
|
|
|
|
Instruction *LastInduction = VecInd;
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
State.set(this, LastInduction, Part);
|
|
|
|
|
|
|
|
if (isa<TruncInst>(EntryVal))
|
|
|
|
State.addMetadata(LastInduction, EntryVal);
|
|
|
|
|
|
|
|
LastInduction = cast<Instruction>(
|
|
|
|
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
|
|
|
|
LastInduction->setDebugLoc(EntryVal->getDebugLoc());
|
|
|
|
}
|
|
|
|
|
|
|
|
LastInduction->setName("vec.ind.next");
|
|
|
|
VecInd->addIncoming(SteppedStart, VectorPH);
|
|
|
|
// Add induction update using an incorrect block temporarily. The phi node
|
|
|
|
// will be fixed after VPlan execution. Note that at this point the latch
|
|
|
|
// block cannot be used, as it does not exist yet.
|
|
|
|
// TODO: Model increment value in VPlan, by turning the recipe into a
|
|
|
|
// multi-def and a subclass of VPHeaderPHIRecipe.
|
|
|
|
VecInd->addIncoming(LastInduction, VectorPH);
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2023-08-20 20:59:34 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-INDUCTION";
|
|
|
|
if (getTruncInst()) {
|
|
|
|
O << "\\l\"";
|
|
|
|
O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
|
|
|
|
O << " +\n" << Indent << "\" ";
|
|
|
|
getVPValue(0)->printAsOperand(O, SlotTracker);
|
|
|
|
} else
|
|
|
|
O << " " << VPlanIngredient(IV);
|
|
|
|
|
|
|
|
O << ", ";
|
|
|
|
getStepValue()->printAsOperand(O, SlotTracker);
|
2024-09-10 10:41:35 +01:00
|
|
|
|
|
|
|
O << ", ";
|
|
|
|
getVFValue()->printAsOperand(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
|
2023-04-16 14:48:02 +01:00
|
|
|
// The step may be defined by a recipe in the preheader (e.g. if it requires
|
|
|
|
// SCEV expansion), but for the canonical induction the step is required to be
|
|
|
|
// 1, which is represented as live-in.
|
|
|
|
if (getStepValue()->getDefiningRecipe())
|
|
|
|
return false;
|
|
|
|
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
|
2022-06-28 10:34:30 +01:00
|
|
|
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
|
2024-05-03 13:09:42 +01:00
|
|
|
auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
|
|
|
|
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
|
|
|
|
getScalarType() == CanIV->getScalarType();
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
2022-11-30 17:04:19 +00:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent;
|
|
|
|
printAsOperand(O, SlotTracker);
|
2024-08-26 21:23:05 +08:00
|
|
|
O << " = DERIVED-IV ";
|
2022-11-30 17:04:19 +00:00
|
|
|
getStartValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " + ";
|
2024-04-04 18:30:17 -04:00
|
|
|
getOperand(1)->printAsOperand(O, SlotTracker);
|
2022-11-30 17:04:19 +00:00
|
|
|
O << " * ";
|
|
|
|
getStepValue()->printAsOperand(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
2022-11-30 17:04:19 +00:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2023-08-20 20:59:34 +01:00
|
|
|
void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
|
|
|
|
// Fast-math-flags propagate from the original induction instruction.
|
|
|
|
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
|
2023-09-08 15:46:12 +01:00
|
|
|
if (hasFastMathFlags())
|
|
|
|
State.Builder.setFastMathFlags(getFastMathFlags());
|
2023-08-20 20:59:34 +01:00
|
|
|
|
|
|
|
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
|
|
|
|
/// variable on which to base the steps, \p Step is the size of the step.
|
|
|
|
|
|
|
|
Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
|
|
|
|
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
|
|
|
|
IRBuilderBase &Builder = State.Builder;
|
|
|
|
|
|
|
|
// Ensure step has the same type as that of scalar IV.
|
|
|
|
Type *BaseIVTy = BaseIV->getType()->getScalarType();
|
2024-01-26 11:13:05 +00:00
|
|
|
assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
|
2023-08-20 20:59:34 +01:00
|
|
|
|
|
|
|
// We build scalar steps for both integer and floating-point induction
|
|
|
|
// variables. Here, we determine the kind of arithmetic we will perform.
|
|
|
|
Instruction::BinaryOps AddOp;
|
|
|
|
Instruction::BinaryOps MulOp;
|
|
|
|
if (BaseIVTy->isIntegerTy()) {
|
|
|
|
AddOp = Instruction::Add;
|
|
|
|
MulOp = Instruction::Mul;
|
|
|
|
} else {
|
2023-09-08 15:46:12 +01:00
|
|
|
AddOp = InductionOpcode;
|
2023-08-20 20:59:34 +01:00
|
|
|
MulOp = Instruction::FMul;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Determine the number of scalars we need to generate for each unroll
|
|
|
|
// iteration.
|
|
|
|
bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
|
|
|
|
// Compute the scalar steps and save the results in State.
|
|
|
|
Type *IntStepTy =
|
|
|
|
IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
|
|
|
|
Type *VecIVTy = nullptr;
|
|
|
|
Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
|
|
|
|
if (!FirstLaneOnly && State.VF.isScalable()) {
|
|
|
|
VecIVTy = VectorType::get(BaseIVTy, State.VF);
|
|
|
|
UnitStepVec =
|
|
|
|
Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
|
|
|
|
SplatStep = Builder.CreateVectorSplat(State.VF, Step);
|
|
|
|
SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned StartPart = 0;
|
|
|
|
unsigned EndPart = State.UF;
|
|
|
|
unsigned StartLane = 0;
|
|
|
|
unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
|
|
|
|
if (State.Instance) {
|
|
|
|
StartPart = State.Instance->Part;
|
|
|
|
EndPart = StartPart + 1;
|
|
|
|
StartLane = State.Instance->Lane.getKnownLane();
|
|
|
|
EndLane = StartLane + 1;
|
|
|
|
}
|
|
|
|
for (unsigned Part = StartPart; Part < EndPart; ++Part) {
|
|
|
|
Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
|
|
|
|
|
|
|
|
if (!FirstLaneOnly && State.VF.isScalable()) {
|
|
|
|
auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
|
|
|
|
auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
|
|
|
|
if (BaseIVTy->isFloatingPointTy())
|
|
|
|
InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
|
|
|
|
auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
|
|
|
|
auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
|
|
|
|
State.set(this, Add, Part);
|
|
|
|
// It's useful to record the lane values too for the known minimum number
|
|
|
|
// of elements so we do those below. This improves the code quality when
|
|
|
|
// trying to extract the first element, for example.
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BaseIVTy->isFloatingPointTy())
|
|
|
|
StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
|
|
|
|
|
|
|
|
for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
|
|
|
|
Value *StartIdx = Builder.CreateBinOp(
|
|
|
|
AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
|
|
|
|
// The step returned by `createStepForVF` is a runtime-evaluated value
|
|
|
|
// when VF is scalable. Otherwise, it should be folded into a Constant.
|
|
|
|
assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
|
|
|
|
"Expected StartIdx to be folded to a constant when VF is not "
|
|
|
|
"scalable");
|
|
|
|
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
|
|
|
|
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
|
|
|
|
State.set(this, Add, VPIteration(Part, Lane));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent;
|
|
|
|
printAsOperand(O, SlotTracker);
|
2023-09-17 10:15:51 +01:00
|
|
|
O << " = SCALAR-STEPS ";
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2022-07-10 17:10:17 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenGEPRecipe::execute(VPTransformState &State) {
|
2023-05-06 09:25:46 +01:00
|
|
|
assert(State.VF.isVector() && "not widening");
|
2022-07-10 17:10:17 -07:00
|
|
|
auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
|
|
|
|
// Construct a vector GEP by widening the operands of the scalar GEP as
|
|
|
|
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
|
|
|
|
// results in a vector of pointers when at least one operand of the GEP
|
|
|
|
// is vector-typed. Thus, to keep the representation compact, we only use
|
|
|
|
// vector-typed operands for loop-varying values.
|
|
|
|
|
2023-05-06 09:25:46 +01:00
|
|
|
if (areAllOperandsInvariant()) {
|
2022-07-10 17:10:17 -07:00
|
|
|
// If we are vectorizing, but the GEP has only loop-invariant operands,
|
|
|
|
// the GEP we build (by only using vector-typed operands for
|
|
|
|
// loop-varying values) would be a scalar pointer. Thus, to ensure we
|
|
|
|
// produce a vector of pointers, we need to either arbitrarily pick an
|
|
|
|
// operand to broadcast, or broadcast a clone of the original GEP.
|
|
|
|
// Here, we broadcast a clone of the original.
|
|
|
|
//
|
|
|
|
// TODO: If at some point we decide to scalarize instructions having
|
|
|
|
// loop-invariant operands, this special case will no longer be
|
|
|
|
// required. We would add the scalarization decision to
|
|
|
|
// collectLoopScalars() and teach getVectorValue() to broadcast
|
|
|
|
// the lane-zero scalar value.
|
2023-06-16 16:14:01 +01:00
|
|
|
SmallVector<Value *> Ops;
|
|
|
|
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
|
|
|
|
Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
|
|
|
|
|
|
|
|
auto *NewGEP =
|
|
|
|
State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
|
|
|
|
ArrayRef(Ops).drop_front(), "", isInBounds());
|
2022-07-10 17:10:17 -07:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
2023-06-16 16:14:01 +01:00
|
|
|
Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
|
2022-07-10 17:10:17 -07:00
|
|
|
State.set(this, EntryPart, Part);
|
|
|
|
State.addMetadata(EntryPart, GEP);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// If the GEP has at least one loop-varying operand, we are sure to
|
|
|
|
// produce a vector of pointers. But if we are only unrolling, we want
|
|
|
|
// to produce a scalar GEP for each unroll part. Thus, the GEP we
|
|
|
|
// produce with the code below will be scalar (if VF == 1) or vector
|
|
|
|
// (otherwise). Note that for the unroll-only case, we still maintain
|
|
|
|
// values in the vector mapping with initVector, as we do for other
|
|
|
|
// instructions.
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
// The pointer operand of the new GEP. If it's loop-invariant, we
|
|
|
|
// won't broadcast it.
|
2023-03-09 17:52:22 +01:00
|
|
|
auto *Ptr = isPointerLoopInvariant()
|
2022-07-10 17:10:17 -07:00
|
|
|
? State.get(getOperand(0), VPIteration(0, 0))
|
|
|
|
: State.get(getOperand(0), Part);
|
|
|
|
|
|
|
|
// Collect all the indices for the new GEP. If any index is
|
|
|
|
// loop-invariant, we won't broadcast it.
|
|
|
|
SmallVector<Value *, 4> Indices;
|
|
|
|
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
|
|
|
|
VPValue *Operand = getOperand(I);
|
2023-03-09 17:52:22 +01:00
|
|
|
if (isIndexLoopInvariant(I - 1))
|
2022-07-10 17:10:17 -07:00
|
|
|
Indices.push_back(State.get(Operand, VPIteration(0, 0)));
|
|
|
|
else
|
|
|
|
Indices.push_back(State.get(Operand, Part));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
|
|
|
|
// but it should be a vector, otherwise.
|
|
|
|
auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
|
2023-05-09 15:17:20 +01:00
|
|
|
Indices, "", isInBounds());
|
2022-07-10 17:10:17 -07:00
|
|
|
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
|
|
|
|
"NewGEP is not a pointer vector");
|
|
|
|
State.set(this, NewGEP, Part);
|
|
|
|
State.addMetadata(NewGEP, GEP);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-10 17:10:17 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-GEP ";
|
2023-03-09 17:52:22 +01:00
|
|
|
O << (isPointerLoopInvariant() ? "Inv" : "Var");
|
|
|
|
for (size_t I = 0; I < getNumOperands() - 1; ++I)
|
|
|
|
O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
O << " ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
2023-05-23 20:36:15 +01:00
|
|
|
O << " = getelementptr";
|
|
|
|
printFlags(O);
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2022-07-11 16:01:04 -07:00
|
|
|
#endif
|
|
|
|
|
2024-01-01 19:51:15 +00:00
|
|
|
void VPVectorPointerRecipe ::execute(VPTransformState &State) {
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
// Calculate the pointer for the specific unroll-part.
|
|
|
|
Value *PartPtr = nullptr;
|
|
|
|
// Use i32 for the gep index type when the value is constant,
|
|
|
|
// or query DataLayout for a more suitable index type otherwise.
|
|
|
|
const DataLayout &DL =
|
2024-06-27 16:38:15 +02:00
|
|
|
Builder.GetInsertBlock()->getDataLayout();
|
2024-01-01 19:51:15 +00:00
|
|
|
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
|
|
|
|
? DL.getIndexType(IndexedTy->getPointerTo())
|
|
|
|
: Builder.getInt32Ty();
|
|
|
|
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
|
2024-01-06 12:30:42 +00:00
|
|
|
bool InBounds = isInBounds();
|
2024-01-01 19:51:15 +00:00
|
|
|
if (IsReverse) {
|
|
|
|
// If the address is consecutive but reversed, then the
|
|
|
|
// wide store needs to start at the last vector element.
|
|
|
|
// RunTimeVF = VScale * VF.getKnownMinValue()
|
|
|
|
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
|
|
|
|
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
|
|
|
|
// NumElt = -Part * RunTimeVF
|
|
|
|
Value *NumElt = Builder.CreateMul(
|
|
|
|
ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
|
|
|
|
// LastLane = 1 - RunTimeVF
|
|
|
|
Value *LastLane =
|
|
|
|
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
|
|
|
|
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
|
|
|
|
PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
|
|
|
|
} else {
|
|
|
|
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
|
|
|
|
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
|
|
|
|
}
|
|
|
|
|
2024-02-26 19:06:43 +00:00
|
|
|
State.set(this, PartPtr, Part, /*IsScalar*/ true);
|
2024-01-01 19:51:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent;
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = vector-pointer ";
|
|
|
|
if (IsReverse)
|
|
|
|
O << "(reverse) ";
|
|
|
|
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-07-11 16:01:04 -07:00
|
|
|
void VPBlendRecipe::execute(VPTransformState &State) {
|
2024-08-21 12:51:40 +01:00
|
|
|
assert(isNormalized() && "Expected blend to be normalized!");
|
2023-09-05 15:45:14 +01:00
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2022-07-11 16:01:04 -07:00
|
|
|
// We know that all PHIs in non-header blocks are converted into
|
|
|
|
// selects, so we don't have to worry about the insertion order and we
|
|
|
|
// can just use the builder.
|
|
|
|
// At this point we generate the predication tree. There may be
|
|
|
|
// duplications since this is a simple recursive scan, but future
|
|
|
|
// optimizations will clean it up.
|
|
|
|
|
|
|
|
unsigned NumIncoming = getNumIncomingValues();
|
|
|
|
|
|
|
|
// Generate a sequence of selects of the form:
|
|
|
|
// SELECT(Mask3, In3,
|
|
|
|
// SELECT(Mask2, In2,
|
|
|
|
// SELECT(Mask1, In1,
|
|
|
|
// In0)))
|
|
|
|
// Note that Mask0 is never used: lanes for which no path reaches this phi and
|
|
|
|
// are essentially undef are taken from In0.
|
2024-08-13 15:08:04 +00:00
|
|
|
VectorParts Entry(State.UF);
|
|
|
|
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
|
|
|
|
for (unsigned In = 0; In < NumIncoming; ++In) {
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
// We might have single edge PHIs (blocks) - use an identity
|
|
|
|
// 'select' for the first PHI operand.
|
|
|
|
Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
|
|
|
|
if (In == 0)
|
|
|
|
Entry[Part] = In0; // Initialize with the first incoming value.
|
|
|
|
else {
|
|
|
|
// Select between the current value and the previous incoming edge
|
|
|
|
// based on the incoming mask.
|
|
|
|
Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
|
|
|
|
Entry[Part] =
|
|
|
|
State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-11 16:01:04 -07:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part)
|
2024-05-15 11:00:14 +01:00
|
|
|
State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
|
2022-07-11 16:01:04 -07:00
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-11 16:01:04 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "BLEND ";
|
2023-09-04 12:35:57 +01:00
|
|
|
printAsOperand(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
O << " =";
|
|
|
|
if (getNumIncomingValues() == 1) {
|
|
|
|
// Not a User of any mask: not really blending, this is a
|
|
|
|
// single-predecessor phi.
|
|
|
|
O << " ";
|
|
|
|
getIncomingValue(0)->printAsOperand(O, SlotTracker);
|
|
|
|
} else {
|
|
|
|
for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
|
|
|
|
O << " ";
|
|
|
|
getIncomingValue(I)->printAsOperand(O, SlotTracker);
|
2024-04-09 11:14:05 +01:00
|
|
|
if (I == 0)
|
|
|
|
continue;
|
2022-06-28 10:34:30 +01:00
|
|
|
O << "/";
|
|
|
|
getMask(I)->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-04-07 20:32:53 +01:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2024-04-07 20:32:53 +01:00
|
|
|
void VPReductionRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "Reduction being replicated.");
|
|
|
|
Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
|
|
|
|
RecurKind Kind = RdxDesc.getRecurrenceKind();
|
|
|
|
// Propagate the fast-math flags carried by the underlying instruction.
|
|
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
|
|
|
|
State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *NewVecOp = State.get(getVecOp(), Part);
|
|
|
|
if (VPValue *Cond = getCondOp()) {
|
|
|
|
Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
|
|
|
|
VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
|
|
|
|
Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
|
|
|
|
|
2024-08-30 11:05:07 -07:00
|
|
|
Value *Start;
|
|
|
|
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind))
|
|
|
|
Start = RdxDesc.getRecurrenceStartValue();
|
|
|
|
else
|
2024-08-30 17:13:51 -07:00
|
|
|
Start = llvm::getRecurrenceIdentity(Kind, ElementTy,
|
|
|
|
RdxDesc.getFastMathFlags());
|
2024-08-30 11:05:07 -07:00
|
|
|
if (State.VF.isVector())
|
|
|
|
Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(),
|
|
|
|
Start);
|
|
|
|
|
|
|
|
Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
|
2024-04-07 20:32:53 +01:00
|
|
|
NewVecOp = Select;
|
|
|
|
}
|
|
|
|
Value *NewRed;
|
|
|
|
Value *NextInChain;
|
|
|
|
if (IsOrdered) {
|
|
|
|
if (State.VF.isVector())
|
|
|
|
NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
|
|
|
|
PrevInChain);
|
|
|
|
else
|
|
|
|
NewRed = State.Builder.CreateBinOp(
|
|
|
|
(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
|
|
|
|
NewVecOp);
|
|
|
|
PrevInChain = NewRed;
|
2024-08-30 12:44:02 -07:00
|
|
|
NextInChain = NewRed;
|
2024-04-07 20:32:53 +01:00
|
|
|
} else {
|
|
|
|
PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
|
2024-09-03 16:49:42 -07:00
|
|
|
NewRed = createReduction(State.Builder, RdxDesc, NewVecOp);
|
2024-08-30 12:44:02 -07:00
|
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
|
|
|
|
NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
|
|
|
|
NewRed, PrevInChain);
|
|
|
|
else
|
|
|
|
NextInChain = State.Builder.CreateBinOp(
|
|
|
|
(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed,
|
|
|
|
PrevInChain);
|
2024-04-07 20:32:53 +01:00
|
|
|
}
|
|
|
|
State.set(this, NextInChain, Part, /*IsScalar*/ true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-07-16 16:15:24 +08:00
|
|
|
void VPReductionEVLRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "Reduction being replicated.");
|
|
|
|
assert(State.UF == 1 &&
|
|
|
|
"Expected only UF == 1 when vectorizing with explicit vector length.");
|
|
|
|
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
// Propagate the fast-math flags carried by the underlying instruction.
|
|
|
|
IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
|
|
|
|
const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
|
|
|
|
Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
|
|
|
|
|
|
|
|
RecurKind Kind = RdxDesc.getRecurrenceKind();
|
|
|
|
Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);
|
|
|
|
Value *VecOp = State.get(getVecOp(), 0);
|
|
|
|
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
|
|
|
|
|
|
|
|
VectorBuilder VBuilder(Builder);
|
|
|
|
VBuilder.setEVL(EVL);
|
|
|
|
Value *Mask;
|
|
|
|
// TODO: move the all-true mask generation into VectorBuilder.
|
|
|
|
if (VPValue *CondOp = getCondOp())
|
|
|
|
Mask = State.get(CondOp, 0);
|
|
|
|
else
|
|
|
|
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
|
|
|
|
VBuilder.setMask(Mask);
|
|
|
|
|
|
|
|
Value *NewRed;
|
|
|
|
if (isOrdered()) {
|
|
|
|
NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);
|
|
|
|
} else {
|
2024-09-03 16:49:42 -07:00
|
|
|
NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc);
|
2024-07-16 16:15:24 +08:00
|
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
|
|
|
|
NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
|
|
|
|
else
|
|
|
|
NewRed = Builder.CreateBinOp(
|
|
|
|
(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
|
|
|
|
}
|
|
|
|
State.set(this, NewRed, 0, /*IsScalar*/ true);
|
|
|
|
}
|
|
|
|
|
2024-04-07 20:32:53 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "REDUCE ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
getChainOp()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " +";
|
|
|
|
if (isa<FPMathOperator>(getUnderlyingInstr()))
|
|
|
|
O << getUnderlyingInstr()->getFastMathFlags();
|
2023-08-16 01:01:01 -07:00
|
|
|
O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
|
2022-06-28 10:34:30 +01:00
|
|
|
getVecOp()->printAsOperand(O, SlotTracker);
|
2024-07-16 16:15:24 +08:00
|
|
|
if (isConditional()) {
|
|
|
|
O << ", ";
|
|
|
|
getCondOp()->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
O << ")";
|
|
|
|
if (RdxDesc.IntermediateStore)
|
|
|
|
O << " (with final reduction value stored in invariant address sank "
|
|
|
|
"outside of loop)";
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
|
|
|
|
O << Indent << "REDUCE ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
getChainOp()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " +";
|
|
|
|
if (isa<FPMathOperator>(getUnderlyingInstr()))
|
|
|
|
O << getUnderlyingInstr()->getFastMathFlags();
|
|
|
|
O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
|
|
|
|
getVecOp()->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", ";
|
|
|
|
getEVL()->printAsOperand(O, SlotTracker);
|
|
|
|
if (isConditional()) {
|
2022-06-28 10:34:30 +01:00
|
|
|
O << ", ";
|
|
|
|
getCondOp()->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
O << ")";
|
2023-08-16 01:01:01 -07:00
|
|
|
if (RdxDesc.IntermediateStore)
|
2022-06-28 10:34:30 +01:00
|
|
|
O << " (with final reduction value stored in invariant address sank "
|
|
|
|
"outside of loop)";
|
|
|
|
}
|
2023-02-20 10:53:45 +00:00
|
|
|
#endif
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2023-02-20 10:28:24 +00:00
|
|
|
bool VPReplicateRecipe::shouldPack() const {
|
|
|
|
// Find if the recipe is used by a widened recipe via an intervening
|
|
|
|
// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
|
|
|
|
return any_of(users(), [](const VPUser *U) {
|
|
|
|
if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
|
2023-02-20 14:11:18 +00:00
|
|
|
return any_of(PredR->users(), [PredR](const VPUser *U) {
|
|
|
|
return !U->usesScalars(PredR);
|
|
|
|
});
|
2023-02-20 10:28:24 +00:00
|
|
|
return false;
|
|
|
|
});
|
|
|
|
}
|
2023-02-20 10:53:45 +00:00
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
|
|
|
|
|
|
|
|
if (!getUnderlyingInstr()->getType()->isVoidTy()) {
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
}
|
|
|
|
if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
|
2023-05-23 20:36:15 +01:00
|
|
|
O << "call";
|
|
|
|
printFlags(O);
|
|
|
|
O << "@" << CB->getCalledFunction()->getName() << "(";
|
2022-06-28 10:34:30 +01:00
|
|
|
interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
|
|
|
|
O, [&O, &SlotTracker](VPValue *Op) {
|
|
|
|
Op->printAsOperand(O, SlotTracker);
|
|
|
|
});
|
|
|
|
O << ")";
|
|
|
|
} else {
|
2023-05-23 20:36:15 +01:00
|
|
|
O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
|
|
|
|
printFlags(O);
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
|
2023-02-20 10:28:24 +00:00
|
|
|
if (shouldPack())
|
2022-06-28 10:34:30 +01:00
|
|
|
O << " (S->V)";
|
|
|
|
}
|
2022-07-13 14:39:59 -07:00
|
|
|
#endif
|
|
|
|
|
2024-01-26 11:13:05 +00:00
|
|
|
/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
|
|
|
|
/// if it is either defined outside the vector region or its operand is known to
|
|
|
|
/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
|
|
|
|
/// TODO: Uniformity should be associated with a VPValue and there should be a
|
|
|
|
/// generic way to check.
|
|
|
|
static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
|
|
|
|
return C->isDefinedOutsideVectorRegions() ||
|
|
|
|
isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
|
|
|
|
isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
|
|
|
|
assert(vputils::onlyFirstLaneUsed(this) &&
|
|
|
|
"Codegen only implemented for first lane.");
|
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::Trunc: {
|
|
|
|
// Note: SExt/ZExt not used yet.
|
|
|
|
Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
|
|
|
|
return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("opcode not implemented yet");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPScalarCastRecipe ::execute(VPTransformState &State) {
|
|
|
|
bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
|
|
|
|
for (unsigned Part = 0; Part != State.UF; ++Part) {
|
|
|
|
Value *Res;
|
|
|
|
// Only generate a single instance, if the recipe is uniform across UFs and
|
|
|
|
// VFs.
|
|
|
|
if (Part > 0 && IsUniformAcrossVFsAndUFs)
|
|
|
|
Res = State.get(this, VPIteration(0, 0));
|
|
|
|
else
|
|
|
|
Res = generate(State, Part);
|
|
|
|
State.set(this, Res, VPIteration(Part, 0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "SCALAR-CAST ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = " << Instruction::getOpcodeName(Opcode) << " ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
O << " to " << *ResultTy;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-07-13 14:39:59 -07:00
|
|
|
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(State.Instance && "Branch on Mask works only on single instance.");
|
|
|
|
|
|
|
|
unsigned Part = State.Instance->Part;
|
|
|
|
unsigned Lane = State.Instance->Lane.getKnownLane();
|
|
|
|
|
|
|
|
Value *ConditionBit = nullptr;
|
|
|
|
VPValue *BlockInMask = getMask();
|
|
|
|
if (BlockInMask) {
|
|
|
|
ConditionBit = State.get(BlockInMask, Part);
|
|
|
|
if (ConditionBit->getType()->isVectorTy())
|
|
|
|
ConditionBit = State.Builder.CreateExtractElement(
|
|
|
|
ConditionBit, State.Builder.getInt32(Lane));
|
|
|
|
} else // Block in mask is all-one.
|
|
|
|
ConditionBit = State.Builder.getTrue();
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2022-07-13 14:39:59 -07:00
|
|
|
// Replace the temporary unreachable terminator with a new conditional branch,
|
|
|
|
// whose two destinations will be set later when they are created.
|
|
|
|
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
|
|
|
|
assert(isa<UnreachableInst>(CurrentTerminator) &&
|
|
|
|
"Expected to replace unreachable terminator with conditional branch.");
|
|
|
|
auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
|
|
|
|
CondBr->setSuccessor(0, nullptr);
|
|
|
|
ReplaceInstWithInst(CurrentTerminator, CondBr);
|
|
|
|
}
|
|
|
|
|
2022-07-17 11:34:23 +01:00
|
|
|
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(State.Instance && "Predicated instruction PHI works per instance.");
|
|
|
|
Instruction *ScalarPredInst =
|
|
|
|
cast<Instruction>(State.get(getOperand(0), *State.Instance));
|
|
|
|
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
|
|
|
|
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
|
|
|
|
assert(PredicatingBB && "Predicated block has no single predecessor.");
|
|
|
|
assert(isa<VPReplicateRecipe>(getOperand(0)) &&
|
|
|
|
"operand must be VPReplicateRecipe");
|
|
|
|
|
|
|
|
// By current pack/unpack logic we need to generate only a single phi node: if
|
|
|
|
// a vector value for the predicated instruction exists at this point it means
|
|
|
|
// the instruction has vector users only, and a phi for the vector value is
|
|
|
|
// needed. In this case the recipe of the predicated instruction is marked to
|
|
|
|
// also do that packing, thereby "hoisting" the insert-element sequence.
|
|
|
|
// Otherwise, a phi node for the scalar value is needed.
|
|
|
|
unsigned Part = State.Instance->Part;
|
|
|
|
if (State.hasVectorValue(getOperand(0), Part)) {
|
|
|
|
Value *VectorValue = State.get(getOperand(0), Part);
|
|
|
|
InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
|
|
|
|
PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
|
|
|
|
VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
|
|
|
|
VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
|
|
|
|
if (State.hasVectorValue(this, Part))
|
|
|
|
State.reset(this, VPhi, Part);
|
|
|
|
else
|
|
|
|
State.set(this, VPhi, Part);
|
|
|
|
// NOTE: Currently we need to update the value of the operand, so the next
|
|
|
|
// predicated iteration inserts its generated value in the correct vector.
|
|
|
|
State.reset(getOperand(0), VPhi, Part);
|
|
|
|
} else {
|
|
|
|
Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
|
|
|
|
PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
|
|
|
|
Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
|
|
|
|
PredicatingBB);
|
|
|
|
Phi->addIncoming(ScalarPredInst, PredicatedBB);
|
|
|
|
if (State.hasScalarValue(this, *State.Instance))
|
|
|
|
State.reset(this, Phi, *State.Instance);
|
|
|
|
else
|
|
|
|
State.set(this, Phi, *State.Instance);
|
|
|
|
// NOTE: Currently we need to update the value of the operand, so the next
|
|
|
|
// predicated iteration inserts its generated value in the correct vector.
|
|
|
|
State.reset(getOperand(0), Phi, *State.Instance);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-13 14:39:59 -07:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "PHI-PREDICATED-INSTRUCTION ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2024-08-11 12:01:17 +01:00
|
|
|
#endif
|
|
|
|
|
2024-09-04 09:46:02 +08:00
|
|
|
InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
|
|
|
|
VPCostContext &Ctx) const {
|
|
|
|
Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
|
|
|
|
const Align Alignment =
|
|
|
|
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
|
|
|
|
unsigned AS =
|
|
|
|
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
|
|
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
|
|
|
|
|
|
if (!Consecutive) {
|
|
|
|
// TODO: Using the original IR may not be accurate.
|
|
|
|
// Currently, ARM will use the underlying IR to calculate gather/scatter
|
|
|
|
// instruction cost.
|
|
|
|
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
|
|
|
|
assert(!Reverse &&
|
|
|
|
"Inconsecutive memory access should not have the order.");
|
|
|
|
return Ctx.TTI.getAddressComputationCost(Ty) +
|
|
|
|
Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
|
|
|
|
IsMasked, Alignment, CostKind,
|
|
|
|
&Ingredient);
|
|
|
|
}
|
|
|
|
|
|
|
|
InstructionCost Cost = 0;
|
|
|
|
if (IsMasked) {
|
|
|
|
Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
|
|
|
|
AS, CostKind);
|
|
|
|
} else {
|
|
|
|
TTI::OperandValueInfo OpInfo =
|
|
|
|
Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
|
|
|
|
Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
|
|
|
|
CostKind, OpInfo, &Ingredient);
|
|
|
|
}
|
|
|
|
if (!Reverse)
|
|
|
|
return Cost;
|
|
|
|
|
|
|
|
return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
|
|
|
|
cast<VectorType>(Ty), std::nullopt,
|
|
|
|
CostKind, 0);
|
|
|
|
}
|
|
|
|
|
2024-08-11 12:01:17 +01:00
|
|
|
void VPWidenLoadRecipe::execute(VPTransformState &State) {
|
|
|
|
auto *LI = cast<LoadInst>(&Ingredient);
|
|
|
|
|
|
|
|
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
|
|
|
|
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
|
|
|
|
const Align Alignment = getLoadStoreAlignment(&Ingredient);
|
|
|
|
bool CreateGather = !isConsecutive();
|
|
|
|
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Value *NewLI;
|
|
|
|
Value *Mask = nullptr;
|
|
|
|
if (auto *VPMask = getMask()) {
|
|
|
|
// Mask reversal is only needed for non-all-one (null) masks, as reverse
|
|
|
|
// of a null all-one mask is a null mask.
|
|
|
|
Mask = State.get(VPMask, Part);
|
|
|
|
if (isReverse())
|
|
|
|
Mask = Builder.CreateVectorReverse(Mask, "reverse");
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2024-08-11 12:01:17 +01:00
|
|
|
Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
|
|
|
|
if (CreateGather) {
|
|
|
|
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
|
|
|
|
"wide.masked.gather");
|
|
|
|
} else if (Mask) {
|
|
|
|
NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
|
|
|
|
PoisonValue::get(DataTy),
|
|
|
|
"wide.masked.load");
|
|
|
|
} else {
|
|
|
|
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
|
|
|
|
}
|
|
|
|
// Add metadata to the load, but setVectorValue to the reverse shuffle.
|
|
|
|
State.addMetadata(NewLI, LI);
|
|
|
|
if (Reverse)
|
|
|
|
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
|
|
|
|
State.set(this, NewLI, Part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2024-04-17 11:00:58 +01:00
|
|
|
void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
2022-06-28 10:34:30 +01:00
|
|
|
O << Indent << "WIDEN ";
|
2024-04-17 11:00:58 +01:00
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = load ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2024-08-22 18:30:48 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/// Use all-true mask for reverse rather than actual mask, as it avoids a
|
|
|
|
/// dependence w/o affecting the result.
|
|
|
|
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
|
|
|
|
Value *EVL, const Twine &Name) {
|
|
|
|
VectorType *ValTy = cast<VectorType>(Operand->getType());
|
|
|
|
Value *AllTrueMask =
|
|
|
|
Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
|
|
|
|
return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
|
|
|
|
{Operand, AllTrueMask, EVL}, nullptr, Name);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
|
|
|
|
"explicit vector length.");
|
|
|
|
auto *LI = cast<LoadInst>(&Ingredient);
|
|
|
|
|
|
|
|
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
|
|
|
|
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
|
|
|
|
const Align Alignment = getLoadStoreAlignment(&Ingredient);
|
|
|
|
bool CreateGather = !isConsecutive();
|
|
|
|
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
|
|
CallInst *NewLI;
|
|
|
|
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
|
|
|
|
Value *Addr = State.get(getAddr(), 0, !CreateGather);
|
|
|
|
Value *Mask = nullptr;
|
|
|
|
if (VPValue *VPMask = getMask()) {
|
|
|
|
Mask = State.get(VPMask, 0);
|
|
|
|
if (isReverse())
|
|
|
|
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
|
|
|
|
} else {
|
|
|
|
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CreateGather) {
|
|
|
|
NewLI =
|
|
|
|
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
|
|
|
|
nullptr, "wide.masked.gather");
|
|
|
|
} else {
|
|
|
|
VectorBuilder VBuilder(Builder);
|
|
|
|
VBuilder.setEVL(EVL).setMask(Mask);
|
|
|
|
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
|
|
|
|
Instruction::Load, DataTy, Addr, "vp.op.load"));
|
|
|
|
}
|
|
|
|
NewLI->addParamAttr(
|
|
|
|
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
|
|
|
|
State.addMetadata(NewLI, LI);
|
|
|
|
Instruction *Res = NewLI;
|
|
|
|
if (isReverse())
|
|
|
|
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
|
|
|
|
State.set(this, Res, 0);
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
|
2024-08-22 18:30:48 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2024-04-19 09:44:23 +01:00
|
|
|
void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = vp.load ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2024-08-15 08:04:22 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenStoreRecipe::execute(VPTransformState &State) {
|
|
|
|
auto *SI = cast<StoreInst>(&Ingredient);
|
|
|
|
|
|
|
|
VPValue *StoredVPValue = getStoredValue();
|
|
|
|
bool CreateScatter = !isConsecutive();
|
|
|
|
const Align Alignment = getLoadStoreAlignment(&Ingredient);
|
|
|
|
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
2024-04-19 09:44:23 +01:00
|
|
|
|
2024-08-15 08:04:22 +01:00
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Instruction *NewSI = nullptr;
|
|
|
|
Value *Mask = nullptr;
|
|
|
|
if (auto *VPMask = getMask()) {
|
|
|
|
// Mask reversal is only needed for non-all-one (null) masks, as reverse
|
|
|
|
// of a null all-one mask is a null mask.
|
|
|
|
Mask = State.get(VPMask, Part);
|
|
|
|
if (isReverse())
|
|
|
|
Mask = Builder.CreateVectorReverse(Mask, "reverse");
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *StoredVal = State.get(StoredVPValue, Part);
|
|
|
|
if (isReverse()) {
|
|
|
|
// If we store to reverse consecutive memory locations, then we need
|
|
|
|
// to reverse the order of elements in the stored value.
|
|
|
|
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
|
|
|
|
// We don't want to update the value in the map as it might be used in
|
|
|
|
// another expression. So don't call resetVectorValue(StoredVal).
|
|
|
|
}
|
|
|
|
Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
|
|
|
|
if (CreateScatter)
|
|
|
|
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
|
|
|
|
else if (Mask)
|
|
|
|
NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
|
|
|
|
else
|
|
|
|
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
|
|
|
|
State.addMetadata(NewSI, SI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2024-04-17 11:00:58 +01:00
|
|
|
void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN store ";
|
2022-06-28 10:34:30 +01:00
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2024-08-22 18:30:48 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
|
|
|
|
"explicit vector length.");
|
|
|
|
auto *SI = cast<StoreInst>(&Ingredient);
|
|
|
|
|
|
|
|
VPValue *StoredValue = getStoredValue();
|
|
|
|
bool CreateScatter = !isConsecutive();
|
|
|
|
const Align Alignment = getLoadStoreAlignment(&Ingredient);
|
|
|
|
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
State.setDebugLocFrom(getDebugLoc());
|
|
|
|
|
|
|
|
CallInst *NewSI = nullptr;
|
|
|
|
Value *StoredVal = State.get(StoredValue, 0);
|
|
|
|
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
|
|
|
|
if (isReverse())
|
|
|
|
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
|
|
|
|
Value *Mask = nullptr;
|
|
|
|
if (VPValue *VPMask = getMask()) {
|
|
|
|
Mask = State.get(VPMask, 0);
|
|
|
|
if (isReverse())
|
|
|
|
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
|
|
|
|
} else {
|
|
|
|
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
|
|
|
|
}
|
|
|
|
Value *Addr = State.get(getAddr(), 0, !CreateScatter);
|
|
|
|
if (CreateScatter) {
|
|
|
|
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
|
|
|
|
Intrinsic::vp_scatter,
|
|
|
|
{StoredVal, Addr, Mask, EVL});
|
|
|
|
} else {
|
|
|
|
VectorBuilder VBuilder(Builder);
|
|
|
|
VBuilder.setEVL(EVL).setMask(Mask);
|
|
|
|
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
|
|
|
|
Instruction::Store, Type::getVoidTy(EVL->getContext()),
|
|
|
|
{StoredVal, Addr}));
|
|
|
|
}
|
|
|
|
NewSI->addParamAttr(
|
|
|
|
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
|
|
|
|
State.addMetadata(NewSI, SI);
|
|
|
|
}
|
2024-04-19 09:44:23 +01:00
|
|
|
|
2024-08-22 18:30:48 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2024-04-19 09:44:23 +01:00
|
|
|
void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN vp.store ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
2022-06-28 10:34:30 +01:00
|
|
|
#endif
|
|
|
|
|
2024-07-20 22:23:01 +01:00
|
|
|
static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
|
|
|
|
VectorType *DstVTy, const DataLayout &DL) {
|
|
|
|
// Verify that V is a vector type with same number of elements as DstVTy.
|
|
|
|
auto VF = DstVTy->getElementCount();
|
|
|
|
auto *SrcVecTy = cast<VectorType>(V->getType());
|
|
|
|
assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
|
|
|
|
Type *SrcElemTy = SrcVecTy->getElementType();
|
|
|
|
Type *DstElemTy = DstVTy->getElementType();
|
|
|
|
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
|
|
|
|
"Vector elements must have same size");
|
|
|
|
|
|
|
|
// Do a direct cast if element types are castable.
|
|
|
|
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
|
|
|
|
return Builder.CreateBitOrPointerCast(V, DstVTy);
|
|
|
|
}
|
|
|
|
// V cannot be directly casted to desired vector type.
|
|
|
|
// May happen when V is a floating point vector but DstVTy is a vector of
|
|
|
|
// pointers or vice-versa. Handle this using a two-step bitcast using an
|
|
|
|
// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
|
|
|
|
assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
|
|
|
|
"Only one type should be a pointer type");
|
|
|
|
assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
|
|
|
|
"Only one type should be a floating point type");
|
|
|
|
Type *IntTy =
|
|
|
|
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
|
|
|
|
auto *VecIntTy = VectorType::get(IntTy, VF);
|
|
|
|
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
|
|
|
|
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return a vector containing interleaved elements from multiple
|
|
|
|
/// smaller input vectors.
|
|
|
|
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
|
|
|
|
const Twine &Name) {
|
|
|
|
unsigned Factor = Vals.size();
|
|
|
|
assert(Factor > 1 && "Tried to interleave invalid number of vectors");
|
|
|
|
|
|
|
|
VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
|
|
|
|
#ifndef NDEBUG
|
|
|
|
for (Value *Val : Vals)
|
|
|
|
assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
|
|
|
|
// must use intrinsics to interleave.
|
|
|
|
if (VecTy->isScalableTy()) {
|
|
|
|
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
|
|
|
|
return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
|
|
|
|
Vals,
|
|
|
|
/*FMFSource=*/nullptr, Name);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fixed length. Start by concatenating all vectors into a wide vector.
|
|
|
|
Value *WideVec = concatenateVectors(Builder, Vals);
|
|
|
|
|
|
|
|
// Interleave the elements into the wide vector.
|
|
|
|
const unsigned NumElts = VecTy->getElementCount().getFixedValue();
|
|
|
|
return Builder.CreateShuffleVector(
|
|
|
|
WideVec, createInterleaveMask(NumElts, Factor), Name);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to vectorize the interleave group that \p Instr belongs to.
|
|
|
|
//
|
|
|
|
// E.g. Translate following interleaved load group (factor = 3):
|
|
|
|
// for (i = 0; i < N; i+=3) {
|
|
|
|
// R = Pic[i]; // Member of index 0
|
|
|
|
// G = Pic[i+1]; // Member of index 1
|
|
|
|
// B = Pic[i+2]; // Member of index 2
|
|
|
|
// ... // do something to R, G, B
|
|
|
|
// }
|
|
|
|
// To:
|
|
|
|
// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
|
|
|
|
// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
|
|
|
|
// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
|
|
|
|
// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
|
|
|
|
//
|
|
|
|
// Or translate following interleaved store group (factor = 3):
|
|
|
|
// for (i = 0; i < N; i+=3) {
|
|
|
|
// ... do something to R, G, B
|
|
|
|
// Pic[i] = R; // Member of index 0
|
|
|
|
// Pic[i+1] = G; // Member of index 1
|
|
|
|
// Pic[i+2] = B; // Member of index 2
|
|
|
|
// }
|
|
|
|
// To:
|
|
|
|
// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
|
|
|
|
// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
|
|
|
|
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
|
|
|
|
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
|
|
|
|
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
|
|
|
|
void VPInterleaveRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "Interleave group being replicated.");
|
|
|
|
const InterleaveGroup<Instruction> *Group = IG;
|
|
|
|
Instruction *Instr = Group->getInsertPos();
|
|
|
|
|
|
|
|
// Prepare for the vector type of the interleaved load/store.
|
|
|
|
Type *ScalarTy = getLoadStoreType(Instr);
|
|
|
|
unsigned InterleaveFactor = Group->getFactor();
|
|
|
|
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
|
|
|
|
|
|
|
|
// Prepare for the new pointers.
|
|
|
|
SmallVector<Value *, 2> AddrParts;
|
|
|
|
unsigned Index = Group->getIndex(Instr);
|
|
|
|
|
|
|
|
// TODO: extend the masked interleaved-group support to reversed access.
|
|
|
|
VPValue *BlockInMask = getMask();
|
|
|
|
assert((!BlockInMask || !Group->isReverse()) &&
|
|
|
|
"Reversed masked interleave-group not supported.");
|
|
|
|
|
|
|
|
Value *Idx;
|
|
|
|
// If the group is reverse, adjust the index to refer to the last vector lane
|
|
|
|
// instead of the first. We adjust the index from the first vector lane,
|
|
|
|
// rather than directly getting the pointer for lane VF - 1, because the
|
|
|
|
// pointer operand of the interleaved access is supposed to be uniform. For
|
|
|
|
// uniform instructions, we're only required to generate a value for the
|
|
|
|
// first vector lane in each unroll iteration.
|
|
|
|
if (Group->isReverse()) {
|
|
|
|
Value *RuntimeVF =
|
|
|
|
getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
|
|
|
|
Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
|
|
|
|
Idx = State.Builder.CreateMul(Idx,
|
|
|
|
State.Builder.getInt32(Group->getFactor()));
|
|
|
|
Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
|
|
|
|
Idx = State.Builder.CreateNeg(Idx);
|
|
|
|
} else
|
|
|
|
Idx = State.Builder.getInt32(-Index);
|
|
|
|
|
|
|
|
VPValue *Addr = getAddr();
|
|
|
|
for (unsigned Part = 0; Part < State.UF; Part++) {
|
|
|
|
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
|
|
|
|
if (auto *I = dyn_cast<Instruction>(AddrPart))
|
|
|
|
State.setDebugLocFrom(I->getDebugLoc());
|
|
|
|
|
|
|
|
// Notice current instruction could be any index. Need to adjust the address
|
|
|
|
// to the member of index 0.
|
|
|
|
//
|
|
|
|
// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
|
|
|
|
// b = A[i]; // Member of index 0
|
|
|
|
// Current pointer is pointed to A[i+1], adjust it to A[i].
|
|
|
|
//
|
|
|
|
// E.g. A[i+1] = a; // Member of index 1
|
|
|
|
// A[i] = b; // Member of index 0
|
|
|
|
// A[i+2] = c; // Member of index 2 (Current instruction)
|
|
|
|
// Current pointer is pointed to A[i+2], adjust it to A[i].
|
|
|
|
|
|
|
|
bool InBounds = false;
|
|
|
|
if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
|
|
|
|
InBounds = gep->isInBounds();
|
|
|
|
AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
|
|
|
|
AddrParts.push_back(AddrPart);
|
|
|
|
}
|
|
|
|
|
|
|
|
State.setDebugLocFrom(Instr->getDebugLoc());
|
|
|
|
Value *PoisonVec = PoisonValue::get(VecTy);
|
|
|
|
|
|
|
|
auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
|
|
|
|
unsigned Part, Value *MaskForGaps) -> Value * {
|
|
|
|
if (State.VF.isScalable()) {
|
|
|
|
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
|
|
|
|
assert(InterleaveFactor == 2 &&
|
|
|
|
"Unsupported deinterleave factor for scalable vectors");
|
|
|
|
auto *BlockInMaskPart = State.get(BlockInMask, Part);
|
|
|
|
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
|
|
|
|
auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
|
|
|
|
State.VF.getKnownMinValue() * 2, true);
|
|
|
|
return State.Builder.CreateIntrinsic(
|
|
|
|
MaskTy, Intrinsic::vector_interleave2, Ops,
|
|
|
|
/*FMFSource=*/nullptr, "interleaved.mask");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!BlockInMask)
|
|
|
|
return MaskForGaps;
|
|
|
|
|
|
|
|
Value *BlockInMaskPart = State.get(BlockInMask, Part);
|
|
|
|
Value *ShuffledMask = State.Builder.CreateShuffleVector(
|
|
|
|
BlockInMaskPart,
|
|
|
|
createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
|
|
|
|
"interleaved.mask");
|
|
|
|
return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
|
|
|
|
ShuffledMask, MaskForGaps)
|
|
|
|
: ShuffledMask;
|
|
|
|
};
|
|
|
|
|
|
|
|
const DataLayout &DL = Instr->getDataLayout();
|
|
|
|
// Vectorize the interleaved load group.
|
|
|
|
if (isa<LoadInst>(Instr)) {
|
|
|
|
Value *MaskForGaps = nullptr;
|
|
|
|
if (NeedsMaskForGaps) {
|
|
|
|
MaskForGaps = createBitMaskForGaps(State.Builder,
|
|
|
|
State.VF.getKnownMinValue(), *Group);
|
|
|
|
assert(MaskForGaps && "Mask for Gaps is required but it is null");
|
|
|
|
}
|
|
|
|
|
|
|
|
// For each unroll part, create a wide load for the group.
|
|
|
|
SmallVector<Value *, 2> NewLoads;
|
|
|
|
for (unsigned Part = 0; Part < State.UF; Part++) {
|
|
|
|
Instruction *NewLoad;
|
|
|
|
if (BlockInMask || MaskForGaps) {
|
|
|
|
Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
|
|
|
|
NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],
|
|
|
|
Group->getAlign(), GroupMask,
|
|
|
|
PoisonVec, "wide.masked.vec");
|
|
|
|
} else
|
|
|
|
NewLoad = State.Builder.CreateAlignedLoad(
|
|
|
|
VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");
|
|
|
|
Group->addMetadata(NewLoad);
|
|
|
|
NewLoads.push_back(NewLoad);
|
|
|
|
}
|
|
|
|
|
|
|
|
ArrayRef<VPValue *> VPDefs = definedValues();
|
|
|
|
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
|
|
|
|
if (VecTy->isScalableTy()) {
|
|
|
|
assert(InterleaveFactor == 2 &&
|
|
|
|
"Unsupported deinterleave factor for scalable vectors");
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
|
|
|
|
// so must use intrinsics to deinterleave.
|
|
|
|
Value *DI = State.Builder.CreateIntrinsic(
|
|
|
|
Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
|
|
|
|
/*FMFSource=*/nullptr, "strided.vec");
|
|
|
|
unsigned J = 0;
|
|
|
|
for (unsigned I = 0; I < InterleaveFactor; ++I) {
|
|
|
|
Instruction *Member = Group->getMember(I);
|
|
|
|
|
|
|
|
if (!Member)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
|
|
|
|
// If this member has different type, cast the result type.
|
|
|
|
if (Member->getType() != ScalarTy) {
|
|
|
|
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
|
|
|
|
StridedVec =
|
|
|
|
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Group->isReverse())
|
|
|
|
StridedVec =
|
|
|
|
State.Builder.CreateVectorReverse(StridedVec, "reverse");
|
|
|
|
|
|
|
|
State.set(VPDefs[J], StridedVec, Part);
|
|
|
|
++J;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For each member in the group, shuffle out the appropriate data from the
|
|
|
|
// wide loads.
|
|
|
|
unsigned J = 0;
|
|
|
|
for (unsigned I = 0; I < InterleaveFactor; ++I) {
|
|
|
|
Instruction *Member = Group->getMember(I);
|
|
|
|
|
|
|
|
// Skip the gaps in the group.
|
|
|
|
if (!Member)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
auto StrideMask =
|
|
|
|
createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
|
|
|
|
for (unsigned Part = 0; Part < State.UF; Part++) {
|
|
|
|
Value *StridedVec = State.Builder.CreateShuffleVector(
|
|
|
|
NewLoads[Part], StrideMask, "strided.vec");
|
|
|
|
|
|
|
|
// If this member has different type, cast the result type.
|
|
|
|
if (Member->getType() != ScalarTy) {
|
|
|
|
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
|
|
|
|
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
|
|
|
|
StridedVec =
|
|
|
|
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Group->isReverse())
|
|
|
|
StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
|
|
|
|
|
|
|
|
State.set(VPDefs[J], StridedVec, Part);
|
|
|
|
}
|
|
|
|
++J;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The sub vector type for current instruction.
|
|
|
|
auto *SubVT = VectorType::get(ScalarTy, State.VF);
|
|
|
|
|
|
|
|
// Vectorize the interleaved store group.
|
|
|
|
Value *MaskForGaps =
|
|
|
|
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
|
|
|
|
assert((!MaskForGaps || !State.VF.isScalable()) &&
|
|
|
|
"masking gaps for scalable vectors is not yet supported.");
|
|
|
|
ArrayRef<VPValue *> StoredValues = getStoredValues();
|
|
|
|
for (unsigned Part = 0; Part < State.UF; Part++) {
|
|
|
|
// Collect the stored vector from each member.
|
|
|
|
SmallVector<Value *, 4> StoredVecs;
|
|
|
|
unsigned StoredIdx = 0;
|
|
|
|
for (unsigned i = 0; i < InterleaveFactor; i++) {
|
|
|
|
assert((Group->getMember(i) || MaskForGaps) &&
|
|
|
|
"Fail to get a member from an interleaved store group");
|
|
|
|
Instruction *Member = Group->getMember(i);
|
|
|
|
|
|
|
|
// Skip the gaps in the group.
|
|
|
|
if (!Member) {
|
|
|
|
Value *Undef = PoisonValue::get(SubVT);
|
|
|
|
StoredVecs.push_back(Undef);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
|
|
|
|
++StoredIdx;
|
|
|
|
|
|
|
|
if (Group->isReverse())
|
|
|
|
StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
|
|
|
|
|
|
|
|
// If this member has different type, cast it to a unified type.
|
|
|
|
|
|
|
|
if (StoredVec->getType() != SubVT)
|
|
|
|
StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
|
|
|
|
|
|
|
|
StoredVecs.push_back(StoredVec);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Interleave all the smaller vectors into one wider vector.
|
|
|
|
Value *IVec =
|
|
|
|
interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
|
|
|
|
Instruction *NewStoreInstr;
|
|
|
|
if (BlockInMask || MaskForGaps) {
|
|
|
|
Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
|
|
|
|
NewStoreInstr = State.Builder.CreateMaskedStore(
|
|
|
|
IVec, AddrParts[Part], Group->getAlign(), GroupMask);
|
|
|
|
} else
|
|
|
|
NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],
|
|
|
|
Group->getAlign());
|
|
|
|
|
|
|
|
Group->addMetadata(NewStoreInstr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
|
|
|
|
IG->getInsertPos()->printAsOperand(O, false);
|
|
|
|
O << ", ";
|
|
|
|
getAddr()->printAsOperand(O, SlotTracker);
|
|
|
|
VPValue *Mask = getMask();
|
|
|
|
if (Mask) {
|
|
|
|
O << ", ";
|
|
|
|
Mask->printAsOperand(O, SlotTracker);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned OpIdx = 0;
|
|
|
|
for (unsigned i = 0; i < IG->getFactor(); ++i) {
|
|
|
|
if (!IG->getMember(i))
|
|
|
|
continue;
|
|
|
|
if (getNumStoreOperands() > 0) {
|
|
|
|
O << "\n" << Indent << " store ";
|
|
|
|
getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
|
|
|
|
O << " to index " << i;
|
|
|
|
} else {
|
|
|
|
O << "\n" << Indent << " ";
|
|
|
|
getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
|
|
|
|
O << " = load from index " << i;
|
|
|
|
}
|
|
|
|
++OpIdx;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
Value *Start = getStartValue()->getLiveInIRValue();
|
2023-09-11 11:32:51 +01:00
|
|
|
PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
|
|
|
|
EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
EntryPart->addIncoming(Start, VectorPH);
|
2023-09-05 15:45:14 +01:00
|
|
|
EntryPart->setDebugLoc(getDebugLoc());
|
2022-06-28 10:34:30 +01:00
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
|
2024-02-26 19:06:43 +00:00
|
|
|
State.set(this, EntryPart, Part, /*IsScalar*/ true);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
2023-10-16 20:28:22 +01:00
|
|
|
O << " = CANONICAL-INDUCTION ";
|
|
|
|
printOperands(O, SlotTracker);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-04-16 14:48:02 +01:00
|
|
|
bool VPCanonicalIVPHIRecipe::isCanonical(
|
2024-01-26 11:13:05 +00:00
|
|
|
InductionDescriptor::InductionKind Kind, VPValue *Start,
|
|
|
|
VPValue *Step) const {
|
|
|
|
// Must be an integer induction.
|
|
|
|
if (Kind != InductionDescriptor::IK_IntInduction)
|
2023-04-16 14:48:02 +01:00
|
|
|
return false;
|
|
|
|
// Start must match the start value of this canonical induction.
|
|
|
|
if (Start != getStartValue())
|
2022-11-30 17:04:19 +00:00
|
|
|
return false;
|
2023-04-16 14:48:02 +01:00
|
|
|
|
|
|
|
// If the step is defined by a recipe, it is not a ConstantInt.
|
|
|
|
if (Step->getDefiningRecipe())
|
2022-11-30 17:04:19 +00:00
|
|
|
return false;
|
|
|
|
|
2023-04-16 14:48:02 +01:00
|
|
|
ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
|
|
|
|
return StepC && StepC->isOne();
|
2022-11-30 17:04:19 +00:00
|
|
|
}
|
|
|
|
|
2024-02-03 14:51:12 +00:00
|
|
|
bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
|
2022-09-23 18:23:01 +01:00
|
|
|
return IsScalarAfterVectorization &&
|
2024-02-03 14:51:12 +00:00
|
|
|
(!IsScalable || vputils::onlyFirstLaneUsed(this));
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
2024-08-05 20:42:10 +01:00
|
|
|
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
|
|
|
|
"Not a pointer induction according to InductionDescriptor!");
|
|
|
|
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
|
|
|
|
"Unexpected type.");
|
|
|
|
assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
|
|
|
|
"Recipe should have been replaced");
|
|
|
|
|
|
|
|
auto *IVR = getParent()->getPlan()->getCanonicalIV();
|
|
|
|
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
|
|
|
|
Type *PhiType = IndDesc.getStep()->getType();
|
|
|
|
|
|
|
|
// Build a pointer phi
|
|
|
|
Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
|
|
|
|
Type *ScStValueType = ScalarStartValue->getType();
|
|
|
|
PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
|
|
|
|
CanonicalIV->getIterator());
|
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
|
|
|
|
|
|
|
|
// A pointer induction, performed by using a gep
|
|
|
|
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
|
|
|
|
|
|
|
|
Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
|
|
|
|
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
|
|
|
|
Value *NumUnrolledElems =
|
|
|
|
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
|
|
|
|
Value *InductionGEP = GetElementPtrInst::Create(
|
|
|
|
State.Builder.getInt8Ty(), NewPointerPhi,
|
|
|
|
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
|
|
|
|
InductionLoc);
|
|
|
|
// Add induction update using an incorrect block temporarily. The phi node
|
|
|
|
// will be fixed after VPlan execution. Note that at this point the latch
|
|
|
|
// block cannot be used, as it does not exist yet.
|
|
|
|
// TODO: Model increment value in VPlan, by turning the recipe into a
|
|
|
|
// multi-def and a subclass of VPHeaderPHIRecipe.
|
|
|
|
NewPointerPhi->addIncoming(InductionGEP, VectorPH);
|
|
|
|
|
|
|
|
// Create UF many actual address geps that use the pointer
|
|
|
|
// phi as base and a vectorized version of the step value
|
|
|
|
// (<step*0, ..., step*N>) as offset.
|
|
|
|
for (unsigned Part = 0; Part < State.UF; ++Part) {
|
|
|
|
Type *VecPhiType = VectorType::get(PhiType, State.VF);
|
|
|
|
Value *StartOffsetScalar =
|
|
|
|
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
|
|
|
|
Value *StartOffset =
|
|
|
|
State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
|
|
|
|
// Create a vector of consecutive numbers from zero to VF.
|
|
|
|
StartOffset = State.Builder.CreateAdd(
|
|
|
|
StartOffset, State.Builder.CreateStepVector(VecPhiType));
|
|
|
|
|
|
|
|
assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
|
|
|
|
"scalar step must be the same across all parts");
|
|
|
|
Value *GEP = State.Builder.CreateGEP(
|
|
|
|
State.Builder.getInt8Ty(), NewPointerPhi,
|
2024-09-18 19:22:36 +08:00
|
|
|
State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
|
|
|
|
State.VF, ScalarStepValue)),
|
|
|
|
"vector.gep");
|
2024-08-05 20:42:10 +01:00
|
|
|
State.set(this, GEP, Part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = WIDEN-POINTER-INDUCTION ";
|
|
|
|
getStartValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << ", " << *IndDesc.getStep();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPExpandSCEVRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(!State.Instance && "cannot be used in per-lane");
|
2024-06-27 16:38:15 +02:00
|
|
|
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
|
2022-06-28 10:34:30 +01:00
|
|
|
SCEVExpander Exp(SE, DL, "induction");
|
|
|
|
|
|
|
|
Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
|
|
|
|
&*State.Builder.GetInsertPoint());
|
2023-05-11 16:49:18 +01:00
|
|
|
assert(!State.ExpandedSCEVs.contains(Expr) &&
|
|
|
|
"Same SCEV expanded multiple times");
|
|
|
|
State.ExpandedSCEVs[Expr] = Res;
|
2022-06-28 10:34:30 +01:00
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
|
2023-05-04 14:00:13 +01:00
|
|
|
State.set(this, Res, {Part, 0});
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
getVPSingleValue()->printAsOperand(O, SlotTracker);
|
|
|
|
O << " = EXPAND SCEV " << *Expr;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
|
2024-02-26 19:06:43 +00:00
|
|
|
Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
|
2022-06-28 10:34:30 +01:00
|
|
|
Type *STy = CanonicalIV->getType();
|
|
|
|
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
|
|
|
|
ElementCount VF = State.VF;
|
|
|
|
Value *VStart = VF.isScalar()
|
|
|
|
? CanonicalIV
|
|
|
|
: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
|
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
|
|
|
|
Value *VStep = createStepForVF(Builder, STy, VF, Part);
|
|
|
|
if (VF.isVector()) {
|
|
|
|
VStep = Builder.CreateVectorSplat(VF, VStep);
|
|
|
|
VStep =
|
|
|
|
Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
|
|
|
|
}
|
|
|
|
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
|
|
|
|
State.set(this, CanonicalVectorIV, Part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EMIT ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = WIDEN-CANONICAL-INDUCTION ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
// Create a vector from the initial value.
|
|
|
|
auto *VectorInit = getStartValue()->getLiveInIRValue();
|
|
|
|
|
|
|
|
Type *VecTy = State.VF.isScalar()
|
|
|
|
? VectorInit->getType()
|
|
|
|
: VectorType::get(VectorInit->getType(), State.VF);
|
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
if (State.VF.isVector()) {
|
|
|
|
auto *IdxTy = Builder.getInt32Ty();
|
|
|
|
auto *One = ConstantInt::get(IdxTy, 1);
|
|
|
|
IRBuilder<>::InsertPointGuard Guard(Builder);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
|
|
|
|
auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
|
|
|
|
VectorInit = Builder.CreateInsertElement(
|
|
|
|
PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a phi node for the new recurrence.
|
2023-09-11 11:32:51 +01:00
|
|
|
PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
|
|
|
|
EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
|
2022-06-28 10:34:30 +01:00
|
|
|
EntryPart->addIncoming(VectorInit, VectorPH);
|
|
|
|
State.set(this, EntryPart, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPReductionPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
auto &Builder = State.Builder;
|
|
|
|
|
2024-01-16 14:39:51 +00:00
|
|
|
// Reductions do not have to start at zero. They can start with
|
|
|
|
// any loop invariant values.
|
|
|
|
VPValue *StartVPV = getStartValue();
|
|
|
|
Value *StartV = StartVPV->getLiveInIRValue();
|
|
|
|
|
2022-06-28 10:34:30 +01:00
|
|
|
// In order to support recurrences we need to be able to vectorize Phi nodes.
|
|
|
|
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
|
|
|
|
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
|
|
|
|
// this value when we vectorize all of the instructions that use the PHI.
|
|
|
|
bool ScalarPHI = State.VF.isScalar() || IsInLoop;
|
2024-01-16 14:39:51 +00:00
|
|
|
Type *VecTy = ScalarPHI ? StartV->getType()
|
|
|
|
: VectorType::get(StartV->getType(), State.VF);
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
BasicBlock *HeaderBB = State.CFG.PrevBB;
|
|
|
|
assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
|
|
|
|
"recipe must be in the vector loop header");
|
|
|
|
unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
|
|
|
|
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
|
2023-09-11 11:32:51 +01:00
|
|
|
Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
|
|
|
|
EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
|
2024-02-26 19:06:43 +00:00
|
|
|
State.set(this, EntryPart, Part, IsInLoop);
|
2022-06-28 10:34:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
|
|
|
|
Value *Iden = nullptr;
|
|
|
|
RecurKind RK = RdxDesc.getRecurrenceKind();
|
|
|
|
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
|
2023-07-19 02:51:15 -07:00
|
|
|
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
|
|
|
|
// MinMax and AnyOf reductions have the start value as their identity.
|
2022-06-28 10:34:30 +01:00
|
|
|
if (ScalarPHI) {
|
|
|
|
Iden = StartV;
|
|
|
|
} else {
|
|
|
|
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
StartV = Iden =
|
|
|
|
Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
|
|
|
|
}
|
|
|
|
} else {
|
2024-08-30 17:13:51 -07:00
|
|
|
Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
|
|
|
|
RdxDesc.getFastMathFlags());
|
2022-06-28 10:34:30 +01:00
|
|
|
|
|
|
|
if (!ScalarPHI) {
|
|
|
|
Iden = Builder.CreateVectorSplat(State.VF, Iden);
|
|
|
|
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
|
|
|
|
Builder.SetInsertPoint(VectorPH->getTerminator());
|
|
|
|
Constant *Zero = Builder.getInt32(0);
|
|
|
|
StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
|
2024-02-26 19:06:43 +00:00
|
|
|
Value *EntryPart = State.get(this, Part, IsInLoop);
|
2022-06-28 10:34:30 +01:00
|
|
|
// Make sure to add the reduction start value only to the
|
|
|
|
// first unroll part.
|
|
|
|
Value *StartVal = (Part == 0) ? StartV : Iden;
|
|
|
|
cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-REDUCTION-PHI ";
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void VPWidenPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
assert(EnableVPlanNativePath &&
|
|
|
|
"Non-native vplans are not expected to have VPWidenPHIRecipes.");
|
|
|
|
|
2023-09-22 21:24:14 +01:00
|
|
|
Value *Op0 = State.get(getOperand(0), 0);
|
2022-06-28 10:34:30 +01:00
|
|
|
Type *VecTy = Op0->getType();
|
|
|
|
Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
|
|
|
|
State.set(this, VecPhi, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "WIDEN-PHI ";
|
|
|
|
|
|
|
|
auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
|
|
|
|
// Unless all incoming values are modeled in VPlan print the original PHI
|
|
|
|
// directly.
|
|
|
|
// TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
|
|
|
|
// values as VPValues.
|
|
|
|
if (getNumOperands() != OriginalPhi->getNumOperands()) {
|
|
|
|
O << VPlanIngredient(OriginalPhi);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
2022-05-10 10:49:43 +01:00
|
|
|
|
|
|
|
// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
|
|
|
|
// remove VPActiveLaneMaskPHIRecipe.
|
|
|
|
void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
|
|
|
|
Value *StartMask = State.get(getOperand(0), Part);
|
|
|
|
PHINode *EntryPart =
|
|
|
|
State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
|
|
|
|
EntryPart->addIncoming(StartMask, VectorPH);
|
2023-09-05 15:45:14 +01:00
|
|
|
EntryPart->setDebugLoc(getDebugLoc());
|
2022-05-10 10:49:43 +01:00
|
|
|
State.set(this, EntryPart, Part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "ACTIVE-LANE-MASK-PHI ";
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|
2024-04-04 18:30:17 -04:00
|
|
|
|
|
|
|
void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
|
|
|
|
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
|
|
|
|
assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
|
|
|
|
Value *Start = State.get(getOperand(0), VPIteration(0, 0));
|
|
|
|
PHINode *EntryPart =
|
|
|
|
State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
|
|
|
|
EntryPart->addIncoming(Start, VectorPH);
|
|
|
|
EntryPart->setDebugLoc(getDebugLoc());
|
|
|
|
State.set(this, EntryPart, 0, /*IsScalar=*/true);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
|
|
void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
|
|
|
|
VPSlotTracker &SlotTracker) const {
|
|
|
|
O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
|
|
|
|
|
|
|
|
printAsOperand(O, SlotTracker);
|
|
|
|
O << " = phi ";
|
|
|
|
printOperands(O, SlotTracker);
|
|
|
|
}
|
|
|
|
#endif
|