//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file contains implementations for different VPlan recipes. /// //===----------------------------------------------------------------------===// #include "LoopVectorizationPlanner.h" #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanHelpers.h" #include "VPlanPatternMatch.h" #include "VPlanUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/VectorBuilder.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include using namespace llvm; using VectorParts = SmallVector; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; case VPReplicateSC: return cast(getVPSingleValue()->getUnderlyingValue()) ->mayWriteToMemory(); case VPWidenCallSC: return !cast(this) ->getCalledScalarFunction() ->onlyReadsMemory(); case VPWidenIntrinsicSC: return cast(this)->mayWriteToMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: case VPPredInstPHISC: return false; case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; assert((!I || !I->mayWriteToMemory()) && "underlying instruction may write to memory"); return false; } default: return true; } } bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; case VPReplicateSC: return cast(getVPSingleValue()->getUnderlyingValue()) ->mayReadFromMemory(); case VPWidenCallSC: return !cast(this) ->getCalledScalarFunction() ->onlyWritesMemory(); case VPWidenIntrinsicSC: return cast(this)->mayReadFromMemory(); case VPBranchOnMaskSC: case VPPredInstPHISC: case VPScalarIVStepsSC: case VPWidenStoreEVLSC: case VPWidenStoreSC: return false; case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; assert((!I || !I->mayReadFromMemory()) && "underlying instruction may read from memory"); return false; } default: return true; } } bool VPRecipeBase::mayHaveSideEffects() const { switch (getVPDefID()) { case VPDerivedIVSC: case VPPredInstPHISC: case VPVectorEndPointerSC: return false; case VPInstructionSC: return mayWriteToMemory(); case VPWidenCallSC: { Function *Fn = cast(this)->getCalledScalarFunction(); return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); } case VPWidenIntrinsicSC: return cast(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: case VPWidenPHISC: case VPWidenPointerInductionSC: case VPWidenSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; assert((!I || !I->mayHaveSideEffects()) && "underlying instruction has side-effects"); return false; } case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: case VPWidenStoreSC: assert( cast(this)->getIngredient().mayHaveSideEffects() == mayWriteToMemory() && "mayHaveSideffects result for ingredient differs from this " "implementation"); return mayWriteToMemory(); case VPReplicateSC: { auto *R = cast(this); return R->getUnderlyingInstr()->mayHaveSideEffects(); } default: return true; } } void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && "Insertion position not in any VPBasicBlock"); InsertPos->getParent()->insert(this, InsertPos->getIterator()); } void VPRecipeBase::insertBefore(VPBasicBlock &BB, iplist::iterator I) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(I == BB.end() || I->getParent() == &BB); BB.insert(this, I); } void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && "Insertion position not in any VPBasicBlock"); InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); } void VPRecipeBase::removeFromParent() { assert(getParent() && "Recipe not in any VPBasicBlock"); getParent()->getRecipeList().remove(getIterator()); Parent = nullptr; } iplist::iterator VPRecipeBase::eraseFromParent() { assert(getParent() && "Recipe not in any VPBasicBlock"); return getParent()->getRecipeList().erase(getIterator()); } void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { removeFromParent(); insertAfter(InsertPos); } void VPRecipeBase::moveBefore(VPBasicBlock &BB, iplist::iterator I) { removeFromParent(); insertBefore(BB, I); } InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { // Get the underlying instruction for the recipe, if there is one. It is used // to // * decide if cost computation should be skipped for this recipe, // * apply forced target instruction cost. Instruction *UI = nullptr; if (auto *S = dyn_cast(this)) UI = dyn_cast_or_null(S->getUnderlyingValue()); else if (auto *IG = dyn_cast(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast(this)) UI = &WidenMem->getIngredient(); InstructionCost RecipeCost; if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { RecipeCost = 0; } else { RecipeCost = computeCost(VF, Ctx); if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 && RecipeCost.isValid()) RecipeCost = InstructionCost(ForceTargetInstructionCost); } LLVM_DEBUG({ dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; dump(); }); return RecipeCost; } InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) const { llvm_unreachable("subclasses should implement computeCost"); } bool VPRecipeBase::isPhi() const { return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) || (isa(this) && cast(this)->getOpcode() == Instruction::PHI); } bool VPRecipeBase::isScalarCast() const { auto *VPI = dyn_cast(this); return VPI && Instruction::isCast(VPI->getOpcode()); } InstructionCost VPPartialReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { std::optional Opcode = std::nullopt; VPValue *BinOp = getOperand(0); // If the partial reduction is predicated, a select will be operand 0 rather // than the binary op using namespace llvm::VPlanPatternMatch; if (match(getOperand(0), m_Select(m_VPValue(), m_VPValue(), m_VPValue()))) BinOp = BinOp->getDefiningRecipe()->getOperand(1); // If BinOp is a negation, use the side effect of match to assign the actual // binary operation to BinOp match(BinOp, m_Binary(m_SpecificInt(0), m_VPValue(BinOp))); VPRecipeBase *BinOpR = BinOp->getDefiningRecipe(); if (auto *WidenR = dyn_cast(BinOpR)) Opcode = std::make_optional(WidenR->getOpcode()); VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) : BinOpR->getOperand(0)); auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) : BinOpR->getOperand(1)); auto GetExtendKind = [](VPRecipeBase *R) { // The extend could come from outside the plan. if (!R) return TargetTransformInfo::PR_None; auto *WidenCastR = dyn_cast(R); if (!WidenCastR) return TargetTransformInfo::PR_None; if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) return TargetTransformInfo::PR_ZeroExtend; if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) return TargetTransformInfo::PR_SignExtend; return TargetTransformInfo::PR_None; }; return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, PhiType, VF, GetExtendKind(ExtAR), GetExtendKind(ExtBR), Opcode); } void VPPartialReductionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; assert(getOpcode() == Instruction::Add && "Unhandled partial reduction opcode"); Value *BinOpVal = State.get(getOperand(0)); Value *PhiVal = State.get(getOperand(1)); assert(PhiVal && BinOpVal && "Phi and Mul must be set"); Type *RetTy = PhiVal->getType(); CallInst *V = Builder.CreateIntrinsic( RetTy, Intrinsic::experimental_vector_partial_reduce_add, {PhiVal, BinOpVal}, nullptr, "partial.reduce"); State.set(this, V); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PARTIAL-REDUCE "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; printOperands(O, SlotTracker); } #endif FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); FastMathFlags Res; Res.setAllowReassoc(FMFs.AllowReassoc); Res.setNoNaNs(FMFs.NoNaNs); Res.setNoInfs(FMFs.NoInfs); Res.setNoSignedZeros(FMFs.NoSignedZeros); Res.setAllowReciprocal(FMFs.AllowReciprocal); Res.setAllowContract(FMFs.AllowContract); Res.setApproxFunc(FMFs.ApproxFunc); return Res; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPSingleDefRecipe::dump() const { VPDef::dump(); } #endif template VPValue * VPUnrollPartAccessor::getUnrollPartOperand(VPUser &U) const { if (U.getNumOperands() == PartOpIdx + 1) return U.getOperand(PartOpIdx); return nullptr; } template unsigned VPUnrollPartAccessor::getUnrollPart(VPUser &U) const { if (auto *UnrollPartOp = getUnrollPartOperand(U)) return cast(UnrollPartOp->getLiveInIRValue())->getZExtValue(); return 0; } namespace llvm { template class VPUnrollPartAccessor<2>; template class VPUnrollPartAccessor<3>; } VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef({A, B}), Pred, DL), Opcode(Opcode), Name(Name.str()) { assert(Opcode == Instruction::ICmp && "only ICmp predicates supported at the moment"); } VPInstruction::VPInstruction(unsigned Opcode, std::initializer_list Operands, FastMathFlags FMFs, DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), Opcode(Opcode), Name(Name.str()) { // Make sure the VPInstruction is a floating-point operation. assert(isFPMathOp() && "this op can't take fast-math flags"); } bool VPInstruction::doesGeneratePerAllLanes() const { return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); } bool VPInstruction::canGenerateScalarForFirstLane() const { if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) return true; if (isSingleScalar() || isVectorToScalar()) return true; switch (Opcode) { case Instruction::Freeze: case Instruction::ICmp: case Instruction::PHI: case Instruction::Select: case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: case VPInstruction::AnyOf: return true; default: return false; } } Value *VPInstruction::generatePerLane(VPTransformState &State, const VPLane &Lane) { IRBuilderBase &Builder = State.Builder; assert(getOpcode() == VPInstruction::PtrAdd && "only PtrAdd opcodes are supported for now"); return Builder.CreatePtrAdd(State.get(getOperand(0), Lane), State.get(getOperand(1), Lane), Name); } Value *VPInstruction::generate(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; if (Instruction::isBinaryOp(getOpcode())) { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); auto *Res = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); if (auto *I = dyn_cast(Res)) setFlags(I); return Res; } switch (getOpcode()) { case VPInstruction::Not: { Value *A = State.get(getOperand(0)); return Builder.CreateNot(A, Name); } case Instruction::ExtractElement: { assert(State.VF.isVector() && "Only extract elements from vectors"); Value *Vec = State.get(getOperand(0)); Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); return Builder.CreateExtractElement(Vec, Idx, Name); } case Instruction::Freeze: { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); return Builder.CreateFreeze(Op, Name); } case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::PHI: { assert(getParent() == getParent()->getPlan()->getVectorLoopRegion()->getEntry() && "VPInstructions with PHI opcodes must be used for header phis only " "at the moment"); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); Value *Start = State.get(getOperand(0), VPLane(0)); PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name); Phi->addIncoming(Start, VectorPH); return Phi; } case Instruction::Select: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); // Get the original loop tripcount. Value *ScalarTC = State.get(getOperand(1), VPLane(0)); // If this part of the active lane mask is scalar, generate the CMP directly // to avoid unnecessary extracts. if (State.VF.isScalar()) return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); auto *PredTy = VectorType::get(Int1Ty, State.VF); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. // // vector.ph: // v_init = vector(..., ..., ..., a[-1]) // br vector.body // // vector.body // i = phi [0, vector.ph], [i+4, vector.body] // v1 = phi [v_init, vector.ph], [v2, vector.body] // v2 = a[i, i+1, i+2, i+3]; // v3 = vector(v1(3), v2(0, 1, 2)) auto *V1 = State.get(getOperand(0)); if (!V1->getType()->isVectorTy()) return V1; Value *V2 = State.get(getOperand(1)); return Builder.CreateVectorSplice(V1, V2, -1, Name); } case VPInstruction::CalculateTripCountMinusVF: { unsigned UF = getParent()->getPlan()->getUF(); Value *ScalarTC = State.get(getOperand(0), VPLane(0)); Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); Value *Sub = Builder.CreateSub(ScalarTC, Step); Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); return Builder.CreateSelect(Cmp, Sub, Zero); } case VPInstruction::ExplicitVectorLength: { // TODO: Restructure this code with an explicit remainder loop, vsetvli can // be outside of the main loop. Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); // Compute EVL assert(AVL->getType()->isIntegerTy() && "Requested vector length should be an integer."); assert(State.VF.isScalable() && "Expected scalable vector factor."); Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); Value *EVL = State.Builder.CreateIntrinsic( State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, {AVL, VFArg, State.Builder.getTrue()}); return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { unsigned Part = getUnrollPart(*this); auto *IV = State.get(getOperand(0), VPLane(0)); assert(Part != 0 && "Must have a positive part"); // The canonical IV is incremented by the vectorization factor (num of // SIMD elements) times the unroll part. Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), hasNoSignedWrap()); } case VPInstruction::BranchOnCond: { Value *Cond = State.get(getOperand(0), VPLane(0)); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination for exiting blocks now and // to forward destination(s) later when they are created. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); if (!getParent()->isExiting()) return CondBr; VPRegionBlock *ParentRegion = getParent()->getParent(); VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); return CondBr; } case VPInstruction::BranchOnCount: { // First create the compare. Value *IV = State.get(getOperand(0), /*IsScalar*/ true); Value *TC = State.get(getOperand(1), /*IsScalar*/ true); Value *Cond = Builder.CreateICmpEQ(IV, TC); // Now create the branch. auto *Plan = getParent()->getPlan(); VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination (the header) now and to the // forward destination (the exit/middle block) later when it is created. // Note that CreateCondBr expects a valid BB as first argument, so we need // to set it to nullptr later. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), State.CFG.VPBB2IRBB[Header]); CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } case VPInstruction::Broadcast: { return Builder.CreateVectorSplat( State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); } case VPInstruction::ComputeFindLastIVResult: { // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. auto *PhiR = cast(getOperand(0)); // Get its reduction variable descriptor. const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); [[maybe_unused]] RecurKind RK = RdxDesc.getRecurrenceKind(); assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) && "Unexpected reduction kind"); assert(!PhiR->isInLoop() && "In-loop FindLastIV reduction is not supported yet"); // The recipe's operands are the reduction phi, followed by one operand for // each part of the reduction. unsigned UF = getNumOperands() - 2; Value *ReducedPartRdx = State.get(getOperand(2)); for (unsigned Part = 1; Part < UF; ++Part) { ReducedPartRdx = createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, State.get(getOperand(2 + Part))); } return createFindLastIVReduction(Builder, ReducedPartRdx, State.get(getOperand(1), true), RdxDesc); } case VPInstruction::ComputeReductionResult: { // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. auto *PhiR = cast(getOperand(0)); auto *OrigPhi = cast(PhiR->getUnderlyingValue()); // Get its reduction variable descriptor. const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); assert(!RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) && "should be handled by ComputeFindLastIVResult"); Type *PhiTy = OrigPhi->getType(); // The recipe's operands are the reduction phi, followed by one operand for // each part of the reduction. unsigned UF = getNumOperands() - 1; VectorParts RdxParts(UF); for (unsigned Part = 0; Part < UF; ++Part) RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. // TODO: Handle this in truncateToMinBW. if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); for (unsigned Part = 0; Part < UF; ++Part) RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; unsigned Op = RdxDesc.getOpcode(); if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) Op = Instruction::Or; if (PhiR->isOrdered()) { ReducedPartRdx = RdxParts[UF - 1]; } else { // Floating-point operations should have some FMF to enable the reduction. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); for (unsigned Part = 1; Part < UF; ++Part) { Value *RdxPart = RdxParts[Part]; if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } } // Create the reduction after the loop. Note that inloop reductions create // the target reduction in the loop using a Reduction recipe. if ((State.VF.isVector() || RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && !PhiR->isInLoop()) { // TODO: Support in-order reductions based on the recurrence descriptor. // All ops in the reduction inherit fast-math-flags from the recurrence // descriptor. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) ReducedPartRdx = createAnyOfReduction(Builder, ReducedPartRdx, RdxDesc, OrigPhi); else ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) ReducedPartRdx = RdxDesc.isSigned() ? Builder.CreateSExt(ReducedPartRdx, PhiTy) : Builder.CreateZExt(ReducedPartRdx, PhiTy); } return ReducedPartRdx; } case VPInstruction::ExtractFromEnd: { auto *CI = cast(getOperand(1)->getLiveInIRValue()); unsigned Offset = CI->getZExtValue(); assert(Offset > 0 && "Offset from end must be positive"); Value *Res; if (State.VF.isVector()) { assert(Offset <= State.VF.getKnownMinValue() && "invalid offset to extract from"); // Extract lane VF - Offset from the operand. Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); } else { assert(Offset <= 1 && "invalid offset to extract from"); Res = State.get(getOperand(0)); } if (isa(Res)) Res->setName(Name); return Res; } case VPInstruction::LogicalAnd: { Value *A = State.get(getOperand(0)); Value *B = State.get(getOperand(1)); return Builder.CreateLogicalAnd(A, B, Name); } case VPInstruction::PtrAdd: { assert(vputils::onlyFirstLaneUsed(this) && "can only generate first lane for PtrAdd"); Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Addend = State.get(getOperand(1), VPLane(0)); return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::ResumePhi: { auto *NewPhi = Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); for (const auto &[IncVPV, PredVPBB] : zip(operands(), getParent()->getPredecessors())) { Value *IncV = State.get(IncVPV, /* IsScalar */ true); BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(cast(PredVPBB)); NewPhi->addIncoming(IncV, PredBB); } return NewPhi; } case VPInstruction::AnyOf: { Value *A = State.get(getOperand(0)); return Builder.CreateOrReduce(A); } case VPInstruction::FirstActiveLane: { Value *Mask = State.get(getOperand(0)); return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask, true, Name); } default: llvm_unreachable("Unsupported opcode for instruction"); } } InstructionCost VPInstruction::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (Instruction::isBinaryOp(getOpcode())) { if (!getUnderlyingValue()) { // TODO: Compute cost for VPInstructions without underlying values once // the legacy cost model has been retired. return 0; } assert(!doesGeneratePerAllLanes() && "Should only generate a vector value or single scalar, not scalars " "for all lanes."); Type *ResTy = Ctx.Types.inferScalarType(this); if (!vputils::onlyFirstLaneUsed(this)) ResTy = toVectorTy(ResTy, VF); return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); } switch (getOpcode()) { case Instruction::ExtractElement: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ctx.CostKind); } case VPInstruction::AnyOf: { auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticReductionCost( Instruction::Or, cast(VecTy), std::nullopt, Ctx.CostKind); } case VPInstruction::FirstActiveLane: { // Calculate the cost of determining the lane index. auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx), {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); SmallVector Mask(VF.getKnownMinValue()); std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, cast(VectorTy), Mask, Ctx.CostKind, VF.getKnownMinValue() - 1); } case VPInstruction::ActiveLaneMask: { Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, {ArgTy, ArgTy}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } case VPInstruction::ExplicitVectorLength: { Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx); Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx); IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, I32Ty, {Arg0Ty, I32Ty, I1Ty}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } default: // TODO: Compute cost other VPInstructions once the legacy cost model has // been retired. assert(!getUnderlyingValue() && "unexpected VPInstruction witht underlying value"); return 0; } } bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || getOpcode() == Instruction::ExtractElement || getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeFindLastIVResult || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } bool VPInstruction::isSingleScalar() const { return getOpcode() == VPInstruction::ResumePhi || getOpcode() == Instruction::PHI; } #if !defined(NDEBUG) bool VPInstruction::isFPMathOp() const { // Inspired by FPMathOperator::classof. Notable differences are that we don't // support Call, PHI and Select opcodes here yet. return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || Opcode == Instruction::FNeg || Opcode == Instruction::FSub || Opcode == Instruction::FDiv || Opcode == Instruction::FRem || Opcode == Instruction::FCmp || Opcode == Instruction::Select; } #endif void VPInstruction::execute(VPTransformState &State) { assert(!State.Lane && "VPInstruction executing an Lane"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); assert((hasFastMathFlags() == isFPMathOp() || getOpcode() == Instruction::Select) && "Recipe not a FPMathOp but has fast-math flags?"); if (hasFastMathFlags()) State.Builder.setFastMathFlags(getFastMathFlags()); bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && (vputils::onlyFirstLaneUsed(this) || isVectorToScalar() || isSingleScalar()); bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); if (GeneratesPerAllLanes) { for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); Lane != NumLanes; ++Lane) { Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); assert(GeneratedValue && "generatePerLane must produce a value"); State.set(this, GeneratedValue, VPLane(Lane)); } return; } Value *GeneratedValue = generate(State); if (!hasResult()) return; assert(GeneratedValue && "generate must produce a value"); assert( (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly || State.VF.isScalar()) && "scalar value but not only first lane defined"); State.set(this, GeneratedValue, /*IsScalar*/ GeneratesPerFirstLaneOnly); } bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) return false; switch (getOpcode()) { case Instruction::ExtractElement: case Instruction::Freeze: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractFromEnd: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: case VPInstruction::PtrAdd: return false; default: return true; } } bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) return vputils::onlyFirstLaneUsed(this); switch (getOpcode()) { default: return false; case Instruction::ExtractElement: return Op == getOperand(1); case Instruction::PHI: return true; case Instruction::ICmp: case Instruction::Select: case Instruction::Or: case Instruction::Freeze: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: case VPInstruction::ExplicitVectorLength: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::BranchOnCount: case VPInstruction::BranchOnCond: case VPInstruction::ResumePhi: return true; case VPInstruction::PtrAdd: return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); case VPInstruction::ComputeFindLastIVResult: return Op == getOperand(1); }; llvm_unreachable("switch should return"); } bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) return vputils::onlyFirstPartUsed(this); switch (getOpcode()) { default: return false; case Instruction::ICmp: case Instruction::Select: return vputils::onlyFirstPartUsed(this); case VPInstruction::BranchOnCount: case VPInstruction::BranchOnCond: case VPInstruction::CanonicalIVIncrementForPart: return true; }; llvm_unreachable("switch should return"); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInstruction::dump() const { VPSlotTracker SlotTracker(getParent()->getPlan()); print(dbgs(), "", SlotTracker); } void VPInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; if (hasResult()) { printAsOperand(O, SlotTracker); O << " = "; } switch (getOpcode()) { case VPInstruction::Not: O << "not"; break; case VPInstruction::SLPLoad: O << "combined load"; break; case VPInstruction::SLPStore: O << "combined store"; break; case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; case VPInstruction::ResumePhi: O << "resume-phi"; break; case VPInstruction::ExplicitVectorLength: O << "EXPLICIT-VECTOR-LENGTH"; break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; case VPInstruction::CalculateTripCountMinusVF: O << "TC > VF ? TC - VF : 0"; break; case VPInstruction::CanonicalIVIncrementForPart: O << "VF * Part +"; break; case VPInstruction::BranchOnCount: O << "branch-on-count"; break; case VPInstruction::Broadcast: O << "broadcast"; break; case VPInstruction::ExtractFromEnd: O << "extract-from-end"; break; case VPInstruction::ComputeFindLastIVResult: O << "compute-find-last-iv-result"; break; case VPInstruction::ComputeReductionResult: O << "compute-reduction-result"; break; case VPInstruction::LogicalAnd: O << "logical-and"; break; case VPInstruction::PtrAdd: O << "ptradd"; break; case VPInstruction::AnyOf: O << "any-of"; break; case VPInstruction::FirstActiveLane: O << "first-active-lane"; break; default: O << Instruction::getOpcodeName(getOpcode()); } printFlags(O); printOperands(O, SlotTracker); if (auto DL = getDebugLoc()) { O << ", !dbg "; DL.print(O); } } #endif void VPInstructionWithType::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); assert(vputils::onlyFirstLaneUsed(this) && "Codegen only implemented for first lane."); switch (getOpcode()) { case Instruction::ZExt: case Instruction::Trunc: { Value *Op = State.get(getOperand(0), VPLane(0)); Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()), Op, ResultTy); State.set(this, Cast, VPLane(0)); break; } default: llvm_unreachable("opcode not implemented yet"); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; printOperands(O, SlotTracker); O << " to " << *ResultTy; } #endif VPIRInstruction *VPIRInstruction ::create(Instruction &I) { if (auto *Phi = dyn_cast(&I)) return new VPIRPhi(*Phi); return new VPIRInstruction(I); } void VPIRInstruction::execute(VPTransformState &State) { assert(!isa(this) && getNumOperands() == 0 && "PHINodes must be handled by VPIRPhi"); // Advance the insert point after the wrapped IR instruction. This allows // interleaving VPIRInstructions and other recipes. State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); } InstructionCost VPIRInstruction::computeCost(ElementCount VF, VPCostContext &Ctx) const { // The recipe wraps an existing IR instruction on the border of VPlan's scope, // hence it does not contribute to the cost-modeling for the VPlan. return 0; } void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) { assert(isa(getInstruction()) && "can only add exiting operands to phi nodes"); assert(getNumOperands() == 1 && "must have a single operand"); VPValue *Exiting = getOperand(0); if (!Exiting->isLiveIn()) { LLVMContext &Ctx = getInstruction().getContext(); auto &Plan = *getParent()->getPlan(); Exiting = Builder.createNaryOp( VPInstruction::ExtractFromEnd, {Exiting, Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))}); } setOperand(0, Exiting); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "IR " << I; } #endif void VPIRPhi::execute(VPTransformState &State) { PHINode *Phi = &getIRPhi(); for (const auto &[Idx, Op] : enumerate(operands())) { VPValue *ExitValue = Op; auto Lane = vputils::isUniformAfterVectorization(ExitValue) ? VPLane::getFirstLane() : VPLane::getLastLaneForVF(State.VF); VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; auto *PredVPBB = Pred->getExitingBasicBlock(); BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; // Set insertion point in PredBB in case an extract needs to be generated. // TODO: Model extracts explicitly. State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); Value *V = State.get(ExitValue, VPLane(Lane)); // If there is no existing block for PredBB in the phi, add a new incoming // value. Otherwise update the existing incoming value for PredBB. if (Phi->getBasicBlockIndex(PredBB) == -1) Phi->addIncoming(V, PredBB); else Phi->setIncomingValueForBlock(PredBB, V); } // Advance the insert point after the wrapped IR instruction. This allows // interleaving VPIRInstructions and other recipes. State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPIRPhi::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { VPIRInstruction::print(O, Indent, SlotTracker); if (getNumOperands() != 0) { O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; interleaveComma( enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { Op.value()->printAsOperand(O, SlotTracker); O << " from "; getParent()->getPredecessors()[Op.index()]->printAsOperand(O); }); O << ")"; } } #endif void VPWidenCallRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); FunctionType *VFTy = Variant->getFunctionType(); // Add return type if intrinsic is overloaded on it. SmallVector Args; for (const auto &I : enumerate(arg_operands())) { Value *Arg; // Some vectorized function variants may also take a scalar argument, // e.g. linear parameters for pointers. This needs to be the scalar value // from the start of the respective part when interleaving. if (!VFTy->getParamType(I.index())->isVectorTy()) Arg = State.get(I.value(), VPLane(0)); else Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); Args.push_back(Arg); } assert(Variant != nullptr && "Can't create vector function."); auto *CI = cast_or_null(getUnderlyingValue()); SmallVector OpBundles; if (CI) CI->getOperandBundlesAsDefs(OpBundles); CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); setFlags(V); if (!V->getType()->isVoidTy()) State.set(this, V); State.addMetadata(V, CI); } InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), Variant->getFunctionType()->params(), Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-CALL "; Function *CalledFn = getCalledScalarFunction(); if (CalledFn->getReturnType()->isVoidTy()) O << "void "; else { printAsOperand(O, SlotTracker); O << " = "; } O << "call"; printFlags(O); O << " @" << CalledFn->getName() << "("; interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); }); O << ")"; O << " (using library function"; if (Variant->hasName()) O << ": " << Variant->getName(); O << ")"; } #endif void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); SmallVector TysForDecl; // Add return type if intrinsic is overloaded on it. if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); SmallVector Args; for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), State.TTI)) Arg = State.get(I.value(), VPLane(0)); else Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), State.TTI)) TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } // Use vector version of the intrinsic. Module *M = State.Builder.GetInsertBlock()->getModule(); Function *VectorF = Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic or vector-predication intrinsics."); auto *CI = cast_or_null(getUnderlyingValue()); SmallVector OpBundles; if (CI) CI->getOperandBundlesAsDefs(OpBundles); CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); setFlags(V); if (!V->getType()->isVoidTy()) State.set(this, V); State.addMetadata(V, CI); } InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise // clear Arguments. // TODO: Rework TTI interface to be independent of concrete IR values. SmallVector Arguments; for (const auto &[Idx, Op] : enumerate(operands())) { auto *V = Op->getUnderlyingValue(); if (!V) { // Push all the VP Intrinsic's ops into the Argments even if is nullptr. // Some VP Intrinsic's cost will assert the number of parameters. // Mainly appears in the following two scenarios: // 1. EVL Op is nullptr // 2. The Argmunt of the VP Intrinsic is also the VP Intrinsic if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) { Arguments.push_back(V); continue; } if (auto *UI = dyn_cast_or_null(getUnderlyingValue())) { Arguments.push_back(UI->getArgOperand(Idx)); continue; } Arguments.clear(); break; } Arguments.push_back(V); } Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); SmallVector ParamTys; for (unsigned I = 0; I != getNumOperands(); ++I) ParamTys.push_back( toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, dyn_cast_or_null(getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Vector predication intrinsics only demand the the first lane the last // operand (the EVL operand). return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && Op == getOperand(getNumOperands() - 1); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-INTRINSIC "; if (ResultTy->isVoidTy()) { O << "void "; } else { printAsOperand(O, SlotTracker); O << " = "; } O << "call"; printFlags(O); O << getIntrinsicName() << "("; interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); }); O << ")"; } #endif void VPHistogramRecipe::execute(VPTransformState &State) { IRBuilderBase &Builder = State.Builder; Value *Address = State.get(getOperand(0)); Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); VectorType *VTy = cast(Address->getType()); // The histogram intrinsic requires a mask even if the recipe doesn't; // if the mask operand was omitted then all lanes should be executed and // we just need to synthesize an all-true mask. Value *Mask = nullptr; if (VPValue *VPMask = getMask()) Mask = State.get(VPMask); else Mask = Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); // If this is a subtract, we want to invert the increment amount. We may // add a separate intrinsic in future, but for now we'll try this. if (Opcode == Instruction::Sub) IncAmt = Builder.CreateNeg(IncAmt); else assert(Opcode == Instruction::Add && "only add or sub supported for now"); State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, {VTy, IncAmt->getType()}, {Address, IncAmt, Mask}); } InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { // FIXME: Take the gather and scatter into account as well. For now we're // generating the same cost as the fallback path, but we'll likely // need to create a new TTI method for determining the cost, including // whether we can use base + vec-of-smaller-indices or just // vec-of-pointers. assert(VF.isVector() && "Invalid VF for histogram cost"); Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); VPValue *IncAmt = getOperand(1); Type *IncTy = Ctx.Types.inferScalarType(IncAmt); VectorType *VTy = VectorType::get(IncTy, VF); // Assume that a non-constant update value (or a constant != 1) requires // a multiply, and add that into the cost. InstructionCost MulCost = Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); if (IncAmt->isLiveIn()) { ConstantInt *CI = dyn_cast(IncAmt->getLiveInIRValue()); if (CI && CI->getZExtValue() == 1) MulCost = TTI::TCC_Free; } // Find the cost of the histogram operation itself. Type *PtrTy = VectorType::get(AddressTy, VF); Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, Type::getVoidTy(Ctx.LLVMCtx), {PtrTy, IncTy, MaskTy}); // Add the costs together with the add/sub operation. return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-HISTOGRAM buckets: "; getOperand(0)->printAsOperand(O, SlotTracker); if (Opcode == Instruction::Sub) O << ", dec: "; else { assert(Opcode == Instruction::Add); O << ", inc: "; } getOperand(1)->printAsOperand(O, SlotTracker); if (VPValue *Mask = getMask()) { O << ", mask: "; Mask->printAsOperand(O, SlotTracker); } } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-SELECT "; printAsOperand(O, SlotTracker); O << " = select "; printFlags(O); getOperand(0)->printAsOperand(O, SlotTracker); O << ", "; getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); O << (isInvariantCond() ? " (condition is loop invariant)" : ""); } #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. auto *InvarCond = isInvariantCond() ? State.get(getCond(), VPLane(0)) : nullptr; Value *Cond = InvarCond ? InvarCond : State.get(getCond()); Value *Op0 = State.get(getOperand(1)); Value *Op1 = State.get(getOperand(2)); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel); if (isa(Sel)) setFlags(cast(Sel)); State.addMetadata(Sel, dyn_cast_or_null(getUnderlyingValue())); } InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { SelectInst *SI = cast(getUnderlyingValue()); bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); Type *ScalarTy = Ctx.Types.inferScalarType(this); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); VPValue *Op0, *Op1; using namespace llvm::VPlanPatternMatch; if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { // select x, y, false --> x & y // select x, true, y --> x | y const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); SmallVector Operands; if (all_of(operands(), [](VPValue *Op) { return Op->getUnderlyingValue(); })) Operands.append(SI->op_begin(), SI->op_end()); bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); return Ctx.TTI.getArithmeticInstrCost( IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); } Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; if (auto *Cmp = dyn_cast(SI->getCondition())) Pred = Cmp->getPredicate(); return Ctx.TTI.getCmpSelInstrCost( Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); } VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( const FastMathFlags &FMF) { AllowReassoc = FMF.allowReassoc(); NoNaNs = FMF.noNaNs(); NoInfs = FMF.noInfs(); NoSignedZeros = FMF.noSignedZeros(); AllowReciprocal = FMF.allowReciprocal(); AllowContract = FMF.allowContract(); ApproxFunc = FMF.approxFunc(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { switch (OpType) { case OperationType::Cmp: O << " " << CmpInst::getPredicateName(getPredicate()); break; case OperationType::DisjointOp: if (DisjointFlags.IsDisjoint) O << " disjoint"; break; case OperationType::PossiblyExactOp: if (ExactFlags.IsExact) O << " exact"; break; case OperationType::OverflowingBinOp: if (WrapFlags.HasNUW) O << " nuw"; if (WrapFlags.HasNSW) O << " nsw"; break; case OperationType::FPMathOp: getFastMathFlags().print(O); break; case OperationType::GEPOp: if (GEPFlags.isInBounds()) O << " inbounds"; else if (GEPFlags.hasNoUnsignedSignedWrap()) O << " nusw"; if (GEPFlags.hasNoUnsignedWrap()) O << " nuw"; break; case OperationType::NonNegOp: if (NonNegFlags.NonNeg) O << " nneg"; break; case OperationType::Other: break; } if (getNumOperands() > 0) O << " "; } #endif void VPWidenRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; switch (Opcode) { case Instruction::Call: case Instruction::Br: case Instruction::PHI: case Instruction::GetElementPtr: case Instruction::Select: llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::FNeg: case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. SmallVector Ops; for (VPValue *VPOp : operands()) Ops.push_back(State.get(VPOp)); Value *V = Builder.CreateNAryOp(Opcode, Ops); if (auto *VecOp = dyn_cast(V)) setFlags(VecOp); // Use this vector value for all users of the original instruction. State.set(this, V); State.addMetadata(V, dyn_cast_or_null(getUnderlyingValue())); break; } case Instruction::ExtractValue: { assert(getNumOperands() == 2 && "expected single level extractvalue"); Value *Op = State.get(getOperand(0)); auto *CI = cast(getOperand(1)->getLiveInIRValue()); Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue()); State.set(this, Extract); break; } case Instruction::Freeze: { Value *Op = State.get(getOperand(0)); Value *Freeze = Builder.CreateFreeze(Op); State.set(this, Freeze); break; } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = Opcode == Instruction::FCmp; Value *A = State.get(getOperand(0)); Value *B = State.get(getOperand(1)); Value *C = nullptr; if (FCmp) { // Propagate fast math flags. C = Builder.CreateFCmpFMF( getPredicate(), A, B, dyn_cast_or_null(getUnderlyingValue())); } else { C = Builder.CreateICmp(getPredicate(), A, B); } State.set(this, C); State.addMetadata(C, dyn_cast_or_null(getUnderlyingValue())); break; } default: // This instruction is not vectorized by simple widening. LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " << Instruction::getOpcodeName(Opcode)); llvm_unreachable("Unhandled instruction!"); } // end of switch. #if !defined(NDEBUG) // Verify that VPlan type inference results agree with the type of the // generated values. assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == State.get(this)->getType() && "inferred type and type from generated instructions do not match"); #endif } InstructionCost VPWidenRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { switch (Opcode) { case Instruction::FNeg: { Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost( Opcode, VectorTy, Ctx.CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: // More complex computation, let the legacy cost-model handle this for now. return Ctx.getLegacyCost(cast(getUnderlyingValue()), VF); case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { VPValue *RHS = getOperand(1); // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. TargetTransformInfo::OperandValueInfo RHSInfo = { TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; if (RHS->isLiveIn()) RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && getOperand(1)->isDefinedOutsideLoopRegions()) RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); SmallVector Operands; if (CtxI) Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); return Ctx.TTI.getArithmeticInstrCost( Opcode, VectorTy, Ctx.CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, RHSInfo, Operands, CtxI, &Ctx.TLI); } case Instruction::Freeze: { // This opcode is unknown. Assume that it is the same as 'mul'. Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, Ctx.CostKind); } case Instruction::ExtractValue: { return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, Ctx.CostKind); } case Instruction::ICmp: case Instruction::FCmp: { Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, CtxI); } default: llvm_unreachable("Unsupported opcode for instruction"); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); printOperands(O, SlotTracker); } #endif void VPWidenCastRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; /// Vectorize casts. assert(State.VF.isVector() && "Not vectorizing?"); Type *DestTy = VectorType::get(getResultType(), State.VF); VPValue *Op = getOperand(0); Value *A = State.get(Op); Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); State.set(this, Cast); State.addMetadata(Cast, cast_or_null(getUnderlyingValue())); if (auto *CastOp = dyn_cast(Cast)) setFlags(CastOp); } InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { // TODO: In some cases, VPWidenCastRecipes are created but not considered in // the legacy cost model, including truncates/extends when evaluating a // reduction in a smaller type. if (!getUnderlyingValue()) return 0; // Computes the CastContextHint from a recipes that may access memory. auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; if (isa(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked : TTI::CastContextHint::Normal; const auto *WidenMemoryRecipe = dyn_cast(R); if (WidenMemoryRecipe == nullptr) return TTI::CastContextHint::None; if (!WidenMemoryRecipe->isConsecutive()) return TTI::CastContextHint::GatherScatter; if (WidenMemoryRecipe->isReverse()) return TTI::CastContextHint::Reversed; if (WidenMemoryRecipe->isMasked()) return TTI::CastContextHint::Masked; return TTI::CastContextHint::Normal; }; VPValue *Operand = getOperand(0); TTI::CastContextHint CCH = TTI::CastContextHint::None; // For Trunc/FPTrunc, get the context from the only user. if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { if (auto *StoreRecipe = dyn_cast(*user_begin())) CCH = ComputeCCH(StoreRecipe); } // For Z/Sext, get the context from the operand. else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || Opcode == Instruction::FPExt) { if (Operand->isLiveIn()) CCH = TTI::CastContextHint::Normal; else if (Operand->getDefiningRecipe()) CCH = ComputeCCH(Operand->getDefiningRecipe()); } auto *SrcTy = cast(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); auto *DestTy = cast(toVectorTy(getResultType(), VF)); // Arm TTI will use the underlying instruction to determine the cost. return Ctx.TTI.getCastInstrCost( Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, dyn_cast_if_present(getUnderlyingValue())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); printOperands(O, SlotTracker); O << " to " << *getResultType(); } #endif InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); } /// This function adds /// (0 * Step, 1 * Step, 2 * Step, ...) /// to each vector element of Val. /// \p Opcode is relevant for FP induction variable. static Value *getStepVector(Value *Val, Value *Step, Instruction::BinaryOps BinOp, ElementCount VF, IRBuilderBase &Builder) { assert(VF.isVector() && "only vector VFs are supported"); // Create and check the types. auto *ValVTy = cast(Val->getType()); ElementCount VLen = ValVTy->getElementCount(); Type *STy = Val->getType()->getScalarType(); assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && "Induction Step must be an integer or FP"); assert(Step->getType() == STy && "Step has wrong type"); SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. VectorType *InitVecValVTy = ValVTy; if (STy->isFloatingPointTy()) { Type *InitVecValSTy = IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); InitVecValVTy = VectorType::get(InitVecValSTy, VLen); } Value *InitVec = Builder.CreateStepVector(InitVecValVTy); if (STy->isIntegerTy()) { Step = Builder.CreateVectorSplat(VLen, Step); assert(Step->getType() == Val->getType() && "Invalid step vec"); // FIXME: The newly created binary instructions should contain nsw/nuw // flags, which can be found from the original scalar operations. Step = Builder.CreateMul(InitVec, Step); return Builder.CreateAdd(Val, Step, "induction"); } // Floating point induction. assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction"); InitVec = Builder.CreateUIToFP(InitVec, ValVTy); Step = Builder.CreateVectorSplat(VLen, Step); Value *MulOp = Builder.CreateFMul(InitVec, Step); return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); } /// A helper function that returns an integer or floating-point constant with /// value C. static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) : ConstantFP::get(Ty, C); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Int or FP induction being replicated."); Value *Start = getStartValue()->getLiveInIRValue(); const InductionDescriptor &ID = getInductionDescriptor(); TruncInst *Trunc = getTruncInst(); IRBuilderBase &Builder = State.Builder; assert(getPHINode()->getType() == ID.getStartValue()->getType() && "Types must match"); assert(State.VF.isVector() && "must have vector VF"); // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast(Trunc) : getPHINode(); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(Builder); if (ID.getInductionBinOp() && isa(ID.getInductionBinOp())) Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); // Now do the actual transformations, and start with fetching the step value. Value *Step = State.get(getStepValue(), VPLane(0)); assert((isa(EntryVal) || isa(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); Builder.SetInsertPoint(VectorPH->getTerminator()); if (isa(EntryVal)) { assert(Start->getType()->isIntegerTy() && "Truncation requires an integer type"); auto *TruncType = cast(EntryVal->getType()); Step = Builder.CreateTrunc(Step, TruncType); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); Value *SteppedStart = getStepVector(SplatStart, Step, ID.getInductionOpcode(), State.VF, State.Builder); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. Instruction::BinaryOps AddOp; Instruction::BinaryOps MulOp; if (Step->getType()->isIntegerTy()) { AddOp = Instruction::Add; MulOp = Instruction::Mul; } else { AddOp = ID.getInductionOpcode(); MulOp = Instruction::FMul; } Value *SplatVF; if (VPValue *SplatVFOperand = getSplatVFValue()) { // The recipe has been unrolled. In that case, fetch the splat value for the // induction increment. SplatVF = State.get(SplatVFOperand); } else { // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. Type *StepType = Step->getType(); Value *RuntimeVF = State.get(getVFValue(), VPLane(0)); if (Step->getType()->isFloatingPointTy()) RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); else RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. SplatVF = Builder.CreateVectorSplat(State.VF, Mul); } Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind"); VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); VecInd->setDebugLoc(getDebugLoc()); State.set(this, VecInd); Instruction *LastInduction = cast( Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); LastInduction->setDebugLoc(getDebugLoc()); VecInd->addIncoming(SteppedStart, VectorPH); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch // block cannot be used, as it does not exist yet. // TODO: Model increment value in VPlan, by turning the recipe into a // multi-def and a subclass of VPHeaderPHIRecipe. VecInd->addIncoming(LastInduction, VectorPH); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << " = WIDEN-INDUCTION "; printOperands(O, SlotTracker); if (auto *TI = getTruncInst()) O << " (truncated to " << *TI->getType() << ")"; } #endif bool VPWidenIntOrFpInductionRecipe::isCanonical() const { // The step may be defined by a recipe in the preheader (e.g. if it requires // SCEV expansion), but for the canonical induction the step is required to be // 1, which is represented as live-in. if (getStepValue()->getDefiningRecipe()) return false; auto *StepC = dyn_cast(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast(getStartValue()->getLiveInIRValue()); auto *CanIV = cast(&*getParent()->begin()); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << " = DERIVED-IV "; getStartValue()->printAsOperand(O, SlotTracker); O << " + "; getOperand(1)->printAsOperand(O, SlotTracker); O << " * "; getStepValue()->printAsOperand(O, SlotTracker); } #endif void VPScalarIVStepsRecipe::execute(VPTransformState &State) { // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); if (hasFastMathFlags()) State.Builder.setFastMathFlags(getFastMathFlags()); /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step. Value *BaseIV = State.get(getOperand(0), VPLane(0)); Value *Step = State.get(getStepValue(), VPLane(0)); IRBuilderBase &Builder = State.Builder; // Ensure step has the same type as that of scalar IV. Type *BaseIVTy = BaseIV->getType()->getScalarType(); assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); // We build scalar steps for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. Instruction::BinaryOps AddOp; Instruction::BinaryOps MulOp; if (BaseIVTy->isIntegerTy()) { AddOp = Instruction::Add; MulOp = Instruction::Mul; } else { AddOp = InductionOpcode; MulOp = Instruction::FMul; } // Determine the number of scalars we need to generate for each unroll // iteration. bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; if (!FirstLaneOnly && State.VF.isScalable()) { VecIVTy = VectorType::get(BaseIVTy, State.VF); UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); SplatStep = Builder.CreateVectorSplat(State.VF, Step); SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV); } unsigned StartLane = 0; unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); if (State.Lane) { StartLane = State.Lane->getKnownLane(); EndLane = StartLane + 1; } Value *StartIdx0; if (getUnrollPart(*this) == 0) StartIdx0 = ConstantInt::get(IntStepTy, 0); else { StartIdx0 = State.get(getOperand(2), true); if (getUnrollPart(*this) != 1) { StartIdx0 = Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(), getUnrollPart(*this))); } StartIdx0 = Builder.CreateTrunc(StartIdx0, IntStepTy); } if (!FirstLaneOnly && State.VF.isScalable()) { auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (BaseIVTy->isFloatingPointTy()) InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); State.set(this, Add); // It's useful to record the lane values too for the known minimum number // of elements so we do those below. This improves the code quality when // trying to extract the first element, for example. } if (BaseIVTy->isFloatingPointTy()) StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { Value *StartIdx = Builder.CreateBinOp( AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. assert((State.VF.isScalable() || isa(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); State.set(this, Add, VPLane(Lane)); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << " = SCALAR-STEPS "; printOperands(O, SlotTracker); } #endif void VPWidenGEPRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); auto *GEP = cast(getUnderlyingInstr()); // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP // results in a vector of pointers when at least one operand of the GEP // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. if (areAllOperandsInvariant()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we // produce a vector of pointers, we need to either arbitrarily pick an // operand to broadcast, or broadcast a clone of the original GEP. // Here, we broadcast a clone of the original. // // TODO: If at some point we decide to scalarize instructions having // loop-invariant operands, this special case will no longer be // required. We would add the scalarization decision to // collectLoopScalars() and teach getVectorValue() to broadcast // the lane-zero scalar value. SmallVector Ops; for (unsigned I = 0, E = getNumOperands(); I != E; I++) Ops.push_back(State.get(getOperand(I), VPLane(0))); auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0], ArrayRef(Ops).drop_front(), "", getGEPNoWrapFlags()); Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); State.set(this, Splat); } else { // If the GEP has at least one loop-varying operand, we are sure to // produce a vector of pointers unless VF is scalar. // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. auto *Ptr = isPointerLoopInvariant() ? State.get(getOperand(0), VPLane(0)) : State.get(getOperand(0)); // Collect all the indices for the new GEP. If any index is // loop-invariant, we won't broadcast it. SmallVector Indices; for (unsigned I = 1, E = getNumOperands(); I < E; I++) { VPValue *Operand = getOperand(I); if (isIndexLoopInvariant(I - 1)) Indices.push_back(State.get(Operand, VPLane(0))); else Indices.push_back(State.get(Operand)); } // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices, "", getGEPNoWrapFlags()); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); State.set(this, NewGEP); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; O << (isPointerLoopInvariant() ? "Inv" : "Var"); for (size_t I = 0; I < getNumOperands() - 1; ++I) O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; O << " "; printAsOperand(O, SlotTracker); O << " = getelementptr"; printFlags(O); printOperands(O, SlotTracker); } #endif static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, unsigned CurrentPart, IRBuilderBase &Builder) { // Use i32 for the gep index type when the value is constant, // or query DataLayout for a more suitable index type otherwise. const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); return IsScalable && (IsReverse || CurrentPart > 0) ? DL.getIndexType(Builder.getPtrTy(0)) : Builder.getInt32Ty(); } void VPVectorEndPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, CurrentPart, Builder); // The wide store needs to start at the last vector element. Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); if (IndexTy != RunTimeVF->getType()) RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); // NumElt = -CurrentPart * RunTimeVF Value *NumElt = Builder.CreateMul( ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); // LastLane = 1 - RunTimeVF Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << " = vector-end-pointer"; printFlags(O); printOperands(O, SlotTracker); } #endif void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << " = vector-pointer "; printOperands(O, SlotTracker); } #endif void VPBlendRecipe::execute(VPTransformState &State) { assert(isNormalized() && "Expected blend to be normalized!"); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. unsigned NumIncoming = getNumIncomingValues(); // Generate a sequence of selects of the form: // SELECT(Mask3, In3, // SELECT(Mask2, In2, // SELECT(Mask1, In1, // In0))) // Note that Mask0 is never used: lanes for which no path reaches this phi and // are essentially undef are taken from In0. bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *Result = nullptr; for (unsigned In = 0; In < NumIncoming; ++In) { // We might have single edge PHIs (blocks) - use an identity // 'select' for the first PHI operand. Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); if (In == 0) Result = In0; // Initialize with the first incoming value. else { // Select between the current value and the previous incoming edge // based on the incoming mask. Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); } } State.set(this, Result, OnlyFirstLaneUsed); } InstructionCost VPBlendRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { // Handle cases where only the first lane is used the same way as the legacy // cost model. if (vputils::onlyFirstLaneUsed(this)) return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); return (getNumIncomingValues() - 1) * Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; printAsOperand(O, SlotTracker); O << " ="; if (getNumIncomingValues() == 1) { // Not a User of any mask: not really blending, this is a // single-predecessor phi. O << " "; getIncomingValue(0)->printAsOperand(O, SlotTracker); } else { for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { O << " "; getIncomingValue(I)->printAsOperand(O, SlotTracker); if (I == 0) continue; O << "/"; getMask(I)->printAsOperand(O, SlotTracker); } } } #endif void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Reduction being replicated."); Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); RecurKind Kind = getRecurrenceKind(); assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && "In-loop AnyOf reductions aren't currently supported"); // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(getFastMathFlags()); Value *NewVecOp = State.get(getVecOp()); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, State.VF.isScalar()); VectorType *VecTy = dyn_cast(NewVecOp->getType()); Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags()); if (State.VF.isVector()) Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); NewVecOp = Select; } Value *NewRed; Value *NextInChain; if (IsOrdered) { if (State.VF.isVector()) NewRed = createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); else NewRed = State.Builder.CreateBinOp( (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), PrevInChain, NewVecOp); PrevInChain = NewRed; NextInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); else NextInChain = State.Builder.CreateBinOp( (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, PrevInChain); } State.set(this, NextInChain, /*IsScalar*/ true); } void VPReductionEVLRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Reduction being replicated."); auto &Builder = State.Builder; // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); Builder.setFastMathFlags(getFastMathFlags()); RecurKind Kind = getRecurrenceKind(); Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); Value *VecOp = State.get(getVecOp()); Value *EVL = State.get(getEVL(), VPLane(0)); VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVL); Value *Mask; // TODO: move the all-true mask generation into VectorBuilder. if (VPValue *CondOp = getCondOp()) Mask = State.get(CondOp); else Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); VBuilder.setMask(Mask); Value *NewRed; if (isOrdered()) { NewRed = createOrderedReduction(VBuilder, Kind, VecOp, Prev); } else { NewRed = createSimpleReduction(VBuilder, VecOp, Kind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); else NewRed = Builder.CreateBinOp( (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, Prev); } State.set(this, NewRed, /*IsScalar*/ true); } InstructionCost VPReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { RecurKind RdxKind = getRecurrenceKind(); Type *ElementTy = Ctx.Types.inferScalarType(this); auto *VectorTy = cast(toVectorTy(ElementTy, VF)); unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); FastMathFlags FMFs = getFastMathFlags(); // TODO: Support any-of and in-loop reductions. assert( (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || ForceTargetInstructionCost.getNumOccurrences() > 0) && "Any-of reduction not implemented in VPlan-based cost model currently."); assert( (!cast(getOperand(0))->isInLoop() || ForceTargetInstructionCost.getNumOccurrences() > 0) && "In-loop reduction not implemented in VPlan-based cost model currently."); // Cost = Reduction cost + BinOp cost InstructionCost Cost = Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); return Cost + Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); } return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "REDUCE "; printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); O << " +"; printFlags(O); O << " reduce." << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(getRecurrenceKind())) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (isConditional()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; } void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "REDUCE "; printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); O << " +"; printFlags(O); O << " vp.reduce." << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(getRecurrenceKind())) << " ("; getVecOp()->printAsOperand(O, SlotTracker); O << ", "; getEVL()->printAsOperand(O, SlotTracker); if (isConditional()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; } #endif bool VPReplicateRecipe::shouldPack() const { // Find if the recipe is used by a widened recipe via an intervening // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. return any_of(users(), [](const VPUser *U) { if (auto *PredR = dyn_cast(U)) return any_of(PredR->users(), [PredR](const VPUser *U) { return !U->usesScalars(PredR); }); return false; }); } InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast(getUnderlyingValue()); // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan // transform, avoid computing their cost multiple times for now. Ctx.SkipCostComputation.insert(UI); return Ctx.getLegacyCost(UI, VF); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); if (!getUnderlyingInstr()->getType()->isVoidTy()) { printAsOperand(O, SlotTracker); O << " = "; } if (auto *CB = dyn_cast(getUnderlyingInstr())) { O << "call"; printFlags(O); O << "@" << CB->getCalledFunction()->getName() << "("; interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); }); O << ")"; } else { O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); printFlags(O); printOperands(O, SlotTracker); } if (shouldPack()) O << " (S->V)"; } #endif void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Lane && "Branch on Mask works only on single instance."); VPValue *BlockInMask = getOperand(0); Value *ConditionBit = State.get(BlockInMask, *State.Lane); // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); assert(isa(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."); auto CondBr = State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr); CondBr->setSuccessor(0, nullptr); CurrentTerminator->eraseFromParent(); } InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { // The legacy cost model doesn't assign costs to branches for individual // replicate regions. Match the current behavior in the VPlan cost model for // now. return 0; } void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Lane && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = cast(State.get(getOperand(0), *State.Lane)); BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); assert(isa(getOperand(0)) && "operand must be VPReplicateRecipe"); // By current pack/unpack logic we need to generate only a single phi node: if // a vector value for the predicated instruction exists at this point it means // the instruction has vector users only, and a phi for the vector value is // needed. In this case the recipe of the predicated instruction is marked to // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. if (State.hasVectorValue(getOperand(0))) { Value *VectorValue = State.get(getOperand(0)); InsertElementInst *IEI = cast(VectorValue); PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. if (State.hasVectorValue(this)) State.reset(this, VPhi); else State.set(this, VPhi); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. State.reset(getOperand(0), VPhi); } else { if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) return; Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0)); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); if (State.hasScalarValue(this, *State.Lane)) State.reset(this, Phi, *State.Lane); else State.set(this, Phi, *State.Lane); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. State.reset(getOperand(0), Phi, *State.Lane); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; printAsOperand(O, SlotTracker); O << " = "; printOperands(O, SlotTracker); } #endif InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); if (!Consecutive) { // TODO: Using the original IR may not be accurate. // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); } InstructionCost Cost = 0; if (IsMasked) { Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind, OpInfo, &Ingredient); } if (!Reverse) return Cost; return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, cast(Ty), {}, Ctx.CostKind, 0); } void VPWidenLoadRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; Value *Mask = nullptr; if (auto *VPMask = getMask()) { // Mask reversal is only needed for non-all-one (null) masks, as reverse // of a null all-one mask is a null mask. Mask = State.get(VPMask); if (isReverse()) Mask = Builder.CreateVectorReverse(Mask, "reverse"); } Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); Value *NewLI; if (CreateGather) { NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, "wide.masked.gather"); } else if (Mask) { NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, PoisonValue::get(DataTy), "wide.masked.load"); } else { NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } // Add metadata to the load, but setVectorValue to the reverse shuffle. State.addMetadata(NewLI, LI); if (Reverse) NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); State.set(this, NewLI); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); O << " = load "; printOperands(O, SlotTracker); } #endif /// Use all-true mask for reverse rather than actual mask, as it avoids a /// dependence w/o affecting the result. static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name) { VectorType *ValTy = cast(Operand->getType()); Value *AllTrueMask = Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, {Operand, AllTrueMask, EVL}, nullptr, Name); } void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); Type *ScalarDataTy = getLoadStoreType(&Ingredient); auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; CallInst *NewLI; Value *EVL = State.get(getEVL(), VPLane(0)); Value *Addr = State.get(getAddr(), !CreateGather); Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); if (isReverse()) Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } if (CreateGather) { NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, nullptr, "wide.masked.gather"); } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVL).setMask(Mask); NewLI = cast(VBuilder.createVectorInstruction( Instruction::Load, DataTy, Addr, "vp.op.load")); } NewLI->addParamAttr( 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); State.addMetadata(NewLI, LI); Instruction *Res = NewLI; if (isReverse()) Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); State.set(this, Res); } InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (!Consecutive || IsMasked) return VPWidenMemoryRecipe::computeCost(VF, Ctx); // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() // here because the EVL recipes using EVL to replace the tail mask. But in the // legacy model, it will always calculate the cost of mask. // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) return Cost; return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, cast(Ty), {}, Ctx.CostKind, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); O << " = vp.load "; printOperands(O, SlotTracker); } #endif void VPWidenStoreRecipe::execute(VPTransformState &State) { auto *SI = cast(&Ingredient); VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; Value *Mask = nullptr; if (auto *VPMask = getMask()) { // Mask reversal is only needed for non-all-one (null) masks, as reverse // of a null all-one mask is a null mask. Mask = State.get(VPMask); if (isReverse()) Mask = Builder.CreateVectorReverse(Mask, "reverse"); } Value *StoredVal = State.get(StoredVPValue); if (isReverse()) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); Instruction *NewSI = nullptr; if (CreateScatter) NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); else if (Mask) NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); else NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); State.addMetadata(NewSI, SI); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN store "; printOperands(O, SlotTracker); } #endif void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { auto *SI = cast(&Ingredient); VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; CallInst *NewSI = nullptr; Value *StoredVal = State.get(StoredValue); Value *EVL = State.get(getEVL(), VPLane(0)); if (isReverse()) StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); if (isReverse()) Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } Value *Addr = State.get(getAddr(), !CreateScatter); if (CreateScatter) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter, {StoredVal, Addr, Mask, EVL}); } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVL).setMask(Mask); NewSI = cast(VBuilder.createVectorInstruction( Instruction::Store, Type::getVoidTy(EVL->getContext()), {StoredVal, Addr})); } NewSI->addParamAttr( 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); State.addMetadata(NewSI, SI); } InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (!Consecutive || IsMasked) return VPWidenMemoryRecipe::computeCost(VF, Ctx); // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost() // here because the EVL recipes using EVL to replace the tail mask. But in the // legacy model, it will always calculate the cost of mask. // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we // don't need to compare to the legacy cost model. Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); const Align Alignment = getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) return Cost; return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, cast(Ty), {}, Ctx.CostKind, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN vp.store "; printOperands(O, SlotTracker); } #endif static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. auto VF = DstVTy->getElementCount(); auto *SrcVecTy = cast(V->getType()); assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); Type *DstElemTy = DstVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { return Builder.CreateBitOrPointerCast(V, DstVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of // pointers or vice-versa. Handle this using a two-step bitcast using an // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && "Only one type should be a pointer type"); assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = VectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstVTy); } /// Return a vector containing interleaved elements from multiple /// smaller input vectors. static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, const Twine &Name) { unsigned Factor = Vals.size(); assert(Factor > 1 && "Tried to interleave invalid number of vectors"); VectorType *VecTy = cast(Vals[0]->getType()); #ifndef NDEBUG for (Value *Val : Vals) assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); #endif // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " "scalable vectors, must be power of 2"); SmallVector InterleavingValues(Vals); // When interleaving, the number of values will be shrunk until we have the // single final interleaved value. auto *InterleaveTy = cast(InterleavingValues[0]->getType()); for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); for (unsigned I = 0; I < Midpoint; ++I) InterleavingValues[I] = Builder.CreateIntrinsic( InterleaveTy, Intrinsic::vector_interleave2, {InterleavingValues[I], InterleavingValues[Midpoint + I]}, /*FMFSource=*/nullptr, Name); } return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. Value *WideVec = concatenateVectors(Builder, Vals); // Interleave the elements into the wide vector. const unsigned NumElts = VecTy->getElementCount().getFixedValue(); return Builder.CreateShuffleVector( WideVec, createInterleaveMask(NumElts, Factor), Name); } // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): // for (i = 0; i < N; i+=3) { // R = Pic[i]; // Member of index 0 // G = Pic[i+1]; // Member of index 1 // B = Pic[i+2]; // Member of index 2 // ... // do something to R, G, B // } // To: // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements // // Or translate following interleaved store group (factor = 3): // for (i = 0; i < N; i+=3) { // ... do something to R, G, B // Pic[i] = R; // Member of index 0 // Pic[i+1] = G; // Member of index 1 // Pic[i+2] = B; // Member of index 2 // } // To: // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Interleave group being replicated."); const InterleaveGroup *Group = IG; Instruction *Instr = Group->getInsertPos(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); // TODO: extend the masked interleaved-group support to reversed access. VPValue *BlockInMask = getMask(); assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); VPValue *Addr = getAddr(); Value *ResAddr = State.get(Addr, VPLane(0)); if (auto *I = dyn_cast(ResAddr)) State.setDebugLocFrom(I->getDebugLoc()); // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. if (Group->isReverse()) { Value *RuntimeVF = getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); Value *Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); Index = State.Builder.CreateMul(Index, State.Builder.getInt32(Group->getFactor())); Index = State.Builder.CreateNeg(Index); bool InBounds = false; if (auto *Gep = dyn_cast(ResAddr->stripPointerCasts())) InBounds = Gep->isInBounds(); ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); } State.setDebugLocFrom(getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); SmallVector Ops(InterleaveFactor, ResBlockInMask); return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) return MaskForGaps; Value *ResBlockInMask = State.get(BlockInMask); Value *ShuffledMask = State.Builder.CreateShuffleVector( ResBlockInMask, createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), "interleaved.mask"); return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; }; const DataLayout &DL = Instr->getDataLayout(); // Vectorize the interleaved load group. if (isa(Instr)) { Value *MaskForGaps = nullptr; if (NeedsMaskForGaps) { MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } Instruction *NewLoad; if (BlockInMask || MaskForGaps) { Value *GroupMask = CreateGroupMask(MaskForGaps); NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); } else NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, Group->getAlign(), "wide.vec"); Group->addMetadata(NewLoad); ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. SmallVector DeinterleavedValues(InterleaveFactor); DeinterleavedValues[0] = NewLoad; // For the case of InterleaveFactor > 2, we will have to do recursive // deinterleaving, because the current available deinterleave intrinsic // supports only Factor of 2, otherwise it will bailout after first // iteration. // When deinterleaving, the number of values will double until we // have "InterleaveFactor". for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) { // Deinterleave the elements within the vector SmallVector TempDeinterleavedValues(NumVectors); for (unsigned I = 0; I < NumVectors; ++I) { auto *DiTy = DeinterleavedValues[I]->getType(); TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], /*FMFSource=*/nullptr, "strided.vec"); } // Extract the deinterleaved values: for (unsigned I = 0; I < 2; ++I) for (unsigned J = 0; J < NumVectors; ++J) DeinterleavedValues[NumVectors * I + J] = State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); } #ifndef NDEBUG for (Value *Val : DeinterleavedValues) assert(Val && "NULL Deinterleaved Value"); #endif for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); Value *StridedVec = DeinterleavedValues[I]; if (!Member) { // This value is not needed as it's not used cast(StridedVec)->eraseFromParent(); continue; } // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); StridedVec = createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec); ++J; } return; } // For each member in the group, shuffle out the appropriate data from the // wide loads. unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); // Skip the gaps in the group. if (!Member) continue; auto StrideMask = createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); Value *StridedVec = State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); StridedVec = createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); } if (Group->isReverse()) StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec); ++J; } return; } // The sub vector type for current instruction. auto *SubVT = VectorType::get(ScalarTy, State.VF); // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); assert((!MaskForGaps || !State.VF.isScalable()) && "masking gaps for scalable vectors is not yet supported."); ArrayRef StoredValues = getStoredValues(); // Collect the stored vector from each member. SmallVector StoredVecs; unsigned StoredIdx = 0; for (unsigned i = 0; i < InterleaveFactor; i++) { assert((Group->getMember(i) || MaskForGaps) && "Fail to get a member from an interleaved store group"); Instruction *Member = Group->getMember(i); // Skip the gaps in the group. if (!Member) { Value *Undef = PoisonValue::get(SubVT); StoredVecs.push_back(Undef); continue; } Value *StoredVec = State.get(StoredValues[StoredIdx]); ++StoredIdx; if (Group->isReverse()) StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); // If this member has different type, cast it to a unified type. if (StoredVec->getType() != SubVT) StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); StoredVecs.push_back(StoredVec); } // Interleave all the smaller vectors into one wider vector. Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { Value *GroupMask = CreateGroupMask(MaskForGaps); NewStoreInstr = State.Builder.CreateMaskedStore( IVec, ResAddr, Group->getAlign(), GroupMask); } else NewStoreInstr = State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); Group->addMetadata(NewStoreInstr); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; getAddr()->printAsOperand(O, SlotTracker); VPValue *Mask = getMask(); if (Mask) { O << ", "; Mask->printAsOperand(O, SlotTracker); } unsigned OpIdx = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) { if (!IG->getMember(i)) continue; if (getNumStoreOperands() > 0) { O << "\n" << Indent << " store "; getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); O << " to index " << i; } else { O << "\n" << Indent << " "; getVPValue(OpIdx)->printAsOperand(O, SlotTracker); O << " = load from index " << i; } ++OpIdx; } } #endif InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; for (unsigned Idx = 0; IG->getFactor(); ++Idx) if (auto *Member = IG->getMember(Idx)) { if (Member == InsertPos) break; InsertPosIdx++; } Type *ValTy = Ctx.Types.inferScalarType( getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) : getStoredValues()[InsertPosIdx]); auto *VectorTy = cast(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); unsigned InterleaveFactor = IG->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in the interleaved group. SmallVector Indices; for (unsigned IF = 0; IF < InterleaveFactor; IF++) if (IG->getMember(IF)) Indices.push_back(IF); // Calculate the cost of the whole interleaved group. InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); if (!IG->isReverse()) return Cost; return Cost + IG->getNumMembers() * Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, std::nullopt, Ctx.CostKind, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = CANONICAL-INDUCTION "; printOperands(O, SlotTracker); } #endif bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { return IsScalarAfterVectorization && (!IsScalable || vputils::onlyFirstLaneUsed(this)); } void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { assert(getInductionDescriptor().getKind() == InductionDescriptor::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"); assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() && "Unexpected type."); assert(!onlyScalarsGenerated(State.VF.isScalable()) && "Recipe should have been replaced"); unsigned CurrentPart = getUnrollPart(*this); // Build a pointer phi Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); PHINode *NewPointerPhi = nullptr; if (CurrentPart == 0) { IRBuilder<>::InsertPointGuard Guard(State.Builder); if (State.Builder.GetInsertPoint() != State.Builder.GetInsertBlock()->getFirstNonPHIIt()) State.Builder.SetInsertPoint( State.Builder.GetInsertBlock()->getFirstNonPHIIt()); NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi"); NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); NewPointerPhi->setDebugLoc(getDebugLoc()); } else { // The recipe has been unrolled. In that case, fetch the single pointer phi // shared among all unrolled parts of the recipe. auto *GEP = cast(State.get(getFirstUnrolledPartOperand())); NewPointerPhi = cast(GEP->getPointerOperand()); } // A pointer induction, performed by using a gep BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); Value *ScalarStepValue = State.get(getStepValue(), VPLane(0)); Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue()); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch // block cannot be used, as it does not exist yet. // TODO: Model increment value in VPlan, by turning the recipe into a // multi-def and a subclass of VPHeaderPHIRecipe. if (CurrentPart == 0) { // The recipe represents the first part of the pointer induction. Create the // GEP to increment the phi across all unrolled parts. Value *NumUnrolledElems = State.get(&getParent()->getPlan()->getVFxUF(), true); Value *InductionGEP = GetElementPtrInst::Create( State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul( ScalarStepValue, State.Builder.CreateTrunc(NumUnrolledElems, PhiType)), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, VectorPH); } // Create actual address geps that use the pointer phi as base and a // vectorized version of the step value () as offset. Type *VecPhiType = VectorType::get(PhiType, State.VF); Value *StartOffsetScalar = State.Builder.CreateMul( RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); Value *StartOffset = State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); // Create a vector of consecutive numbers from zero to VF. StartOffset = State.Builder.CreateAdd( StartOffset, State.Builder.CreateStepVector(VecPhiType)); assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) && "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( State.VF, ScalarStepValue)), "vector.gep"); State.set(this, GEP); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { assert((getNumOperands() == 2 || getNumOperands() == 4) && "unexpected number of operands"); O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = WIDEN-POINTER-INDUCTION "; getStartValue()->printAsOperand(O, SlotTracker); O << ", "; getStepValue()->printAsOperand(O, SlotTracker); if (getNumOperands() == 4) { O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); O << ", "; getOperand(3)->printAsOperand(O, SlotTracker); } } #endif void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Lane && "cannot be used in per-lane"); const DataLayout &DL = SE.getDataLayout(); SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true); Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), &*State.Builder.GetInsertPoint()); State.set(this, Res, VPLane(0)); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = EXPAND SCEV " << *Expr; } #endif void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; Value *VStart = VF.isScalar() ? CanonicalIV : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); if (VF.isVector()) { VStep = Builder.CreateVectorSplat(VF, VStep); VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); } Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(this, CanonicalVectorIV); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = WIDEN-CANONICAL-INDUCTION "; printOperands(O, SlotTracker); } #endif void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; // Create a vector from the initial value. auto *VectorInit = getStartValue()->getLiveInIRValue(); Type *VecTy = State.VF.isScalar() ? VectorInit->getType() : VectorType::get(VectorInit->getType(), State.VF); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); if (State.VF.isVector()) { auto *IdxTy = Builder.getInt32Ty(); auto *One = ConstantInt::get(IdxTy, 1); IRBuilder<>::InsertPointGuard Guard(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); auto *LastIdx = Builder.CreateSub(RuntimeVF, One); VectorInit = Builder.CreateInsertElement( PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); } // Create a phi node for the new recurrence. PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); Phi->addIncoming(VectorInit, VectorPH); State.set(this, Phi); } InstructionCost VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (VF.isScalar()) return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); if (VF.isScalable() && VF.getKnownMinValue() == 1) return InstructionCost::getInvalid(); return 0; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif void VPReductionPHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; // If this phi is fed by a scaled reduction then it should output a // vector with fewer elements than the VF. ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); Value *StartV = StartVPV->getLiveInIRValue(); // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; Type *VecTy = ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentParentLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); Phi->insertBefore(HeaderBB->getFirstInsertionPt()); State.set(this, Phi, IsInLoop); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); unsigned CurrentPart = getUnrollPart(*this); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { // MinMax and AnyOf reductions have the start value as their identity. if (ScalarPHI) { Iden = StartV; } else { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); StartV = Iden = State.get(StartVPV); } } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { // [I|F]FindLastIV will use a sentinel value to initialize the reduction // phi or the resume value from the main vector loop when vectorizing the // epilogue loop. In the exit block, ComputeReductionResult will generate // checks to verify if the reduction result is the sentinel value. If the // result is the sentinel value, it will be corrected back to the start // value. // TODO: The sentinel value is not always necessary. When the start value is // a constant, and smaller than the start value of the induction variable, // the start value can be directly used to initialize the reduction phi. Iden = StartV; if (!ScalarPHI) { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); StartV = Iden = Builder.CreateVectorSplat(State.VF, Iden); } } else { Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); if (!ScalarPHI) { if (CurrentPart == 0) { // Create start and identity vector values for the reduction in the // preheader. // TODO: Introduce recipes in VPlan preheader to create initial values. Iden = Builder.CreateVectorSplat(VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); Constant *Zero = Builder.getInt32(0); StartV = Builder.CreateInsertElement(Iden, StartV, Zero); } else { Iden = Builder.CreateVectorSplat(VF, Iden); } } } Phi = cast(State.get(this, IsInLoop)); Value *StartVal = (CurrentPart == 0) ? StartV : Iden; Phi->addIncoming(StartVal, VectorPH); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-REDUCTION-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); if (VFScaleFactor != 1) O << " (VF scaled by 1/" << VFScaleFactor << ")"; } #endif VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) { VPBasicBlock *Parent = getParent(); VPBlockBase *Pred = nullptr; if (Parent->getNumPredecessors() > 0) { Pred = Parent->getPredecessors()[I]; } else { auto *Region = Parent->getParent(); assert(Region && !Region->isReplicator() && Region->getEntry() == Parent && "must be in the entry block of a non-replicate region"); assert( I < 2 && getNumOperands() == 2 && "when placed in an entry block, only 2 incoming blocks are available"); // I == 0 selects the predecessor of the region, I == 1 selects the region // itself whose exiting block feeds the phi across the backedge. Pred = I == 0 ? Region->getSinglePredecessor() : Region; } return Pred->getExitingBasicBlock(); } void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name); State.set(this, VecPhi); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif // TODO: It would be good to use the existing VPWidenPHIRecipe instead and // remove VPActiveLaneMaskPHIRecipe. void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); Value *StartMask = State.get(getOperand(0)); PHINode *Phi = State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); Phi->addIncoming(StartMask, VectorPH); State.set(this, Phi); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "ACTIVE-LANE-MASK-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif