mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-17 22:16:46 +00:00

If the buildvector node has cast to float user, it cannot be considered as safe for truncation, need to use the original bitwidth here. Fixes #135410
24073 lines
989 KiB
C++
24073 lines
989 KiB
C++
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
|
|
// stores that can be put together into vector-stores. Next, it attempts to
|
|
// construct vectorizable tree using the use-def chains. If a profitable tree
|
|
// was found, the SLP vectorizer performs vectorization on the tree.
|
|
//
|
|
// The pass is inspired by the work described in the paper:
|
|
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/DenseSet.h"
|
|
#include "llvm/ADT/PriorityQueue.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/ScopeExit.h"
|
|
#include "llvm/ADT/SetOperations.h"
|
|
#include "llvm/ADT/SetVector.h"
|
|
#include "llvm/ADT/SmallBitVector.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallSet.h"
|
|
#include "llvm/ADT/SmallString.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/ADT/iterator.h"
|
|
#include "llvm/ADT/iterator_range.h"
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
|
#include "llvm/Analysis/CodeMetrics.h"
|
|
#include "llvm/Analysis/ConstantFolding.h"
|
|
#include "llvm/Analysis/DemandedBits.h"
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
|
#include "llvm/Analysis/IVDescriptors.h"
|
|
#include "llvm/Analysis/Loads.h"
|
|
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/MemoryLocation.h"
|
|
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/Analysis/VectorUtils.h"
|
|
#include "llvm/IR/Attributes.h"
|
|
#include "llvm/IR/BasicBlock.h"
|
|
#include "llvm/IR/Constant.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/DataLayout.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/Dominators.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstrTypes.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Operator.h"
|
|
#include "llvm/IR/PatternMatch.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Use.h"
|
|
#include "llvm/IR/User.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/IR/ValueHandle.h"
|
|
#ifdef EXPENSIVE_CHECKS
|
|
#include "llvm/IR/Verifier.h"
|
|
#endif
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Compiler.h"
|
|
#include "llvm/Support/DOTGraphTraits.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/DebugCounter.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Support/GraphWriter.h"
|
|
#include "llvm/Support/InstructionCost.h"
|
|
#include "llvm/Support/KnownBits.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
|
|
#include "llvm/Transforms/Utils/Local.h"
|
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
|
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <iterator>
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <set>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <utility>
|
|
|
|
using namespace llvm;
|
|
using namespace llvm::PatternMatch;
|
|
using namespace slpvectorizer;
|
|
using namespace std::placeholders;
|
|
|
|
#define SV_NAME "slp-vectorizer"
|
|
#define DEBUG_TYPE "SLP"
|
|
|
|
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
|
|
|
|
DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
|
|
"Controls which SLP graphs should be vectorized.");
|
|
|
|
static cl::opt<bool>
|
|
RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
|
|
cl::desc("Run the SLP vectorization passes"));
|
|
|
|
static cl::opt<bool>
|
|
SLPReVec("slp-revec", cl::init(false), cl::Hidden,
|
|
cl::desc("Enable vectorization for wider vector utilization"));
|
|
|
|
static cl::opt<int>
|
|
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
|
|
cl::desc("Only vectorize if you gain more than this "
|
|
"number "));
|
|
|
|
static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
|
|
"slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
|
|
cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
|
|
"heuristics and makes vectorization decision via cost modeling."));
|
|
|
|
static cl::opt<bool>
|
|
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
|
|
cl::desc("Attempt to vectorize horizontal reductions"));
|
|
|
|
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
|
|
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
|
|
cl::desc(
|
|
"Attempt to vectorize horizontal reductions feeding into a store"));
|
|
|
|
static cl::opt<bool> SplitAlternateInstructions(
|
|
"slp-split-alternate-instructions", cl::init(true), cl::Hidden,
|
|
cl::desc("Improve the code quality by splitting alternate instructions"));
|
|
|
|
static cl::opt<int>
|
|
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
|
|
cl::desc("Attempt to vectorize for this register size in bits"));
|
|
|
|
static cl::opt<unsigned>
|
|
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
|
|
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
|
|
|
|
/// Limits the size of scheduling regions in a block.
|
|
/// It avoid long compile times for _very_ large blocks where vector
|
|
/// instructions are spread over a wide range.
|
|
/// This limit is way higher than needed by real-world functions.
|
|
static cl::opt<int>
|
|
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
|
|
cl::desc("Limit the size of the SLP scheduling region per block"));
|
|
|
|
static cl::opt<int> MinVectorRegSizeOption(
|
|
"slp-min-reg-size", cl::init(128), cl::Hidden,
|
|
cl::desc("Attempt to vectorize for this register size in bits"));
|
|
|
|
static cl::opt<unsigned> RecursionMaxDepth(
|
|
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
|
|
cl::desc("Limit the recursion depth when building a vectorizable tree"));
|
|
|
|
static cl::opt<unsigned> MinTreeSize(
|
|
"slp-min-tree-size", cl::init(3), cl::Hidden,
|
|
cl::desc("Only vectorize small trees if they are fully vectorizable"));
|
|
|
|
// The maximum depth that the look-ahead score heuristic will explore.
|
|
// The higher this value, the higher the compilation time overhead.
|
|
static cl::opt<int> LookAheadMaxDepth(
|
|
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
|
|
cl::desc("The maximum look-ahead depth for operand reordering scores"));
|
|
|
|
// The maximum depth that the look-ahead score heuristic will explore
|
|
// when it probing among candidates for vectorization tree roots.
|
|
// The higher this value, the higher the compilation time overhead but unlike
|
|
// similar limit for operands ordering this is less frequently used, hence
|
|
// impact of higher value is less noticeable.
|
|
static cl::opt<int> RootLookAheadMaxDepth(
|
|
"slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
|
|
cl::desc("The maximum look-ahead depth for searching best rooting option"));
|
|
|
|
static cl::opt<unsigned> MinProfitableStridedLoads(
|
|
"slp-min-strided-loads", cl::init(2), cl::Hidden,
|
|
cl::desc("The minimum number of loads, which should be considered strided, "
|
|
"if the stride is > 1 or is runtime value"));
|
|
|
|
static cl::opt<unsigned> MaxProfitableLoadStride(
|
|
"slp-max-stride", cl::init(8), cl::Hidden,
|
|
cl::desc("The maximum stride, considered to be profitable."));
|
|
|
|
static cl::opt<bool>
|
|
ViewSLPTree("view-slp-tree", cl::Hidden,
|
|
cl::desc("Display the SLP trees with Graphviz"));
|
|
|
|
static cl::opt<bool> VectorizeNonPowerOf2(
|
|
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
|
|
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
|
|
|
|
// Limit the number of alias checks. The limit is chosen so that
|
|
// it has no negative effect on the llvm benchmarks.
|
|
static const unsigned AliasedCheckLimit = 10;
|
|
|
|
// Limit of the number of uses for potentially transformed instructions/values,
|
|
// used in checks to avoid compile-time explode.
|
|
static constexpr int UsesLimit = 64;
|
|
|
|
// Another limit for the alias checks: The maximum distance between load/store
|
|
// instructions where alias checks are done.
|
|
// This limit is useful for very large basic blocks.
|
|
static const unsigned MaxMemDepDistance = 160;
|
|
|
|
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
|
|
/// regions to be handled.
|
|
static const int MinScheduleRegionSize = 16;
|
|
|
|
/// Maximum allowed number of operands in the PHI nodes.
|
|
static const unsigned MaxPHINumOperands = 128;
|
|
|
|
/// Predicate for the element types that the SLP vectorizer supports.
|
|
///
|
|
/// The most important thing to filter here are types which are invalid in LLVM
|
|
/// vectors. We also filter target specific types which have absolutely no
|
|
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
|
|
/// avoids spending time checking the cost model and realizing that they will
|
|
/// be inevitably scalarized.
|
|
static bool isValidElementType(Type *Ty) {
|
|
// TODO: Support ScalableVectorType.
|
|
if (SLPReVec && isa<FixedVectorType>(Ty))
|
|
Ty = Ty->getScalarType();
|
|
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
|
|
!Ty->isPPC_FP128Ty();
|
|
}
|
|
|
|
/// Returns the type of the given value/instruction \p V. If it is store,
|
|
/// returns the type of its value operand, for Cmp - the types of the compare
|
|
/// operands and for insertelement - the type os the inserted operand.
|
|
/// Otherwise, just the type of the value is returned.
|
|
static Type *getValueType(Value *V) {
|
|
if (auto *SI = dyn_cast<StoreInst>(V))
|
|
return SI->getValueOperand()->getType();
|
|
if (auto *CI = dyn_cast<CmpInst>(V))
|
|
return CI->getOperand(0)->getType();
|
|
if (auto *IE = dyn_cast<InsertElementInst>(V))
|
|
return IE->getOperand(1)->getType();
|
|
return V->getType();
|
|
}
|
|
|
|
/// \returns the number of elements for Ty.
|
|
static unsigned getNumElements(Type *Ty) {
|
|
assert(!isa<ScalableVectorType>(Ty) &&
|
|
"ScalableVectorType is not supported.");
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
|
|
return VecTy->getNumElements();
|
|
return 1;
|
|
}
|
|
|
|
/// \returns the vector type of ScalarTy based on vectorization factor.
|
|
static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
|
|
return FixedVectorType::get(ScalarTy->getScalarType(),
|
|
VF * getNumElements(ScalarTy));
|
|
}
|
|
|
|
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
|
|
/// which forms type, which splits by \p TTI into whole vector types during
|
|
/// legalization.
|
|
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
|
|
Type *Ty, unsigned Sz) {
|
|
if (!isValidElementType(Ty))
|
|
return bit_ceil(Sz);
|
|
// Find the number of elements, which forms full vectors.
|
|
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
|
|
if (NumParts == 0 || NumParts >= Sz)
|
|
return bit_ceil(Sz);
|
|
return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
|
|
}
|
|
|
|
/// Returns the number of elements of the given type \p Ty, not greater than \p
|
|
/// Sz, which forms type, which splits by \p TTI into whole vector types during
|
|
/// legalization.
|
|
static unsigned
|
|
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
|
|
unsigned Sz) {
|
|
if (!isValidElementType(Ty))
|
|
return bit_floor(Sz);
|
|
// Find the number of elements, which forms full vectors.
|
|
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
|
|
if (NumParts == 0 || NumParts >= Sz)
|
|
return bit_floor(Sz);
|
|
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
|
|
if (RegVF > Sz)
|
|
return bit_floor(Sz);
|
|
return (Sz / RegVF) * RegVF;
|
|
}
|
|
|
|
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
|
|
SmallVectorImpl<int> &Mask) {
|
|
// The ShuffleBuilder implementation use shufflevector to splat an "element".
|
|
// But the element have different meaning for SLP (scalar) and REVEC
|
|
// (vector). We need to expand Mask into masks which shufflevector can use
|
|
// directly.
|
|
SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
|
|
for (unsigned I : seq<unsigned>(Mask.size()))
|
|
for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
|
|
I * VecTyNumElements, VecTyNumElements)))
|
|
MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
|
|
: Mask[I] * VecTyNumElements + J;
|
|
Mask.swap(NewMask);
|
|
}
|
|
|
|
/// \returns the number of groups of shufflevector
|
|
/// A group has the following features
|
|
/// 1. All of value in a group are shufflevector.
|
|
/// 2. The mask of all shufflevector is isExtractSubvectorMask.
|
|
/// 3. The mask of all shufflevector uses all of the elements of the source.
|
|
/// e.g., it is 1 group (%0)
|
|
/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
|
|
/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
|
|
/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
/// it is 2 groups (%3 and %4)
|
|
/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
|
|
/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
|
|
/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
|
|
/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
|
|
/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
/// it is 0 group
|
|
/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
|
|
/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
|
|
/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
|
|
if (VL.empty())
|
|
return 0;
|
|
if (!all_of(VL, IsaPred<ShuffleVectorInst>))
|
|
return 0;
|
|
auto *SV = cast<ShuffleVectorInst>(VL.front());
|
|
unsigned SVNumElements =
|
|
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
|
|
unsigned ShuffleMaskSize = SV->getShuffleMask().size();
|
|
if (SVNumElements % ShuffleMaskSize != 0)
|
|
return 0;
|
|
unsigned GroupSize = SVNumElements / ShuffleMaskSize;
|
|
if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
|
|
return 0;
|
|
unsigned NumGroup = 0;
|
|
for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
|
|
auto *SV = cast<ShuffleVectorInst>(VL[I]);
|
|
Value *Src = SV->getOperand(0);
|
|
ArrayRef<Value *> Group = VL.slice(I, GroupSize);
|
|
SmallBitVector ExpectedIndex(GroupSize);
|
|
if (!all_of(Group, [&](Value *V) {
|
|
auto *SV = cast<ShuffleVectorInst>(V);
|
|
// From the same source.
|
|
if (SV->getOperand(0) != Src)
|
|
return false;
|
|
int Index;
|
|
if (!SV->isExtractSubvectorMask(Index))
|
|
return false;
|
|
ExpectedIndex.set(Index / ShuffleMaskSize);
|
|
return true;
|
|
}))
|
|
return 0;
|
|
if (!ExpectedIndex.all())
|
|
return 0;
|
|
++NumGroup;
|
|
}
|
|
assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
|
|
return NumGroup;
|
|
}
|
|
|
|
/// \returns a shufflevector mask which is used to vectorize shufflevectors
|
|
/// e.g.,
|
|
/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
|
|
/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
|
|
/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
|
|
/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
|
|
/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
/// the result is
|
|
/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
|
|
static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
|
|
assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
|
|
auto *SV = cast<ShuffleVectorInst>(VL.front());
|
|
unsigned SVNumElements =
|
|
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
|
|
SmallVector<int> Mask;
|
|
unsigned AccumulateLength = 0;
|
|
for (Value *V : VL) {
|
|
auto *SV = cast<ShuffleVectorInst>(V);
|
|
for (int M : SV->getShuffleMask())
|
|
Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
|
|
: AccumulateLength + M);
|
|
AccumulateLength += SVNumElements;
|
|
}
|
|
return Mask;
|
|
}
|
|
|
|
/// \returns True if the value is a constant (but not globals/constant
|
|
/// expressions).
|
|
static bool isConstant(Value *V) {
|
|
return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
|
|
}
|
|
|
|
/// Checks if \p V is one of vector-like instructions, i.e. undef,
|
|
/// insertelement/extractelement with constant indices for fixed vector type or
|
|
/// extractvalue instruction.
|
|
static bool isVectorLikeInstWithConstOps(Value *V) {
|
|
if (!isa<InsertElementInst, ExtractElementInst>(V) &&
|
|
!isa<ExtractValueInst, UndefValue>(V))
|
|
return false;
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I || isa<ExtractValueInst>(I))
|
|
return true;
|
|
if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
|
|
return false;
|
|
if (isa<ExtractElementInst>(I))
|
|
return isConstant(I->getOperand(1));
|
|
assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
|
|
return isConstant(I->getOperand(2));
|
|
}
|
|
|
|
/// Returns power-of-2 number of elements in a single register (part), given the
|
|
/// total number of elements \p Size and number of registers (parts) \p
|
|
/// NumParts.
|
|
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
|
|
return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
|
|
}
|
|
|
|
/// Returns correct remaining number of elements, considering total amount \p
|
|
/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
|
|
/// and current register (part) \p Part.
|
|
static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
|
|
unsigned Part) {
|
|
return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
|
|
}
|
|
|
|
#if !defined(NDEBUG)
|
|
/// Print a short descriptor of the instruction bundle suitable for debug output.
|
|
static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
|
|
std::string Result;
|
|
raw_string_ostream OS(Result);
|
|
if (Idx >= 0)
|
|
OS << "Idx: " << Idx << ", ";
|
|
OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
|
|
return Result;
|
|
}
|
|
#endif
|
|
|
|
/// \returns true if all of the instructions in \p VL are in the same block or
|
|
/// false otherwise.
|
|
static bool allSameBlock(ArrayRef<Value *> VL) {
|
|
auto *It = find_if(VL, IsaPred<Instruction>);
|
|
if (It == VL.end())
|
|
return false;
|
|
Instruction *I0 = cast<Instruction>(*It);
|
|
if (all_of(VL, isVectorLikeInstWithConstOps))
|
|
return true;
|
|
|
|
BasicBlock *BB = I0->getParent();
|
|
for (Value *V : iterator_range(It, VL.end())) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
auto *II = dyn_cast<Instruction>(V);
|
|
if (!II)
|
|
return false;
|
|
|
|
if (BB != II->getParent())
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// \returns True if all of the values in \p VL are constants (but not
|
|
/// globals/constant expressions).
|
|
static bool allConstant(ArrayRef<Value *> VL) {
|
|
// Constant expressions and globals can't be vectorized like normal integer/FP
|
|
// constants.
|
|
return all_of(VL, isConstant);
|
|
}
|
|
|
|
/// \returns True if all of the values in \p VL are identical or some of them
|
|
/// are UndefValue.
|
|
static bool isSplat(ArrayRef<Value *> VL) {
|
|
Value *FirstNonUndef = nullptr;
|
|
for (Value *V : VL) {
|
|
if (isa<UndefValue>(V))
|
|
continue;
|
|
if (!FirstNonUndef) {
|
|
FirstNonUndef = V;
|
|
continue;
|
|
}
|
|
if (V != FirstNonUndef)
|
|
return false;
|
|
}
|
|
return FirstNonUndef != nullptr;
|
|
}
|
|
|
|
/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
|
|
static bool isCommutative(Instruction *I) {
|
|
if (auto *Cmp = dyn_cast<CmpInst>(I))
|
|
return Cmp->isCommutative();
|
|
if (auto *BO = dyn_cast<BinaryOperator>(I))
|
|
return BO->isCommutative() ||
|
|
(BO->getOpcode() == Instruction::Sub &&
|
|
!BO->hasNUsesOrMore(UsesLimit) &&
|
|
all_of(
|
|
BO->uses(),
|
|
[](const Use &U) {
|
|
// Commutative, if icmp eq/ne sub, 0
|
|
CmpPredicate Pred;
|
|
if (match(U.getUser(),
|
|
m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
|
|
(Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
|
|
return true;
|
|
// Commutative, if abs(sub nsw, true) or abs(sub, false).
|
|
ConstantInt *Flag;
|
|
return match(U.getUser(),
|
|
m_Intrinsic<Intrinsic::abs>(
|
|
m_Specific(U.get()), m_ConstantInt(Flag))) &&
|
|
(!cast<Instruction>(U.get())->hasNoSignedWrap() ||
|
|
Flag->isOne());
|
|
})) ||
|
|
(BO->getOpcode() == Instruction::FSub &&
|
|
!BO->hasNUsesOrMore(UsesLimit) &&
|
|
all_of(BO->uses(), [](const Use &U) {
|
|
return match(U.getUser(),
|
|
m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
|
|
}));
|
|
return I->isCommutative();
|
|
}
|
|
|
|
template <typename T>
|
|
static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
|
|
unsigned Offset) {
|
|
static_assert(std::is_same_v<T, InsertElementInst> ||
|
|
std::is_same_v<T, ExtractElementInst>,
|
|
"unsupported T");
|
|
int Index = Offset;
|
|
if (const auto *IE = dyn_cast<T>(Inst)) {
|
|
const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
|
|
if (!VT)
|
|
return std::nullopt;
|
|
const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
|
|
if (!CI)
|
|
return std::nullopt;
|
|
if (CI->getValue().uge(VT->getNumElements()))
|
|
return std::nullopt;
|
|
Index *= VT->getNumElements();
|
|
Index += CI->getZExtValue();
|
|
return Index;
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/// \returns inserting or extracting index of InsertElement, ExtractElement or
|
|
/// InsertValue instruction, using Offset as base offset for index.
|
|
/// \returns std::nullopt if the index is not an immediate.
|
|
static std::optional<unsigned> getElementIndex(const Value *Inst,
|
|
unsigned Offset = 0) {
|
|
if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
|
|
return Index;
|
|
if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
|
|
return Index;
|
|
|
|
int Index = Offset;
|
|
|
|
const auto *IV = dyn_cast<InsertValueInst>(Inst);
|
|
if (!IV)
|
|
return std::nullopt;
|
|
|
|
Type *CurrentType = IV->getType();
|
|
for (unsigned I : IV->indices()) {
|
|
if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
|
|
Index *= ST->getNumElements();
|
|
CurrentType = ST->getElementType(I);
|
|
} else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
|
|
Index *= AT->getNumElements();
|
|
CurrentType = AT->getElementType();
|
|
} else {
|
|
return std::nullopt;
|
|
}
|
|
Index += I;
|
|
}
|
|
return Index;
|
|
}
|
|
|
|
namespace {
|
|
/// Specifies the way the mask should be analyzed for undefs/poisonous elements
|
|
/// in the shuffle mask.
|
|
enum class UseMask {
|
|
FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
|
|
///< check for the mask elements for the first argument (mask
|
|
///< indices are in range [0:VF)).
|
|
SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
|
|
///< for the mask elements for the second argument (mask indices
|
|
///< are in range [VF:2*VF))
|
|
UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
|
|
///< future shuffle elements and mark them as ones as being used
|
|
///< in future. Non-undef elements are considered as unused since
|
|
///< they're already marked as used in the mask.
|
|
};
|
|
} // namespace
|
|
|
|
/// Prepares a use bitset for the given mask either for the first argument or
|
|
/// for the second.
|
|
static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
|
|
UseMask MaskArg) {
|
|
SmallBitVector UseMask(VF, true);
|
|
for (auto [Idx, Value] : enumerate(Mask)) {
|
|
if (Value == PoisonMaskElem) {
|
|
if (MaskArg == UseMask::UndefsAsMask)
|
|
UseMask.reset(Idx);
|
|
continue;
|
|
}
|
|
if (MaskArg == UseMask::FirstArg && Value < VF)
|
|
UseMask.reset(Value);
|
|
else if (MaskArg == UseMask::SecondArg && Value >= VF)
|
|
UseMask.reset(Value - VF);
|
|
}
|
|
return UseMask;
|
|
}
|
|
|
|
/// Checks if the given value is actually an undefined constant vector.
|
|
/// Also, if the \p UseMask is not empty, tries to check if the non-masked
|
|
/// elements actually mask the insertelement buildvector, if any.
|
|
template <bool IsPoisonOnly = false>
|
|
static SmallBitVector isUndefVector(const Value *V,
|
|
const SmallBitVector &UseMask = {}) {
|
|
SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
|
|
using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
|
|
if (isa<T>(V))
|
|
return Res;
|
|
auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
|
|
if (!VecTy)
|
|
return Res.reset();
|
|
auto *C = dyn_cast<Constant>(V);
|
|
if (!C) {
|
|
if (!UseMask.empty()) {
|
|
const Value *Base = V;
|
|
while (auto *II = dyn_cast<InsertElementInst>(Base)) {
|
|
Base = II->getOperand(0);
|
|
if (isa<T>(II->getOperand(1)))
|
|
continue;
|
|
std::optional<unsigned> Idx = getElementIndex(II);
|
|
if (!Idx) {
|
|
Res.reset();
|
|
return Res;
|
|
}
|
|
if (*Idx < UseMask.size() && !UseMask.test(*Idx))
|
|
Res.reset(*Idx);
|
|
}
|
|
// TODO: Add analysis for shuffles here too.
|
|
if (V == Base) {
|
|
Res.reset();
|
|
} else {
|
|
SmallBitVector SubMask(UseMask.size(), false);
|
|
Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
|
|
}
|
|
} else {
|
|
Res.reset();
|
|
}
|
|
return Res;
|
|
}
|
|
for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
|
|
if (Constant *Elem = C->getAggregateElement(I))
|
|
if (!isa<T>(Elem) &&
|
|
(UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
|
|
Res.reset(I);
|
|
}
|
|
return Res;
|
|
}
|
|
|
|
/// Checks if the vector of instructions can be represented as a shuffle, like:
|
|
/// %x0 = extractelement <4 x i8> %x, i32 0
|
|
/// %x3 = extractelement <4 x i8> %x, i32 3
|
|
/// %y1 = extractelement <4 x i8> %y, i32 1
|
|
/// %y2 = extractelement <4 x i8> %y, i32 2
|
|
/// %x0x0 = mul i8 %x0, %x0
|
|
/// %x3x3 = mul i8 %x3, %x3
|
|
/// %y1y1 = mul i8 %y1, %y1
|
|
/// %y2y2 = mul i8 %y2, %y2
|
|
/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
|
|
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
|
|
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
|
|
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
|
|
/// ret <4 x i8> %ins4
|
|
/// can be transformed into:
|
|
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
|
|
/// i32 6>
|
|
/// %2 = mul <4 x i8> %1, %1
|
|
/// ret <4 x i8> %2
|
|
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
|
|
/// TODO: Can we split off and reuse the shuffle mask detection from
|
|
/// ShuffleVectorInst/getShuffleCost?
|
|
static std::optional<TargetTransformInfo::ShuffleKind>
|
|
isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
|
|
AssumptionCache *AC) {
|
|
const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
|
|
if (It == VL.end())
|
|
return std::nullopt;
|
|
unsigned Size =
|
|
std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
|
|
auto *EI = dyn_cast<ExtractElementInst>(V);
|
|
if (!EI)
|
|
return S;
|
|
auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
|
|
if (!VTy)
|
|
return S;
|
|
return std::max(S, VTy->getNumElements());
|
|
});
|
|
|
|
Value *Vec1 = nullptr;
|
|
Value *Vec2 = nullptr;
|
|
bool HasNonUndefVec = any_of(VL, [&](Value *V) {
|
|
auto *EE = dyn_cast<ExtractElementInst>(V);
|
|
if (!EE)
|
|
return false;
|
|
Value *Vec = EE->getVectorOperand();
|
|
if (isa<UndefValue>(Vec))
|
|
return false;
|
|
return isGuaranteedNotToBePoison(Vec, AC);
|
|
});
|
|
enum ShuffleMode { Unknown, Select, Permute };
|
|
ShuffleMode CommonShuffleMode = Unknown;
|
|
Mask.assign(VL.size(), PoisonMaskElem);
|
|
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
|
|
// Undef can be represented as an undef element in a vector.
|
|
if (isa<UndefValue>(VL[I]))
|
|
continue;
|
|
auto *EI = cast<ExtractElementInst>(VL[I]);
|
|
if (isa<ScalableVectorType>(EI->getVectorOperandType()))
|
|
return std::nullopt;
|
|
auto *Vec = EI->getVectorOperand();
|
|
// We can extractelement from undef or poison vector.
|
|
if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
|
|
continue;
|
|
// All vector operands must have the same number of vector elements.
|
|
if (isa<UndefValue>(Vec)) {
|
|
Mask[I] = I;
|
|
} else {
|
|
if (isa<UndefValue>(EI->getIndexOperand()))
|
|
continue;
|
|
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
|
|
if (!Idx)
|
|
return std::nullopt;
|
|
// Undefined behavior if Idx is negative or >= Size.
|
|
if (Idx->getValue().uge(Size))
|
|
continue;
|
|
unsigned IntIdx = Idx->getValue().getZExtValue();
|
|
Mask[I] = IntIdx;
|
|
}
|
|
if (isUndefVector(Vec).all() && HasNonUndefVec)
|
|
continue;
|
|
// For correct shuffling we have to have at most 2 different vector operands
|
|
// in all extractelement instructions.
|
|
if (!Vec1 || Vec1 == Vec) {
|
|
Vec1 = Vec;
|
|
} else if (!Vec2 || Vec2 == Vec) {
|
|
Vec2 = Vec;
|
|
Mask[I] += Size;
|
|
} else {
|
|
return std::nullopt;
|
|
}
|
|
if (CommonShuffleMode == Permute)
|
|
continue;
|
|
// If the extract index is not the same as the operation number, it is a
|
|
// permutation.
|
|
if (Mask[I] % Size != I) {
|
|
CommonShuffleMode = Permute;
|
|
continue;
|
|
}
|
|
CommonShuffleMode = Select;
|
|
}
|
|
// If we're not crossing lanes in different vectors, consider it as blending.
|
|
if (CommonShuffleMode == Select && Vec2)
|
|
return TargetTransformInfo::SK_Select;
|
|
// If Vec2 was never used, we have a permutation of a single vector, otherwise
|
|
// we have permutation of 2 vectors.
|
|
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
|
|
: TargetTransformInfo::SK_PermuteSingleSrc;
|
|
}
|
|
|
|
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
|
|
static std::optional<unsigned> getExtractIndex(const Instruction *E) {
|
|
unsigned Opcode = E->getOpcode();
|
|
assert((Opcode == Instruction::ExtractElement ||
|
|
Opcode == Instruction::ExtractValue) &&
|
|
"Expected extractelement or extractvalue instruction.");
|
|
if (Opcode == Instruction::ExtractElement) {
|
|
auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
|
|
if (!CI)
|
|
return std::nullopt;
|
|
return CI->getZExtValue();
|
|
}
|
|
auto *EI = cast<ExtractValueInst>(E);
|
|
if (EI->getNumIndices() != 1)
|
|
return std::nullopt;
|
|
return *EI->idx_begin();
|
|
}
|
|
|
|
namespace {
|
|
|
|
/// Main data required for vectorization of instructions.
|
|
class InstructionsState {
|
|
/// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
|
|
/// only BinaryOperator, CastInst, and CmpInst support alternate instructions
|
|
/// (i.e., AltOp is not equal to MainOp; this can be checked using
|
|
/// isAltShuffle).
|
|
/// A rare exception is TrySplitNode, where the InstructionsState is derived
|
|
/// from getMainAltOpsNoStateVL.
|
|
/// For those InstructionsState that use alternate instructions, the resulting
|
|
/// vectorized output ultimately comes from a shufflevector. For example,
|
|
/// given a vector list (VL):
|
|
/// VL[0] = add i32 a, e
|
|
/// VL[1] = sub i32 b, f
|
|
/// VL[2] = add i32 c, g
|
|
/// VL[3] = sub i32 d, h
|
|
/// The vectorized result would be:
|
|
/// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
|
|
/// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
|
|
/// result = shufflevector <4 x i32> intermediated_0,
|
|
/// <4 x i32> intermediated_1,
|
|
/// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
|
/// Since shufflevector is used in the final result, when calculating the cost
|
|
/// (getEntryCost), we must account for the usage of shufflevector in
|
|
/// GetVectorCost.
|
|
Instruction *MainOp = nullptr;
|
|
Instruction *AltOp = nullptr;
|
|
|
|
public:
|
|
Instruction *getMainOp() const {
|
|
assert(valid() && "InstructionsState is invalid.");
|
|
return MainOp;
|
|
}
|
|
|
|
Instruction *getAltOp() const {
|
|
assert(valid() && "InstructionsState is invalid.");
|
|
return AltOp;
|
|
}
|
|
|
|
/// The main/alternate opcodes for the list of instructions.
|
|
unsigned getOpcode() const { return getMainOp()->getOpcode(); }
|
|
|
|
unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
|
|
|
|
/// Some of the instructions in the list have alternate opcodes.
|
|
bool isAltShuffle() const { return getMainOp() != getAltOp(); }
|
|
|
|
bool isOpcodeOrAlt(Instruction *I) const {
|
|
unsigned CheckedOpcode = I->getOpcode();
|
|
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
|
|
}
|
|
|
|
/// Checks if main/alt instructions are shift operations.
|
|
bool isShiftOp() const {
|
|
return getMainOp()->isShift() && getAltOp()->isShift();
|
|
}
|
|
|
|
/// Checks if main/alt instructions are bitwise logic operations.
|
|
bool isBitwiseLogicOp() const {
|
|
return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
|
|
}
|
|
|
|
/// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
|
|
bool isMulDivLikeOp() const {
|
|
constexpr std::array<unsigned, 8> MulDiv = {
|
|
Instruction::Mul, Instruction::FMul, Instruction::SDiv,
|
|
Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
|
|
Instruction::URem, Instruction::FRem};
|
|
return is_contained(MulDiv, getOpcode()) &&
|
|
is_contained(MulDiv, getAltOpcode());
|
|
}
|
|
|
|
/// Checks if main/alt instructions are add/sub/fadd/fsub operations.
|
|
bool isAddSubLikeOp() const {
|
|
constexpr std::array<unsigned, 4> AddSub = {
|
|
Instruction::Add, Instruction::Sub, Instruction::FAdd,
|
|
Instruction::FSub};
|
|
return is_contained(AddSub, getOpcode()) &&
|
|
is_contained(AddSub, getAltOpcode());
|
|
}
|
|
|
|
/// Checks if main/alt instructions are cmp operations.
|
|
bool isCmpOp() const {
|
|
return (getOpcode() == Instruction::ICmp ||
|
|
getOpcode() == Instruction::FCmp) &&
|
|
getAltOpcode() == getOpcode();
|
|
}
|
|
|
|
/// Checks if the current state is valid, i.e. has non-null MainOp
|
|
bool valid() const { return MainOp && AltOp; }
|
|
|
|
explicit operator bool() const { return valid(); }
|
|
|
|
InstructionsState() = delete;
|
|
InstructionsState(Instruction *MainOp, Instruction *AltOp)
|
|
: MainOp(MainOp), AltOp(AltOp) {}
|
|
static InstructionsState invalid() { return {nullptr, nullptr}; }
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
/// \returns true if \p Opcode is allowed as part of the main/alternate
|
|
/// instruction for SLP vectorization.
|
|
///
|
|
/// Example of unsupported opcode is SDIV that can potentially cause UB if the
|
|
/// "shuffled out" lane would result in division by zero.
|
|
static bool isValidForAlternation(unsigned Opcode) {
|
|
if (Instruction::isIntDivRem(Opcode))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
|
|
const TargetLibraryInfo &TLI);
|
|
|
|
/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
|
|
/// compatible instructions or constants, or just some other regular values.
|
|
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
|
|
Value *Op1, const TargetLibraryInfo &TLI) {
|
|
return (isConstant(BaseOp0) && isConstant(Op0)) ||
|
|
(isConstant(BaseOp1) && isConstant(Op1)) ||
|
|
(!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
|
|
!isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
|
|
BaseOp0 == Op0 || BaseOp1 == Op1 ||
|
|
getSameOpcode({BaseOp0, Op0}, TLI) ||
|
|
getSameOpcode({BaseOp1, Op1}, TLI);
|
|
}
|
|
|
|
/// \returns true if a compare instruction \p CI has similar "look" and
|
|
/// same predicate as \p BaseCI, "as is" or with its operands and predicate
|
|
/// swapped, false otherwise.
|
|
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
|
|
const TargetLibraryInfo &TLI) {
|
|
assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
|
|
"Assessing comparisons of different types?");
|
|
CmpInst::Predicate BasePred = BaseCI->getPredicate();
|
|
CmpInst::Predicate Pred = CI->getPredicate();
|
|
CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
|
|
|
|
Value *BaseOp0 = BaseCI->getOperand(0);
|
|
Value *BaseOp1 = BaseCI->getOperand(1);
|
|
Value *Op0 = CI->getOperand(0);
|
|
Value *Op1 = CI->getOperand(1);
|
|
|
|
return (BasePred == Pred &&
|
|
areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
|
|
(BasePred == SwappedPred &&
|
|
areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
|
|
}
|
|
|
|
/// \returns analysis of the Instructions in \p VL described in
|
|
/// InstructionsState, the Opcode that we suppose the whole list
|
|
/// could be vectorized even if its structure is diverse.
|
|
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
|
|
const TargetLibraryInfo &TLI) {
|
|
// Make sure these are all Instructions.
|
|
if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
|
|
return InstructionsState::invalid();
|
|
|
|
auto *It = find_if(VL, IsaPred<Instruction>);
|
|
if (It == VL.end())
|
|
return InstructionsState::invalid();
|
|
|
|
Instruction *MainOp = cast<Instruction>(*It);
|
|
unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
|
|
if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
|
|
(VL.size() == 2 && InstCnt < 2))
|
|
return InstructionsState::invalid();
|
|
|
|
bool IsCastOp = isa<CastInst>(MainOp);
|
|
bool IsBinOp = isa<BinaryOperator>(MainOp);
|
|
bool IsCmpOp = isa<CmpInst>(MainOp);
|
|
CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
|
|
: CmpInst::BAD_ICMP_PREDICATE;
|
|
Instruction *AltOp = MainOp;
|
|
unsigned Opcode = MainOp->getOpcode();
|
|
unsigned AltOpcode = Opcode;
|
|
|
|
bool SwappedPredsCompatible = IsCmpOp && [&]() {
|
|
SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
|
|
UniquePreds.insert(BasePred);
|
|
UniqueNonSwappedPreds.insert(BasePred);
|
|
for (Value *V : VL) {
|
|
auto *I = dyn_cast<CmpInst>(V);
|
|
if (!I)
|
|
return false;
|
|
CmpInst::Predicate CurrentPred = I->getPredicate();
|
|
CmpInst::Predicate SwappedCurrentPred =
|
|
CmpInst::getSwappedPredicate(CurrentPred);
|
|
UniqueNonSwappedPreds.insert(CurrentPred);
|
|
if (!UniquePreds.contains(CurrentPred) &&
|
|
!UniquePreds.contains(SwappedCurrentPred))
|
|
UniquePreds.insert(CurrentPred);
|
|
}
|
|
// Total number of predicates > 2, but if consider swapped predicates
|
|
// compatible only 2, consider swappable predicates as compatible opcodes,
|
|
// not alternate.
|
|
return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
|
|
}();
|
|
// Check for one alternate opcode from another BinaryOperator.
|
|
// TODO - generalize to support all operators (types, calls etc.).
|
|
Intrinsic::ID BaseID = 0;
|
|
SmallVector<VFInfo> BaseMappings;
|
|
if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
|
|
BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
|
|
BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
|
|
if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
|
|
return InstructionsState::invalid();
|
|
}
|
|
bool AnyPoison = InstCnt != VL.size();
|
|
// Check MainOp too to be sure that it matches the requirements for the
|
|
// instructions.
|
|
for (Value *V : iterator_range(It, VL.end())) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
continue;
|
|
|
|
// Cannot combine poison and divisions.
|
|
// TODO: do some smart analysis of the CallInsts to exclude divide-like
|
|
// intrinsics/functions only.
|
|
if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
|
|
return InstructionsState::invalid();
|
|
unsigned InstOpcode = I->getOpcode();
|
|
if (IsBinOp && isa<BinaryOperator>(I)) {
|
|
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
|
|
continue;
|
|
if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
|
|
isValidForAlternation(Opcode)) {
|
|
AltOpcode = InstOpcode;
|
|
AltOp = I;
|
|
continue;
|
|
}
|
|
} else if (IsCastOp && isa<CastInst>(I)) {
|
|
Value *Op0 = MainOp->getOperand(0);
|
|
Type *Ty0 = Op0->getType();
|
|
Value *Op1 = I->getOperand(0);
|
|
Type *Ty1 = Op1->getType();
|
|
if (Ty0 == Ty1) {
|
|
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
|
|
continue;
|
|
if (Opcode == AltOpcode) {
|
|
assert(isValidForAlternation(Opcode) &&
|
|
isValidForAlternation(InstOpcode) &&
|
|
"Cast isn't safe for alternation, logic needs to be updated!");
|
|
AltOpcode = InstOpcode;
|
|
AltOp = I;
|
|
continue;
|
|
}
|
|
}
|
|
} else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
|
|
auto *BaseInst = cast<CmpInst>(MainOp);
|
|
Type *Ty0 = BaseInst->getOperand(0)->getType();
|
|
Type *Ty1 = Inst->getOperand(0)->getType();
|
|
if (Ty0 == Ty1) {
|
|
assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
|
|
assert(InstOpcode == AltOpcode &&
|
|
"Alternate instructions are only supported by BinaryOperator "
|
|
"and CastInst.");
|
|
// Check for compatible operands. If the corresponding operands are not
|
|
// compatible - need to perform alternate vectorization.
|
|
CmpInst::Predicate CurrentPred = Inst->getPredicate();
|
|
CmpInst::Predicate SwappedCurrentPred =
|
|
CmpInst::getSwappedPredicate(CurrentPred);
|
|
|
|
if ((VL.size() == 2 || SwappedPredsCompatible) &&
|
|
(BasePred == CurrentPred || BasePred == SwappedCurrentPred))
|
|
continue;
|
|
|
|
if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
|
|
continue;
|
|
auto *AltInst = cast<CmpInst>(AltOp);
|
|
if (MainOp != AltOp) {
|
|
if (isCmpSameOrSwapped(AltInst, Inst, TLI))
|
|
continue;
|
|
} else if (BasePred != CurrentPred) {
|
|
assert(
|
|
isValidForAlternation(InstOpcode) &&
|
|
"CmpInst isn't safe for alternation, logic needs to be updated!");
|
|
AltOp = I;
|
|
continue;
|
|
}
|
|
CmpInst::Predicate AltPred = AltInst->getPredicate();
|
|
if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
|
|
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
|
|
continue;
|
|
}
|
|
} else if (InstOpcode == Opcode) {
|
|
assert(InstOpcode == AltOpcode &&
|
|
"Alternate instructions are only supported by BinaryOperator and "
|
|
"CastInst.");
|
|
if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
|
|
if (Gep->getNumOperands() != 2 ||
|
|
Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
|
|
return InstructionsState::invalid();
|
|
} else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
|
|
if (!isVectorLikeInstWithConstOps(EI))
|
|
return InstructionsState::invalid();
|
|
} else if (auto *LI = dyn_cast<LoadInst>(I)) {
|
|
auto *BaseLI = cast<LoadInst>(MainOp);
|
|
if (!LI->isSimple() || !BaseLI->isSimple())
|
|
return InstructionsState::invalid();
|
|
} else if (auto *Call = dyn_cast<CallInst>(I)) {
|
|
auto *CallBase = cast<CallInst>(MainOp);
|
|
if (Call->getCalledFunction() != CallBase->getCalledFunction())
|
|
return InstructionsState::invalid();
|
|
if (Call->hasOperandBundles() &&
|
|
(!CallBase->hasOperandBundles() ||
|
|
!std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
|
|
Call->op_begin() + Call->getBundleOperandsEndIndex(),
|
|
CallBase->op_begin() +
|
|
CallBase->getBundleOperandsStartIndex())))
|
|
return InstructionsState::invalid();
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
|
|
if (ID != BaseID)
|
|
return InstructionsState::invalid();
|
|
if (!ID) {
|
|
SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
|
|
if (Mappings.size() != BaseMappings.size() ||
|
|
Mappings.front().ISA != BaseMappings.front().ISA ||
|
|
Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
|
|
Mappings.front().VectorName != BaseMappings.front().VectorName ||
|
|
Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
|
|
Mappings.front().Shape.Parameters !=
|
|
BaseMappings.front().Shape.Parameters)
|
|
return InstructionsState::invalid();
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
return InstructionsState::invalid();
|
|
}
|
|
|
|
return InstructionsState(MainOp, AltOp);
|
|
}
|
|
|
|
/// \returns true if all of the values in \p VL have the same type or false
|
|
/// otherwise.
|
|
static bool allSameType(ArrayRef<Value *> VL) {
|
|
Type *Ty = VL.front()->getType();
|
|
return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
|
|
}
|
|
|
|
/// \returns True if in-tree use also needs extract. This refers to
|
|
/// possible scalar operand in vectorized instruction.
|
|
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
|
|
TargetLibraryInfo *TLI,
|
|
const TargetTransformInfo *TTI) {
|
|
if (!UserInst)
|
|
return false;
|
|
unsigned Opcode = UserInst->getOpcode();
|
|
switch (Opcode) {
|
|
case Instruction::Load: {
|
|
LoadInst *LI = cast<LoadInst>(UserInst);
|
|
return (LI->getPointerOperand() == Scalar);
|
|
}
|
|
case Instruction::Store: {
|
|
StoreInst *SI = cast<StoreInst>(UserInst);
|
|
return (SI->getPointerOperand() == Scalar);
|
|
}
|
|
case Instruction::Call: {
|
|
CallInst *CI = cast<CallInst>(UserInst);
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
return any_of(enumerate(CI->args()), [&](auto &&Arg) {
|
|
return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
|
|
Arg.value().get() == Scalar;
|
|
});
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// \returns the AA location that is being access by the instruction.
|
|
static MemoryLocation getLocation(Instruction *I) {
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
|
return MemoryLocation::get(SI);
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
|
return MemoryLocation::get(LI);
|
|
return MemoryLocation();
|
|
}
|
|
|
|
/// \returns True if the instruction is not a volatile or atomic load/store.
|
|
static bool isSimple(Instruction *I) {
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
|
return LI->isSimple();
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
|
return SI->isSimple();
|
|
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
|
|
return !MI->isVolatile();
|
|
return true;
|
|
}
|
|
|
|
/// Shuffles \p Mask in accordance with the given \p SubMask.
|
|
/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
|
|
/// one but two input vectors.
|
|
static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
|
|
bool ExtendingManyInputs = false) {
|
|
if (SubMask.empty())
|
|
return;
|
|
assert(
|
|
(!ExtendingManyInputs || SubMask.size() > Mask.size() ||
|
|
// Check if input scalars were extended to match the size of other node.
|
|
(SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
|
|
"SubMask with many inputs support must be larger than the mask.");
|
|
if (Mask.empty()) {
|
|
Mask.append(SubMask.begin(), SubMask.end());
|
|
return;
|
|
}
|
|
SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
|
|
int TermValue = std::min(Mask.size(), SubMask.size());
|
|
for (int I = 0, E = SubMask.size(); I < E; ++I) {
|
|
if (SubMask[I] == PoisonMaskElem ||
|
|
(!ExtendingManyInputs &&
|
|
(SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
|
|
continue;
|
|
NewMask[I] = Mask[SubMask[I]];
|
|
}
|
|
Mask.swap(NewMask);
|
|
}
|
|
|
|
/// Order may have elements assigned special value (size) which is out of
|
|
/// bounds. Such indices only appear on places which correspond to undef values
|
|
/// (see canReuseExtract for details) and used in order to avoid undef values
|
|
/// have effect on operands ordering.
|
|
/// The first loop below simply finds all unused indices and then the next loop
|
|
/// nest assigns these indices for undef values positions.
|
|
/// As an example below Order has two undef positions and they have assigned
|
|
/// values 3 and 7 respectively:
|
|
/// before: 6 9 5 4 9 2 1 0
|
|
/// after: 6 3 5 4 7 2 1 0
|
|
static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
|
|
const unsigned Sz = Order.size();
|
|
SmallBitVector UnusedIndices(Sz, /*t=*/true);
|
|
SmallBitVector MaskedIndices(Sz);
|
|
for (unsigned I = 0; I < Sz; ++I) {
|
|
if (Order[I] < Sz)
|
|
UnusedIndices.reset(Order[I]);
|
|
else
|
|
MaskedIndices.set(I);
|
|
}
|
|
if (MaskedIndices.none())
|
|
return;
|
|
assert(UnusedIndices.count() == MaskedIndices.count() &&
|
|
"Non-synced masked/available indices.");
|
|
int Idx = UnusedIndices.find_first();
|
|
int MIdx = MaskedIndices.find_first();
|
|
while (MIdx >= 0) {
|
|
assert(Idx >= 0 && "Indices must be synced.");
|
|
Order[MIdx] = Idx;
|
|
Idx = UnusedIndices.find_next(Idx);
|
|
MIdx = MaskedIndices.find_next(MIdx);
|
|
}
|
|
}
|
|
|
|
/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
|
|
/// Opcode1.
|
|
static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy,
|
|
unsigned Opcode0, unsigned Opcode1) {
|
|
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
|
|
SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
|
|
for (unsigned Lane : seq<unsigned>(VL.size())) {
|
|
if (isa<PoisonValue>(VL[Lane]))
|
|
continue;
|
|
if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
|
|
OpcodeMask.set(Lane * ScalarTyNumElements,
|
|
Lane * ScalarTyNumElements + ScalarTyNumElements);
|
|
}
|
|
return OpcodeMask;
|
|
}
|
|
|
|
/// Replicates the given \p Val \p VF times.
|
|
static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val,
|
|
unsigned VF) {
|
|
assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
|
|
"Expected scalar constants.");
|
|
SmallVector<Constant *> NewVal(Val.size() * VF);
|
|
for (auto [I, V] : enumerate(Val))
|
|
std::fill_n(NewVal.begin() + I * VF, VF, V);
|
|
return NewVal;
|
|
}
|
|
|
|
namespace llvm {
|
|
|
|
static void inversePermutation(ArrayRef<unsigned> Indices,
|
|
SmallVectorImpl<int> &Mask) {
|
|
Mask.clear();
|
|
const unsigned E = Indices.size();
|
|
Mask.resize(E, PoisonMaskElem);
|
|
for (unsigned I = 0; I < E; ++I)
|
|
Mask[Indices[I]] = I;
|
|
}
|
|
|
|
/// Reorders the list of scalars in accordance with the given \p Mask.
|
|
static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
|
|
ArrayRef<int> Mask) {
|
|
assert(!Mask.empty() && "Expected non-empty mask.");
|
|
SmallVector<Value *> Prev(Scalars.size(),
|
|
PoisonValue::get(Scalars.front()->getType()));
|
|
Prev.swap(Scalars);
|
|
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
|
|
if (Mask[I] != PoisonMaskElem)
|
|
Scalars[Mask[I]] = Prev[I];
|
|
}
|
|
|
|
/// Checks if the provided value does not require scheduling. It does not
|
|
/// require scheduling if this is not an instruction or it is an instruction
|
|
/// that does not read/write memory and all operands are either not instructions
|
|
/// or phi nodes or instructions from different blocks.
|
|
static bool areAllOperandsNonInsts(Value *V) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
return true;
|
|
return !mayHaveNonDefUseDependency(*I) &&
|
|
all_of(I->operands(), [I](Value *V) {
|
|
auto *IO = dyn_cast<Instruction>(V);
|
|
if (!IO)
|
|
return true;
|
|
return isa<PHINode>(IO) || IO->getParent() != I->getParent();
|
|
});
|
|
}
|
|
|
|
/// Checks if the provided value does not require scheduling. It does not
|
|
/// require scheduling if this is not an instruction or it is an instruction
|
|
/// that does not read/write memory and all users are phi nodes or instructions
|
|
/// from the different blocks.
|
|
static bool isUsedOutsideBlock(Value *V) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
return true;
|
|
// Limits the number of uses to save compile time.
|
|
return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
|
|
all_of(I->users(), [I](User *U) {
|
|
auto *IU = dyn_cast<Instruction>(U);
|
|
if (!IU)
|
|
return true;
|
|
return IU->getParent() != I->getParent() || isa<PHINode>(IU);
|
|
});
|
|
}
|
|
|
|
/// Checks if the specified value does not require scheduling. It does not
|
|
/// require scheduling if all operands and all users do not need to be scheduled
|
|
/// in the current basic block.
|
|
static bool doesNotNeedToBeScheduled(Value *V) {
|
|
return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
|
|
}
|
|
|
|
/// Checks if the specified array of instructions does not require scheduling.
|
|
/// It is so if all either instructions have operands that do not require
|
|
/// scheduling or their users do not require scheduling since they are phis or
|
|
/// in other basic blocks.
|
|
static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
|
|
return !VL.empty() &&
|
|
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
|
|
}
|
|
|
|
/// Returns true if widened type of \p Ty elements with size \p Sz represents
|
|
/// full vector type, i.e. adding extra element results in extra parts upon type
|
|
/// legalization.
|
|
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
|
|
unsigned Sz) {
|
|
if (Sz <= 1)
|
|
return false;
|
|
if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
|
|
return false;
|
|
if (has_single_bit(Sz))
|
|
return true;
|
|
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
|
|
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
|
|
Sz % NumParts == 0;
|
|
}
|
|
|
|
/// Returns number of parts, the type \p VecTy will be split at the codegen
|
|
/// phase. If the type is going to be scalarized or does not uses whole
|
|
/// registers, returns 1.
|
|
static unsigned
|
|
getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
|
|
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
|
|
unsigned NumParts = TTI.getNumberOfParts(VecTy);
|
|
if (NumParts == 0 || NumParts >= Limit)
|
|
return 1;
|
|
unsigned Sz = getNumElements(VecTy);
|
|
if (NumParts >= Sz || Sz % NumParts != 0 ||
|
|
!hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
|
|
return 1;
|
|
return NumParts;
|
|
}
|
|
|
|
namespace slpvectorizer {
|
|
|
|
/// Bottom Up SLP Vectorizer.
|
|
class BoUpSLP {
|
|
struct TreeEntry;
|
|
class ScheduleEntity;
|
|
class ScheduleData;
|
|
class ScheduleBundle;
|
|
class ShuffleCostEstimator;
|
|
class ShuffleInstructionBuilder;
|
|
|
|
public:
|
|
/// Tracks the state we can represent the loads in the given sequence.
|
|
enum class LoadsState {
|
|
Gather,
|
|
Vectorize,
|
|
ScatterVectorize,
|
|
StridedVectorize,
|
|
CompressVectorize
|
|
};
|
|
|
|
using ValueList = SmallVector<Value *, 8>;
|
|
using InstrList = SmallVector<Instruction *, 16>;
|
|
using ValueSet = SmallPtrSet<Value *, 16>;
|
|
using StoreList = SmallVector<StoreInst *, 8>;
|
|
using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
|
|
using OrdersType = SmallVector<unsigned, 4>;
|
|
|
|
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
|
|
TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
|
|
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
|
|
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
|
|
: BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
|
|
AC(AC), DB(DB), DL(DL), ORE(ORE),
|
|
Builder(Se->getContext(), TargetFolder(*DL)) {
|
|
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
|
|
// Use the vector register size specified by the target unless overridden
|
|
// by a command-line option.
|
|
// TODO: It would be better to limit the vectorization factor based on
|
|
// data type rather than just register size. For example, x86 AVX has
|
|
// 256-bit registers, but it does not support integer operations
|
|
// at that width (that requires AVX2).
|
|
if (MaxVectorRegSizeOption.getNumOccurrences())
|
|
MaxVecRegSize = MaxVectorRegSizeOption;
|
|
else
|
|
MaxVecRegSize =
|
|
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
|
|
.getFixedValue();
|
|
|
|
if (MinVectorRegSizeOption.getNumOccurrences())
|
|
MinVecRegSize = MinVectorRegSizeOption;
|
|
else
|
|
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
|
|
}
|
|
|
|
/// Vectorize the tree that starts with the elements in \p VL.
|
|
/// Returns the vectorized root.
|
|
Value *vectorizeTree();
|
|
|
|
/// Vectorize the tree but with the list of externally used values \p
|
|
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
|
|
/// generated extractvalue instructions.
|
|
Value *vectorizeTree(
|
|
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
|
|
Instruction *ReductionRoot = nullptr,
|
|
ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
|
|
|
|
/// \returns the cost incurred by unwanted spills and fills, caused by
|
|
/// holding live values over call sites.
|
|
InstructionCost getSpillCost();
|
|
|
|
/// \returns the vectorization cost of the subtree that starts at \p VL.
|
|
/// A negative number means that this is profitable.
|
|
InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
|
|
InstructionCost ReductionCost = TTI::TCC_Free);
|
|
|
|
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
|
|
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
|
|
void buildTree(ArrayRef<Value *> Roots,
|
|
const SmallDenseSet<Value *> &UserIgnoreLst);
|
|
|
|
/// Construct a vectorizable tree that starts at \p Roots.
|
|
void buildTree(ArrayRef<Value *> Roots);
|
|
|
|
/// Return the scalars of the root node.
|
|
ArrayRef<Value *> getRootNodeScalars() const {
|
|
assert(!VectorizableTree.empty() && "No graph to get the first node from");
|
|
return VectorizableTree.front()->Scalars;
|
|
}
|
|
|
|
/// Returns the type/is-signed info for the root node in the graph without
|
|
/// casting.
|
|
std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
|
|
const TreeEntry &Root = *VectorizableTree.front().get();
|
|
if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
|
|
!Root.Scalars.front()->getType()->isIntegerTy())
|
|
return std::nullopt;
|
|
auto It = MinBWs.find(&Root);
|
|
if (It != MinBWs.end())
|
|
return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
|
|
It->second.first),
|
|
It->second.second);
|
|
if (Root.getOpcode() == Instruction::ZExt ||
|
|
Root.getOpcode() == Instruction::SExt)
|
|
return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
|
|
Root.getOpcode() == Instruction::SExt);
|
|
return std::nullopt;
|
|
}
|
|
|
|
/// Checks if the root graph node can be emitted with narrower bitwidth at
|
|
/// codegen and returns it signedness, if so.
|
|
bool isSignedMinBitwidthRootNode() const {
|
|
return MinBWs.at(VectorizableTree.front().get()).second;
|
|
}
|
|
|
|
/// Returns reduction type after minbitdth analysis.
|
|
FixedVectorType *getReductionType() const {
|
|
if (ReductionBitWidth == 0 ||
|
|
!VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
|
|
ReductionBitWidth >=
|
|
DL->getTypeSizeInBits(
|
|
VectorizableTree.front()->Scalars.front()->getType()))
|
|
return getWidenedType(
|
|
VectorizableTree.front()->Scalars.front()->getType(),
|
|
VectorizableTree.front()->getVectorFactor());
|
|
return getWidenedType(
|
|
IntegerType::get(
|
|
VectorizableTree.front()->Scalars.front()->getContext(),
|
|
ReductionBitWidth),
|
|
VectorizableTree.front()->getVectorFactor());
|
|
}
|
|
|
|
/// Builds external uses of the vectorized scalars, i.e. the list of
|
|
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
|
|
/// ExternallyUsedValues contains additional list of external uses to handle
|
|
/// vectorization of reductions.
|
|
void
|
|
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
|
|
|
|
/// Transforms graph nodes to target specific representations, if profitable.
|
|
void transformNodes();
|
|
|
|
/// Clear the internal data structures that are created by 'buildTree'.
|
|
void deleteTree() {
|
|
VectorizableTree.clear();
|
|
ScalarToTreeEntries.clear();
|
|
ScalarsInSplitNodes.clear();
|
|
MustGather.clear();
|
|
NonScheduledFirst.clear();
|
|
EntryToLastInstruction.clear();
|
|
LoadEntriesToVectorize.clear();
|
|
IsGraphTransformMode = false;
|
|
GatheredLoadsEntriesFirst.reset();
|
|
ExternalUses.clear();
|
|
ExternalUsesAsOriginalScalar.clear();
|
|
for (auto &Iter : BlocksSchedules) {
|
|
BlockScheduling *BS = Iter.second.get();
|
|
BS->clear();
|
|
}
|
|
MinBWs.clear();
|
|
ReductionBitWidth = 0;
|
|
BaseGraphSize = 1;
|
|
CastMaxMinBWSizes.reset();
|
|
ExtraBitWidthNodes.clear();
|
|
InstrElementSize.clear();
|
|
UserIgnoreList = nullptr;
|
|
PostponedGathers.clear();
|
|
ValueToGatherNodes.clear();
|
|
}
|
|
|
|
unsigned getTreeSize() const { return VectorizableTree.size(); }
|
|
|
|
/// Returns the base graph size, before any transformations.
|
|
unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
|
|
|
|
/// Perform LICM and CSE on the newly generated gather sequences.
|
|
void optimizeGatherSequence();
|
|
|
|
/// Does this non-empty order represent an identity order? Identity
|
|
/// should be represented as an empty order, so this is used to
|
|
/// decide if we can canonicalize a computed order. Undef elements
|
|
/// (represented as size) are ignored.
|
|
static bool isIdentityOrder(ArrayRef<unsigned> Order) {
|
|
assert(!Order.empty() && "expected non-empty order");
|
|
const unsigned Sz = Order.size();
|
|
return all_of(enumerate(Order), [&](const auto &P) {
|
|
return P.value() == P.index() || P.value() == Sz;
|
|
});
|
|
}
|
|
|
|
/// Checks if the specified gather tree entry \p TE can be represented as a
|
|
/// shuffled vector entry + (possibly) permutation with other gathers. It
|
|
/// implements the checks only for possibly ordered scalars (Loads,
|
|
/// ExtractElement, ExtractValue), which can be part of the graph.
|
|
/// \param TopToBottom If true, used for the whole tree rotation, false - for
|
|
/// sub-tree rotations. \param IgnoreReorder true, if the order of the root
|
|
/// node might be ignored.
|
|
std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
|
|
bool TopToBottom,
|
|
bool IgnoreReorder);
|
|
|
|
/// Sort loads into increasing pointers offsets to allow greater clustering.
|
|
std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
|
|
|
|
/// Gets reordering data for the given tree entry. If the entry is vectorized
|
|
/// - just return ReorderIndices, otherwise check if the scalars can be
|
|
/// reordered and return the most optimal order.
|
|
/// \return std::nullopt if ordering is not important, empty order, if
|
|
/// identity order is important, or the actual order.
|
|
/// \param TopToBottom If true, include the order of vectorized stores and
|
|
/// insertelement nodes, otherwise skip them.
|
|
/// \param IgnoreReorder true, if the root node order can be ignored.
|
|
std::optional<OrdersType>
|
|
getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
|
|
|
|
/// Checks if it is profitable to reorder the current tree.
|
|
/// If the tree does not contain many profitable reordable nodes, better to
|
|
/// skip it to save compile time.
|
|
bool isProfitableToReorder() const;
|
|
|
|
/// Reorders the current graph to the most profitable order starting from the
|
|
/// root node to the leaf nodes. The best order is chosen only from the nodes
|
|
/// of the same size (vectorization factor). Smaller nodes are considered
|
|
/// parts of subgraph with smaller VF and they are reordered independently. We
|
|
/// can make it because we still need to extend smaller nodes to the wider VF
|
|
/// and we can merge reordering shuffles with the widening shuffles.
|
|
void reorderTopToBottom();
|
|
|
|
/// Reorders the current graph to the most profitable order starting from
|
|
/// leaves to the root. It allows to rotate small subgraphs and reduce the
|
|
/// number of reshuffles if the leaf nodes use the same order. In this case we
|
|
/// can merge the orders and just shuffle user node instead of shuffling its
|
|
/// operands. Plus, even the leaf nodes have different orders, it allows to
|
|
/// sink reordering in the graph closer to the root node and merge it later
|
|
/// during analysis.
|
|
void reorderBottomToTop(bool IgnoreReorder = false);
|
|
|
|
/// \return The vector element size in bits to use when vectorizing the
|
|
/// expression tree ending at \p V. If V is a store, the size is the width of
|
|
/// the stored value. Otherwise, the size is the width of the largest loaded
|
|
/// value reaching V. This method is used by the vectorizer to calculate
|
|
/// vectorization factors.
|
|
unsigned getVectorElementSize(Value *V);
|
|
|
|
/// Compute the minimum type sizes required to represent the entries in a
|
|
/// vectorizable tree.
|
|
void computeMinimumValueSizes();
|
|
|
|
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
|
|
unsigned getMaxVecRegSize() const {
|
|
return MaxVecRegSize;
|
|
}
|
|
|
|
// \returns minimum vector register size as set by cl::opt.
|
|
unsigned getMinVecRegSize() const {
|
|
return MinVecRegSize;
|
|
}
|
|
|
|
unsigned getMinVF(unsigned Sz) const {
|
|
return std::max(2U, getMinVecRegSize() / Sz);
|
|
}
|
|
|
|
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
|
|
unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
|
|
MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
|
|
return MaxVF ? MaxVF : UINT_MAX;
|
|
}
|
|
|
|
/// Check if homogeneous aggregate is isomorphic to some VectorType.
|
|
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
|
|
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
|
|
/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
|
|
///
|
|
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
|
|
unsigned canMapToVector(Type *T) const;
|
|
|
|
/// \returns True if the VectorizableTree is both tiny and not fully
|
|
/// vectorizable. We do not vectorize such trees.
|
|
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
|
|
|
|
/// Checks if the graph and all its subgraphs cannot be better vectorized.
|
|
/// It may happen, if all gather nodes are loads and they cannot be
|
|
/// "clusterized". In this case even subgraphs cannot be vectorized more
|
|
/// effectively than the base graph.
|
|
bool isTreeNotExtendable() const;
|
|
|
|
/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
|
|
/// can be load combined in the backend. Load combining may not be allowed in
|
|
/// the IR optimizer, so we do not want to alter the pattern. For example,
|
|
/// partially transforming a scalar bswap() pattern into vector code is
|
|
/// effectively impossible for the backend to undo.
|
|
/// TODO: If load combining is allowed in the IR optimizer, this analysis
|
|
/// may not be necessary.
|
|
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
|
|
|
|
/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
|
|
/// can be load combined in the backend. Load combining may not be allowed in
|
|
/// the IR optimizer, so we do not want to alter the pattern. For example,
|
|
/// partially transforming a scalar bswap() pattern into vector code is
|
|
/// effectively impossible for the backend to undo.
|
|
/// TODO: If load combining is allowed in the IR optimizer, this analysis
|
|
/// may not be necessary.
|
|
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
|
|
|
|
/// Checks if the given array of loads can be represented as a vectorized,
|
|
/// scatter or just simple gather.
|
|
/// \param VL list of loads.
|
|
/// \param VL0 main load value.
|
|
/// \param Order returned order of load instructions.
|
|
/// \param PointerOps returned list of pointer operands.
|
|
/// \param BestVF return best vector factor, if recursive check found better
|
|
/// vectorization sequences rather than masked gather.
|
|
/// \param TryRecursiveCheck used to check if long masked gather can be
|
|
/// represented as a serie of loads/insert subvector, if profitable.
|
|
LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
|
|
SmallVectorImpl<unsigned> &Order,
|
|
SmallVectorImpl<Value *> &PointerOps,
|
|
unsigned *BestVF = nullptr,
|
|
bool TryRecursiveCheck = true) const;
|
|
|
|
/// Registers non-vectorizable sequence of loads
|
|
template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
|
|
ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
|
|
}
|
|
|
|
/// Checks if the given loads sequence is known as not vectorizable
|
|
template <typename T>
|
|
bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const {
|
|
return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
|
|
}
|
|
|
|
OptimizationRemarkEmitter *getORE() { return ORE; }
|
|
|
|
/// This structure holds any data we need about the edges being traversed
|
|
/// during buildTree_rec(). We keep track of:
|
|
/// (i) the user TreeEntry index, and
|
|
/// (ii) the index of the edge.
|
|
struct EdgeInfo {
|
|
EdgeInfo() = default;
|
|
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
|
|
: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
|
|
/// The user TreeEntry.
|
|
TreeEntry *UserTE = nullptr;
|
|
/// The operand index of the use.
|
|
unsigned EdgeIdx = UINT_MAX;
|
|
#ifndef NDEBUG
|
|
friend inline raw_ostream &operator<<(raw_ostream &OS,
|
|
const BoUpSLP::EdgeInfo &EI) {
|
|
EI.dump(OS);
|
|
return OS;
|
|
}
|
|
/// Debug print.
|
|
void dump(raw_ostream &OS) const {
|
|
OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
|
|
<< " EdgeIdx:" << EdgeIdx << "}";
|
|
}
|
|
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
|
|
#endif
|
|
bool operator == (const EdgeInfo &Other) const {
|
|
return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
|
|
}
|
|
|
|
operator bool() const { return UserTE != nullptr; }
|
|
};
|
|
|
|
/// A helper class used for scoring candidates for two consecutive lanes.
|
|
class LookAheadHeuristics {
|
|
const TargetLibraryInfo &TLI;
|
|
const DataLayout &DL;
|
|
ScalarEvolution &SE;
|
|
const BoUpSLP &R;
|
|
int NumLanes; // Total number of lanes (aka vectorization factor).
|
|
int MaxLevel; // The maximum recursion depth for accumulating score.
|
|
|
|
public:
|
|
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
|
|
ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
|
|
int MaxLevel)
|
|
: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
|
|
MaxLevel(MaxLevel) {}
|
|
|
|
// The hard-coded scores listed here are not very important, though it shall
|
|
// be higher for better matches to improve the resulting cost. When
|
|
// computing the scores of matching one sub-tree with another, we are
|
|
// basically counting the number of values that are matching. So even if all
|
|
// scores are set to 1, we would still get a decent matching result.
|
|
// However, sometimes we have to break ties. For example we may have to
|
|
// choose between matching loads vs matching opcodes. This is what these
|
|
// scores are helping us with: they provide the order of preference. Also,
|
|
// this is important if the scalar is externally used or used in another
|
|
// tree entry node in the different lane.
|
|
|
|
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
|
|
static const int ScoreConsecutiveLoads = 4;
|
|
/// The same load multiple times. This should have a better score than
|
|
/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
|
|
/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
|
|
/// a vector load and 1.0 for a broadcast.
|
|
static const int ScoreSplatLoads = 3;
|
|
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
|
|
static const int ScoreReversedLoads = 3;
|
|
/// A load candidate for masked gather.
|
|
static const int ScoreMaskedGatherCandidate = 1;
|
|
/// ExtractElementInst from same vector and consecutive indexes.
|
|
static const int ScoreConsecutiveExtracts = 4;
|
|
/// ExtractElementInst from same vector and reversed indices.
|
|
static const int ScoreReversedExtracts = 3;
|
|
/// Constants.
|
|
static const int ScoreConstants = 2;
|
|
/// Instructions with the same opcode.
|
|
static const int ScoreSameOpcode = 2;
|
|
/// Instructions with alt opcodes (e.g, add + sub).
|
|
static const int ScoreAltOpcodes = 1;
|
|
/// Identical instructions (a.k.a. splat or broadcast).
|
|
static const int ScoreSplat = 1;
|
|
/// Matching with an undef is preferable to failing.
|
|
static const int ScoreUndef = 1;
|
|
/// Score for failing to find a decent match.
|
|
static const int ScoreFail = 0;
|
|
/// Score if all users are vectorized.
|
|
static const int ScoreAllUserVectorized = 1;
|
|
|
|
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
|
|
/// \p U1 and \p U2 are the users of \p V1 and \p V2.
|
|
/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
|
|
/// MainAltOps.
|
|
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
|
|
ArrayRef<Value *> MainAltOps) const {
|
|
if (!isValidElementType(V1->getType()) ||
|
|
!isValidElementType(V2->getType()))
|
|
return LookAheadHeuristics::ScoreFail;
|
|
|
|
if (V1 == V2) {
|
|
if (isa<LoadInst>(V1)) {
|
|
// Retruns true if the users of V1 and V2 won't need to be extracted.
|
|
auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
|
|
// Bail out if we have too many uses to save compilation time.
|
|
if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
|
|
return false;
|
|
|
|
auto AllUsersVectorized = [U1, U2, this](Value *V) {
|
|
return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
|
|
return U == U1 || U == U2 || R.isVectorized(U);
|
|
});
|
|
};
|
|
return AllUsersVectorized(V1) && AllUsersVectorized(V2);
|
|
};
|
|
// A broadcast of a load can be cheaper on some targets.
|
|
if (R.TTI->isLegalBroadcastLoad(V1->getType(),
|
|
ElementCount::getFixed(NumLanes)) &&
|
|
((int)V1->getNumUses() == NumLanes ||
|
|
AllUsersAreInternal(V1, V2)))
|
|
return LookAheadHeuristics::ScoreSplatLoads;
|
|
}
|
|
return LookAheadHeuristics::ScoreSplat;
|
|
}
|
|
|
|
auto CheckSameEntryOrFail = [&]() {
|
|
if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
|
|
SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1);
|
|
if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
|
|
!TEs2.empty() &&
|
|
any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
|
|
return LookAheadHeuristics::ScoreSplatLoads;
|
|
}
|
|
return LookAheadHeuristics::ScoreFail;
|
|
};
|
|
|
|
auto *LI1 = dyn_cast<LoadInst>(V1);
|
|
auto *LI2 = dyn_cast<LoadInst>(V2);
|
|
if (LI1 && LI2) {
|
|
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
|
|
!LI2->isSimple())
|
|
return CheckSameEntryOrFail();
|
|
|
|
std::optional<int> Dist = getPointersDiff(
|
|
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
|
|
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
|
|
if (!Dist || *Dist == 0) {
|
|
if (getUnderlyingObject(LI1->getPointerOperand()) ==
|
|
getUnderlyingObject(LI2->getPointerOperand()) &&
|
|
R.TTI->isLegalMaskedGather(
|
|
getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
|
|
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
|
|
return CheckSameEntryOrFail();
|
|
}
|
|
// The distance is too large - still may be profitable to use masked
|
|
// loads/gathers.
|
|
if (std::abs(*Dist) > NumLanes / 2)
|
|
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
|
|
// This still will detect consecutive loads, but we might have "holes"
|
|
// in some cases. It is ok for non-power-2 vectorization and may produce
|
|
// better results. It should not affect current vectorization.
|
|
return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
|
|
: LookAheadHeuristics::ScoreReversedLoads;
|
|
}
|
|
|
|
auto *C1 = dyn_cast<Constant>(V1);
|
|
auto *C2 = dyn_cast<Constant>(V2);
|
|
if (C1 && C2)
|
|
return LookAheadHeuristics::ScoreConstants;
|
|
|
|
// Extracts from consecutive indexes of the same vector better score as
|
|
// the extracts could be optimized away.
|
|
Value *EV1;
|
|
ConstantInt *Ex1Idx;
|
|
if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
|
|
// Undefs are always profitable for extractelements.
|
|
// Compiler can easily combine poison and extractelement <non-poison> or
|
|
// undef and extractelement <poison>. But combining undef +
|
|
// extractelement <non-poison-but-may-produce-poison> requires some
|
|
// extra operations.
|
|
if (isa<UndefValue>(V2))
|
|
return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
|
|
? LookAheadHeuristics::ScoreConsecutiveExtracts
|
|
: LookAheadHeuristics::ScoreSameOpcode;
|
|
Value *EV2 = nullptr;
|
|
ConstantInt *Ex2Idx = nullptr;
|
|
if (match(V2,
|
|
m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
|
|
m_Undef())))) {
|
|
// Undefs are always profitable for extractelements.
|
|
if (!Ex2Idx)
|
|
return LookAheadHeuristics::ScoreConsecutiveExtracts;
|
|
if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
|
|
return LookAheadHeuristics::ScoreConsecutiveExtracts;
|
|
if (EV2 == EV1) {
|
|
int Idx1 = Ex1Idx->getZExtValue();
|
|
int Idx2 = Ex2Idx->getZExtValue();
|
|
int Dist = Idx2 - Idx1;
|
|
// The distance is too large - still may be profitable to use
|
|
// shuffles.
|
|
if (std::abs(Dist) == 0)
|
|
return LookAheadHeuristics::ScoreSplat;
|
|
if (std::abs(Dist) > NumLanes / 2)
|
|
return LookAheadHeuristics::ScoreSameOpcode;
|
|
return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
|
|
: LookAheadHeuristics::ScoreReversedExtracts;
|
|
}
|
|
return LookAheadHeuristics::ScoreAltOpcodes;
|
|
}
|
|
return CheckSameEntryOrFail();
|
|
}
|
|
|
|
auto *I1 = dyn_cast<Instruction>(V1);
|
|
auto *I2 = dyn_cast<Instruction>(V2);
|
|
if (I1 && I2) {
|
|
if (I1->getParent() != I2->getParent())
|
|
return CheckSameEntryOrFail();
|
|
SmallVector<Value *, 4> Ops(MainAltOps);
|
|
Ops.push_back(I1);
|
|
Ops.push_back(I2);
|
|
InstructionsState S = getSameOpcode(Ops, TLI);
|
|
// Note: Only consider instructions with <= 2 operands to avoid
|
|
// complexity explosion.
|
|
if (S &&
|
|
(S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
|
|
!S.isAltShuffle()) &&
|
|
all_of(Ops, [&S](Value *V) {
|
|
return isa<PoisonValue>(V) ||
|
|
cast<Instruction>(V)->getNumOperands() ==
|
|
S.getMainOp()->getNumOperands();
|
|
}))
|
|
return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
|
|
: LookAheadHeuristics::ScoreSameOpcode;
|
|
}
|
|
|
|
if (I1 && isa<PoisonValue>(V2))
|
|
return LookAheadHeuristics::ScoreSameOpcode;
|
|
|
|
if (isa<UndefValue>(V2))
|
|
return LookAheadHeuristics::ScoreUndef;
|
|
|
|
return CheckSameEntryOrFail();
|
|
}
|
|
|
|
/// Go through the operands of \p LHS and \p RHS recursively until
|
|
/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
|
|
/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
|
|
/// of \p U1 and \p U2), except at the beginning of the recursion where
|
|
/// these are set to nullptr.
|
|
///
|
|
/// For example:
|
|
/// \verbatim
|
|
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
|
|
/// \ / \ / \ / \ /
|
|
/// + + + +
|
|
/// G1 G2 G3 G4
|
|
/// \endverbatim
|
|
/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
|
|
/// each level recursively, accumulating the score. It starts from matching
|
|
/// the additions at level 0, then moves on to the loads (level 1). The
|
|
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
|
|
/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
|
|
/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
|
|
/// Please note that the order of the operands does not matter, as we
|
|
/// evaluate the score of all profitable combinations of operands. In
|
|
/// other words the score of G1 and G4 is the same as G1 and G2. This
|
|
/// heuristic is based on ideas described in:
|
|
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
|
|
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
|
|
/// Luís F. W. Góes
|
|
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
|
|
Instruction *U2, int CurrLevel,
|
|
ArrayRef<Value *> MainAltOps) const {
|
|
|
|
// Get the shallow score of V1 and V2.
|
|
int ShallowScoreAtThisLevel =
|
|
getShallowScore(LHS, RHS, U1, U2, MainAltOps);
|
|
|
|
// If reached MaxLevel,
|
|
// or if V1 and V2 are not instructions,
|
|
// or if they are SPLAT,
|
|
// or if they are not consecutive,
|
|
// or if profitable to vectorize loads or extractelements, early return
|
|
// the current cost.
|
|
auto *I1 = dyn_cast<Instruction>(LHS);
|
|
auto *I2 = dyn_cast<Instruction>(RHS);
|
|
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
|
|
ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
|
|
(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
|
|
(I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
|
|
(isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
|
|
ShallowScoreAtThisLevel))
|
|
return ShallowScoreAtThisLevel;
|
|
assert(I1 && I2 && "Should have early exited.");
|
|
|
|
// Contains the I2 operand indexes that got matched with I1 operands.
|
|
SmallSet<unsigned, 4> Op2Used;
|
|
|
|
// Recursion towards the operands of I1 and I2. We are trying all possible
|
|
// operand pairs, and keeping track of the best score.
|
|
for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
|
|
OpIdx1 != NumOperands1; ++OpIdx1) {
|
|
// Try to pair op1I with the best operand of I2.
|
|
int MaxTmpScore = 0;
|
|
unsigned MaxOpIdx2 = 0;
|
|
bool FoundBest = false;
|
|
// If I2 is commutative try all combinations.
|
|
unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
|
|
unsigned ToIdx = isCommutative(I2)
|
|
? I2->getNumOperands()
|
|
: std::min(I2->getNumOperands(), OpIdx1 + 1);
|
|
assert(FromIdx <= ToIdx && "Bad index");
|
|
for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
|
|
// Skip operands already paired with OpIdx1.
|
|
if (Op2Used.count(OpIdx2))
|
|
continue;
|
|
// Recursively calculate the cost at each level
|
|
int TmpScore =
|
|
getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
|
|
I1, I2, CurrLevel + 1, {});
|
|
// Look for the best score.
|
|
if (TmpScore > LookAheadHeuristics::ScoreFail &&
|
|
TmpScore > MaxTmpScore) {
|
|
MaxTmpScore = TmpScore;
|
|
MaxOpIdx2 = OpIdx2;
|
|
FoundBest = true;
|
|
}
|
|
}
|
|
if (FoundBest) {
|
|
// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
|
|
Op2Used.insert(MaxOpIdx2);
|
|
ShallowScoreAtThisLevel += MaxTmpScore;
|
|
}
|
|
}
|
|
return ShallowScoreAtThisLevel;
|
|
}
|
|
};
|
|
/// A helper data structure to hold the operands of a vector of instructions.
|
|
/// This supports a fixed vector length for all operand vectors.
|
|
class VLOperands {
|
|
/// For each operand we need (i) the value, and (ii) the opcode that it
|
|
/// would be attached to if the expression was in a left-linearized form.
|
|
/// This is required to avoid illegal operand reordering.
|
|
/// For example:
|
|
/// \verbatim
|
|
/// 0 Op1
|
|
/// |/
|
|
/// Op1 Op2 Linearized + Op2
|
|
/// \ / ----------> |/
|
|
/// - -
|
|
///
|
|
/// Op1 - Op2 (0 + Op1) - Op2
|
|
/// \endverbatim
|
|
///
|
|
/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
|
|
///
|
|
/// Another way to think of this is to track all the operations across the
|
|
/// path from the operand all the way to the root of the tree and to
|
|
/// calculate the operation that corresponds to this path. For example, the
|
|
/// path from Op2 to the root crosses the RHS of the '-', therefore the
|
|
/// corresponding operation is a '-' (which matches the one in the
|
|
/// linearized tree, as shown above).
|
|
///
|
|
/// For lack of a better term, we refer to this operation as Accumulated
|
|
/// Path Operation (APO).
|
|
struct OperandData {
|
|
OperandData() = default;
|
|
OperandData(Value *V, bool APO, bool IsUsed)
|
|
: V(V), APO(APO), IsUsed(IsUsed) {}
|
|
/// The operand value.
|
|
Value *V = nullptr;
|
|
/// TreeEntries only allow a single opcode, or an alternate sequence of
|
|
/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
|
|
/// APO. It is set to 'true' if 'V' is attached to an inverse operation
|
|
/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
|
|
/// (e.g., Add/Mul)
|
|
bool APO = false;
|
|
/// Helper data for the reordering function.
|
|
bool IsUsed = false;
|
|
};
|
|
|
|
/// During operand reordering, we are trying to select the operand at lane
|
|
/// that matches best with the operand at the neighboring lane. Our
|
|
/// selection is based on the type of value we are looking for. For example,
|
|
/// if the neighboring lane has a load, we need to look for a load that is
|
|
/// accessing a consecutive address. These strategies are summarized in the
|
|
/// 'ReorderingMode' enumerator.
|
|
enum class ReorderingMode {
|
|
Load, ///< Matching loads to consecutive memory addresses
|
|
Opcode, ///< Matching instructions based on opcode (same or alternate)
|
|
Constant, ///< Matching constants
|
|
Splat, ///< Matching the same instruction multiple times (broadcast)
|
|
Failed, ///< We failed to create a vectorizable group
|
|
};
|
|
|
|
using OperandDataVec = SmallVector<OperandData, 2>;
|
|
|
|
/// A vector of operand vectors.
|
|
SmallVector<OperandDataVec, 4> OpsVec;
|
|
/// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
|
|
/// is not IntrinsicInst, ArgSize is User::getNumOperands.
|
|
unsigned ArgSize = 0;
|
|
|
|
const TargetLibraryInfo &TLI;
|
|
const DataLayout &DL;
|
|
ScalarEvolution &SE;
|
|
const BoUpSLP &R;
|
|
const Loop *L = nullptr;
|
|
|
|
/// \returns the operand data at \p OpIdx and \p Lane.
|
|
OperandData &getData(unsigned OpIdx, unsigned Lane) {
|
|
return OpsVec[OpIdx][Lane];
|
|
}
|
|
|
|
/// \returns the operand data at \p OpIdx and \p Lane. Const version.
|
|
const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
|
|
return OpsVec[OpIdx][Lane];
|
|
}
|
|
|
|
/// Clears the used flag for all entries.
|
|
void clearUsed() {
|
|
for (unsigned OpIdx = 0, NumOperands = getNumOperands();
|
|
OpIdx != NumOperands; ++OpIdx)
|
|
for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
|
|
++Lane)
|
|
OpsVec[OpIdx][Lane].IsUsed = false;
|
|
}
|
|
|
|
/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
|
|
void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
|
|
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
|
|
}
|
|
|
|
/// \param Lane lane of the operands under analysis.
|
|
/// \param OpIdx operand index in \p Lane lane we're looking the best
|
|
/// candidate for.
|
|
/// \param Idx operand index of the current candidate value.
|
|
/// \returns The additional score due to possible broadcasting of the
|
|
/// elements in the lane. It is more profitable to have power-of-2 unique
|
|
/// elements in the lane, it will be vectorized with higher probability
|
|
/// after removing duplicates. Currently the SLP vectorizer supports only
|
|
/// vectorization of the power-of-2 number of unique scalars.
|
|
int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
|
|
const SmallBitVector &UsedLanes) const {
|
|
Value *IdxLaneV = getData(Idx, Lane).V;
|
|
if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
|
|
isa<ExtractElementInst>(IdxLaneV))
|
|
return 0;
|
|
SmallDenseMap<Value *, unsigned, 4> Uniques;
|
|
for (unsigned Ln : seq<unsigned>(getNumLanes())) {
|
|
if (Ln == Lane)
|
|
continue;
|
|
Value *OpIdxLnV = getData(OpIdx, Ln).V;
|
|
if (!isa<Instruction>(OpIdxLnV))
|
|
return 0;
|
|
Uniques.try_emplace(OpIdxLnV, Ln);
|
|
}
|
|
unsigned UniquesCount = Uniques.size();
|
|
auto IdxIt = Uniques.find(IdxLaneV);
|
|
unsigned UniquesCntWithIdxLaneV =
|
|
IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
|
|
Value *OpIdxLaneV = getData(OpIdx, Lane).V;
|
|
auto OpIdxIt = Uniques.find(OpIdxLaneV);
|
|
unsigned UniquesCntWithOpIdxLaneV =
|
|
OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
|
|
if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
|
|
return 0;
|
|
return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
|
|
UniquesCntWithOpIdxLaneV,
|
|
UniquesCntWithOpIdxLaneV -
|
|
bit_floor(UniquesCntWithOpIdxLaneV)) -
|
|
((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
|
|
? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
|
|
: bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
|
|
}
|
|
|
|
/// \param Lane lane of the operands under analysis.
|
|
/// \param OpIdx operand index in \p Lane lane we're looking the best
|
|
/// candidate for.
|
|
/// \param Idx operand index of the current candidate value.
|
|
/// \returns The additional score for the scalar which users are all
|
|
/// vectorized.
|
|
int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
|
|
Value *IdxLaneV = getData(Idx, Lane).V;
|
|
Value *OpIdxLaneV = getData(OpIdx, Lane).V;
|
|
// Do not care about number of uses for vector-like instructions
|
|
// (extractelement/extractvalue with constant indices), they are extracts
|
|
// themselves and already externally used. Vectorization of such
|
|
// instructions does not add extra extractelement instruction, just may
|
|
// remove it.
|
|
if (isVectorLikeInstWithConstOps(IdxLaneV) &&
|
|
isVectorLikeInstWithConstOps(OpIdxLaneV))
|
|
return LookAheadHeuristics::ScoreAllUserVectorized;
|
|
auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
|
|
if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
|
|
return 0;
|
|
return R.areAllUsersVectorized(IdxLaneI)
|
|
? LookAheadHeuristics::ScoreAllUserVectorized
|
|
: 0;
|
|
}
|
|
|
|
/// Score scaling factor for fully compatible instructions but with
|
|
/// different number of external uses. Allows better selection of the
|
|
/// instructions with less external uses.
|
|
static const int ScoreScaleFactor = 10;
|
|
|
|
/// \Returns the look-ahead score, which tells us how much the sub-trees
|
|
/// rooted at \p LHS and \p RHS match, the more they match the higher the
|
|
/// score. This helps break ties in an informed way when we cannot decide on
|
|
/// the order of the operands by just considering the immediate
|
|
/// predecessors.
|
|
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
|
|
int Lane, unsigned OpIdx, unsigned Idx,
|
|
bool &IsUsed, const SmallBitVector &UsedLanes) {
|
|
LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
|
|
LookAheadMaxDepth);
|
|
// Keep track of the instruction stack as we recurse into the operands
|
|
// during the look-ahead score exploration.
|
|
int Score =
|
|
LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
|
|
/*CurrLevel=*/1, MainAltOps);
|
|
if (Score) {
|
|
int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
|
|
if (Score <= -SplatScore) {
|
|
// Failed score.
|
|
Score = 0;
|
|
} else {
|
|
Score += SplatScore;
|
|
// Scale score to see the difference between different operands
|
|
// and similar operands but all vectorized/not all vectorized
|
|
// uses. It does not affect actual selection of the best
|
|
// compatible operand in general, just allows to select the
|
|
// operand with all vectorized uses.
|
|
Score *= ScoreScaleFactor;
|
|
Score += getExternalUseScore(Lane, OpIdx, Idx);
|
|
IsUsed = true;
|
|
}
|
|
}
|
|
return Score;
|
|
}
|
|
|
|
/// Best defined scores per lanes between the passes. Used to choose the
|
|
/// best operand (with the highest score) between the passes.
|
|
/// The key - {Operand Index, Lane}.
|
|
/// The value - the best score between the passes for the lane and the
|
|
/// operand.
|
|
SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
|
|
BestScoresPerLanes;
|
|
|
|
// Search all operands in Ops[*][Lane] for the one that matches best
|
|
// Ops[OpIdx][LastLane] and return its opreand index.
|
|
// If no good match can be found, return std::nullopt.
|
|
std::optional<unsigned>
|
|
getBestOperand(unsigned OpIdx, int Lane, int LastLane,
|
|
ArrayRef<ReorderingMode> ReorderingModes,
|
|
ArrayRef<Value *> MainAltOps,
|
|
const SmallBitVector &UsedLanes) {
|
|
unsigned NumOperands = getNumOperands();
|
|
|
|
// The operand of the previous lane at OpIdx.
|
|
Value *OpLastLane = getData(OpIdx, LastLane).V;
|
|
|
|
// Our strategy mode for OpIdx.
|
|
ReorderingMode RMode = ReorderingModes[OpIdx];
|
|
if (RMode == ReorderingMode::Failed)
|
|
return std::nullopt;
|
|
|
|
// The linearized opcode of the operand at OpIdx, Lane.
|
|
bool OpIdxAPO = getData(OpIdx, Lane).APO;
|
|
|
|
// The best operand index and its score.
|
|
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
|
|
// are using the score to differentiate between the two.
|
|
struct BestOpData {
|
|
std::optional<unsigned> Idx;
|
|
unsigned Score = 0;
|
|
} BestOp;
|
|
BestOp.Score =
|
|
BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
|
|
.first->second;
|
|
|
|
// Track if the operand must be marked as used. If the operand is set to
|
|
// Score 1 explicitly (because of non power-of-2 unique scalars, we may
|
|
// want to reestimate the operands again on the following iterations).
|
|
bool IsUsed = RMode == ReorderingMode::Splat ||
|
|
RMode == ReorderingMode::Constant ||
|
|
RMode == ReorderingMode::Load;
|
|
// Iterate through all unused operands and look for the best.
|
|
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
|
|
// Get the operand at Idx and Lane.
|
|
OperandData &OpData = getData(Idx, Lane);
|
|
Value *Op = OpData.V;
|
|
bool OpAPO = OpData.APO;
|
|
|
|
// Skip already selected operands.
|
|
if (OpData.IsUsed)
|
|
continue;
|
|
|
|
// Skip if we are trying to move the operand to a position with a
|
|
// different opcode in the linearized tree form. This would break the
|
|
// semantics.
|
|
if (OpAPO != OpIdxAPO)
|
|
continue;
|
|
|
|
// Look for an operand that matches the current mode.
|
|
switch (RMode) {
|
|
case ReorderingMode::Load:
|
|
case ReorderingMode::Opcode: {
|
|
bool LeftToRight = Lane > LastLane;
|
|
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
|
|
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
|
|
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
|
|
OpIdx, Idx, IsUsed, UsedLanes);
|
|
if (Score > static_cast<int>(BestOp.Score) ||
|
|
(Score > 0 && Score == static_cast<int>(BestOp.Score) &&
|
|
Idx == OpIdx)) {
|
|
BestOp.Idx = Idx;
|
|
BestOp.Score = Score;
|
|
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
|
|
}
|
|
break;
|
|
}
|
|
case ReorderingMode::Constant:
|
|
if (isa<Constant>(Op) ||
|
|
(!BestOp.Score && L && L->isLoopInvariant(Op))) {
|
|
BestOp.Idx = Idx;
|
|
if (isa<Constant>(Op)) {
|
|
BestOp.Score = LookAheadHeuristics::ScoreConstants;
|
|
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
|
|
LookAheadHeuristics::ScoreConstants;
|
|
}
|
|
if (isa<UndefValue>(Op) || !isa<Constant>(Op))
|
|
IsUsed = false;
|
|
}
|
|
break;
|
|
case ReorderingMode::Splat:
|
|
if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
|
|
IsUsed = Op == OpLastLane;
|
|
if (Op == OpLastLane) {
|
|
BestOp.Score = LookAheadHeuristics::ScoreSplat;
|
|
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
|
|
LookAheadHeuristics::ScoreSplat;
|
|
}
|
|
BestOp.Idx = Idx;
|
|
}
|
|
break;
|
|
case ReorderingMode::Failed:
|
|
llvm_unreachable("Not expected Failed reordering mode.");
|
|
}
|
|
}
|
|
|
|
if (BestOp.Idx) {
|
|
getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
|
|
return BestOp.Idx;
|
|
}
|
|
// If we could not find a good match return std::nullopt.
|
|
return std::nullopt;
|
|
}
|
|
|
|
/// Helper for reorderOperandVecs.
|
|
/// \returns the lane that we should start reordering from. This is the one
|
|
/// which has the least number of operands that can freely move about or
|
|
/// less profitable because it already has the most optimal set of operands.
|
|
unsigned getBestLaneToStartReordering() const {
|
|
unsigned Min = UINT_MAX;
|
|
unsigned SameOpNumber = 0;
|
|
// std::pair<unsigned, unsigned> is used to implement a simple voting
|
|
// algorithm and choose the lane with the least number of operands that
|
|
// can freely move about or less profitable because it already has the
|
|
// most optimal set of operands. The first unsigned is a counter for
|
|
// voting, the second unsigned is the counter of lanes with instructions
|
|
// with same/alternate opcodes and same parent basic block.
|
|
MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
|
|
// Try to be closer to the original results, if we have multiple lanes
|
|
// with same cost. If 2 lanes have the same cost, use the one with the
|
|
// highest index.
|
|
for (int I = getNumLanes(); I > 0; --I) {
|
|
unsigned Lane = I - 1;
|
|
OperandsOrderData NumFreeOpsHash =
|
|
getMaxNumOperandsThatCanBeReordered(Lane);
|
|
// Compare the number of operands that can move and choose the one with
|
|
// the least number.
|
|
if (NumFreeOpsHash.NumOfAPOs < Min) {
|
|
Min = NumFreeOpsHash.NumOfAPOs;
|
|
SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
|
|
HashMap.clear();
|
|
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
|
|
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
|
|
NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
|
|
// Select the most optimal lane in terms of number of operands that
|
|
// should be moved around.
|
|
SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
|
|
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
|
|
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
|
|
NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
|
|
auto [It, Inserted] =
|
|
HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
|
|
if (!Inserted)
|
|
++It->second.first;
|
|
}
|
|
}
|
|
// Select the lane with the minimum counter.
|
|
unsigned BestLane = 0;
|
|
unsigned CntMin = UINT_MAX;
|
|
for (const auto &Data : reverse(HashMap)) {
|
|
if (Data.second.first < CntMin) {
|
|
CntMin = Data.second.first;
|
|
BestLane = Data.second.second;
|
|
}
|
|
}
|
|
return BestLane;
|
|
}
|
|
|
|
/// Data structure that helps to reorder operands.
|
|
struct OperandsOrderData {
|
|
/// The best number of operands with the same APOs, which can be
|
|
/// reordered.
|
|
unsigned NumOfAPOs = UINT_MAX;
|
|
/// Number of operands with the same/alternate instruction opcode and
|
|
/// parent.
|
|
unsigned NumOpsWithSameOpcodeParent = 0;
|
|
/// Hash for the actual operands ordering.
|
|
/// Used to count operands, actually their position id and opcode
|
|
/// value. It is used in the voting mechanism to find the lane with the
|
|
/// least number of operands that can freely move about or less profitable
|
|
/// because it already has the most optimal set of operands. Can be
|
|
/// replaced with SmallVector<unsigned> instead but hash code is faster
|
|
/// and requires less memory.
|
|
unsigned Hash = 0;
|
|
};
|
|
/// \returns the maximum number of operands that are allowed to be reordered
|
|
/// for \p Lane and the number of compatible instructions(with the same
|
|
/// parent/opcode). This is used as a heuristic for selecting the first lane
|
|
/// to start operand reordering.
|
|
OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
|
|
unsigned CntTrue = 0;
|
|
unsigned NumOperands = getNumOperands();
|
|
// Operands with the same APO can be reordered. We therefore need to count
|
|
// how many of them we have for each APO, like this: Cnt[APO] = x.
|
|
// Since we only have two APOs, namely true and false, we can avoid using
|
|
// a map. Instead we can simply count the number of operands that
|
|
// correspond to one of them (in this case the 'true' APO), and calculate
|
|
// the other by subtracting it from the total number of operands.
|
|
// Operands with the same instruction opcode and parent are more
|
|
// profitable since we don't need to move them in many cases, with a high
|
|
// probability such lane already can be vectorized effectively.
|
|
bool AllUndefs = true;
|
|
unsigned NumOpsWithSameOpcodeParent = 0;
|
|
Instruction *OpcodeI = nullptr;
|
|
BasicBlock *Parent = nullptr;
|
|
unsigned Hash = 0;
|
|
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
|
|
const OperandData &OpData = getData(OpIdx, Lane);
|
|
if (OpData.APO)
|
|
++CntTrue;
|
|
// Use Boyer-Moore majority voting for finding the majority opcode and
|
|
// the number of times it occurs.
|
|
if (auto *I = dyn_cast<Instruction>(OpData.V)) {
|
|
if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
|
|
I->getParent() != Parent) {
|
|
if (NumOpsWithSameOpcodeParent == 0) {
|
|
NumOpsWithSameOpcodeParent = 1;
|
|
OpcodeI = I;
|
|
Parent = I->getParent();
|
|
} else {
|
|
--NumOpsWithSameOpcodeParent;
|
|
}
|
|
} else {
|
|
++NumOpsWithSameOpcodeParent;
|
|
}
|
|
}
|
|
Hash = hash_combine(
|
|
Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
|
|
AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
|
|
}
|
|
if (AllUndefs)
|
|
return {};
|
|
OperandsOrderData Data;
|
|
Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
|
|
Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
|
|
Data.Hash = Hash;
|
|
return Data;
|
|
}
|
|
|
|
/// Go through the instructions in VL and append their operands.
|
|
void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
|
|
assert(!VL.empty() && "Bad VL");
|
|
assert((empty() || VL.size() == getNumLanes()) &&
|
|
"Expected same number of lanes");
|
|
assert(S.valid() && "InstructionsState is invalid.");
|
|
// IntrinsicInst::isCommutative returns true if swapping the first "two"
|
|
// arguments to the intrinsic produces the same result.
|
|
constexpr unsigned IntrinsicNumOperands = 2;
|
|
Instruction *MainOp = S.getMainOp();
|
|
unsigned NumOperands = MainOp->getNumOperands();
|
|
ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
|
|
OpsVec.resize(NumOperands);
|
|
unsigned NumLanes = VL.size();
|
|
for (OperandDataVec &Ops : OpsVec)
|
|
Ops.resize(NumLanes);
|
|
for (unsigned Lane : seq<unsigned>(NumLanes)) {
|
|
Value *V = VL[Lane];
|
|
assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
|
|
"Expected instruction or poison value");
|
|
if (isa<PoisonValue>(V)) {
|
|
for (unsigned OpIdx : seq<unsigned>(NumOperands))
|
|
OpsVec[OpIdx][Lane] = {
|
|
PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
|
|
false};
|
|
if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
|
|
OpsVec[0][Lane] = {EI->getVectorOperand(), true, false};
|
|
} else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
|
|
OpsVec[0][Lane] = {EV->getAggregateOperand(), true, false};
|
|
}
|
|
continue;
|
|
}
|
|
// Our tree has just 3 nodes: the root and two operands.
|
|
// It is therefore trivial to get the APO. We only need to check the
|
|
// opcode of V and whether the operand at OpIdx is the LHS or RHS
|
|
// operand. The LHS operand of both add and sub is never attached to an
|
|
// inversese operation in the linearized form, therefore its APO is
|
|
// false. The RHS is true only if V is an inverse operation.
|
|
|
|
// Since operand reordering is performed on groups of commutative
|
|
// operations or alternating sequences (e.g., +, -), we can safely tell
|
|
// the inverse operations by checking commutativity.
|
|
bool IsInverseOperation = !isCommutative(cast<Instruction>(V));
|
|
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
|
|
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
|
|
OpsVec[OpIdx][Lane] = {cast<Instruction>(V)->getOperand(OpIdx), APO,
|
|
false};
|
|
}
|
|
}
|
|
}
|
|
|
|
/// \returns the number of operands.
|
|
unsigned getNumOperands() const { return ArgSize; }
|
|
|
|
/// \returns the number of lanes.
|
|
unsigned getNumLanes() const { return OpsVec[0].size(); }
|
|
|
|
/// \returns the operand value at \p OpIdx and \p Lane.
|
|
Value *getValue(unsigned OpIdx, unsigned Lane) const {
|
|
return getData(OpIdx, Lane).V;
|
|
}
|
|
|
|
/// \returns true if the data structure is empty.
|
|
bool empty() const { return OpsVec.empty(); }
|
|
|
|
/// Clears the data.
|
|
void clear() { OpsVec.clear(); }
|
|
|
|
/// \Returns true if there are enough operands identical to \p Op to fill
|
|
/// the whole vector (it is mixed with constants or loop invariant values).
|
|
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
|
|
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
|
|
assert(Op == getValue(OpIdx, Lane) &&
|
|
"Op is expected to be getValue(OpIdx, Lane).");
|
|
// Small number of loads - try load matching.
|
|
if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
|
|
return false;
|
|
bool OpAPO = getData(OpIdx, Lane).APO;
|
|
bool IsInvariant = L && L->isLoopInvariant(Op);
|
|
unsigned Cnt = 0;
|
|
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
|
|
if (Ln == Lane)
|
|
continue;
|
|
// This is set to true if we found a candidate for broadcast at Lane.
|
|
bool FoundCandidate = false;
|
|
for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
|
|
OperandData &Data = getData(OpI, Ln);
|
|
if (Data.APO != OpAPO || Data.IsUsed)
|
|
continue;
|
|
Value *OpILane = getValue(OpI, Lane);
|
|
bool IsConstantOp = isa<Constant>(OpILane);
|
|
// Consider the broadcast candidate if:
|
|
// 1. Same value is found in one of the operands.
|
|
if (Data.V == Op ||
|
|
// 2. The operand in the given lane is not constant but there is a
|
|
// constant operand in another lane (which can be moved to the
|
|
// given lane). In this case we can represent it as a simple
|
|
// permutation of constant and broadcast.
|
|
(!IsConstantOp &&
|
|
((Lns > 2 && isa<Constant>(Data.V)) ||
|
|
// 2.1. If we have only 2 lanes, need to check that value in the
|
|
// next lane does not build same opcode sequence.
|
|
(Lns == 2 &&
|
|
!getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
|
|
isa<Constant>(Data.V)))) ||
|
|
// 3. The operand in the current lane is loop invariant (can be
|
|
// hoisted out) and another operand is also a loop invariant
|
|
// (though not a constant). In this case the whole vector can be
|
|
// hoisted out.
|
|
// FIXME: need to teach the cost model about this case for better
|
|
// estimation.
|
|
(IsInvariant && !isa<Constant>(Data.V) &&
|
|
!getSameOpcode({Op, Data.V}, TLI) &&
|
|
L->isLoopInvariant(Data.V))) {
|
|
FoundCandidate = true;
|
|
Data.IsUsed = Data.V == Op;
|
|
if (Data.V == Op)
|
|
++Cnt;
|
|
break;
|
|
}
|
|
}
|
|
if (!FoundCandidate)
|
|
return false;
|
|
}
|
|
return getNumLanes() == 2 || Cnt > 1;
|
|
}
|
|
|
|
/// Checks if there is at least single compatible operand in lanes other
|
|
/// than \p Lane, compatible with the operand \p Op.
|
|
bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
|
|
assert(Op == getValue(OpIdx, Lane) &&
|
|
"Op is expected to be getValue(OpIdx, Lane).");
|
|
bool OpAPO = getData(OpIdx, Lane).APO;
|
|
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
|
|
if (Ln == Lane)
|
|
continue;
|
|
if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
|
|
const OperandData &Data = getData(OpI, Ln);
|
|
if (Data.APO != OpAPO || Data.IsUsed)
|
|
return true;
|
|
Value *OpILn = getValue(OpI, Ln);
|
|
return (L && L->isLoopInvariant(OpILn)) ||
|
|
(getSameOpcode({Op, OpILn}, TLI) &&
|
|
allSameBlock({Op, OpILn}));
|
|
}))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public:
|
|
/// Initialize with all the operands of the instruction vector \p RootVL.
|
|
VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
|
|
const BoUpSLP &R)
|
|
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
|
|
L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
|
|
// Append all the operands of RootVL.
|
|
appendOperandsOfVL(RootVL, S);
|
|
}
|
|
|
|
/// \Returns a value vector with the operands across all lanes for the
|
|
/// opearnd at \p OpIdx.
|
|
ValueList getVL(unsigned OpIdx) const {
|
|
ValueList OpVL(OpsVec[OpIdx].size());
|
|
assert(OpsVec[OpIdx].size() == getNumLanes() &&
|
|
"Expected same num of lanes across all operands");
|
|
for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
|
|
OpVL[Lane] = OpsVec[OpIdx][Lane].V;
|
|
return OpVL;
|
|
}
|
|
|
|
// Performs operand reordering for 2 or more operands.
|
|
// The original operands are in OrigOps[OpIdx][Lane].
|
|
// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
|
|
void reorder() {
|
|
unsigned NumOperands = getNumOperands();
|
|
unsigned NumLanes = getNumLanes();
|
|
// Each operand has its own mode. We are using this mode to help us select
|
|
// the instructions for each lane, so that they match best with the ones
|
|
// we have selected so far.
|
|
SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
|
|
|
|
// This is a greedy single-pass algorithm. We are going over each lane
|
|
// once and deciding on the best order right away with no back-tracking.
|
|
// However, in order to increase its effectiveness, we start with the lane
|
|
// that has operands that can move the least. For example, given the
|
|
// following lanes:
|
|
// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
|
|
// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
|
|
// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
|
|
// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
|
|
// we will start at Lane 1, since the operands of the subtraction cannot
|
|
// be reordered. Then we will visit the rest of the lanes in a circular
|
|
// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
|
|
|
|
// Find the first lane that we will start our search from.
|
|
unsigned FirstLane = getBestLaneToStartReordering();
|
|
|
|
// Initialize the modes.
|
|
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
|
|
Value *OpLane0 = getValue(OpIdx, FirstLane);
|
|
// Keep track if we have instructions with all the same opcode on one
|
|
// side.
|
|
if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
|
|
// Check if OpLane0 should be broadcast.
|
|
if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
|
|
!canBeVectorized(OpILane0, OpIdx, FirstLane))
|
|
ReorderingModes[OpIdx] = ReorderingMode::Splat;
|
|
else if (isa<LoadInst>(OpILane0))
|
|
ReorderingModes[OpIdx] = ReorderingMode::Load;
|
|
else
|
|
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
|
|
} else if (isa<Constant>(OpLane0)) {
|
|
ReorderingModes[OpIdx] = ReorderingMode::Constant;
|
|
} else if (isa<Argument>(OpLane0)) {
|
|
// Our best hope is a Splat. It may save some cost in some cases.
|
|
ReorderingModes[OpIdx] = ReorderingMode::Splat;
|
|
} else {
|
|
llvm_unreachable("Unexpected value kind.");
|
|
}
|
|
}
|
|
|
|
// Check that we don't have same operands. No need to reorder if operands
|
|
// are just perfect diamond or shuffled diamond match. Do not do it only
|
|
// for possible broadcasts or non-power of 2 number of scalars (just for
|
|
// now).
|
|
auto &&SkipReordering = [this]() {
|
|
SmallPtrSet<Value *, 4> UniqueValues;
|
|
ArrayRef<OperandData> Op0 = OpsVec.front();
|
|
for (const OperandData &Data : Op0)
|
|
UniqueValues.insert(Data.V);
|
|
for (ArrayRef<OperandData> Op :
|
|
ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
|
|
if (any_of(Op, [&UniqueValues](const OperandData &Data) {
|
|
return !UniqueValues.contains(Data.V);
|
|
}))
|
|
return false;
|
|
}
|
|
// TODO: Check if we can remove a check for non-power-2 number of
|
|
// scalars after full support of non-power-2 vectorization.
|
|
return UniqueValues.size() != 2 &&
|
|
hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
|
|
UniqueValues.size());
|
|
};
|
|
|
|
// If the initial strategy fails for any of the operand indexes, then we
|
|
// perform reordering again in a second pass. This helps avoid assigning
|
|
// high priority to the failed strategy, and should improve reordering for
|
|
// the non-failed operand indexes.
|
|
for (int Pass = 0; Pass != 2; ++Pass) {
|
|
// Check if no need to reorder operands since they're are perfect or
|
|
// shuffled diamond match.
|
|
// Need to do it to avoid extra external use cost counting for
|
|
// shuffled matches, which may cause regressions.
|
|
if (SkipReordering())
|
|
break;
|
|
// Skip the second pass if the first pass did not fail.
|
|
bool StrategyFailed = false;
|
|
// Mark all operand data as free to use.
|
|
clearUsed();
|
|
// We keep the original operand order for the FirstLane, so reorder the
|
|
// rest of the lanes. We are visiting the nodes in a circular fashion,
|
|
// using FirstLane as the center point and increasing the radius
|
|
// distance.
|
|
SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
|
|
for (unsigned I = 0; I < NumOperands; ++I)
|
|
MainAltOps[I].push_back(getData(I, FirstLane).V);
|
|
|
|
SmallBitVector UsedLanes(NumLanes);
|
|
UsedLanes.set(FirstLane);
|
|
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
|
|
// Visit the lane on the right and then the lane on the left.
|
|
for (int Direction : {+1, -1}) {
|
|
int Lane = FirstLane + Direction * Distance;
|
|
if (Lane < 0 || Lane >= (int)NumLanes)
|
|
continue;
|
|
UsedLanes.set(Lane);
|
|
int LastLane = Lane - Direction;
|
|
assert(LastLane >= 0 && LastLane < (int)NumLanes &&
|
|
"Out of bounds");
|
|
// Look for a good match for each operand.
|
|
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
|
|
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
|
|
std::optional<unsigned> BestIdx =
|
|
getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
|
|
MainAltOps[OpIdx], UsedLanes);
|
|
// By not selecting a value, we allow the operands that follow to
|
|
// select a better matching value. We will get a non-null value in
|
|
// the next run of getBestOperand().
|
|
if (BestIdx) {
|
|
// Swap the current operand with the one returned by
|
|
// getBestOperand().
|
|
swap(OpIdx, *BestIdx, Lane);
|
|
} else {
|
|
// Enable the second pass.
|
|
StrategyFailed = true;
|
|
}
|
|
// Try to get the alternate opcode and follow it during analysis.
|
|
if (MainAltOps[OpIdx].size() != 2) {
|
|
OperandData &AltOp = getData(OpIdx, Lane);
|
|
InstructionsState OpS =
|
|
getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
|
|
if (OpS && OpS.isAltShuffle())
|
|
MainAltOps[OpIdx].push_back(AltOp.V);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Skip second pass if the strategy did not fail.
|
|
if (!StrategyFailed)
|
|
break;
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
|
|
switch (RMode) {
|
|
case ReorderingMode::Load:
|
|
return "Load";
|
|
case ReorderingMode::Opcode:
|
|
return "Opcode";
|
|
case ReorderingMode::Constant:
|
|
return "Constant";
|
|
case ReorderingMode::Splat:
|
|
return "Splat";
|
|
case ReorderingMode::Failed:
|
|
return "Failed";
|
|
}
|
|
llvm_unreachable("Unimplemented Reordering Type");
|
|
}
|
|
|
|
LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
|
|
raw_ostream &OS) {
|
|
return OS << getModeStr(RMode);
|
|
}
|
|
|
|
/// Debug print.
|
|
LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
|
|
printMode(RMode, dbgs());
|
|
}
|
|
|
|
friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
|
|
return printMode(RMode, OS);
|
|
}
|
|
|
|
LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
|
|
const unsigned Indent = 2;
|
|
unsigned Cnt = 0;
|
|
for (const OperandDataVec &OpDataVec : OpsVec) {
|
|
OS << "Operand " << Cnt++ << "\n";
|
|
for (const OperandData &OpData : OpDataVec) {
|
|
OS.indent(Indent) << "{";
|
|
if (Value *V = OpData.V)
|
|
OS << *V;
|
|
else
|
|
OS << "null";
|
|
OS << ", APO:" << OpData.APO << "}\n";
|
|
}
|
|
OS << "\n";
|
|
}
|
|
return OS;
|
|
}
|
|
|
|
/// Debug print.
|
|
LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
|
|
#endif
|
|
};
|
|
|
|
/// Evaluate each pair in \p Candidates and return index into \p Candidates
|
|
/// for a pair which have highest score deemed to have best chance to form
|
|
/// root of profitable tree to vectorize. Return std::nullopt if no candidate
|
|
/// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
|
|
/// of the cost, considered to be good enough score.
|
|
std::optional<int>
|
|
findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
|
|
int Limit = LookAheadHeuristics::ScoreFail) const {
|
|
LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
|
|
RootLookAheadMaxDepth);
|
|
int BestScore = Limit;
|
|
std::optional<int> Index;
|
|
for (int I : seq<int>(0, Candidates.size())) {
|
|
int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
|
|
Candidates[I].second,
|
|
/*U1=*/nullptr, /*U2=*/nullptr,
|
|
/*CurrLevel=*/1, {});
|
|
if (Score > BestScore) {
|
|
BestScore = Score;
|
|
Index = I;
|
|
}
|
|
}
|
|
return Index;
|
|
}
|
|
|
|
/// Checks if the instruction is marked for deletion.
|
|
bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
|
|
|
|
/// Removes an instruction from its block and eventually deletes it.
|
|
/// It's like Instruction::eraseFromParent() except that the actual deletion
|
|
/// is delayed until BoUpSLP is destructed.
|
|
void eraseInstruction(Instruction *I) {
|
|
DeletedInstructions.insert(I);
|
|
}
|
|
|
|
/// Remove instructions from the parent function and clear the operands of \p
|
|
/// DeadVals instructions, marking for deletion trivially dead operands.
|
|
template <typename T>
|
|
void removeInstructionsAndOperands(
|
|
ArrayRef<T *> DeadVals,
|
|
ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
|
|
SmallVector<WeakTrackingVH> DeadInsts;
|
|
for (T *V : DeadVals) {
|
|
auto *I = cast<Instruction>(V);
|
|
eraseInstruction(I);
|
|
}
|
|
DenseSet<Value *> Processed;
|
|
for (T *V : DeadVals) {
|
|
if (!V || !Processed.insert(V).second)
|
|
continue;
|
|
auto *I = cast<Instruction>(V);
|
|
salvageDebugInfo(*I);
|
|
ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
|
|
for (Use &U : I->operands()) {
|
|
if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
|
|
OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
|
|
wouldInstructionBeTriviallyDead(OpI, TLI) &&
|
|
(Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
|
|
return Entry->VectorizedValue == OpI;
|
|
})))
|
|
DeadInsts.push_back(OpI);
|
|
}
|
|
I->dropAllReferences();
|
|
}
|
|
for (T *V : DeadVals) {
|
|
auto *I = cast<Instruction>(V);
|
|
if (!I->getParent())
|
|
continue;
|
|
assert((I->use_empty() || all_of(I->uses(),
|
|
[&](Use &U) {
|
|
return isDeleted(
|
|
cast<Instruction>(U.getUser()));
|
|
})) &&
|
|
"trying to erase instruction with users.");
|
|
I->removeFromParent();
|
|
SE->forgetValue(I);
|
|
}
|
|
// Process the dead instruction list until empty.
|
|
while (!DeadInsts.empty()) {
|
|
Value *V = DeadInsts.pop_back_val();
|
|
Instruction *VI = cast_or_null<Instruction>(V);
|
|
if (!VI || !VI->getParent())
|
|
continue;
|
|
assert(isInstructionTriviallyDead(VI, TLI) &&
|
|
"Live instruction found in dead worklist!");
|
|
assert(VI->use_empty() && "Instructions with uses are not dead.");
|
|
|
|
// Don't lose the debug info while deleting the instructions.
|
|
salvageDebugInfo(*VI);
|
|
|
|
// Null out all of the instruction's operands to see if any operand
|
|
// becomes dead as we go.
|
|
for (Use &OpU : VI->operands()) {
|
|
Value *OpV = OpU.get();
|
|
if (!OpV)
|
|
continue;
|
|
OpU.set(nullptr);
|
|
|
|
if (!OpV->use_empty())
|
|
continue;
|
|
|
|
// If the operand is an instruction that became dead as we nulled out
|
|
// the operand, and if it is 'trivially' dead, delete it in a future
|
|
// loop iteration.
|
|
if (auto *OpI = dyn_cast<Instruction>(OpV))
|
|
if (!DeletedInstructions.contains(OpI) &&
|
|
(!OpI->getType()->isVectorTy() ||
|
|
none_of(VectorValuesAndScales,
|
|
[&](const std::tuple<Value *, unsigned, bool> &V) {
|
|
return std::get<0>(V) == OpI;
|
|
})) &&
|
|
isInstructionTriviallyDead(OpI, TLI))
|
|
DeadInsts.push_back(OpI);
|
|
}
|
|
|
|
VI->removeFromParent();
|
|
eraseInstruction(VI);
|
|
SE->forgetValue(VI);
|
|
}
|
|
}
|
|
|
|
/// Checks if the instruction was already analyzed for being possible
|
|
/// reduction root.
|
|
bool isAnalyzedReductionRoot(Instruction *I) const {
|
|
return AnalyzedReductionsRoots.count(I);
|
|
}
|
|
/// Register given instruction as already analyzed for being possible
|
|
/// reduction root.
|
|
void analyzedReductionRoot(Instruction *I) {
|
|
AnalyzedReductionsRoots.insert(I);
|
|
}
|
|
/// Checks if the provided list of reduced values was checked already for
|
|
/// vectorization.
|
|
bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
|
|
return AnalyzedReductionVals.contains(hash_value(VL));
|
|
}
|
|
/// Adds the list of reduced values to list of already checked values for the
|
|
/// vectorization.
|
|
void analyzedReductionVals(ArrayRef<Value *> VL) {
|
|
AnalyzedReductionVals.insert(hash_value(VL));
|
|
}
|
|
/// Clear the list of the analyzed reduction root instructions.
|
|
void clearReductionData() {
|
|
AnalyzedReductionsRoots.clear();
|
|
AnalyzedReductionVals.clear();
|
|
AnalyzedMinBWVals.clear();
|
|
}
|
|
/// Checks if the given value is gathered in one of the nodes.
|
|
bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
|
|
return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
|
|
}
|
|
/// Checks if the given value is gathered in one of the nodes.
|
|
bool isGathered(const Value *V) const {
|
|
return MustGather.contains(V);
|
|
}
|
|
/// Checks if the specified value was not schedule.
|
|
bool isNotScheduled(const Value *V) const {
|
|
return NonScheduledFirst.contains(V);
|
|
}
|
|
|
|
/// Check if the value is vectorized in the tree.
|
|
bool isVectorized(const Value *V) const {
|
|
assert(V && "V cannot be nullptr.");
|
|
return ScalarToTreeEntries.contains(V);
|
|
}
|
|
|
|
~BoUpSLP();
|
|
|
|
private:
|
|
/// Determine if a node \p E in can be demoted to a smaller type with a
|
|
/// truncation. We collect the entries that will be demoted in ToDemote.
|
|
/// \param E Node for analysis
|
|
/// \param ToDemote indices of the nodes to be demoted.
|
|
bool collectValuesToDemote(
|
|
const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
|
|
SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
|
|
const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
|
|
bool &IsProfitableToDemote, bool IsTruncRoot) const;
|
|
|
|
/// Check if the operands on the edges \p Edges of the \p UserTE allows
|
|
/// reordering (i.e. the operands can be reordered because they have only one
|
|
/// user and reordarable).
|
|
/// \param ReorderableGathers List of all gather nodes that require reordering
|
|
/// (e.g., gather of extractlements or partially vectorizable loads).
|
|
/// \param GatherOps List of gather operand nodes for \p UserTE that require
|
|
/// reordering, subset of \p NonVectorized.
|
|
bool
|
|
canReorderOperands(TreeEntry *UserTE,
|
|
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
|
|
ArrayRef<TreeEntry *> ReorderableGathers,
|
|
SmallVectorImpl<TreeEntry *> &GatherOps);
|
|
|
|
/// Checks if the given \p TE is a gather node with clustered reused scalars
|
|
/// and reorders it per given \p Mask.
|
|
void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
|
|
|
|
/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
|
|
/// if any. If it is not vectorized (gather node), returns nullptr.
|
|
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
|
|
ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
|
|
TreeEntry *TE = nullptr;
|
|
const auto *It = find_if(VL, [&](Value *V) {
|
|
if (!isa<Instruction>(V))
|
|
return false;
|
|
for (TreeEntry *E : getTreeEntries(V)) {
|
|
if (E->UserTreeIndex == EdgeInfo(UserTE, OpIdx)) {
|
|
TE = E;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
});
|
|
if (It != VL.end()) {
|
|
assert(TE->isSame(VL) && "Expected same scalars.");
|
|
return TE;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
|
|
/// if any. If it is not vectorized (gather node), returns nullptr.
|
|
const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
|
|
unsigned OpIdx) const {
|
|
return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
|
|
const_cast<TreeEntry *>(UserTE), OpIdx);
|
|
}
|
|
|
|
/// Checks if all users of \p I are the part of the vectorization tree.
|
|
bool areAllUsersVectorized(
|
|
Instruction *I,
|
|
const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
|
|
|
|
/// Return information about the vector formed for the specified index
|
|
/// of a vector of (the same) instruction.
|
|
TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
|
|
|
|
/// \returns the graph entry for the \p Idx operand of the \p E entry.
|
|
const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
|
|
TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
|
|
return const_cast<TreeEntry *>(
|
|
getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
|
|
}
|
|
|
|
/// Gets the root instruction for the given node. If the node is a strided
|
|
/// load/store node with the reverse order, the root instruction is the last
|
|
/// one.
|
|
Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
|
|
|
|
/// \returns Cast context for the given graph node.
|
|
TargetTransformInfo::CastContextHint
|
|
getCastContextHint(const TreeEntry &TE) const;
|
|
|
|
/// \returns the cost of the vectorizable entry.
|
|
InstructionCost getEntryCost(const TreeEntry *E,
|
|
ArrayRef<Value *> VectorizedVals,
|
|
SmallPtrSetImpl<Value *> &CheckedExtracts);
|
|
|
|
/// Checks if it is legal and profitable to build SplitVectorize node for the
|
|
/// given \p VL.
|
|
/// \param Op1 first homogeneous scalars.
|
|
/// \param Op2 second homogeneous scalars.
|
|
/// \param ReorderIndices indices to reorder the scalars.
|
|
/// \returns true if the node was successfully built.
|
|
bool canBuildSplitNode(ArrayRef<Value *> VL,
|
|
const InstructionsState &LocalState,
|
|
SmallVectorImpl<Value *> &Op1,
|
|
SmallVectorImpl<Value *> &Op2,
|
|
OrdersType &ReorderIndices) const;
|
|
|
|
/// This is the recursive part of buildTree.
|
|
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
|
|
const EdgeInfo &EI, unsigned InterleaveFactor = 0);
|
|
|
|
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
|
|
/// be vectorized to use the original vector (or aggregate "bitcast" to a
|
|
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
|
|
/// returns false, setting \p CurrentOrder to either an empty vector or a
|
|
/// non-identity permutation that allows to reuse extract instructions.
|
|
/// \param ResizeAllowed indicates whether it is allowed to handle subvector
|
|
/// extract order.
|
|
bool canReuseExtract(ArrayRef<Value *> VL,
|
|
SmallVectorImpl<unsigned> &CurrentOrder,
|
|
bool ResizeAllowed = false) const;
|
|
|
|
/// Vectorize a single entry in the tree.
|
|
Value *vectorizeTree(TreeEntry *E);
|
|
|
|
/// Returns vectorized operand node, that matches the order of the scalars
|
|
/// operand number \p NodeIdx in entry \p E.
|
|
TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
|
|
ArrayRef<Value *> VL,
|
|
const InstructionsState &S);
|
|
const TreeEntry *
|
|
getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
|
|
ArrayRef<Value *> VL,
|
|
const InstructionsState &S) const {
|
|
return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx,
|
|
VL, S);
|
|
}
|
|
|
|
/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
|
|
/// \p E.
|
|
Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
|
|
|
|
/// Create a new vector from a list of scalar values. Produces a sequence
|
|
/// which exploits values reused across lanes, and arranges the inserts
|
|
/// for ease of later optimization.
|
|
template <typename BVTy, typename ResTy, typename... Args>
|
|
ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
|
|
|
|
/// Create a new vector from a list of scalar values. Produces a sequence
|
|
/// which exploits values reused across lanes, and arranges the inserts
|
|
/// for ease of later optimization.
|
|
Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
|
|
|
|
/// Returns the instruction in the bundle, which can be used as a base point
|
|
/// for scheduling. Usually it is the last instruction in the bundle, except
|
|
/// for the case when all operands are external (in this case, it is the first
|
|
/// instruction in the list).
|
|
Instruction &getLastInstructionInBundle(const TreeEntry *E);
|
|
|
|
/// Tries to find extractelement instructions with constant indices from fixed
|
|
/// vector type and gather such instructions into a bunch, which highly likely
|
|
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
|
|
/// was successful, the matched scalars are replaced by poison values in \p VL
|
|
/// for future analysis.
|
|
std::optional<TargetTransformInfo::ShuffleKind>
|
|
tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
|
|
SmallVectorImpl<int> &Mask) const;
|
|
|
|
/// Tries to find extractelement instructions with constant indices from fixed
|
|
/// vector type and gather such instructions into a bunch, which highly likely
|
|
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
|
|
/// was successful, the matched scalars are replaced by poison values in \p VL
|
|
/// for future analysis.
|
|
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
|
|
tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
|
|
SmallVectorImpl<int> &Mask,
|
|
unsigned NumParts) const;
|
|
|
|
/// Checks if the gathered \p VL can be represented as a single register
|
|
/// shuffle(s) of previous tree entries.
|
|
/// \param TE Tree entry checked for permutation.
|
|
/// \param VL List of scalars (a subset of the TE scalar), checked for
|
|
/// permutations. Must form single-register vector.
|
|
/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
|
|
/// commands to build the mask using the original vector value, without
|
|
/// relying on the potential reordering.
|
|
/// \returns ShuffleKind, if gathered values can be represented as shuffles of
|
|
/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
|
|
std::optional<TargetTransformInfo::ShuffleKind>
|
|
isGatherShuffledSingleRegisterEntry(
|
|
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
|
|
SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
|
|
bool ForOrder);
|
|
|
|
/// Checks if the gathered \p VL can be represented as multi-register
|
|
/// shuffle(s) of previous tree entries.
|
|
/// \param TE Tree entry checked for permutation.
|
|
/// \param VL List of scalars (a subset of the TE scalar), checked for
|
|
/// permutations.
|
|
/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
|
|
/// commands to build the mask using the original vector value, without
|
|
/// relying on the potential reordering.
|
|
/// \returns per-register series of ShuffleKind, if gathered values can be
|
|
/// represented as shuffles of previous tree entries. \p Mask is filled with
|
|
/// the shuffle mask (also on per-register base).
|
|
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
|
|
isGatherShuffledEntry(
|
|
const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
|
|
SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
|
|
unsigned NumParts, bool ForOrder = false);
|
|
|
|
/// \returns the cost of gathering (inserting) the values in \p VL into a
|
|
/// vector.
|
|
/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
|
|
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
|
|
Type *ScalarTy) const;
|
|
|
|
/// Set the Builder insert point to one after the last instruction in
|
|
/// the bundle
|
|
void setInsertPointAfterBundle(const TreeEntry *E);
|
|
|
|
/// \returns a vector from a collection of scalars in \p VL. if \p Root is not
|
|
/// specified, the starting vector value is poison.
|
|
Value *
|
|
gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
|
|
function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
|
|
|
|
/// \returns whether the VectorizableTree is fully vectorizable and will
|
|
/// be beneficial even the tree height is tiny.
|
|
bool isFullyVectorizableTinyTree(bool ForReduction) const;
|
|
|
|
/// Run through the list of all gathered loads in the graph and try to find
|
|
/// vector loads/masked gathers instead of regular gathers. Later these loads
|
|
/// are reshufled to build final gathered nodes.
|
|
void tryToVectorizeGatheredLoads(
|
|
const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
|
|
SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
|
|
8> &GatheredLoads);
|
|
|
|
/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
|
|
/// users of \p TE and collects the stores. It returns the map from the store
|
|
/// pointers to the collected stores.
|
|
SmallVector<SmallVector<StoreInst *>>
|
|
collectUserStores(const BoUpSLP::TreeEntry *TE) const;
|
|
|
|
/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
|
|
/// stores in \p StoresVec can form a vector instruction. If so it returns
|
|
/// true and populates \p ReorderIndices with the shuffle indices of the
|
|
/// stores when compared to the sorted vector.
|
|
bool canFormVector(ArrayRef<StoreInst *> StoresVec,
|
|
OrdersType &ReorderIndices) const;
|
|
|
|
/// Iterates through the users of \p TE, looking for scalar stores that can be
|
|
/// potentially vectorized in a future SLP-tree. If found, it keeps track of
|
|
/// their order and builds an order index vector for each store bundle. It
|
|
/// returns all these order vectors found.
|
|
/// We run this after the tree has formed, otherwise we may come across user
|
|
/// instructions that are not yet in the tree.
|
|
SmallVector<OrdersType, 1>
|
|
findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
|
|
|
|
/// Tries to reorder the gathering node for better vectorization
|
|
/// opportunities.
|
|
void reorderGatherNode(TreeEntry &TE);
|
|
|
|
struct TreeEntry {
|
|
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
|
|
TreeEntry(VecTreeTy &Container) : Container(Container) {}
|
|
|
|
/// \returns Common mask for reorder indices and reused scalars.
|
|
SmallVector<int> getCommonMask() const {
|
|
if (State == TreeEntry::SplitVectorize)
|
|
return {};
|
|
SmallVector<int> Mask;
|
|
inversePermutation(ReorderIndices, Mask);
|
|
::addMask(Mask, ReuseShuffleIndices);
|
|
return Mask;
|
|
}
|
|
|
|
/// \returns The mask for split nodes.
|
|
SmallVector<int> getSplitMask() const {
|
|
assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
|
|
"Expected only split vectorize node.");
|
|
SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
|
|
unsigned CommonVF = std::max<unsigned>(
|
|
CombinedEntriesWithIndices.back().second,
|
|
Scalars.size() - CombinedEntriesWithIndices.back().second);
|
|
for (auto [Idx, I] : enumerate(ReorderIndices))
|
|
Mask[I] =
|
|
Idx + (Idx >= CombinedEntriesWithIndices.back().second
|
|
? CommonVF - CombinedEntriesWithIndices.back().second
|
|
: 0);
|
|
return Mask;
|
|
}
|
|
|
|
/// Updates (reorders) SplitVectorize node according to the given mask \p
|
|
/// Mask and order \p MaskOrder.
|
|
void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
|
|
ArrayRef<int> MaskOrder);
|
|
|
|
/// \returns true if the scalars in VL are equal to this entry.
|
|
bool isSame(ArrayRef<Value *> VL) const {
|
|
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
|
|
if (Mask.size() != VL.size() && VL.size() == Scalars.size())
|
|
return std::equal(VL.begin(), VL.end(), Scalars.begin());
|
|
return VL.size() == Mask.size() &&
|
|
std::equal(VL.begin(), VL.end(), Mask.begin(),
|
|
[Scalars](Value *V, int Idx) {
|
|
return (isa<UndefValue>(V) &&
|
|
Idx == PoisonMaskElem) ||
|
|
(Idx != PoisonMaskElem && V == Scalars[Idx]);
|
|
});
|
|
};
|
|
if (!ReorderIndices.empty()) {
|
|
// TODO: implement matching if the nodes are just reordered, still can
|
|
// treat the vector as the same if the list of scalars matches VL
|
|
// directly, without reordering.
|
|
SmallVector<int> Mask;
|
|
inversePermutation(ReorderIndices, Mask);
|
|
if (VL.size() == Scalars.size())
|
|
return IsSame(Scalars, Mask);
|
|
if (VL.size() == ReuseShuffleIndices.size()) {
|
|
::addMask(Mask, ReuseShuffleIndices);
|
|
return IsSame(Scalars, Mask);
|
|
}
|
|
return false;
|
|
}
|
|
return IsSame(Scalars, ReuseShuffleIndices);
|
|
}
|
|
|
|
bool isOperandGatherNode(const EdgeInfo &UserEI) const {
|
|
return isGather() && UserTreeIndex.EdgeIdx == UserEI.EdgeIdx &&
|
|
UserTreeIndex.UserTE == UserEI.UserTE;
|
|
}
|
|
|
|
/// \returns true if current entry has same operands as \p TE.
|
|
bool hasEqualOperands(const TreeEntry &TE) const {
|
|
if (TE.getNumOperands() != getNumOperands())
|
|
return false;
|
|
SmallBitVector Used(getNumOperands());
|
|
for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
|
|
unsigned PrevCount = Used.count();
|
|
for (unsigned K = 0; K < E; ++K) {
|
|
if (Used.test(K))
|
|
continue;
|
|
if (getOperand(K) == TE.getOperand(I)) {
|
|
Used.set(K);
|
|
break;
|
|
}
|
|
}
|
|
// Check if we actually found the matching operand.
|
|
if (PrevCount == Used.count())
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// \return Final vectorization factor for the node. Defined by the total
|
|
/// number of vectorized scalars, including those, used several times in the
|
|
/// entry and counted in the \a ReuseShuffleIndices, if any.
|
|
unsigned getVectorFactor() const {
|
|
if (!ReuseShuffleIndices.empty())
|
|
return ReuseShuffleIndices.size();
|
|
return Scalars.size();
|
|
};
|
|
|
|
/// Checks if the current node is a gather node.
|
|
bool isGather() const { return State == NeedToGather; }
|
|
|
|
/// A vector of scalars.
|
|
ValueList Scalars;
|
|
|
|
/// The Scalars are vectorized into this value. It is initialized to Null.
|
|
WeakTrackingVH VectorizedValue = nullptr;
|
|
|
|
/// Do we need to gather this sequence or vectorize it
|
|
/// (either with vector instruction or with scatter/gather
|
|
/// intrinsics for store/load)?
|
|
enum EntryState {
|
|
Vectorize, ///< The node is regularly vectorized.
|
|
ScatterVectorize, ///< Masked scatter/gather node.
|
|
StridedVectorize, ///< Strided loads (and stores)
|
|
CompressVectorize, ///< (Masked) load with compress.
|
|
NeedToGather, ///< Gather/buildvector node.
|
|
CombinedVectorize, ///< Vectorized node, combined with its user into more
|
|
///< complex node like select/cmp to minmax, mul/add to
|
|
///< fma, etc. Must be used for the following nodes in
|
|
///< the pattern, not the very first one.
|
|
SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
|
|
///< independently and then combines back.
|
|
};
|
|
EntryState State;
|
|
|
|
/// List of combined opcodes supported by the vectorizer.
|
|
enum CombinedOpcode {
|
|
NotCombinedOp = -1,
|
|
MinMax = Instruction::OtherOpsEnd + 1,
|
|
};
|
|
CombinedOpcode CombinedOp = NotCombinedOp;
|
|
|
|
/// Does this sequence require some shuffling?
|
|
SmallVector<int, 4> ReuseShuffleIndices;
|
|
|
|
/// Does this entry require reordering?
|
|
SmallVector<unsigned, 4> ReorderIndices;
|
|
|
|
/// Points back to the VectorizableTree.
|
|
///
|
|
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
|
|
/// to be a pointer and needs to be able to initialize the child iterator.
|
|
/// Thus we need a reference back to the container to translate the indices
|
|
/// to entries.
|
|
VecTreeTy &Container;
|
|
|
|
/// The TreeEntry index containing the user of this entry.
|
|
EdgeInfo UserTreeIndex;
|
|
|
|
/// The index of this treeEntry in VectorizableTree.
|
|
unsigned Idx = 0;
|
|
|
|
/// For gather/buildvector/alt opcode nodes, which are combined from
|
|
/// other nodes as a series of insertvector instructions.
|
|
SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
|
|
|
|
private:
|
|
/// The operands of each instruction in each lane Operands[op_index][lane].
|
|
/// Note: This helps avoid the replication of the code that performs the
|
|
/// reordering of operands during buildTree_rec() and vectorizeTree().
|
|
SmallVector<ValueList, 2> Operands;
|
|
|
|
/// MainOp and AltOp are recorded inside. S should be obtained from
|
|
/// newTreeEntry.
|
|
InstructionsState S = InstructionsState::invalid();
|
|
|
|
/// Interleaving factor for interleaved loads Vectorize nodes.
|
|
unsigned InterleaveFactor = 0;
|
|
|
|
public:
|
|
/// Returns interleave factor for interleave nodes.
|
|
unsigned getInterleaveFactor() const { return InterleaveFactor; }
|
|
/// Sets interleaving factor for the interleaving nodes.
|
|
void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
|
|
|
|
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
|
|
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
|
|
if (Operands.size() < OpIdx + 1)
|
|
Operands.resize(OpIdx + 1);
|
|
assert(Operands[OpIdx].empty() && "Already resized?");
|
|
assert(OpVL.size() <= Scalars.size() &&
|
|
"Number of operands is greater than the number of scalars.");
|
|
Operands[OpIdx].resize(OpVL.size());
|
|
copy(OpVL, Operands[OpIdx].begin());
|
|
}
|
|
|
|
/// Set this bundle's operand from Scalars.
|
|
void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
|
|
VLOperands Ops(Scalars, S, R);
|
|
if (RequireReorder)
|
|
Ops.reorder();
|
|
for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
|
|
setOperand(I, Ops.getVL(I));
|
|
}
|
|
|
|
/// Reorders operands of the node to the given mask \p Mask.
|
|
void reorderOperands(ArrayRef<int> Mask) {
|
|
for (ValueList &Operand : Operands)
|
|
reorderScalars(Operand, Mask);
|
|
}
|
|
|
|
/// \returns the \p OpIdx operand of this TreeEntry.
|
|
ValueList &getOperand(unsigned OpIdx) {
|
|
assert(OpIdx < Operands.size() && "Off bounds");
|
|
return Operands[OpIdx];
|
|
}
|
|
|
|
/// \returns the \p OpIdx operand of this TreeEntry.
|
|
ArrayRef<Value *> getOperand(unsigned OpIdx) const {
|
|
assert(OpIdx < Operands.size() && "Off bounds");
|
|
return Operands[OpIdx];
|
|
}
|
|
|
|
/// \returns the number of operands.
|
|
unsigned getNumOperands() const { return Operands.size(); }
|
|
|
|
/// \return the single \p OpIdx operand.
|
|
Value *getSingleOperand(unsigned OpIdx) const {
|
|
assert(OpIdx < Operands.size() && "Off bounds");
|
|
assert(!Operands[OpIdx].empty() && "No operand available");
|
|
return Operands[OpIdx][0];
|
|
}
|
|
|
|
/// Some of the instructions in the list have alternate opcodes.
|
|
bool isAltShuffle() const { return S.isAltShuffle(); }
|
|
|
|
bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
|
|
|
|
/// Chooses the correct key for scheduling data. If \p Op has the same (or
|
|
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
|
|
/// \p OpValue.
|
|
Value *isOneOf(Value *Op) const {
|
|
auto *I = dyn_cast<Instruction>(Op);
|
|
if (I && isOpcodeOrAlt(I))
|
|
return Op;
|
|
return S.getMainOp();
|
|
}
|
|
|
|
void setOperations(const InstructionsState &S) {
|
|
assert(S && "InstructionsState is invalid.");
|
|
this->S = S;
|
|
}
|
|
|
|
Instruction *getMainOp() const { return S.getMainOp(); }
|
|
|
|
Instruction *getAltOp() const { return S.getAltOp(); }
|
|
|
|
/// The main/alternate opcodes for the list of instructions.
|
|
unsigned getOpcode() const { return S.getOpcode(); }
|
|
|
|
unsigned getAltOpcode() const { return S.getAltOpcode(); }
|
|
|
|
bool hasState() const { return S.valid(); }
|
|
|
|
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
|
|
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
|
|
int findLaneForValue(Value *V) const {
|
|
unsigned FoundLane = getVectorFactor();
|
|
for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
|
|
std::advance(It, 1)) {
|
|
if (*It != V)
|
|
continue;
|
|
FoundLane = std::distance(Scalars.begin(), It);
|
|
assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
|
|
if (!ReorderIndices.empty())
|
|
FoundLane = ReorderIndices[FoundLane];
|
|
assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
|
|
if (ReuseShuffleIndices.empty())
|
|
break;
|
|
if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
|
|
RIt != ReuseShuffleIndices.end()) {
|
|
FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
|
|
break;
|
|
}
|
|
}
|
|
assert(FoundLane < getVectorFactor() && "Unable to find given value.");
|
|
return FoundLane;
|
|
}
|
|
|
|
/// Build a shuffle mask for graph entry which represents a merge of main
|
|
/// and alternate operations.
|
|
void
|
|
buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
|
|
SmallVectorImpl<int> &Mask,
|
|
SmallVectorImpl<Value *> *OpScalars = nullptr,
|
|
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
|
|
|
|
/// Return true if this is a non-power-of-2 node.
|
|
bool isNonPowOf2Vec() const {
|
|
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
|
|
return IsNonPowerOf2;
|
|
}
|
|
|
|
/// Return true if this is a node, which tries to vectorize number of
|
|
/// elements, forming whole vectors.
|
|
bool
|
|
hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
|
|
bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
|
|
TTI, getValueType(Scalars.front()), Scalars.size());
|
|
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
|
|
"Reshuffling not supported with non-power-of-2 vectors yet.");
|
|
return IsNonPowerOf2;
|
|
}
|
|
|
|
Value *getOrdered(unsigned Idx) const {
|
|
assert(isGather() && "Must be used only for buildvectors/gathers.");
|
|
if (ReorderIndices.empty())
|
|
return Scalars[Idx];
|
|
SmallVector<int> Mask;
|
|
inversePermutation(ReorderIndices, Mask);
|
|
return Scalars[Mask[Idx]];
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/// Debug printer.
|
|
LLVM_DUMP_METHOD void dump() const {
|
|
dbgs() << Idx << ".\n";
|
|
for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
|
|
dbgs() << "Operand " << OpI << ":\n";
|
|
for (const Value *V : Operands[OpI])
|
|
dbgs().indent(2) << *V << "\n";
|
|
}
|
|
dbgs() << "Scalars: \n";
|
|
for (Value *V : Scalars)
|
|
dbgs().indent(2) << *V << "\n";
|
|
dbgs() << "State: ";
|
|
switch (State) {
|
|
case Vectorize:
|
|
if (InterleaveFactor > 0) {
|
|
dbgs() << "Vectorize with interleave factor " << InterleaveFactor
|
|
<< "\n";
|
|
} else {
|
|
dbgs() << "Vectorize\n";
|
|
}
|
|
break;
|
|
case ScatterVectorize:
|
|
dbgs() << "ScatterVectorize\n";
|
|
break;
|
|
case StridedVectorize:
|
|
dbgs() << "StridedVectorize\n";
|
|
break;
|
|
case CompressVectorize:
|
|
dbgs() << "CompressVectorize\n";
|
|
break;
|
|
case NeedToGather:
|
|
dbgs() << "NeedToGather\n";
|
|
break;
|
|
case CombinedVectorize:
|
|
dbgs() << "CombinedVectorize\n";
|
|
break;
|
|
case SplitVectorize:
|
|
dbgs() << "SplitVectorize\n";
|
|
break;
|
|
}
|
|
if (S) {
|
|
dbgs() << "MainOp: " << *S.getMainOp() << "\n";
|
|
dbgs() << "AltOp: " << *S.getAltOp() << "\n";
|
|
} else {
|
|
dbgs() << "MainOp: NULL\n";
|
|
dbgs() << "AltOp: NULL\n";
|
|
}
|
|
dbgs() << "VectorizedValue: ";
|
|
if (VectorizedValue)
|
|
dbgs() << *VectorizedValue << "\n";
|
|
else
|
|
dbgs() << "NULL\n";
|
|
dbgs() << "ReuseShuffleIndices: ";
|
|
if (ReuseShuffleIndices.empty())
|
|
dbgs() << "Empty";
|
|
else
|
|
for (int ReuseIdx : ReuseShuffleIndices)
|
|
dbgs() << ReuseIdx << ", ";
|
|
dbgs() << "\n";
|
|
dbgs() << "ReorderIndices: ";
|
|
for (unsigned ReorderIdx : ReorderIndices)
|
|
dbgs() << ReorderIdx << ", ";
|
|
dbgs() << "\n";
|
|
dbgs() << "UserTreeIndex: ";
|
|
if (UserTreeIndex)
|
|
dbgs() << UserTreeIndex;
|
|
else
|
|
dbgs() << "<invalid>";
|
|
dbgs() << "\n";
|
|
if (!CombinedEntriesWithIndices.empty()) {
|
|
dbgs() << "Combined entries: ";
|
|
interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
|
|
dbgs() << "Entry index " << P.first << " with offset " << P.second;
|
|
});
|
|
dbgs() << "\n";
|
|
}
|
|
}
|
|
#endif
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
|
|
InstructionCost VecCost, InstructionCost ScalarCost,
|
|
StringRef Banner) const {
|
|
dbgs() << "SLP: " << Banner << ":\n";
|
|
E->dump();
|
|
dbgs() << "SLP: Costs:\n";
|
|
dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
|
|
dbgs() << "SLP: VectorCost = " << VecCost << "\n";
|
|
dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
|
|
dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
|
|
<< ReuseShuffleCost + VecCost - ScalarCost << "\n";
|
|
}
|
|
#endif
|
|
|
|
/// Create a new VectorizableTree entry.
|
|
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
|
|
const InstructionsState &S,
|
|
const EdgeInfo &UserTreeIdx,
|
|
ArrayRef<int> ReuseShuffleIndices = {},
|
|
ArrayRef<unsigned> ReorderIndices = {},
|
|
unsigned InterleaveFactor = 0) {
|
|
TreeEntry::EntryState EntryState =
|
|
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
|
|
TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
|
|
ReuseShuffleIndices, ReorderIndices);
|
|
if (E && InterleaveFactor > 0)
|
|
E->setInterleave(InterleaveFactor);
|
|
return E;
|
|
}
|
|
|
|
TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
|
|
TreeEntry::EntryState EntryState,
|
|
ScheduleBundle &Bundle, const InstructionsState &S,
|
|
const EdgeInfo &UserTreeIdx,
|
|
ArrayRef<int> ReuseShuffleIndices = {},
|
|
ArrayRef<unsigned> ReorderIndices = {}) {
|
|
assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
|
|
EntryState == TreeEntry::SplitVectorize)) ||
|
|
(Bundle && EntryState != TreeEntry::NeedToGather &&
|
|
EntryState != TreeEntry::SplitVectorize)) &&
|
|
"Need to vectorize gather entry?");
|
|
// Gathered loads still gathered? Do not create entry, use the original one.
|
|
if (GatheredLoadsEntriesFirst.has_value() &&
|
|
EntryState == TreeEntry::NeedToGather && S &&
|
|
S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
|
|
!UserTreeIdx.UserTE)
|
|
return nullptr;
|
|
VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
|
|
TreeEntry *Last = VectorizableTree.back().get();
|
|
Last->Idx = VectorizableTree.size() - 1;
|
|
Last->State = EntryState;
|
|
// FIXME: Remove once support for ReuseShuffleIndices has been implemented
|
|
// for non-power-of-two vectors.
|
|
assert(
|
|
(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
|
|
ReuseShuffleIndices.empty()) &&
|
|
"Reshuffling scalars not yet supported for nodes with padding");
|
|
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
|
|
ReuseShuffleIndices.end());
|
|
if (ReorderIndices.empty()) {
|
|
Last->Scalars.assign(VL.begin(), VL.end());
|
|
if (S)
|
|
Last->setOperations(S);
|
|
} else {
|
|
// Reorder scalars and build final mask.
|
|
Last->Scalars.assign(VL.size(), nullptr);
|
|
transform(ReorderIndices, Last->Scalars.begin(),
|
|
[VL](unsigned Idx) -> Value * {
|
|
if (Idx >= VL.size())
|
|
return UndefValue::get(VL.front()->getType());
|
|
return VL[Idx];
|
|
});
|
|
InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
|
|
if (S)
|
|
Last->setOperations(S);
|
|
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
|
|
}
|
|
if (EntryState == TreeEntry::SplitVectorize) {
|
|
assert(S && "Split nodes must have operations.");
|
|
Last->setOperations(S);
|
|
SmallPtrSet<Value *, 4> Processed;
|
|
for (Value *V : VL) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
continue;
|
|
auto It = ScalarsInSplitNodes.find(V);
|
|
if (It == ScalarsInSplitNodes.end()) {
|
|
ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
|
|
(void)Processed.insert(V);
|
|
} else if (Processed.insert(V).second) {
|
|
assert(!is_contained(It->getSecond(), Last) &&
|
|
"Value already associated with the node.");
|
|
It->getSecond().push_back(Last);
|
|
}
|
|
}
|
|
} else if (!Last->isGather()) {
|
|
SmallPtrSet<Value *, 4> Processed;
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
auto It = ScalarToTreeEntries.find(V);
|
|
if (It == ScalarToTreeEntries.end()) {
|
|
ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
|
|
(void)Processed.insert(V);
|
|
} else if (Processed.insert(V).second) {
|
|
assert(!is_contained(It->getSecond(), Last) &&
|
|
"Value already associated with the node.");
|
|
It->getSecond().push_back(Last);
|
|
}
|
|
}
|
|
// Update the scheduler bundle to point to this TreeEntry.
|
|
assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
|
|
isVectorLikeInstWithConstOps(S.getMainOp()) ||
|
|
doesNotNeedToSchedule(VL)) &&
|
|
"Bundle and VL out of sync");
|
|
if (!Bundle.getBundle().empty()) {
|
|
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
|
|
auto *BundleMember = Bundle.getBundle().begin();
|
|
SmallPtrSet<Value *, 4> Processed;
|
|
for (Value *V : VL) {
|
|
if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
|
|
continue;
|
|
++BundleMember;
|
|
}
|
|
assert(BundleMember == Bundle.getBundle().end() &&
|
|
"Bundle and VL out of sync");
|
|
#endif
|
|
Bundle.setTreeEntry(Last);
|
|
}
|
|
} else {
|
|
// Build a map for gathered scalars to the nodes where they are used.
|
|
bool AllConstsOrCasts = true;
|
|
for (Value *V : VL)
|
|
if (!isConstant(V)) {
|
|
auto *I = dyn_cast<CastInst>(V);
|
|
AllConstsOrCasts &= I && I->getType()->isIntegerTy();
|
|
if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
|
|
!UserTreeIdx.UserTE->isGather())
|
|
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
|
|
}
|
|
if (AllConstsOrCasts)
|
|
CastMaxMinBWSizes =
|
|
std::make_pair(std::numeric_limits<unsigned>::max(), 1);
|
|
MustGather.insert_range(VL);
|
|
}
|
|
|
|
if (UserTreeIdx.UserTE)
|
|
Last->UserTreeIndex = UserTreeIdx;
|
|
return Last;
|
|
}
|
|
|
|
/// -- Vectorization State --
|
|
/// Holds all of the tree entries.
|
|
TreeEntry::VecTreeTy VectorizableTree;
|
|
|
|
#ifndef NDEBUG
|
|
/// Debug printer.
|
|
LLVM_DUMP_METHOD void dumpVectorizableTree() const {
|
|
for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
|
|
VectorizableTree[Id]->dump();
|
|
dbgs() << "\n";
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/// Get list of vector entries, associated with the value \p V.
|
|
ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
|
|
assert(V && "V cannot be nullptr.");
|
|
auto It = ScalarToTreeEntries.find(V);
|
|
if (It == ScalarToTreeEntries.end())
|
|
return {};
|
|
return It->getSecond();
|
|
}
|
|
|
|
/// Get list of split vector entries, associated with the value \p V.
|
|
ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
|
|
assert(V && "V cannot be nullptr.");
|
|
auto It = ScalarsInSplitNodes.find(V);
|
|
if (It == ScalarsInSplitNodes.end())
|
|
return {};
|
|
return It->getSecond();
|
|
}
|
|
|
|
/// Returns first vector node for value \p V, matching values \p VL.
|
|
TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
|
|
bool SameVF = false) const {
|
|
assert(V && "V cannot be nullptr.");
|
|
for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
|
|
if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
|
|
return TE;
|
|
return nullptr;
|
|
}
|
|
|
|
/// Check that the operand node of alternate node does not generate
|
|
/// buildvector sequence. If it is, then probably not worth it to build
|
|
/// alternate shuffle, if number of buildvector operands + alternate
|
|
/// instruction > than the number of buildvector instructions.
|
|
/// \param S the instructions state of the analyzed values.
|
|
/// \param VL list of the instructions with alternate opcodes.
|
|
bool areAltOperandsProfitable(const InstructionsState &S,
|
|
ArrayRef<Value *> VL) const;
|
|
|
|
/// Checks if the specified list of the instructions/values can be vectorized
|
|
/// in general.
|
|
bool isLegalToVectorizeScalars(ArrayRef<Value *> VL, unsigned Depth,
|
|
const EdgeInfo &UserTreeIdx,
|
|
InstructionsState &S,
|
|
bool &TryToFindDuplicates,
|
|
bool &TrySplitVectorize) const;
|
|
|
|
/// Checks if the specified list of the instructions/values can be vectorized
|
|
/// and fills required data before actual scheduling of the instructions.
|
|
TreeEntry::EntryState
|
|
getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
|
|
bool IsScatterVectorizeUserTE,
|
|
OrdersType &CurrentOrder,
|
|
SmallVectorImpl<Value *> &PointerOps);
|
|
|
|
/// Maps a specific scalar to its tree entry(ies).
|
|
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
|
|
|
|
/// Scalars, used in split vectorize nodes.
|
|
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
|
|
|
|
/// Maps a value to the proposed vectorizable size.
|
|
SmallDenseMap<Value *, unsigned> InstrElementSize;
|
|
|
|
/// A list of scalars that we found that we need to keep as scalars.
|
|
ValueSet MustGather;
|
|
|
|
/// A set of first non-schedulable values.
|
|
ValueSet NonScheduledFirst;
|
|
|
|
/// A map between the vectorized entries and the last instructions in the
|
|
/// bundles. The bundles are built in use order, not in the def order of the
|
|
/// instructions. So, we cannot rely directly on the last instruction in the
|
|
/// bundle being the last instruction in the program order during
|
|
/// vectorization process since the basic blocks are affected, need to
|
|
/// pre-gather them before.
|
|
DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
|
|
|
|
/// List of gather nodes, depending on other gather/vector nodes, which should
|
|
/// be emitted after the vector instruction emission process to correctly
|
|
/// handle order of the vector instructions and shuffles.
|
|
SetVector<const TreeEntry *> PostponedGathers;
|
|
|
|
using ValueToGatherNodesMap =
|
|
DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
|
|
ValueToGatherNodesMap ValueToGatherNodes;
|
|
|
|
/// A list of the load entries (node indices), which can be vectorized using
|
|
/// strided or masked gather approach, but attempted to be represented as
|
|
/// contiguous loads.
|
|
SetVector<unsigned> LoadEntriesToVectorize;
|
|
|
|
/// true if graph nodes transforming mode is on.
|
|
bool IsGraphTransformMode = false;
|
|
|
|
/// The index of the first gathered load entry in the VectorizeTree.
|
|
std::optional<unsigned> GatheredLoadsEntriesFirst;
|
|
|
|
/// This POD struct describes one external user in the vectorized tree.
|
|
struct ExternalUser {
|
|
ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
|
|
: Scalar(S), User(U), E(E), Lane(L) {}
|
|
|
|
/// Which scalar in our function.
|
|
Value *Scalar = nullptr;
|
|
|
|
/// Which user that uses the scalar.
|
|
llvm::User *User = nullptr;
|
|
|
|
/// Vector node, the value is part of.
|
|
const TreeEntry &E;
|
|
|
|
/// Which lane does the scalar belong to.
|
|
int Lane;
|
|
};
|
|
using UserList = SmallVector<ExternalUser, 16>;
|
|
|
|
/// Checks if two instructions may access the same memory.
|
|
///
|
|
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
|
|
/// is invariant in the calling loop.
|
|
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
|
|
Instruction *Inst2) {
|
|
assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
|
|
if (!isSimple(Inst2))
|
|
return true;
|
|
// First check if the result is already in the cache.
|
|
AliasCacheKey Key = std::make_pair(Inst1, Inst2);
|
|
auto Res = AliasCache.try_emplace(Key);
|
|
if (!Res.second)
|
|
return Res.first->second;
|
|
bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
|
|
// Store the result in the cache.
|
|
Res.first->getSecond() = Aliased;
|
|
AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
|
|
return Aliased;
|
|
}
|
|
|
|
using AliasCacheKey = std::pair<Instruction *, Instruction *>;
|
|
|
|
/// Cache for alias results.
|
|
/// TODO: consider moving this to the AliasAnalysis itself.
|
|
SmallDenseMap<AliasCacheKey, bool> AliasCache;
|
|
|
|
// Cache for pointerMayBeCaptured calls inside AA. This is preserved
|
|
// globally through SLP because we don't perform any action which
|
|
// invalidates capture results.
|
|
BatchAAResults BatchAA;
|
|
|
|
/// Temporary store for deleted instructions. Instructions will be deleted
|
|
/// eventually when the BoUpSLP is destructed. The deferral is required to
|
|
/// ensure that there are no incorrect collisions in the AliasCache, which
|
|
/// can happen if a new instruction is allocated at the same address as a
|
|
/// previously deleted instruction.
|
|
DenseSet<Instruction *> DeletedInstructions;
|
|
|
|
/// Set of the instruction, being analyzed already for reductions.
|
|
SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
|
|
|
|
/// Set of hashes for the list of reduction values already being analyzed.
|
|
DenseSet<size_t> AnalyzedReductionVals;
|
|
|
|
/// Values, already been analyzed for mininmal bitwidth and found to be
|
|
/// non-profitable.
|
|
DenseSet<Value *> AnalyzedMinBWVals;
|
|
|
|
/// A list of values that need to extracted out of the tree.
|
|
/// This list holds pairs of (Internal Scalar : External User). External User
|
|
/// can be nullptr, it means that this Internal Scalar will be used later,
|
|
/// after vectorization.
|
|
UserList ExternalUses;
|
|
|
|
/// A list of GEPs which can be reaplced by scalar GEPs instead of
|
|
/// extractelement instructions.
|
|
SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
|
|
|
|
/// Values used only by @llvm.assume calls.
|
|
SmallPtrSet<const Value *, 32> EphValues;
|
|
|
|
/// Holds all of the instructions that we gathered, shuffle instructions and
|
|
/// extractelements.
|
|
SetVector<Instruction *> GatherShuffleExtractSeq;
|
|
|
|
/// A list of blocks that we are going to CSE.
|
|
DenseSet<BasicBlock *> CSEBlocks;
|
|
|
|
/// List of hashes of vector of loads, which are known to be non vectorizable.
|
|
DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
|
|
|
|
/// Represents a scheduling entity, either ScheduleData or ScheduleBundle.
|
|
/// ScheduleData used to gather dependecies for a single instructions, while
|
|
/// ScheduleBundle represents a batch of instructions, going to be groupped
|
|
/// together.
|
|
class ScheduleEntity {
|
|
friend class ScheduleBundle;
|
|
friend class ScheduleData;
|
|
|
|
protected:
|
|
enum class Kind { ScheduleData, ScheduleBundle };
|
|
Kind getKind() const { return K; }
|
|
ScheduleEntity(Kind K) : K(K) {}
|
|
|
|
private:
|
|
/// Used for getting a "good" final ordering of instructions.
|
|
int SchedulingPriority = 0;
|
|
/// True if this instruction (or bundle) is scheduled (or considered as
|
|
/// scheduled in the dry-run).
|
|
bool IsScheduled = false;
|
|
/// The kind of the ScheduleEntity.
|
|
const Kind K = Kind::ScheduleData;
|
|
|
|
public:
|
|
ScheduleEntity() = delete;
|
|
/// Gets/sets the scheduling priority.
|
|
void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
|
|
int getSchedulingPriority() const { return SchedulingPriority; }
|
|
bool isReady() const {
|
|
if (auto *SD = dyn_cast<ScheduleData>(this))
|
|
return SD->isReady();
|
|
return cast<ScheduleBundle>(this)->isReady();
|
|
}
|
|
/// Gets/sets if the bundle is scheduled.
|
|
bool isScheduled() const { return IsScheduled; }
|
|
void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
|
|
|
|
static bool classof(const ScheduleEntity *) { return true; }
|
|
};
|
|
|
|
/// Contains all scheduling relevant data for an instruction.
|
|
/// A ScheduleData either represents a single instruction or a member of an
|
|
/// instruction bundle (= a group of instructions which is combined into a
|
|
/// vector instruction).
|
|
class ScheduleData final : public ScheduleEntity {
|
|
public:
|
|
// The initial value for the dependency counters. It means that the
|
|
// dependencies are not calculated yet.
|
|
enum { InvalidDeps = -1 };
|
|
|
|
ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
|
|
static bool classof(const ScheduleEntity *Entity) {
|
|
return Entity->getKind() == Kind::ScheduleData;
|
|
}
|
|
|
|
void init(int BlockSchedulingRegionID, Instruction *I) {
|
|
NextLoadStore = nullptr;
|
|
IsScheduled = false;
|
|
SchedulingRegionID = BlockSchedulingRegionID;
|
|
clearDependencies();
|
|
Inst = I;
|
|
}
|
|
|
|
/// Verify basic self consistency properties
|
|
void verify() {
|
|
if (hasValidDependencies()) {
|
|
assert(UnscheduledDeps <= Dependencies && "invariant");
|
|
} else {
|
|
assert(UnscheduledDeps == Dependencies && "invariant");
|
|
}
|
|
|
|
if (IsScheduled) {
|
|
assert(hasValidDependencies() && UnscheduledDeps == 0 &&
|
|
"unexpected scheduled state");
|
|
}
|
|
}
|
|
|
|
/// Returns true if the dependency information has been calculated.
|
|
/// Note that depenendency validity can vary between instructions within
|
|
/// a single bundle.
|
|
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
|
|
|
|
/// Returns true if it is ready for scheduling, i.e. it has no more
|
|
/// unscheduled depending instructions/bundles.
|
|
bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
|
|
|
|
/// Modifies the number of unscheduled dependencies for this instruction,
|
|
/// and returns the number of remaining dependencies for the containing
|
|
/// bundle.
|
|
int incrementUnscheduledDeps(int Incr) {
|
|
assert(hasValidDependencies() &&
|
|
"increment of unscheduled deps would be meaningless");
|
|
UnscheduledDeps += Incr;
|
|
return UnscheduledDeps;
|
|
}
|
|
|
|
/// Sets the number of unscheduled dependencies to the number of
|
|
/// dependencies.
|
|
void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
|
|
|
|
/// Clears all dependency information.
|
|
void clearDependencies() {
|
|
Dependencies = InvalidDeps;
|
|
resetUnscheduledDeps();
|
|
MemoryDependencies.clear();
|
|
ControlDependencies.clear();
|
|
IsScheduled = false;
|
|
}
|
|
|
|
/// Gets the number of unscheduled dependencies.
|
|
int getUnscheduledDeps() const { return UnscheduledDeps; }
|
|
/// Gets the number of dependencies.
|
|
int getDependencies() const { return Dependencies; }
|
|
/// Initializes the number of dependencies.
|
|
void initDependencies() { Dependencies = 0; }
|
|
/// Increments the number of dependencies.
|
|
void incDependencies() { Dependencies++; }
|
|
|
|
/// Gets scheduling region ID.
|
|
int getSchedulingRegionID() const { return SchedulingRegionID; }
|
|
|
|
/// Gets the instruction.
|
|
Instruction *getInst() const { return Inst; }
|
|
|
|
/// Gets the list of memory dependencies.
|
|
ArrayRef<ScheduleData *> getMemoryDependencies() const {
|
|
return MemoryDependencies;
|
|
}
|
|
/// Adds a memory dependency.
|
|
void addMemoryDependency(ScheduleData *Dep) {
|
|
MemoryDependencies.push_back(Dep);
|
|
}
|
|
/// Gets the list of control dependencies.
|
|
ArrayRef<ScheduleData *> getControlDependencies() const {
|
|
return ControlDependencies;
|
|
}
|
|
/// Adds a control dependency.
|
|
void addControlDependency(ScheduleData *Dep) {
|
|
ControlDependencies.push_back(Dep);
|
|
}
|
|
/// Gets/sets the next load/store instruction in the block.
|
|
ScheduleData *getNextLoadStore() const { return NextLoadStore; }
|
|
void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
|
|
|
|
void dump(raw_ostream &OS) const { OS << *Inst; }
|
|
|
|
LLVM_DUMP_METHOD void dump() const {
|
|
dump(dbgs());
|
|
dbgs() << '\n';
|
|
}
|
|
|
|
private:
|
|
Instruction *Inst = nullptr;
|
|
|
|
/// Single linked list of all memory instructions (e.g. load, store, call)
|
|
/// in the block - until the end of the scheduling region.
|
|
ScheduleData *NextLoadStore = nullptr;
|
|
|
|
/// The dependent memory instructions.
|
|
/// This list is derived on demand in calculateDependencies().
|
|
SmallVector<ScheduleData *> MemoryDependencies;
|
|
|
|
/// List of instructions which this instruction could be control dependent
|
|
/// on. Allowing such nodes to be scheduled below this one could introduce
|
|
/// a runtime fault which didn't exist in the original program.
|
|
/// ex: this is a load or udiv following a readonly call which inf loops
|
|
SmallVector<ScheduleData *> ControlDependencies;
|
|
|
|
/// This ScheduleData is in the current scheduling region if this matches
|
|
/// the current SchedulingRegionID of BlockScheduling.
|
|
int SchedulingRegionID = 0;
|
|
|
|
/// The number of dependencies. Constitutes of the number of users of the
|
|
/// instruction plus the number of dependent memory instructions (if any).
|
|
/// This value is calculated on demand.
|
|
/// If InvalidDeps, the number of dependencies is not calculated yet.
|
|
int Dependencies = InvalidDeps;
|
|
|
|
/// The number of dependencies minus the number of dependencies of scheduled
|
|
/// instructions. As soon as this is zero, the instruction/bundle gets ready
|
|
/// for scheduling.
|
|
/// Note that this is negative as long as Dependencies is not calculated.
|
|
int UnscheduledDeps = InvalidDeps;
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
friend inline raw_ostream &operator<<(raw_ostream &os,
|
|
const BoUpSLP::ScheduleData &SD) {
|
|
SD.dump(os);
|
|
return os;
|
|
}
|
|
#endif
|
|
|
|
class ScheduleBundle final : public ScheduleEntity {
|
|
/// The schedule data for the instructions in the bundle.
|
|
SmallVector<ScheduleData *> Bundle;
|
|
/// True if this bundle is valid.
|
|
bool IsValid = true;
|
|
/// The TreeEntry that this instruction corresponds to.
|
|
TreeEntry *TE = nullptr;
|
|
ScheduleBundle(bool IsValid)
|
|
: ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
|
|
|
|
public:
|
|
ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
|
|
static bool classof(const ScheduleEntity *Entity) {
|
|
return Entity->getKind() == Kind::ScheduleBundle;
|
|
}
|
|
|
|
/// Verify basic self consistency properties
|
|
void verify() const {
|
|
for (const ScheduleData *SD : Bundle) {
|
|
if (SD->hasValidDependencies()) {
|
|
assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
|
|
"invariant");
|
|
} else {
|
|
assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
|
|
"invariant");
|
|
}
|
|
|
|
if (isScheduled()) {
|
|
assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
|
|
"unexpected scheduled state");
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns the number of unscheduled dependencies in the bundle.
|
|
int unscheduledDepsInBundle() const {
|
|
assert(*this && "bundle must not be empty");
|
|
int Sum = 0;
|
|
for (const ScheduleData *BundleMember : Bundle) {
|
|
if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
|
|
return ScheduleData::InvalidDeps;
|
|
Sum += BundleMember->getUnscheduledDeps();
|
|
}
|
|
return Sum;
|
|
}
|
|
|
|
/// Returns true if the dependency information has been calculated.
|
|
/// Note that depenendency validity can vary between instructions within
|
|
/// a single bundle.
|
|
bool hasValidDependencies() const {
|
|
return all_of(Bundle, [](const ScheduleData *SD) {
|
|
return SD->hasValidDependencies();
|
|
});
|
|
}
|
|
|
|
/// Returns true if it is ready for scheduling, i.e. it has no more
|
|
/// unscheduled depending instructions/bundles.
|
|
bool isReady() const {
|
|
assert(*this && "bundle must not be empty");
|
|
return unscheduledDepsInBundle() == 0 && !isScheduled();
|
|
}
|
|
|
|
/// Returns the bundle of scheduling data, associated with the current
|
|
/// instruction.
|
|
ArrayRef<ScheduleData *> getBundle() { return Bundle; }
|
|
ArrayRef<const ScheduleData *> getBundle() const { return Bundle; }
|
|
/// Adds an instruction to the bundle.
|
|
void add(ScheduleData *SD) { Bundle.push_back(SD); }
|
|
|
|
/// Gets/sets the associated tree entry.
|
|
void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
|
|
TreeEntry *getTreeEntry() const { return TE; }
|
|
|
|
static ScheduleBundle invalid() { return {false}; }
|
|
|
|
operator bool() const { return IsValid; }
|
|
|
|
#ifndef NDEBUG
|
|
void dump(raw_ostream &OS) const {
|
|
if (!*this) {
|
|
OS << "[]";
|
|
return;
|
|
}
|
|
OS << '[';
|
|
interleaveComma(Bundle, OS,
|
|
[&](const ScheduleData *SD) { OS << *SD->getInst(); });
|
|
OS << ']';
|
|
}
|
|
|
|
LLVM_DUMP_METHOD void dump() const {
|
|
dump(dbgs());
|
|
dbgs() << '\n';
|
|
}
|
|
#endif // NDEBUG
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
friend inline raw_ostream &operator<<(raw_ostream &os,
|
|
const BoUpSLP::ScheduleBundle &Bundle) {
|
|
Bundle.dump(os);
|
|
return os;
|
|
}
|
|
#endif
|
|
|
|
friend struct GraphTraits<BoUpSLP *>;
|
|
friend struct DOTGraphTraits<BoUpSLP *>;
|
|
|
|
/// Contains all scheduling data for a basic block.
|
|
/// It does not schedules instructions, which are not memory read/write
|
|
/// instructions and their operands are either constants, or arguments, or
|
|
/// phis, or instructions from others blocks, or their users are phis or from
|
|
/// the other blocks. The resulting vector instructions can be placed at the
|
|
/// beginning of the basic block without scheduling (if operands does not need
|
|
/// to be scheduled) or at the end of the block (if users are outside of the
|
|
/// block). It allows to save some compile time and memory used by the
|
|
/// compiler.
|
|
/// ScheduleData is assigned for each instruction in between the boundaries of
|
|
/// the tree entry, even for those, which are not part of the graph. It is
|
|
/// required to correctly follow the dependencies between the instructions and
|
|
/// their correct scheduling. The ScheduleData is not allocated for the
|
|
/// instructions, which do not require scheduling, like phis, nodes with
|
|
/// extractelements/insertelements only or nodes with instructions, with
|
|
/// uses/operands outside of the block.
|
|
struct BlockScheduling {
|
|
BlockScheduling(BasicBlock *BB)
|
|
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
|
|
|
|
void clear() {
|
|
ScheduledBundles.clear();
|
|
ScheduledBundlesList.clear();
|
|
ReadyInsts.clear();
|
|
ScheduleStart = nullptr;
|
|
ScheduleEnd = nullptr;
|
|
FirstLoadStoreInRegion = nullptr;
|
|
LastLoadStoreInRegion = nullptr;
|
|
RegionHasStackSave = false;
|
|
|
|
// Reduce the maximum schedule region size by the size of the
|
|
// previous scheduling run.
|
|
ScheduleRegionSizeLimit -= ScheduleRegionSize;
|
|
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
|
|
ScheduleRegionSizeLimit = MinScheduleRegionSize;
|
|
ScheduleRegionSize = 0;
|
|
|
|
// Make a new scheduling region, i.e. all existing ScheduleData is not
|
|
// in the new region yet.
|
|
++SchedulingRegionID;
|
|
}
|
|
|
|
ScheduleData *getScheduleData(Instruction *I) {
|
|
if (!I)
|
|
return nullptr;
|
|
if (BB != I->getParent())
|
|
// Avoid lookup if can't possibly be in map.
|
|
return nullptr;
|
|
ScheduleData *SD = ScheduleDataMap.lookup(I);
|
|
if (SD && isInSchedulingRegion(SD))
|
|
return SD;
|
|
return nullptr;
|
|
}
|
|
|
|
ScheduleData *getScheduleData(Value *V) {
|
|
return getScheduleData(dyn_cast<Instruction>(V));
|
|
}
|
|
|
|
ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
return {};
|
|
auto It = ScheduledBundles.find(I);
|
|
if (It == ScheduledBundles.end())
|
|
return {};
|
|
return It->getSecond();
|
|
}
|
|
|
|
bool isInSchedulingRegion(ScheduleData *SD) const {
|
|
return SD->getSchedulingRegionID() == SchedulingRegionID;
|
|
}
|
|
|
|
bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
|
|
return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) {
|
|
return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
|
|
});
|
|
}
|
|
|
|
/// Marks an instruction as scheduled and puts all dependent ready
|
|
/// instructions into the ready-list.
|
|
template <typename ReadyListType>
|
|
void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
|
|
auto ProcessBundleMember = [&](ScheduleData *BundleMember,
|
|
ScheduleBundle *Bundle) {
|
|
// Handle the def-use chain dependencies.
|
|
|
|
// Decrement the unscheduled counter and insert to ready list if ready.
|
|
auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) {
|
|
if ((IsControl || Data->hasValidDependencies()) &&
|
|
Data->incrementUnscheduledDeps(-1) == 0) {
|
|
// There are no more unscheduled dependencies after
|
|
// decrementing, so we can put the dependent instruction
|
|
// into the ready list.
|
|
if (ArrayRef<ScheduleBundle *> Bundles =
|
|
getScheduleBundles(Data->getInst());
|
|
!Bundles.empty()) {
|
|
for (ScheduleBundle *Bundle : Bundles) {
|
|
if (Bundle->unscheduledDepsInBundle() == 0) {
|
|
assert(!Bundle->isScheduled() &&
|
|
"already scheduled bundle gets ready");
|
|
ReadyList.insert(Bundle);
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: gets ready: " << *Bundle << "\n");
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
assert(!Data->isScheduled() &&
|
|
"already scheduled bundle gets ready");
|
|
ReadyList.insert(Data);
|
|
LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
|
|
}
|
|
};
|
|
|
|
auto DecrUnschedForInst = [&](Instruction *I) {
|
|
if (ScheduleData *OpSD = getScheduleData(I))
|
|
DecrUnsched(OpSD, /*IsControl=*/false);
|
|
};
|
|
|
|
// If BundleMember is a vector bundle, its operands may have been
|
|
// reordered during buildTree(). We therefore need to get its operands
|
|
// through the TreeEntry.
|
|
if (Bundle) {
|
|
// Need to search for the lane since the tree entry can be reordered.
|
|
auto *In = BundleMember->getInst();
|
|
int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
|
|
find(Bundle->getTreeEntry()->Scalars, In));
|
|
assert(Lane >= 0 && "Lane not set");
|
|
|
|
// Since vectorization tree is being built recursively this assertion
|
|
// ensures that the tree entry has all operands set before reaching
|
|
// this code. Couple of exceptions known at the moment are extracts
|
|
// where their second (immediate) operand is not added. Since
|
|
// immediates do not affect scheduler behavior this is considered
|
|
// okay.
|
|
assert(
|
|
In &&
|
|
(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
|
|
In->getNumOperands() ==
|
|
Bundle->getTreeEntry()->getNumOperands()) &&
|
|
"Missed TreeEntry operands?");
|
|
|
|
for (unsigned OpIdx :
|
|
seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
|
|
if (auto *I = dyn_cast<Instruction>(
|
|
Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: check for readiness (def): " << *I << "\n");
|
|
DecrUnschedForInst(I);
|
|
}
|
|
} else {
|
|
// If BundleMember is a stand-alone instruction, no operand reordering
|
|
// has taken place, so we directly access its operands.
|
|
for (Use &U : BundleMember->getInst()->operands())
|
|
if (auto *I = dyn_cast<Instruction>(U.get())) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: check for readiness (def): " << *I << "\n");
|
|
DecrUnschedForInst(I);
|
|
}
|
|
}
|
|
// Handle the memory dependencies.
|
|
for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
|
|
// There are no more unscheduled dependencies after decrementing,
|
|
// so we can put the dependent instruction into the ready list.
|
|
LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
|
|
<< *MemoryDep << "\n");
|
|
DecrUnsched(MemoryDep);
|
|
}
|
|
// Handle the control dependencies.
|
|
for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
|
|
// There are no more unscheduled dependencies after decrementing,
|
|
// so we can put the dependent instruction into the ready list.
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: check for readiness (ctrl): " << *Dep << "\n");
|
|
DecrUnsched(Dep, /*IsControl=*/true);
|
|
}
|
|
};
|
|
if (auto *SD = dyn_cast<ScheduleData>(Data)) {
|
|
SD->setScheduled(/*Scheduled=*/true);
|
|
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
|
|
ProcessBundleMember(SD, nullptr);
|
|
} else {
|
|
ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
|
|
Bundle.setScheduled(/*Scheduled=*/true);
|
|
LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
|
|
auto AreAllBundlesScheduled = [&](const ScheduleData *SD) {
|
|
ArrayRef<ScheduleBundle *> SDBundles =
|
|
getScheduleBundles(SD->getInst());
|
|
return !SDBundles.empty() &&
|
|
all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
|
|
return SDBundle->isScheduled();
|
|
});
|
|
};
|
|
for (ScheduleData *SD : Bundle.getBundle()) {
|
|
if (AreAllBundlesScheduled(SD)) {
|
|
SD->setScheduled(/*Scheduled=*/true);
|
|
ProcessBundleMember(SD, &Bundle);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Verify basic self consistency properties of the data structure.
|
|
void verify() {
|
|
if (!ScheduleStart)
|
|
return;
|
|
|
|
assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
|
|
ScheduleStart->comesBefore(ScheduleEnd) &&
|
|
"Not a valid scheduling region?");
|
|
|
|
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
|
|
if (!Bundles.empty()) {
|
|
for (ScheduleBundle *Bundle : Bundles) {
|
|
assert(isInSchedulingRegion(*Bundle) &&
|
|
"primary schedule data not in window?");
|
|
Bundle->verify();
|
|
}
|
|
continue;
|
|
}
|
|
auto *SD = getScheduleData(I);
|
|
if (!SD)
|
|
continue;
|
|
assert(isInSchedulingRegion(SD) &&
|
|
"primary schedule data not in window?");
|
|
SD->verify();
|
|
}
|
|
|
|
assert(all_of(ReadyInsts,
|
|
[](const ScheduleEntity *Bundle) {
|
|
return Bundle->isReady();
|
|
}) &&
|
|
"item in ready list not ready?");
|
|
}
|
|
|
|
/// Put all instructions into the ReadyList which are ready for scheduling.
|
|
template <typename ReadyListType>
|
|
void initialFillReadyList(ReadyListType &ReadyList) {
|
|
SmallPtrSet<ScheduleBundle *, 16> Visited;
|
|
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
ScheduleData *SD = getScheduleData(I);
|
|
if (SD && SD->hasValidDependencies() && SD->isReady()) {
|
|
if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
|
|
!Bundles.empty()) {
|
|
for (ScheduleBundle *Bundle : Bundles) {
|
|
if (!Visited.insert(Bundle).second)
|
|
continue;
|
|
if (Bundle->hasValidDependencies() && Bundle->isReady()) {
|
|
ReadyList.insert(Bundle);
|
|
LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
|
|
<< *Bundle << "\n");
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
ReadyList.insert(SD);
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: initially in ready list: " << *SD << "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Build a bundle from the ScheduleData nodes corresponding to the
|
|
/// scalar instruction for each lane.
|
|
ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
|
|
|
|
/// Checks if a bundle of instructions can be scheduled, i.e. has no
|
|
/// cyclic dependencies. This is only a dry-run, no instructions are
|
|
/// actually moved at this stage.
|
|
/// \returns the scheduling bundle. The returned Optional value is not
|
|
/// std::nullopt if \p VL is allowed to be scheduled.
|
|
std::optional<ScheduleBundle *>
|
|
tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
|
|
const InstructionsState &S);
|
|
|
|
/// Allocates schedule data chunk.
|
|
ScheduleData *allocateScheduleDataChunks();
|
|
|
|
/// Extends the scheduling region so that V is inside the region.
|
|
/// \returns true if the region size is within the limit.
|
|
bool extendSchedulingRegion(Value *V, const InstructionsState &S);
|
|
|
|
/// Initialize the ScheduleData structures for new instructions in the
|
|
/// scheduling region.
|
|
void initScheduleData(Instruction *FromI, Instruction *ToI,
|
|
ScheduleData *PrevLoadStore,
|
|
ScheduleData *NextLoadStore);
|
|
|
|
/// Updates the dependency information of a bundle and of all instructions/
|
|
/// bundles which depend on the original bundle.
|
|
void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
|
|
BoUpSLP *SLP);
|
|
|
|
/// Sets all instruction in the scheduling region to un-scheduled.
|
|
void resetSchedule();
|
|
|
|
BasicBlock *BB;
|
|
|
|
/// Simple memory allocation for ScheduleData.
|
|
SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
|
|
|
|
/// The size of a ScheduleData array in ScheduleDataChunks.
|
|
int ChunkSize;
|
|
|
|
/// The allocator position in the current chunk, which is the last entry
|
|
/// of ScheduleDataChunks.
|
|
int ChunkPos;
|
|
|
|
/// Attaches ScheduleData to Instruction.
|
|
/// Note that the mapping survives during all vectorization iterations, i.e.
|
|
/// ScheduleData structures are recycled.
|
|
SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
|
|
|
|
/// Attaches ScheduleBundle to Instruction.
|
|
SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
|
|
ScheduledBundles;
|
|
/// The list of ScheduleBundles.
|
|
SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
|
|
|
|
/// The ready-list for scheduling (only used for the dry-run).
|
|
SetVector<ScheduleEntity *> ReadyInsts;
|
|
|
|
/// The first instruction of the scheduling region.
|
|
Instruction *ScheduleStart = nullptr;
|
|
|
|
/// The first instruction _after_ the scheduling region.
|
|
Instruction *ScheduleEnd = nullptr;
|
|
|
|
/// The first memory accessing instruction in the scheduling region
|
|
/// (can be null).
|
|
ScheduleData *FirstLoadStoreInRegion = nullptr;
|
|
|
|
/// The last memory accessing instruction in the scheduling region
|
|
/// (can be null).
|
|
ScheduleData *LastLoadStoreInRegion = nullptr;
|
|
|
|
/// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
|
|
/// region? Used to optimize the dependence calculation for the
|
|
/// common case where there isn't.
|
|
bool RegionHasStackSave = false;
|
|
|
|
/// The current size of the scheduling region.
|
|
int ScheduleRegionSize = 0;
|
|
|
|
/// The maximum size allowed for the scheduling region.
|
|
int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
|
|
|
|
/// The ID of the scheduling region. For a new vectorization iteration this
|
|
/// is incremented which "removes" all ScheduleData from the region.
|
|
/// Make sure that the initial SchedulingRegionID is greater than the
|
|
/// initial SchedulingRegionID in ScheduleData (which is 0).
|
|
int SchedulingRegionID = 1;
|
|
};
|
|
|
|
/// Attaches the BlockScheduling structures to basic blocks.
|
|
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
|
|
|
|
/// Performs the "real" scheduling. Done before vectorization is actually
|
|
/// performed in a basic block.
|
|
void scheduleBlock(BlockScheduling *BS);
|
|
|
|
/// List of users to ignore during scheduling and that don't need extracting.
|
|
const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
|
|
|
|
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
|
|
/// sorted SmallVectors of unsigned.
|
|
struct OrdersTypeDenseMapInfo {
|
|
static OrdersType getEmptyKey() {
|
|
OrdersType V;
|
|
V.push_back(~1U);
|
|
return V;
|
|
}
|
|
|
|
static OrdersType getTombstoneKey() {
|
|
OrdersType V;
|
|
V.push_back(~2U);
|
|
return V;
|
|
}
|
|
|
|
static unsigned getHashValue(const OrdersType &V) {
|
|
return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
|
|
}
|
|
|
|
static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
|
|
return LHS == RHS;
|
|
}
|
|
};
|
|
|
|
// Analysis and block reference.
|
|
Function *F;
|
|
ScalarEvolution *SE;
|
|
TargetTransformInfo *TTI;
|
|
TargetLibraryInfo *TLI;
|
|
LoopInfo *LI;
|
|
DominatorTree *DT;
|
|
AssumptionCache *AC;
|
|
DemandedBits *DB;
|
|
const DataLayout *DL;
|
|
OptimizationRemarkEmitter *ORE;
|
|
|
|
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
|
|
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
|
|
|
|
/// Instruction builder to construct the vectorized tree.
|
|
IRBuilder<TargetFolder> Builder;
|
|
|
|
/// A map of scalar integer values to the smallest bit width with which they
|
|
/// can legally be represented. The values map to (width, signed) pairs,
|
|
/// where "width" indicates the minimum bit width and "signed" is True if the
|
|
/// value must be signed-extended, rather than zero-extended, back to its
|
|
/// original width.
|
|
DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
|
|
|
|
/// Final size of the reduced vector, if the current graph represents the
|
|
/// input for the reduction and it was possible to narrow the size of the
|
|
/// reduction.
|
|
unsigned ReductionBitWidth = 0;
|
|
|
|
/// Canonical graph size before the transformations.
|
|
unsigned BaseGraphSize = 1;
|
|
|
|
/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
|
|
/// type sizes, used in the tree.
|
|
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
|
|
|
|
/// Indices of the vectorized nodes, which supposed to be the roots of the new
|
|
/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
|
|
DenseSet<unsigned> ExtraBitWidthNodes;
|
|
};
|
|
|
|
} // end namespace slpvectorizer
|
|
|
|
template <> struct GraphTraits<BoUpSLP *> {
|
|
using TreeEntry = BoUpSLP::TreeEntry;
|
|
|
|
/// NodeRef has to be a pointer per the GraphWriter.
|
|
using NodeRef = TreeEntry *;
|
|
|
|
using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
|
|
|
|
/// Add the VectorizableTree to the index iterator to be able to return
|
|
/// TreeEntry pointers.
|
|
struct ChildIteratorType
|
|
: public iterator_adaptor_base<
|
|
ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
|
|
ContainerTy &VectorizableTree;
|
|
|
|
ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
|
|
ContainerTy &VT)
|
|
: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
|
|
|
|
NodeRef operator*() { return I->UserTE; }
|
|
};
|
|
|
|
static NodeRef getEntryNode(BoUpSLP &R) {
|
|
return R.VectorizableTree[0].get();
|
|
}
|
|
|
|
static ChildIteratorType child_begin(NodeRef N) {
|
|
return {&N->UserTreeIndex, N->Container};
|
|
}
|
|
|
|
static ChildIteratorType child_end(NodeRef N) {
|
|
return {&N->UserTreeIndex + 1, N->Container};
|
|
}
|
|
|
|
/// For the node iterator we just need to turn the TreeEntry iterator into a
|
|
/// TreeEntry* iterator so that it dereferences to NodeRef.
|
|
class nodes_iterator {
|
|
using ItTy = ContainerTy::iterator;
|
|
ItTy It;
|
|
|
|
public:
|
|
nodes_iterator(const ItTy &It2) : It(It2) {}
|
|
NodeRef operator*() { return It->get(); }
|
|
nodes_iterator operator++() {
|
|
++It;
|
|
return *this;
|
|
}
|
|
bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
|
|
};
|
|
|
|
static nodes_iterator nodes_begin(BoUpSLP *R) {
|
|
return nodes_iterator(R->VectorizableTree.begin());
|
|
}
|
|
|
|
static nodes_iterator nodes_end(BoUpSLP *R) {
|
|
return nodes_iterator(R->VectorizableTree.end());
|
|
}
|
|
|
|
static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
|
|
};
|
|
|
|
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
|
|
using TreeEntry = BoUpSLP::TreeEntry;
|
|
|
|
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
|
|
|
|
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
|
|
std::string Str;
|
|
raw_string_ostream OS(Str);
|
|
OS << Entry->Idx << ".\n";
|
|
if (isSplat(Entry->Scalars))
|
|
OS << "<splat> ";
|
|
for (auto *V : Entry->Scalars) {
|
|
OS << *V;
|
|
if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
|
|
return EU.Scalar == V;
|
|
}))
|
|
OS << " <extract>";
|
|
OS << "\n";
|
|
}
|
|
return Str;
|
|
}
|
|
|
|
static std::string getNodeAttributes(const TreeEntry *Entry,
|
|
const BoUpSLP *) {
|
|
if (Entry->isGather())
|
|
return "color=red";
|
|
if (Entry->State == TreeEntry::ScatterVectorize ||
|
|
Entry->State == TreeEntry::StridedVectorize ||
|
|
Entry->State == TreeEntry::CompressVectorize)
|
|
return "color=blue";
|
|
return "";
|
|
}
|
|
};
|
|
|
|
} // end namespace llvm
|
|
|
|
BoUpSLP::~BoUpSLP() {
|
|
SmallVector<WeakTrackingVH> DeadInsts;
|
|
for (auto *I : DeletedInstructions) {
|
|
if (!I->getParent()) {
|
|
// Temporarily insert instruction back to erase them from parent and
|
|
// memory later.
|
|
if (isa<PHINode>(I))
|
|
// Phi nodes must be the very first instructions in the block.
|
|
I->insertBefore(F->getEntryBlock(),
|
|
F->getEntryBlock().getFirstNonPHIIt());
|
|
else
|
|
I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
|
|
continue;
|
|
}
|
|
for (Use &U : I->operands()) {
|
|
auto *Op = dyn_cast<Instruction>(U.get());
|
|
if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
|
|
wouldInstructionBeTriviallyDead(Op, TLI))
|
|
DeadInsts.emplace_back(Op);
|
|
}
|
|
I->dropAllReferences();
|
|
}
|
|
for (auto *I : DeletedInstructions) {
|
|
assert(I->use_empty() &&
|
|
"trying to erase instruction with users.");
|
|
I->eraseFromParent();
|
|
}
|
|
|
|
// Cleanup any dead scalar code feeding the vectorized instructions
|
|
RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
|
|
|
|
#ifdef EXPENSIVE_CHECKS
|
|
// If we could guarantee that this call is not extremely slow, we could
|
|
// remove the ifdef limitation (see PR47712).
|
|
assert(!verifyFunction(*F, &dbgs()));
|
|
#endif
|
|
}
|
|
|
|
/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
|
|
/// contains original mask for the scalars reused in the node. Procedure
|
|
/// transform this mask in accordance with the given \p Mask.
|
|
static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
|
|
assert(!Mask.empty() && Reuses.size() == Mask.size() &&
|
|
"Expected non-empty mask.");
|
|
SmallVector<int> Prev(Reuses.begin(), Reuses.end());
|
|
Prev.swap(Reuses);
|
|
for (unsigned I = 0, E = Prev.size(); I < E; ++I)
|
|
if (Mask[I] != PoisonMaskElem)
|
|
Reuses[Mask[I]] = Prev[I];
|
|
}
|
|
|
|
/// Reorders the given \p Order according to the given \p Mask. \p Order - is
|
|
/// the original order of the scalars. Procedure transforms the provided order
|
|
/// in accordance with the given \p Mask. If the resulting \p Order is just an
|
|
/// identity order, \p Order is cleared.
|
|
static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
|
|
bool BottomOrder = false) {
|
|
assert(!Mask.empty() && "Expected non-empty mask.");
|
|
unsigned Sz = Mask.size();
|
|
if (BottomOrder) {
|
|
SmallVector<unsigned> PrevOrder;
|
|
if (Order.empty()) {
|
|
PrevOrder.resize(Sz);
|
|
std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
|
|
} else {
|
|
PrevOrder.swap(Order);
|
|
}
|
|
Order.assign(Sz, Sz);
|
|
for (unsigned I = 0; I < Sz; ++I)
|
|
if (Mask[I] != PoisonMaskElem)
|
|
Order[I] = PrevOrder[Mask[I]];
|
|
if (all_of(enumerate(Order), [&](const auto &Data) {
|
|
return Data.value() == Sz || Data.index() == Data.value();
|
|
})) {
|
|
Order.clear();
|
|
return;
|
|
}
|
|
fixupOrderingIndices(Order);
|
|
return;
|
|
}
|
|
SmallVector<int> MaskOrder;
|
|
if (Order.empty()) {
|
|
MaskOrder.resize(Sz);
|
|
std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
|
|
} else {
|
|
inversePermutation(Order, MaskOrder);
|
|
}
|
|
reorderReuses(MaskOrder, Mask);
|
|
if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
|
|
Order.clear();
|
|
return;
|
|
}
|
|
Order.assign(Sz, Sz);
|
|
for (unsigned I = 0; I < Sz; ++I)
|
|
if (MaskOrder[I] != PoisonMaskElem)
|
|
Order[MaskOrder[I]] = I;
|
|
fixupOrderingIndices(Order);
|
|
}
|
|
|
|
std::optional<BoUpSLP::OrdersType>
|
|
BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
|
|
bool TopToBottom, bool IgnoreReorder) {
|
|
assert(TE.isGather() && "Expected gather node only.");
|
|
// Try to find subvector extract/insert patterns and reorder only such
|
|
// patterns.
|
|
SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
|
|
Type *ScalarTy = GatheredScalars.front()->getType();
|
|
int NumScalars = GatheredScalars.size();
|
|
if (!isValidElementType(ScalarTy))
|
|
return std::nullopt;
|
|
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
|
|
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
|
|
SmallVector<int> ExtractMask;
|
|
SmallVector<int> Mask;
|
|
SmallVector<SmallVector<const TreeEntry *>> Entries;
|
|
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
|
|
tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
|
|
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
|
|
isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
|
|
/*ForOrder=*/true);
|
|
// No shuffled operands - ignore.
|
|
if (GatherShuffles.empty() && ExtractShuffles.empty())
|
|
return std::nullopt;
|
|
OrdersType CurrentOrder(NumScalars, NumScalars);
|
|
if (GatherShuffles.size() == 1 &&
|
|
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
|
|
Entries.front().front()->isSame(TE.Scalars)) {
|
|
// If the full matched node in whole tree rotation - no need to consider the
|
|
// matching order, rotating the whole tree.
|
|
if (TopToBottom)
|
|
return std::nullopt;
|
|
// No need to keep the order for the same user node.
|
|
if (Entries.front().front()->UserTreeIndex.UserTE ==
|
|
TE.UserTreeIndex.UserTE)
|
|
return std::nullopt;
|
|
// No need to keep the order for the matched root node, if it can be freely
|
|
// reordered.
|
|
if (!IgnoreReorder && Entries.front().front()->Idx == 0)
|
|
return std::nullopt;
|
|
// If shuffling 2 elements only and the matching node has reverse reuses -
|
|
// no need to count order, both work fine.
|
|
if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
|
|
TE.getVectorFactor() == 2 && Mask.size() == 2 &&
|
|
any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
|
|
[](const auto &P) {
|
|
return P.value() % 2 != static_cast<int>(P.index()) % 2;
|
|
}))
|
|
return std::nullopt;
|
|
|
|
// Perfect match in the graph, will reuse the previously vectorized
|
|
// node. Cost is 0.
|
|
std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
|
|
return CurrentOrder;
|
|
}
|
|
auto IsSplatMask = [](ArrayRef<int> Mask) {
|
|
int SingleElt = PoisonMaskElem;
|
|
return all_of(Mask, [&](int I) {
|
|
if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
|
|
SingleElt = I;
|
|
return I == PoisonMaskElem || I == SingleElt;
|
|
});
|
|
};
|
|
// Exclusive broadcast mask - ignore.
|
|
if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
|
|
(Entries.size() != 1 ||
|
|
Entries.front().front()->ReorderIndices.empty())) ||
|
|
(GatherShuffles.empty() && IsSplatMask(ExtractMask)))
|
|
return std::nullopt;
|
|
SmallBitVector ShuffledSubMasks(NumParts);
|
|
auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
|
|
ArrayRef<int> Mask, int PartSz, int NumParts,
|
|
function_ref<unsigned(unsigned)> GetVF) {
|
|
for (int I : seq<int>(0, NumParts)) {
|
|
if (ShuffledSubMasks.test(I))
|
|
continue;
|
|
const int VF = GetVF(I);
|
|
if (VF == 0)
|
|
continue;
|
|
unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
|
|
MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
|
|
// Shuffle of at least 2 vectors - ignore.
|
|
if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
|
|
std::fill(Slice.begin(), Slice.end(), NumScalars);
|
|
ShuffledSubMasks.set(I);
|
|
continue;
|
|
}
|
|
// Try to include as much elements from the mask as possible.
|
|
int FirstMin = INT_MAX;
|
|
int SecondVecFound = false;
|
|
for (int K : seq<int>(Limit)) {
|
|
int Idx = Mask[I * PartSz + K];
|
|
if (Idx == PoisonMaskElem) {
|
|
Value *V = GatheredScalars[I * PartSz + K];
|
|
if (isConstant(V) && !isa<PoisonValue>(V)) {
|
|
SecondVecFound = true;
|
|
break;
|
|
}
|
|
continue;
|
|
}
|
|
if (Idx < VF) {
|
|
if (FirstMin > Idx)
|
|
FirstMin = Idx;
|
|
} else {
|
|
SecondVecFound = true;
|
|
break;
|
|
}
|
|
}
|
|
FirstMin = (FirstMin / PartSz) * PartSz;
|
|
// Shuffle of at least 2 vectors - ignore.
|
|
if (SecondVecFound) {
|
|
std::fill(Slice.begin(), Slice.end(), NumScalars);
|
|
ShuffledSubMasks.set(I);
|
|
continue;
|
|
}
|
|
for (int K : seq<int>(Limit)) {
|
|
int Idx = Mask[I * PartSz + K];
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
Idx -= FirstMin;
|
|
if (Idx >= PartSz) {
|
|
SecondVecFound = true;
|
|
break;
|
|
}
|
|
if (CurrentOrder[I * PartSz + Idx] >
|
|
static_cast<unsigned>(I * PartSz + K) &&
|
|
CurrentOrder[I * PartSz + Idx] !=
|
|
static_cast<unsigned>(I * PartSz + Idx))
|
|
CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
|
|
}
|
|
// Shuffle of at least 2 vectors - ignore.
|
|
if (SecondVecFound) {
|
|
std::fill(Slice.begin(), Slice.end(), NumScalars);
|
|
ShuffledSubMasks.set(I);
|
|
continue;
|
|
}
|
|
}
|
|
};
|
|
int PartSz = getPartNumElems(NumScalars, NumParts);
|
|
if (!ExtractShuffles.empty())
|
|
TransformMaskToOrder(
|
|
CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
|
|
if (!ExtractShuffles[I])
|
|
return 0U;
|
|
unsigned VF = 0;
|
|
unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
|
|
for (unsigned Idx : seq<unsigned>(Sz)) {
|
|
int K = I * PartSz + Idx;
|
|
if (ExtractMask[K] == PoisonMaskElem)
|
|
continue;
|
|
if (!TE.ReuseShuffleIndices.empty())
|
|
K = TE.ReuseShuffleIndices[K];
|
|
if (K == PoisonMaskElem)
|
|
continue;
|
|
if (!TE.ReorderIndices.empty())
|
|
K = std::distance(TE.ReorderIndices.begin(),
|
|
find(TE.ReorderIndices, K));
|
|
auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
|
|
if (!EI)
|
|
continue;
|
|
VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
|
|
->getElementCount()
|
|
.getKnownMinValue());
|
|
}
|
|
return VF;
|
|
});
|
|
// Check special corner case - single shuffle of the same entry.
|
|
if (GatherShuffles.size() == 1 && NumParts != 1) {
|
|
if (ShuffledSubMasks.any())
|
|
return std::nullopt;
|
|
PartSz = NumScalars;
|
|
NumParts = 1;
|
|
}
|
|
if (!Entries.empty())
|
|
TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
|
|
if (!GatherShuffles[I])
|
|
return 0U;
|
|
return std::max(Entries[I].front()->getVectorFactor(),
|
|
Entries[I].back()->getVectorFactor());
|
|
});
|
|
int NumUndefs =
|
|
count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
|
|
if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
|
|
return std::nullopt;
|
|
return std::move(CurrentOrder);
|
|
}
|
|
|
|
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
|
|
const TargetLibraryInfo &TLI,
|
|
bool CompareOpcodes = true) {
|
|
if (getUnderlyingObject(Ptr1, RecursionMaxDepth) !=
|
|
getUnderlyingObject(Ptr2, RecursionMaxDepth))
|
|
return false;
|
|
auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
|
|
auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
|
|
return (!GEP1 || GEP1->getNumOperands() == 2) &&
|
|
(!GEP2 || GEP2->getNumOperands() == 2) &&
|
|
(((!GEP1 || isConstant(GEP1->getOperand(1))) &&
|
|
(!GEP2 || isConstant(GEP2->getOperand(1)))) ||
|
|
!CompareOpcodes ||
|
|
(GEP1 && GEP2 &&
|
|
getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
|
|
}
|
|
|
|
/// Calculates minimal alignment as a common alignment.
|
|
template <typename T>
|
|
static Align computeCommonAlignment(ArrayRef<Value *> VL) {
|
|
Align CommonAlignment = cast<T>(VL.front())->getAlign();
|
|
for (Value *V : VL.drop_front())
|
|
CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
|
|
return CommonAlignment;
|
|
}
|
|
|
|
/// Check if \p Order represents reverse order.
|
|
static bool isReverseOrder(ArrayRef<unsigned> Order) {
|
|
assert(!Order.empty() &&
|
|
"Order is empty. Please check it before using isReverseOrder.");
|
|
unsigned Sz = Order.size();
|
|
return all_of(enumerate(Order), [&](const auto &Pair) {
|
|
return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
|
|
});
|
|
}
|
|
|
|
/// Checks if the provided list of pointers \p Pointers represents the strided
|
|
/// pointers for type ElemTy. If they are not, std::nullopt is returned.
|
|
/// Otherwise, if \p Inst is not specified, just initialized optional value is
|
|
/// returned to show that the pointers represent strided pointers. If \p Inst
|
|
/// specified, the runtime stride is materialized before the given \p Inst.
|
|
/// \returns std::nullopt if the pointers are not pointers with the runtime
|
|
/// stride, nullptr or actual stride value, otherwise.
|
|
static std::optional<Value *>
|
|
calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
|
|
const DataLayout &DL, ScalarEvolution &SE,
|
|
SmallVectorImpl<unsigned> &SortedIndices,
|
|
Instruction *Inst = nullptr) {
|
|
SmallVector<const SCEV *> SCEVs;
|
|
const SCEV *PtrSCEVLowest = nullptr;
|
|
const SCEV *PtrSCEVHighest = nullptr;
|
|
// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
|
|
// addresses).
|
|
for (Value *Ptr : PointerOps) {
|
|
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
|
|
if (!PtrSCEV)
|
|
return std::nullopt;
|
|
SCEVs.push_back(PtrSCEV);
|
|
if (!PtrSCEVLowest && !PtrSCEVHighest) {
|
|
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
|
|
continue;
|
|
}
|
|
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
|
|
if (isa<SCEVCouldNotCompute>(Diff))
|
|
return std::nullopt;
|
|
if (Diff->isNonConstantNegative()) {
|
|
PtrSCEVLowest = PtrSCEV;
|
|
continue;
|
|
}
|
|
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
|
|
if (isa<SCEVCouldNotCompute>(Diff1))
|
|
return std::nullopt;
|
|
if (Diff1->isNonConstantNegative()) {
|
|
PtrSCEVHighest = PtrSCEV;
|
|
continue;
|
|
}
|
|
}
|
|
// Dist = PtrSCEVHighest - PtrSCEVLowest;
|
|
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
|
|
if (isa<SCEVCouldNotCompute>(Dist))
|
|
return std::nullopt;
|
|
int Size = DL.getTypeStoreSize(ElemTy);
|
|
auto TryGetStride = [&](const SCEV *Dist,
|
|
const SCEV *Multiplier) -> const SCEV * {
|
|
if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
|
|
if (M->getOperand(0) == Multiplier)
|
|
return M->getOperand(1);
|
|
if (M->getOperand(1) == Multiplier)
|
|
return M->getOperand(0);
|
|
return nullptr;
|
|
}
|
|
if (Multiplier == Dist)
|
|
return SE.getConstant(Dist->getType(), 1);
|
|
return SE.getUDivExactExpr(Dist, Multiplier);
|
|
};
|
|
// Stride_in_elements = Dist / element_size * (num_elems - 1).
|
|
const SCEV *Stride = nullptr;
|
|
if (Size != 1 || SCEVs.size() > 2) {
|
|
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
|
|
Stride = TryGetStride(Dist, Sz);
|
|
if (!Stride)
|
|
return std::nullopt;
|
|
}
|
|
if (!Stride || isa<SCEVConstant>(Stride))
|
|
return std::nullopt;
|
|
// Iterate through all pointers and check if all distances are
|
|
// unique multiple of Stride.
|
|
using DistOrdPair = std::pair<int64_t, int>;
|
|
auto Compare = llvm::less_first();
|
|
std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
|
|
int Cnt = 0;
|
|
bool IsConsecutive = true;
|
|
for (const SCEV *PtrSCEV : SCEVs) {
|
|
unsigned Dist = 0;
|
|
if (PtrSCEV != PtrSCEVLowest) {
|
|
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
|
|
const SCEV *Coeff = TryGetStride(Diff, Stride);
|
|
if (!Coeff)
|
|
return std::nullopt;
|
|
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
|
|
if (!SC || isa<SCEVCouldNotCompute>(SC))
|
|
return std::nullopt;
|
|
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
|
|
SE.getMulExpr(Stride, SC)))
|
|
->isZero())
|
|
return std::nullopt;
|
|
Dist = SC->getAPInt().getZExtValue();
|
|
}
|
|
// If the strides are not the same or repeated, we can't vectorize.
|
|
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
|
|
return std::nullopt;
|
|
auto Res = Offsets.emplace(Dist, Cnt);
|
|
if (!Res.second)
|
|
return std::nullopt;
|
|
// Consecutive order if the inserted element is the last one.
|
|
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
|
|
++Cnt;
|
|
}
|
|
if (Offsets.size() != SCEVs.size())
|
|
return std::nullopt;
|
|
SortedIndices.clear();
|
|
if (!IsConsecutive) {
|
|
// Fill SortedIndices array only if it is non-consecutive.
|
|
SortedIndices.resize(PointerOps.size());
|
|
Cnt = 0;
|
|
for (const std::pair<int64_t, int> &Pair : Offsets) {
|
|
SortedIndices[Cnt] = Pair.second;
|
|
++Cnt;
|
|
}
|
|
}
|
|
if (!Inst)
|
|
return nullptr;
|
|
SCEVExpander Expander(SE, DL, "strided-load-vec");
|
|
return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
|
|
}
|
|
|
|
static std::pair<InstructionCost, InstructionCost>
|
|
getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
|
|
Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
|
|
Type *ScalarTy, VectorType *VecTy);
|
|
|
|
/// Returns the cost of the shuffle instructions with the given \p Kind, vector
|
|
/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
|
|
/// subvector pattern.
|
|
static InstructionCost
|
|
getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
|
|
VectorType *Tp, ArrayRef<int> Mask = {},
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
|
|
int Index = 0, VectorType *SubTp = nullptr,
|
|
ArrayRef<const Value *> Args = {}) {
|
|
if (Kind != TTI::SK_PermuteTwoSrc)
|
|
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
|
|
int NumSrcElts = Tp->getElementCount().getKnownMinValue();
|
|
int NumSubElts;
|
|
if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
|
|
Mask, NumSrcElts, NumSubElts, Index)) {
|
|
if (Index + NumSubElts > NumSrcElts &&
|
|
Index + NumSrcElts <= static_cast<int>(Mask.size()))
|
|
return TTI.getShuffleCost(
|
|
TTI::SK_InsertSubvector,
|
|
getWidenedType(Tp->getElementType(), Mask.size()), Mask,
|
|
TTI::TCK_RecipThroughput, Index, Tp);
|
|
}
|
|
return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
|
|
}
|
|
|
|
/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
|
|
/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
|
|
/// instead of a scalar.
|
|
static InstructionCost
|
|
getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
|
|
VectorType *Ty, const APInt &DemandedElts, bool Insert,
|
|
bool Extract, TTI::TargetCostKind CostKind,
|
|
bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
|
|
assert(!isa<ScalableVectorType>(Ty) &&
|
|
"ScalableVectorType is not supported.");
|
|
assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
|
|
getNumElements(Ty) &&
|
|
"Incorrect usage.");
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
|
|
// of CreateInsertElement.
|
|
unsigned ScalarTyNumElements = VecTy->getNumElements();
|
|
InstructionCost Cost = 0;
|
|
for (unsigned I : seq(DemandedElts.getBitWidth())) {
|
|
if (!DemandedElts[I])
|
|
continue;
|
|
if (Insert)
|
|
Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,
|
|
I * ScalarTyNumElements, VecTy);
|
|
if (Extract)
|
|
Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,
|
|
I * ScalarTyNumElements, VecTy);
|
|
}
|
|
return Cost;
|
|
}
|
|
APInt NewDemandedElts = DemandedElts;
|
|
InstructionCost Cost = 0;
|
|
if (!ForPoisonSrc && Insert) {
|
|
// Handle insert into non-poison vector.
|
|
// TODO: Need to teach getScalarizationOverhead about insert elements into
|
|
// non-poison input vector to better handle such cases. Currently, it is
|
|
// very conservative and may "pessimize" the vectorization.
|
|
for (unsigned I : seq(DemandedElts.getBitWidth())) {
|
|
if (!DemandedElts[I])
|
|
continue;
|
|
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
|
|
I, Constant::getNullValue(Ty),
|
|
VL.empty() ? nullptr : VL[I]);
|
|
}
|
|
NewDemandedElts.clearAllBits();
|
|
} else if (!NewDemandedElts.isZero()) {
|
|
Cost += TTI.getScalarizationOverhead(Ty, NewDemandedElts, Insert, Extract,
|
|
CostKind, VL);
|
|
}
|
|
return Cost;
|
|
}
|
|
|
|
/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
|
|
/// is a FixedVectorType, a vector will be extracted instead of a scalar.
|
|
static InstructionCost getVectorInstrCost(
|
|
const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
|
|
TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
|
|
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
|
|
if (Opcode == Instruction::ExtractElement) {
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
assert(isa<VectorType>(Val) && "Val must be a vector type.");
|
|
return getShuffleCost(TTI, TTI::SK_ExtractSubvector,
|
|
cast<VectorType>(Val), {}, CostKind,
|
|
Index * VecTy->getNumElements(), VecTy);
|
|
}
|
|
}
|
|
return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
|
|
ScalarUserAndIdx);
|
|
}
|
|
|
|
/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
|
|
/// is a FixedVectorType, a vector will be extracted instead of a scalar.
|
|
static InstructionCost getExtractWithExtendCost(
|
|
const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
|
|
VectorType *VecTy, unsigned Index,
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
|
|
if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
auto *SubTp =
|
|
getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
|
|
return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,
|
|
Index * ScalarTy->getNumElements(), SubTp) +
|
|
TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
|
|
CostKind);
|
|
}
|
|
return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
|
|
}
|
|
|
|
/// Correctly creates insert_subvector, checking that the index is multiple of
|
|
/// the subvectors length. Otherwise, generates shuffle using \p Generator or
|
|
/// using default shuffle.
|
|
static Value *createInsertVector(
|
|
IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
|
|
function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
|
|
const unsigned SubVecVF = getNumElements(V->getType());
|
|
if (Index % SubVecVF == 0) {
|
|
Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
|
|
Builder.getInt64(Index));
|
|
} else {
|
|
// Create shuffle, insertvector requires that index is multiple of
|
|
// the subvector length.
|
|
const unsigned VecVF = getNumElements(Vec->getType());
|
|
SmallVector<int> Mask(VecVF, PoisonMaskElem);
|
|
std::iota(Mask.begin(), Mask.end(), 0);
|
|
for (unsigned I : seq<unsigned>(SubVecVF))
|
|
Mask[I + Index] = I + VecVF;
|
|
if (Generator) {
|
|
Vec = Generator(Vec, V, Mask);
|
|
} else {
|
|
// 1. Resize V to the size of Vec.
|
|
SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
|
|
std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
|
|
V = Builder.CreateShuffleVector(V, ResizeMask);
|
|
Vec = Builder.CreateShuffleVector(Vec, V, Mask);
|
|
}
|
|
}
|
|
return Vec;
|
|
}
|
|
|
|
/// Correctly creates extract_subvector, checking that the index is multiple of
|
|
/// the subvectors length. Otherwise, generates shuffle using \p Generator or
|
|
/// using default shuffle.
|
|
static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
|
|
unsigned SubVecVF, unsigned Index) {
|
|
if (Index % SubVecVF == 0) {
|
|
VectorType *SubVecTy =
|
|
getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
|
|
return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
|
|
}
|
|
// Create shuffle, extract_subvector requires that index is multiple of
|
|
// the subvector length.
|
|
SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
|
|
std::iota(Mask.begin(), Mask.end(), Index);
|
|
return Builder.CreateShuffleVector(Vec, Mask);
|
|
}
|
|
|
|
/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
|
|
/// with \p Order.
|
|
/// \return true if the mask represents strided access, false - otherwise.
|
|
static bool buildCompressMask(ArrayRef<Value *> PointerOps,
|
|
ArrayRef<unsigned> Order, Type *ScalarTy,
|
|
const DataLayout &DL, ScalarEvolution &SE,
|
|
SmallVectorImpl<int> &CompressMask) {
|
|
const unsigned Sz = PointerOps.size();
|
|
CompressMask.assign(Sz, PoisonMaskElem);
|
|
// The first element always set.
|
|
CompressMask[0] = 0;
|
|
// Check if the mask represents strided access.
|
|
std::optional<unsigned> Stride = 0;
|
|
Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
|
|
for (unsigned I : seq<unsigned>(1, Sz)) {
|
|
Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
|
|
unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
|
|
CompressMask[I] = Pos;
|
|
if (!Stride)
|
|
continue;
|
|
if (*Stride == 0) {
|
|
*Stride = Pos;
|
|
continue;
|
|
}
|
|
if (Pos != *Stride * I)
|
|
Stride.reset();
|
|
}
|
|
return Stride.has_value();
|
|
}
|
|
|
|
/// Checks if the \p VL can be transformed to a (masked)load + compress or
|
|
/// (masked) interleaved load.
|
|
static bool isMaskedLoadCompress(
|
|
ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
|
|
ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
|
|
const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
|
|
const DominatorTree &DT, const TargetLibraryInfo &TLI,
|
|
const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
|
|
unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
|
|
VectorType *&LoadVecTy) {
|
|
InterleaveFactor = 0;
|
|
Type *ScalarTy = VL.front()->getType();
|
|
const unsigned Sz = VL.size();
|
|
auto *VecTy = getWidenedType(ScalarTy, Sz);
|
|
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
SmallVector<int> Mask;
|
|
if (!Order.empty())
|
|
inversePermutation(Order, Mask);
|
|
// Check external uses.
|
|
for (const auto [I, V] : enumerate(VL)) {
|
|
if (AreAllUsersVectorized(V))
|
|
continue;
|
|
InstructionCost ExtractCost =
|
|
TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
|
|
Mask.empty() ? I : Mask[I]);
|
|
InstructionCost ScalarCost =
|
|
TTI.getInstructionCost(cast<Instruction>(V), CostKind);
|
|
if (ExtractCost <= ScalarCost)
|
|
return false;
|
|
}
|
|
Value *Ptr0;
|
|
Value *PtrN;
|
|
if (Order.empty()) {
|
|
Ptr0 = PointerOps.front();
|
|
PtrN = PointerOps.back();
|
|
} else {
|
|
Ptr0 = PointerOps[Order.front()];
|
|
PtrN = PointerOps[Order.back()];
|
|
}
|
|
std::optional<int> Diff =
|
|
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
|
|
if (!Diff)
|
|
return false;
|
|
const unsigned MaxRegSize =
|
|
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
|
|
.getFixedValue();
|
|
// Check for very large distances between elements.
|
|
if (*Diff / Sz >= MaxRegSize / 8)
|
|
return false;
|
|
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
|
|
LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
|
|
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
|
|
IsMasked = !isSafeToLoadUnconditionally(
|
|
Ptr0, LoadVecTy, CommonAlignment, DL,
|
|
cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
|
|
&TLI);
|
|
if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
|
|
LI->getPointerAddressSpace()))
|
|
return false;
|
|
// TODO: perform the analysis of each scalar load for better
|
|
// safe-load-unconditionally analysis.
|
|
bool IsStrided =
|
|
buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
|
|
assert(CompressMask.size() >= 2 && "At least two elements are required");
|
|
SmallVector<Value *> OrderedPointerOps(PointerOps);
|
|
if (!Order.empty())
|
|
reorderScalars(OrderedPointerOps, Mask);
|
|
auto [ScalarGEPCost, VectorGEPCost] =
|
|
getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
|
|
Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
|
|
// The cost of scalar loads.
|
|
InstructionCost ScalarLoadsCost =
|
|
std::accumulate(VL.begin(), VL.end(), InstructionCost(),
|
|
[&](InstructionCost C, Value *V) {
|
|
return C + TTI.getInstructionCost(cast<Instruction>(V),
|
|
CostKind);
|
|
}) +
|
|
ScalarGEPCost;
|
|
APInt DemandedElts = APInt::getAllOnes(Sz);
|
|
InstructionCost GatherCost =
|
|
getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
|
|
/*Insert=*/true,
|
|
/*Extract=*/false, CostKind) +
|
|
ScalarLoadsCost;
|
|
InstructionCost LoadCost = 0;
|
|
if (IsMasked) {
|
|
LoadCost =
|
|
TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
|
|
LI->getPointerAddressSpace(), CostKind);
|
|
} else {
|
|
CommonAlignment = LI->getAlign();
|
|
LoadCost =
|
|
TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
|
|
LI->getPointerAddressSpace(), CostKind);
|
|
}
|
|
if (IsStrided) {
|
|
// Check for potential segmented(interleaved) loads.
|
|
if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
|
|
CommonAlignment,
|
|
LI->getPointerAddressSpace())) {
|
|
InstructionCost InterleavedCost =
|
|
VectorGEPCost + TTI.getInterleavedMemoryOpCost(
|
|
Instruction::Load, LoadVecTy, CompressMask[1],
|
|
std::nullopt, CommonAlignment,
|
|
LI->getPointerAddressSpace(), CostKind, IsMasked);
|
|
if (!Mask.empty())
|
|
InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
|
|
VecTy, Mask, CostKind);
|
|
if (InterleavedCost < GatherCost) {
|
|
InterleaveFactor = CompressMask[1];
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
if (!Order.empty()) {
|
|
SmallVector<int> NewMask(Sz, PoisonMaskElem);
|
|
for (unsigned I : seq<unsigned>(Sz)) {
|
|
NewMask[I] = CompressMask[Mask[I]];
|
|
}
|
|
CompressMask.swap(NewMask);
|
|
}
|
|
InstructionCost CompressCost = ::getShuffleCost(
|
|
TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
|
|
InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
|
|
return TotalVecCost < GatherCost;
|
|
}
|
|
|
|
/// Checks if strided loads can be generated out of \p VL loads with pointers \p
|
|
/// PointerOps:
|
|
/// 1. Target with strided load support is detected.
|
|
/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
|
|
/// potential stride <= MaxProfitableLoadStride and the potential stride is
|
|
/// power-of-2 (to avoid perf regressions for the very small number of loads)
|
|
/// and max distance > number of loads, or potential stride is -1.
|
|
/// 3. The loads are ordered, or number of unordered loads <=
|
|
/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
|
|
/// to avoid extra costs for very expensive shuffles).
|
|
/// 4. Any pointer operand is an instruction with the users outside of the
|
|
/// current graph (for masked gathers extra extractelement instructions
|
|
/// might be required).
|
|
static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
|
|
ArrayRef<unsigned> Order,
|
|
const TargetTransformInfo &TTI, const DataLayout &DL,
|
|
ScalarEvolution &SE,
|
|
const bool IsAnyPointerUsedOutGraph, const int Diff) {
|
|
const unsigned Sz = VL.size();
|
|
const unsigned AbsoluteDiff = std::abs(Diff);
|
|
Type *ScalarTy = VL.front()->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, Sz);
|
|
if (IsAnyPointerUsedOutGraph ||
|
|
(AbsoluteDiff > Sz &&
|
|
(Sz > MinProfitableStridedLoads ||
|
|
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
|
|
AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
|
|
Diff == -(static_cast<int>(Sz) - 1)) {
|
|
int Stride = Diff / static_cast<int>(Sz - 1);
|
|
if (Diff != Stride * static_cast<int>(Sz - 1))
|
|
return false;
|
|
Align Alignment =
|
|
cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
|
|
->getAlign();
|
|
if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
|
|
return false;
|
|
Value *Ptr0;
|
|
Value *PtrN;
|
|
if (Order.empty()) {
|
|
Ptr0 = PointerOps.front();
|
|
PtrN = PointerOps.back();
|
|
} else {
|
|
Ptr0 = PointerOps[Order.front()];
|
|
PtrN = PointerOps[Order.back()];
|
|
}
|
|
// Iterate through all pointers and check if all distances are
|
|
// unique multiple of Dist.
|
|
SmallSet<int, 4> Dists;
|
|
for (Value *Ptr : PointerOps) {
|
|
int Dist = 0;
|
|
if (Ptr == PtrN)
|
|
Dist = Diff;
|
|
else if (Ptr != Ptr0)
|
|
Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
|
|
// If the strides are not the same or repeated, we can't
|
|
// vectorize.
|
|
if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
|
|
break;
|
|
}
|
|
if (Dists.size() == Sz)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
BoUpSLP::LoadsState
|
|
BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
|
|
SmallVectorImpl<unsigned> &Order,
|
|
SmallVectorImpl<Value *> &PointerOps,
|
|
unsigned *BestVF, bool TryRecursiveCheck) const {
|
|
// Check that a vectorized load would load the same memory as a scalar
|
|
// load. For example, we don't want to vectorize loads that are smaller
|
|
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
|
|
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
|
|
// from such a struct, we read/write packed bits disagreeing with the
|
|
// unvectorized version.
|
|
if (BestVF)
|
|
*BestVF = 0;
|
|
if (areKnownNonVectorizableLoads(VL))
|
|
return LoadsState::Gather;
|
|
Type *ScalarTy = VL0->getType();
|
|
|
|
if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
|
|
return LoadsState::Gather;
|
|
|
|
// Make sure all loads in the bundle are simple - we can't vectorize
|
|
// atomic or volatile loads.
|
|
PointerOps.clear();
|
|
const unsigned Sz = VL.size();
|
|
PointerOps.resize(Sz);
|
|
auto *POIter = PointerOps.begin();
|
|
for (Value *V : VL) {
|
|
auto *L = dyn_cast<LoadInst>(V);
|
|
if (!L || !L->isSimple())
|
|
return LoadsState::Gather;
|
|
*POIter = L->getPointerOperand();
|
|
++POIter;
|
|
}
|
|
|
|
Order.clear();
|
|
// Check the order of pointer operands or that all pointers are the same.
|
|
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
|
|
|
|
auto *VecTy = getWidenedType(ScalarTy, Sz);
|
|
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
|
|
if (!IsSorted) {
|
|
if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
|
|
if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
|
|
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
|
|
return LoadsState::StridedVectorize;
|
|
}
|
|
|
|
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
|
|
TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
|
|
return LoadsState::Gather;
|
|
|
|
if (!all_of(PointerOps, [&](Value *P) {
|
|
return arePointersCompatible(P, PointerOps.front(), *TLI);
|
|
}))
|
|
return LoadsState::Gather;
|
|
|
|
} else {
|
|
Value *Ptr0;
|
|
Value *PtrN;
|
|
if (Order.empty()) {
|
|
Ptr0 = PointerOps.front();
|
|
PtrN = PointerOps.back();
|
|
} else {
|
|
Ptr0 = PointerOps[Order.front()];
|
|
PtrN = PointerOps[Order.back()];
|
|
}
|
|
std::optional<int> Diff =
|
|
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
|
|
// Check that the sorted loads are consecutive.
|
|
if (static_cast<unsigned>(*Diff) == Sz - 1)
|
|
return LoadsState::Vectorize;
|
|
// Simple check if not a strided access - clear order.
|
|
bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
|
|
// Try to generate strided load node.
|
|
auto IsAnyPointerUsedOutGraph =
|
|
IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
|
|
return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
|
|
return !isVectorized(U) && !MustGather.contains(U);
|
|
});
|
|
});
|
|
if (IsPossibleStrided &&
|
|
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
|
|
IsAnyPointerUsedOutGraph, *Diff))
|
|
return LoadsState::StridedVectorize;
|
|
bool IsMasked;
|
|
unsigned InterleaveFactor;
|
|
SmallVector<int> CompressMask;
|
|
VectorType *LoadVecTy;
|
|
if (isMaskedLoadCompress(
|
|
VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
|
|
[&](Value *V) {
|
|
return areAllUsersVectorized(cast<Instruction>(V),
|
|
UserIgnoreList);
|
|
},
|
|
IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
|
|
return LoadsState::CompressVectorize;
|
|
}
|
|
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
|
|
TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
|
|
return LoadsState::Gather;
|
|
// Correctly identify compare the cost of loads + shuffles rather than
|
|
// strided/masked gather loads. Returns true if vectorized + shuffles
|
|
// representation is better than just gather.
|
|
auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
|
|
unsigned *BestVF,
|
|
bool ProfitableGatherPointers) {
|
|
if (BestVF)
|
|
*BestVF = 0;
|
|
// Compare masked gather cost and loads + insert subvector costs.
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
auto [ScalarGEPCost, VectorGEPCost] =
|
|
getGEPCosts(TTI, PointerOps, PointerOps.front(),
|
|
Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
|
|
// Estimate the cost of masked gather GEP. If not a splat, roughly
|
|
// estimate as a buildvector, otherwise estimate as splat.
|
|
APInt DemandedElts = APInt::getAllOnes(Sz);
|
|
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
|
|
VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
|
|
if (static_cast<unsigned>(count_if(
|
|
PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
|
|
any_of(PointerOps, [&](Value *V) {
|
|
return getUnderlyingObject(V) !=
|
|
getUnderlyingObject(PointerOps.front());
|
|
}))
|
|
VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
|
|
DemandedElts, /*Insert=*/true,
|
|
/*Extract=*/false, CostKind);
|
|
else
|
|
VectorGEPCost +=
|
|
getScalarizationOverhead(
|
|
TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
|
|
/*Insert=*/true, /*Extract=*/false, CostKind) +
|
|
::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
|
|
// The cost of scalar loads.
|
|
InstructionCost ScalarLoadsCost =
|
|
std::accumulate(VL.begin(), VL.end(), InstructionCost(),
|
|
[&](InstructionCost C, Value *V) {
|
|
return C + TTI.getInstructionCost(
|
|
cast<Instruction>(V), CostKind);
|
|
}) +
|
|
ScalarGEPCost;
|
|
// The cost of masked gather.
|
|
InstructionCost MaskedGatherCost =
|
|
TTI.getGatherScatterOpCost(
|
|
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
|
|
/*VariableMask=*/false, CommonAlignment, CostKind) +
|
|
(ProfitableGatherPointers ? 0 : VectorGEPCost);
|
|
InstructionCost GatherCost =
|
|
getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
|
|
/*Insert=*/true,
|
|
/*Extract=*/false, CostKind) +
|
|
ScalarLoadsCost;
|
|
// The list of loads is small or perform partial check already - directly
|
|
// compare masked gather cost and gather cost.
|
|
constexpr unsigned ListLimit = 4;
|
|
if (!TryRecursiveCheck || VL.size() < ListLimit)
|
|
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
|
|
|
|
// FIXME: The following code has not been updated for non-power-of-2
|
|
// vectors (and not whole registers). The splitting logic here does not
|
|
// cover the original vector if the vector factor is not a power of two.
|
|
if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
|
|
return false;
|
|
|
|
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
|
|
unsigned MinVF = getMinVF(2 * Sz);
|
|
DemandedElts.clearAllBits();
|
|
// Iterate through possible vectorization factors and check if vectorized +
|
|
// shuffles is better than just gather.
|
|
for (unsigned VF =
|
|
getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
|
|
VF >= MinVF;
|
|
VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
|
|
SmallVector<LoadsState> States;
|
|
for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
|
|
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
|
|
SmallVector<unsigned> Order;
|
|
SmallVector<Value *> PointerOps;
|
|
LoadsState LS =
|
|
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
|
|
/*TryRecursiveCheck=*/false);
|
|
// Check that the sorted loads are consecutive.
|
|
if (LS == LoadsState::Gather) {
|
|
if (BestVF) {
|
|
DemandedElts.setAllBits();
|
|
break;
|
|
}
|
|
DemandedElts.setBits(Cnt, Cnt + VF);
|
|
continue;
|
|
}
|
|
// If need the reorder - consider as high-cost masked gather for now.
|
|
if ((LS == LoadsState::Vectorize ||
|
|
LS == LoadsState::StridedVectorize ||
|
|
LS == LoadsState::CompressVectorize) &&
|
|
!Order.empty() && !isReverseOrder(Order))
|
|
LS = LoadsState::ScatterVectorize;
|
|
States.push_back(LS);
|
|
}
|
|
if (DemandedElts.isAllOnes())
|
|
// All loads gathered - try smaller VF.
|
|
continue;
|
|
// Can be vectorized later as a serie of loads/insertelements.
|
|
InstructionCost VecLdCost = 0;
|
|
if (!DemandedElts.isZero()) {
|
|
VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
|
|
/*Insert=*/true,
|
|
/*Extract=*/false, CostKind) +
|
|
ScalarGEPCost;
|
|
for (unsigned Idx : seq<unsigned>(VL.size()))
|
|
if (DemandedElts[Idx])
|
|
VecLdCost +=
|
|
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
|
|
}
|
|
auto *SubVecTy = getWidenedType(ScalarTy, VF);
|
|
for (auto [I, LS] : enumerate(States)) {
|
|
auto *LI0 = cast<LoadInst>(VL[I * VF]);
|
|
InstructionCost VectorGEPCost =
|
|
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
|
|
? 0
|
|
: getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
|
|
LI0->getPointerOperand(),
|
|
Instruction::GetElementPtr, CostKind, ScalarTy,
|
|
SubVecTy)
|
|
.second;
|
|
if (LS == LoadsState::ScatterVectorize) {
|
|
if (static_cast<unsigned>(
|
|
count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
|
|
PointerOps.size() - 1 ||
|
|
any_of(PointerOps, [&](Value *V) {
|
|
return getUnderlyingObject(V) !=
|
|
getUnderlyingObject(PointerOps.front());
|
|
}))
|
|
VectorGEPCost += getScalarizationOverhead(
|
|
TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
|
|
/*Insert=*/true, /*Extract=*/false, CostKind);
|
|
else
|
|
VectorGEPCost +=
|
|
getScalarizationOverhead(
|
|
TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
|
|
/*Insert=*/true, /*Extract=*/false, CostKind) +
|
|
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
|
|
CostKind);
|
|
}
|
|
switch (LS) {
|
|
case LoadsState::Vectorize:
|
|
VecLdCost +=
|
|
TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
|
|
LI0->getPointerAddressSpace(), CostKind,
|
|
TTI::OperandValueInfo()) +
|
|
VectorGEPCost;
|
|
break;
|
|
case LoadsState::StridedVectorize:
|
|
VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
|
|
LI0->getPointerOperand(),
|
|
/*VariableMask=*/false,
|
|
CommonAlignment, CostKind) +
|
|
VectorGEPCost;
|
|
break;
|
|
case LoadsState::CompressVectorize:
|
|
VecLdCost += TTI.getMaskedMemoryOpCost(
|
|
Instruction::Load, SubVecTy, CommonAlignment,
|
|
LI0->getPointerAddressSpace(), CostKind) +
|
|
VectorGEPCost +
|
|
::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
|
|
{}, CostKind);
|
|
break;
|
|
case LoadsState::ScatterVectorize:
|
|
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
|
|
LI0->getPointerOperand(),
|
|
/*VariableMask=*/false,
|
|
CommonAlignment, CostKind) +
|
|
VectorGEPCost;
|
|
break;
|
|
case LoadsState::Gather:
|
|
// Gathers are already calculated - ignore.
|
|
continue;
|
|
}
|
|
SmallVector<int> ShuffleMask(VL.size());
|
|
for (int Idx : seq<int>(0, VL.size()))
|
|
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
|
|
if (I > 0)
|
|
VecLdCost +=
|
|
::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
|
|
CostKind, I * VF, SubVecTy);
|
|
}
|
|
// If masked gather cost is higher - better to vectorize, so
|
|
// consider it as a gather node. It will be better estimated
|
|
// later.
|
|
if (MaskedGatherCost >= VecLdCost &&
|
|
VecLdCost - GatherCost < -SLPCostThreshold) {
|
|
if (BestVF)
|
|
*BestVF = VF;
|
|
return true;
|
|
}
|
|
}
|
|
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
|
|
};
|
|
// TODO: need to improve analysis of the pointers, if not all of them are
|
|
// GEPs or have > 2 operands, we end up with a gather node, which just
|
|
// increases the cost.
|
|
Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
|
|
bool ProfitableGatherPointers =
|
|
L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
|
|
return L->isLoopInvariant(V);
|
|
})) <= Sz / 2;
|
|
if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
|
|
auto *GEP = dyn_cast<GetElementPtrInst>(P);
|
|
return (!GEP && doesNotNeedToBeScheduled(P)) ||
|
|
(GEP && GEP->getNumOperands() == 2 &&
|
|
isa<Constant, Instruction>(GEP->getOperand(1)));
|
|
})) {
|
|
// Check if potential masked gather can be represented as series
|
|
// of loads + insertsubvectors.
|
|
// If masked gather cost is higher - better to vectorize, so
|
|
// consider it as a gather node. It will be better estimated
|
|
// later.
|
|
if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
|
|
ProfitableGatherPointers))
|
|
return LoadsState::ScatterVectorize;
|
|
}
|
|
|
|
return LoadsState::Gather;
|
|
}
|
|
|
|
static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
|
|
ArrayRef<BasicBlock *> BBs, Type *ElemTy,
|
|
const DataLayout &DL, ScalarEvolution &SE,
|
|
SmallVectorImpl<unsigned> &SortedIndices) {
|
|
assert(
|
|
all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
|
|
"Expected list of pointer operands.");
|
|
// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
|
|
// Ptr into, sort and return the sorted indices with values next to one
|
|
// another.
|
|
SmallMapVector<std::pair<BasicBlock *, Value *>,
|
|
SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8>
|
|
Bases;
|
|
Bases
|
|
.try_emplace(std::make_pair(
|
|
BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth)))
|
|
.first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
|
|
|
|
SortedIndices.clear();
|
|
for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
|
|
auto Key = std::make_pair(BBs[Cnt + 1],
|
|
getUnderlyingObject(Ptr, RecursionMaxDepth));
|
|
bool Found = any_of(Bases.try_emplace(Key).first->second,
|
|
[&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
|
|
std::optional<int> Diff = getPointersDiff(
|
|
ElemTy, std::get<0>(Base.front()), ElemTy,
|
|
Ptr, DL, SE,
|
|
/*StrictCheck=*/true);
|
|
if (!Diff)
|
|
return false;
|
|
|
|
Base.emplace_back(Ptr, *Diff, Cnt + 1);
|
|
return true;
|
|
});
|
|
|
|
if (!Found) {
|
|
// If we haven't found enough to usefully cluster, return early.
|
|
if (Bases.size() > VL.size() / 2 - 1)
|
|
return false;
|
|
|
|
// Not found already - add a new Base
|
|
Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
|
|
}
|
|
}
|
|
|
|
if (Bases.size() == VL.size())
|
|
return false;
|
|
|
|
if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
|
|
Bases.front().second.size() == VL.size()))
|
|
return false;
|
|
|
|
// For each of the bases sort the pointers by Offset and check if any of the
|
|
// base become consecutively allocated.
|
|
auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
|
|
SmallPtrSet<Value *, 13> FirstPointers;
|
|
SmallPtrSet<Value *, 13> SecondPointers;
|
|
Value *P1 = Ptr1;
|
|
Value *P2 = Ptr2;
|
|
unsigned Depth = 0;
|
|
while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
|
|
if (P1 == P2 || Depth > RecursionMaxDepth)
|
|
return false;
|
|
FirstPointers.insert(P1);
|
|
SecondPointers.insert(P2);
|
|
P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
|
|
P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
|
|
++Depth;
|
|
}
|
|
assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
|
|
"Unable to find matching root.");
|
|
return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
|
|
};
|
|
for (auto &Base : Bases) {
|
|
for (auto &Vec : Base.second) {
|
|
if (Vec.size() > 1) {
|
|
stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
|
|
const std::tuple<Value *, int, unsigned> &Y) {
|
|
return std::get<1>(X) < std::get<1>(Y);
|
|
});
|
|
int InitialOffset = std::get<1>(Vec[0]);
|
|
bool AnyConsecutive =
|
|
all_of(enumerate(Vec), [InitialOffset](const auto &P) {
|
|
return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
|
|
});
|
|
// Fill SortedIndices array only if it looks worth-while to sort the
|
|
// ptrs.
|
|
if (!AnyConsecutive)
|
|
return false;
|
|
}
|
|
}
|
|
stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
|
|
return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
|
|
});
|
|
}
|
|
|
|
for (auto &T : Bases)
|
|
for (const auto &Vec : T.second)
|
|
for (const auto &P : Vec)
|
|
SortedIndices.push_back(std::get<2>(P));
|
|
|
|
assert(SortedIndices.size() == VL.size() &&
|
|
"Expected SortedIndices to be the size of VL");
|
|
return true;
|
|
}
|
|
|
|
std::optional<BoUpSLP::OrdersType>
|
|
BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
|
|
assert(TE.isGather() && "Expected gather node only.");
|
|
Type *ScalarTy = TE.Scalars[0]->getType();
|
|
|
|
SmallVector<Value *> Ptrs;
|
|
Ptrs.reserve(TE.Scalars.size());
|
|
SmallVector<BasicBlock *> BBs;
|
|
BBs.reserve(TE.Scalars.size());
|
|
for (Value *V : TE.Scalars) {
|
|
auto *L = dyn_cast<LoadInst>(V);
|
|
if (!L || !L->isSimple())
|
|
return std::nullopt;
|
|
Ptrs.push_back(L->getPointerOperand());
|
|
BBs.push_back(L->getParent());
|
|
}
|
|
|
|
BoUpSLP::OrdersType Order;
|
|
if (!LoadEntriesToVectorize.contains(TE.Idx) &&
|
|
clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
|
|
return std::move(Order);
|
|
return std::nullopt;
|
|
}
|
|
|
|
/// Check if two insertelement instructions are from the same buildvector.
|
|
static bool areTwoInsertFromSameBuildVector(
|
|
InsertElementInst *VU, InsertElementInst *V,
|
|
function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
|
|
// Instructions must be from the same basic blocks.
|
|
if (VU->getParent() != V->getParent())
|
|
return false;
|
|
// Checks if 2 insertelements are from the same buildvector.
|
|
if (VU->getType() != V->getType())
|
|
return false;
|
|
// Multiple used inserts are separate nodes.
|
|
if (!VU->hasOneUse() && !V->hasOneUse())
|
|
return false;
|
|
auto *IE1 = VU;
|
|
auto *IE2 = V;
|
|
std::optional<unsigned> Idx1 = getElementIndex(IE1);
|
|
std::optional<unsigned> Idx2 = getElementIndex(IE2);
|
|
if (Idx1 == std::nullopt || Idx2 == std::nullopt)
|
|
return false;
|
|
// Go through the vector operand of insertelement instructions trying to find
|
|
// either VU as the original vector for IE2 or V as the original vector for
|
|
// IE1.
|
|
SmallBitVector ReusedIdx(
|
|
cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
|
|
bool IsReusedIdx = false;
|
|
do {
|
|
if (IE2 == VU && !IE1)
|
|
return VU->hasOneUse();
|
|
if (IE1 == V && !IE2)
|
|
return V->hasOneUse();
|
|
if (IE1 && IE1 != V) {
|
|
unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
|
|
IsReusedIdx |= ReusedIdx.test(Idx1);
|
|
ReusedIdx.set(Idx1);
|
|
if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
|
|
IE1 = nullptr;
|
|
else
|
|
IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
|
|
}
|
|
if (IE2 && IE2 != VU) {
|
|
unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
|
|
IsReusedIdx |= ReusedIdx.test(Idx2);
|
|
ReusedIdx.set(Idx2);
|
|
if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
|
|
IE2 = nullptr;
|
|
else
|
|
IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
|
|
}
|
|
} while (!IsReusedIdx && (IE1 || IE2));
|
|
return false;
|
|
}
|
|
|
|
std::optional<BoUpSLP::OrdersType>
|
|
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
|
|
bool IgnoreReorder) {
|
|
// No need to reorder if need to shuffle reuses, still need to shuffle the
|
|
// node.
|
|
if (!TE.ReuseShuffleIndices.empty()) {
|
|
// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
|
|
assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
|
|
"Reshuffling scalars not yet supported for nodes with padding");
|
|
|
|
if (isSplat(TE.Scalars))
|
|
return std::nullopt;
|
|
// Check if reuse shuffle indices can be improved by reordering.
|
|
// For this, check that reuse mask is "clustered", i.e. each scalar values
|
|
// is used once in each submask of size <number_of_scalars>.
|
|
// Example: 4 scalar values.
|
|
// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
|
|
// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
|
|
// element 3 is used twice in the second submask.
|
|
unsigned Sz = TE.Scalars.size();
|
|
if (TE.isGather()) {
|
|
if (std::optional<OrdersType> CurrentOrder =
|
|
findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
|
|
SmallVector<int> Mask;
|
|
fixupOrderingIndices(*CurrentOrder);
|
|
inversePermutation(*CurrentOrder, Mask);
|
|
::addMask(Mask, TE.ReuseShuffleIndices);
|
|
OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
|
|
unsigned Sz = TE.Scalars.size();
|
|
for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
|
|
for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
|
|
if (Idx != PoisonMaskElem)
|
|
Res[Idx + K * Sz] = I + K * Sz;
|
|
}
|
|
return std::move(Res);
|
|
}
|
|
}
|
|
if (Sz == 2 && TE.getVectorFactor() == 4 &&
|
|
::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
|
|
2 * TE.getVectorFactor())) == 1)
|
|
return std::nullopt;
|
|
if (TE.ReuseShuffleIndices.size() % Sz != 0)
|
|
return std::nullopt;
|
|
if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
|
|
Sz)) {
|
|
SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
|
|
if (TE.ReorderIndices.empty())
|
|
std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
|
|
else
|
|
inversePermutation(TE.ReorderIndices, ReorderMask);
|
|
::addMask(ReorderMask, TE.ReuseShuffleIndices);
|
|
unsigned VF = ReorderMask.size();
|
|
OrdersType ResOrder(VF, VF);
|
|
unsigned NumParts = divideCeil(VF, Sz);
|
|
SmallBitVector UsedVals(NumParts);
|
|
for (unsigned I = 0; I < VF; I += Sz) {
|
|
int Val = PoisonMaskElem;
|
|
unsigned UndefCnt = 0;
|
|
unsigned Limit = std::min(Sz, VF - I);
|
|
if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
|
|
[&](int Idx) {
|
|
if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
|
|
Val = Idx;
|
|
if (Idx == PoisonMaskElem)
|
|
++UndefCnt;
|
|
return Idx != PoisonMaskElem && Idx != Val;
|
|
}) ||
|
|
Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
|
|
UndefCnt > Sz / 2)
|
|
return std::nullopt;
|
|
UsedVals.set(Val);
|
|
for (unsigned K = 0; K < NumParts; ++K) {
|
|
unsigned Idx = Val + Sz * K;
|
|
if (Idx < VF && I + K < VF)
|
|
ResOrder[Idx] = I + K;
|
|
}
|
|
}
|
|
return std::move(ResOrder);
|
|
}
|
|
unsigned VF = TE.getVectorFactor();
|
|
// Try build correct order for extractelement instructions.
|
|
SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
|
|
TE.ReuseShuffleIndices.end());
|
|
if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
|
|
all_of(TE.Scalars, [Sz](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
|
|
return Idx && *Idx < Sz;
|
|
})) {
|
|
assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
|
|
"by BinaryOperator and CastInst.");
|
|
SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
|
|
if (TE.ReorderIndices.empty())
|
|
std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
|
|
else
|
|
inversePermutation(TE.ReorderIndices, ReorderMask);
|
|
for (unsigned I = 0; I < VF; ++I) {
|
|
int &Idx = ReusedMask[I];
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
Value *V = TE.Scalars[ReorderMask[Idx]];
|
|
std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
|
|
Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
|
|
}
|
|
}
|
|
// Build the order of the VF size, need to reorder reuses shuffles, they are
|
|
// always of VF size.
|
|
OrdersType ResOrder(VF);
|
|
std::iota(ResOrder.begin(), ResOrder.end(), 0);
|
|
auto *It = ResOrder.begin();
|
|
for (unsigned K = 0; K < VF; K += Sz) {
|
|
OrdersType CurrentOrder(TE.ReorderIndices);
|
|
SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
|
|
if (SubMask.front() == PoisonMaskElem)
|
|
std::iota(SubMask.begin(), SubMask.end(), 0);
|
|
reorderOrder(CurrentOrder, SubMask);
|
|
transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
|
|
std::advance(It, Sz);
|
|
}
|
|
if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
|
|
return Data.index() == Data.value();
|
|
}))
|
|
return std::nullopt; // No need to reorder.
|
|
return std::move(ResOrder);
|
|
}
|
|
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
|
|
(!TE.UserTreeIndex ||
|
|
!Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
|
|
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
|
|
return std::nullopt;
|
|
if (TE.State == TreeEntry::SplitVectorize ||
|
|
((TE.State == TreeEntry::Vectorize ||
|
|
TE.State == TreeEntry::StridedVectorize ||
|
|
TE.State == TreeEntry::CompressVectorize) &&
|
|
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
|
|
(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
|
|
assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
|
|
"Alternate instructions are only supported by "
|
|
"BinaryOperator and CastInst.");
|
|
return TE.ReorderIndices;
|
|
}
|
|
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
|
|
if (!TE.ReorderIndices.empty())
|
|
return TE.ReorderIndices;
|
|
|
|
SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
|
|
for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
|
|
if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
|
|
continue;
|
|
auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
|
|
if (!II)
|
|
continue;
|
|
Instruction *BVHead = nullptr;
|
|
BasicBlock *BB = II->getParent();
|
|
while (II && II->hasOneUse() && II->getParent() == BB) {
|
|
BVHead = II;
|
|
II = dyn_cast<InsertElementInst>(II->getOperand(0));
|
|
}
|
|
I = BVHead;
|
|
}
|
|
|
|
auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
|
|
assert(BB1 != BB2 && "Expected different basic blocks.");
|
|
if (!DT->isReachableFromEntry(BB1))
|
|
return false;
|
|
if (!DT->isReachableFromEntry(BB2))
|
|
return true;
|
|
auto *NodeA = DT->getNode(BB1);
|
|
auto *NodeB = DT->getNode(BB2);
|
|
assert(NodeA && "Should only process reachable instructions");
|
|
assert(NodeB && "Should only process reachable instructions");
|
|
assert((NodeA == NodeB) ==
|
|
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
|
|
};
|
|
auto PHICompare = [&](unsigned I1, unsigned I2) {
|
|
Value *V1 = TE.Scalars[I1];
|
|
Value *V2 = TE.Scalars[I2];
|
|
if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
|
|
return false;
|
|
if (isa<PoisonValue>(V1))
|
|
return true;
|
|
if (isa<PoisonValue>(V2))
|
|
return false;
|
|
if (V1->getNumUses() < V2->getNumUses())
|
|
return true;
|
|
if (V1->getNumUses() > V2->getNumUses())
|
|
return false;
|
|
auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
|
|
auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
|
|
if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
|
|
return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
|
|
FirstUserOfPhi2->getParent());
|
|
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
|
|
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
|
|
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
|
|
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
|
|
if (IE1 && !IE2)
|
|
return true;
|
|
if (!IE1 && IE2)
|
|
return false;
|
|
if (IE1 && IE2) {
|
|
if (UserBVHead[I1] && !UserBVHead[I2])
|
|
return true;
|
|
if (!UserBVHead[I1])
|
|
return false;
|
|
if (UserBVHead[I1] == UserBVHead[I2])
|
|
return getElementIndex(IE1) < getElementIndex(IE2);
|
|
if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
|
|
return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
|
|
UserBVHead[I2]->getParent());
|
|
return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
|
|
}
|
|
if (EE1 && !EE2)
|
|
return true;
|
|
if (!EE1 && EE2)
|
|
return false;
|
|
if (EE1 && EE2) {
|
|
auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
|
|
auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
|
|
auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
|
|
auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
|
|
if (!Inst2 && !P2)
|
|
return Inst1 || P1;
|
|
if (EE1->getOperand(0) == EE2->getOperand(0))
|
|
return getElementIndex(EE1) < getElementIndex(EE2);
|
|
if (!Inst1 && Inst2)
|
|
return false;
|
|
if (Inst1 && Inst2) {
|
|
if (Inst1->getParent() != Inst2->getParent())
|
|
return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
|
|
return Inst1->comesBefore(Inst2);
|
|
}
|
|
if (!P1 && P2)
|
|
return false;
|
|
assert(P1 && P2 &&
|
|
"Expected either instructions or arguments vector operands.");
|
|
return P1->getArgNo() < P2->getArgNo();
|
|
}
|
|
return false;
|
|
};
|
|
OrdersType Phis(TE.Scalars.size());
|
|
std::iota(Phis.begin(), Phis.end(), 0);
|
|
stable_sort(Phis, PHICompare);
|
|
if (isIdentityOrder(Phis))
|
|
return std::nullopt; // No need to reorder.
|
|
return std::move(Phis);
|
|
}
|
|
if (TE.isGather() &&
|
|
(!TE.hasState() || !TE.isAltShuffle() ||
|
|
ScalarsInSplitNodes.contains(TE.getMainOp())) &&
|
|
allSameType(TE.Scalars)) {
|
|
// TODO: add analysis of other gather nodes with extractelement
|
|
// instructions and other values/instructions, not only undefs.
|
|
if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
|
|
(all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
|
|
any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
|
|
all_of(TE.Scalars, [](Value *V) {
|
|
auto *EE = dyn_cast<ExtractElementInst>(V);
|
|
return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
|
|
})) {
|
|
// Check that gather of extractelements can be represented as
|
|
// just a shuffle of a single vector.
|
|
OrdersType CurrentOrder;
|
|
bool Reuse =
|
|
canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
|
|
if (Reuse || !CurrentOrder.empty())
|
|
return std::move(CurrentOrder);
|
|
}
|
|
// If the gather node is <undef, v, .., poison> and
|
|
// insertelement poison, v, 0 [+ permute]
|
|
// is cheaper than
|
|
// insertelement poison, v, n - try to reorder.
|
|
// If rotating the whole graph, exclude the permute cost, the whole graph
|
|
// might be transformed.
|
|
int Sz = TE.Scalars.size();
|
|
if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
|
|
count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
|
|
const auto *It =
|
|
find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
|
|
if (It == TE.Scalars.begin())
|
|
return OrdersType();
|
|
auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
|
|
if (It != TE.Scalars.end()) {
|
|
OrdersType Order(Sz, Sz);
|
|
unsigned Idx = std::distance(TE.Scalars.begin(), It);
|
|
Order[Idx] = 0;
|
|
fixupOrderingIndices(Order);
|
|
SmallVector<int> Mask;
|
|
inversePermutation(Order, Mask);
|
|
InstructionCost PermuteCost =
|
|
TopToBottom
|
|
? 0
|
|
: ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
|
|
InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
|
|
Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
|
|
PoisonValue::get(Ty), *It);
|
|
InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
|
|
Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
|
|
PoisonValue::get(Ty), *It);
|
|
if (InsertFirstCost + PermuteCost < InsertIdxCost) {
|
|
OrdersType Order(Sz, Sz);
|
|
Order[Idx] = 0;
|
|
return std::move(Order);
|
|
}
|
|
}
|
|
}
|
|
if (isSplat(TE.Scalars))
|
|
return std::nullopt;
|
|
if (TE.Scalars.size() >= 3)
|
|
if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
|
|
return Order;
|
|
// Check if can include the order of vectorized loads. For masked gathers do
|
|
// extra analysis later, so include such nodes into a special list.
|
|
if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
|
|
SmallVector<Value *> PointerOps;
|
|
OrdersType CurrentOrder;
|
|
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
|
|
CurrentOrder, PointerOps);
|
|
if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
|
|
Res == LoadsState::CompressVectorize)
|
|
return std::move(CurrentOrder);
|
|
}
|
|
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
|
|
// has been auditted for correctness with non-power-of-two vectors.
|
|
if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
|
|
if (std::optional<OrdersType> CurrentOrder =
|
|
findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
|
|
return CurrentOrder;
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/// Checks if the given mask is a "clustered" mask with the same clusters of
|
|
/// size \p Sz, which are not identity submasks.
|
|
static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
|
|
unsigned Sz) {
|
|
ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
|
|
if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
|
|
return false;
|
|
for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
|
|
ArrayRef<int> Cluster = Mask.slice(I, Sz);
|
|
if (Cluster != FirstCluster)
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
|
|
// Reorder reuses mask.
|
|
reorderReuses(TE.ReuseShuffleIndices, Mask);
|
|
const unsigned Sz = TE.Scalars.size();
|
|
// For vectorized and non-clustered reused no need to do anything else.
|
|
if (!TE.isGather() ||
|
|
!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
|
|
Sz) ||
|
|
!isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
|
|
return;
|
|
SmallVector<int> NewMask;
|
|
inversePermutation(TE.ReorderIndices, NewMask);
|
|
addMask(NewMask, TE.ReuseShuffleIndices);
|
|
// Clear reorder since it is going to be applied to the new mask.
|
|
TE.ReorderIndices.clear();
|
|
// Try to improve gathered nodes with clustered reuses, if possible.
|
|
ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
|
|
SmallVector<unsigned> NewOrder(Slice);
|
|
inversePermutation(NewOrder, NewMask);
|
|
reorderScalars(TE.Scalars, NewMask);
|
|
// Fill the reuses mask with the identity submasks.
|
|
for (auto *It = TE.ReuseShuffleIndices.begin(),
|
|
*End = TE.ReuseShuffleIndices.end();
|
|
It != End; std::advance(It, Sz))
|
|
std::iota(It, std::next(It, Sz), 0);
|
|
}
|
|
|
|
static void combineOrders(MutableArrayRef<unsigned> Order,
|
|
ArrayRef<unsigned> SecondaryOrder) {
|
|
assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
|
|
"Expected same size of orders");
|
|
unsigned Sz = Order.size();
|
|
SmallBitVector UsedIndices(Sz);
|
|
for (unsigned Idx : seq<unsigned>(0, Sz)) {
|
|
if (Order[Idx] != Sz)
|
|
UsedIndices.set(Order[Idx]);
|
|
}
|
|
if (SecondaryOrder.empty()) {
|
|
for (unsigned Idx : seq<unsigned>(0, Sz))
|
|
if (Order[Idx] == Sz && !UsedIndices.test(Idx))
|
|
Order[Idx] = Idx;
|
|
} else {
|
|
for (unsigned Idx : seq<unsigned>(0, Sz))
|
|
if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
|
|
!UsedIndices.test(SecondaryOrder[Idx]))
|
|
Order[Idx] = SecondaryOrder[Idx];
|
|
}
|
|
}
|
|
|
|
bool BoUpSLP::isProfitableToReorder() const {
|
|
constexpr unsigned TinyVF = 2;
|
|
constexpr unsigned TinyTree = 10;
|
|
constexpr unsigned PhiOpsLimit = 12;
|
|
constexpr unsigned GatherLoadsLimit = 2;
|
|
if (VectorizableTree.size() <= TinyTree)
|
|
return true;
|
|
if (VectorizableTree.front()->hasState() &&
|
|
!VectorizableTree.front()->isGather() &&
|
|
(VectorizableTree.front()->getOpcode() == Instruction::Store ||
|
|
VectorizableTree.front()->getOpcode() == Instruction::PHI ||
|
|
(VectorizableTree.front()->getVectorFactor() <= TinyVF &&
|
|
(VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
|
|
VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
|
|
VectorizableTree.front()->ReorderIndices.empty()) {
|
|
// Check if the tree has only single store and single (unordered) load node,
|
|
// other nodes are phis or geps/binops, combined with phis, and/orsingle
|
|
// gather load node
|
|
bool HasPhis = false;
|
|
if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
|
|
VectorizableTree.front()->Scalars.size() == TinyVF &&
|
|
VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
|
|
return false;
|
|
bool HasLoad = true;
|
|
unsigned GatherLoads = 0;
|
|
for (const std::unique_ptr<TreeEntry> &TE :
|
|
ArrayRef(VectorizableTree).drop_front()) {
|
|
if (!TE->hasState()) {
|
|
if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
|
|
all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
|
|
continue;
|
|
if (VectorizableTree.front()->Scalars.size() == TinyVF &&
|
|
any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
|
|
continue;
|
|
return true;
|
|
}
|
|
if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
|
|
if (!TE->isGather()) {
|
|
HasLoad = false;
|
|
continue;
|
|
}
|
|
if (HasLoad)
|
|
return true;
|
|
++GatherLoads;
|
|
if (GatherLoads >= GatherLoadsLimit)
|
|
return true;
|
|
}
|
|
if (TE->getOpcode() == Instruction::GetElementPtr ||
|
|
Instruction::isBinaryOp(TE->getOpcode()))
|
|
continue;
|
|
if (TE->getOpcode() != Instruction::PHI)
|
|
return true;
|
|
if (VectorizableTree.front()->Scalars.size() == TinyVF &&
|
|
TE->getNumOperands() > PhiOpsLimit)
|
|
return false;
|
|
HasPhis = true;
|
|
}
|
|
return !HasPhis;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
|
|
ArrayRef<int> MaskOrder) {
|
|
assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
|
|
SmallVector<int> NewMask(getVectorFactor());
|
|
SmallVector<int> NewMaskOrder(getVectorFactor());
|
|
std::iota(NewMask.begin(), NewMask.end(), 0);
|
|
std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
|
|
if (Idx == 0) {
|
|
copy(Mask, NewMask.begin());
|
|
copy(MaskOrder, NewMaskOrder.begin());
|
|
} else {
|
|
assert(Idx == 1 && "Expected either 0 or 1 index.");
|
|
unsigned Offset = CombinedEntriesWithIndices.back().second;
|
|
for (unsigned I : seq<unsigned>(Mask.size())) {
|
|
NewMask[I + Offset] = Mask[I] + Offset;
|
|
NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
|
|
}
|
|
}
|
|
reorderScalars(Scalars, NewMask);
|
|
reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
|
|
if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
|
|
ReorderIndices.clear();
|
|
}
|
|
|
|
void BoUpSLP::reorderTopToBottom() {
|
|
// Maps VF to the graph nodes.
|
|
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
|
|
// ExtractElement gather nodes which can be vectorized and need to handle
|
|
// their ordering.
|
|
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
|
|
|
|
// Phi nodes can have preferred ordering based on their result users
|
|
DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
|
|
|
|
// AltShuffles can also have a preferred ordering that leads to fewer
|
|
// instructions, e.g., the addsub instruction in x86.
|
|
DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
|
|
|
|
// Maps a TreeEntry to the reorder indices of external users.
|
|
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
|
|
ExternalUserReorderMap;
|
|
// Find all reorderable nodes with the given VF.
|
|
// Currently the are vectorized stores,loads,extracts + some gathering of
|
|
// extracts.
|
|
for_each(VectorizableTree, [&, &TTIRef = *TTI](
|
|
const std::unique_ptr<TreeEntry> &TE) {
|
|
// Look for external users that will probably be vectorized.
|
|
SmallVector<OrdersType, 1> ExternalUserReorderIndices =
|
|
findExternalStoreUsersReorderIndices(TE.get());
|
|
if (!ExternalUserReorderIndices.empty()) {
|
|
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
|
|
ExternalUserReorderMap.try_emplace(TE.get(),
|
|
std::move(ExternalUserReorderIndices));
|
|
}
|
|
|
|
// Patterns like [fadd,fsub] can be combined into a single instruction in
|
|
// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
|
|
// to take into account their order when looking for the most used order.
|
|
if (TE->hasState() && TE->isAltShuffle() &&
|
|
TE->State != TreeEntry::SplitVectorize) {
|
|
Type *ScalarTy = TE->Scalars[0]->getType();
|
|
VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
|
|
unsigned Opcode0 = TE->getOpcode();
|
|
unsigned Opcode1 = TE->getAltOpcode();
|
|
SmallBitVector OpcodeMask(
|
|
getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
|
|
// If this pattern is supported by the target then we consider the order.
|
|
if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
|
|
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
|
|
AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
|
|
}
|
|
// TODO: Check the reverse order too.
|
|
}
|
|
|
|
bool IgnoreReorder =
|
|
!UserIgnoreList && VectorizableTree.front()->hasState() &&
|
|
(VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
|
|
VectorizableTree.front()->getOpcode() == Instruction::Store);
|
|
if (std::optional<OrdersType> CurrentOrder =
|
|
getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
|
|
// Do not include ordering for nodes used in the alt opcode vectorization,
|
|
// better to reorder them during bottom-to-top stage. If follow the order
|
|
// here, it causes reordering of the whole graph though actually it is
|
|
// profitable just to reorder the subgraph that starts from the alternate
|
|
// opcode vectorization node. Such nodes already end-up with the shuffle
|
|
// instruction and it is just enough to change this shuffle rather than
|
|
// rotate the scalars for the whole graph.
|
|
unsigned Cnt = 0;
|
|
const TreeEntry *UserTE = TE.get();
|
|
while (UserTE && Cnt < RecursionMaxDepth) {
|
|
if (!UserTE->UserTreeIndex)
|
|
break;
|
|
if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
|
|
UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
|
|
UserTE->UserTreeIndex.UserTE->Idx != 0)
|
|
return;
|
|
UserTE = UserTE->UserTreeIndex.UserTE;
|
|
++Cnt;
|
|
}
|
|
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
|
|
if (!(TE->State == TreeEntry::Vectorize ||
|
|
TE->State == TreeEntry::StridedVectorize ||
|
|
TE->State == TreeEntry::SplitVectorize ||
|
|
TE->State == TreeEntry::CompressVectorize) ||
|
|
!TE->ReuseShuffleIndices.empty())
|
|
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
|
|
if (TE->State == TreeEntry::Vectorize &&
|
|
TE->getOpcode() == Instruction::PHI)
|
|
PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
|
|
}
|
|
});
|
|
|
|
// Reorder the graph nodes according to their vectorization factor.
|
|
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
|
|
!VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
|
|
auto It = VFToOrderedEntries.find(VF);
|
|
if (It == VFToOrderedEntries.end())
|
|
continue;
|
|
// Try to find the most profitable order. We just are looking for the most
|
|
// used order and reorder scalar elements in the nodes according to this
|
|
// mostly used order.
|
|
ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
|
|
// Delete VF entry upon exit.
|
|
auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
|
|
|
|
// All operands are reordered and used only in this node - propagate the
|
|
// most used order to the user node.
|
|
MapVector<OrdersType, unsigned,
|
|
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
|
|
OrdersUses;
|
|
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
|
|
for (const TreeEntry *OpTE : OrderedEntries) {
|
|
// No need to reorder this nodes, still need to extend and to use shuffle,
|
|
// just need to merge reordering shuffle and the reuse shuffle.
|
|
if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
|
|
OpTE->State != TreeEntry::SplitVectorize)
|
|
continue;
|
|
// Count number of orders uses.
|
|
const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
|
|
&PhisToOrders]() -> const OrdersType & {
|
|
if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
|
|
auto It = GathersToOrders.find(OpTE);
|
|
if (It != GathersToOrders.end())
|
|
return It->second;
|
|
}
|
|
if (OpTE->hasState() && OpTE->isAltShuffle()) {
|
|
auto It = AltShufflesToOrders.find(OpTE);
|
|
if (It != AltShufflesToOrders.end())
|
|
return It->second;
|
|
}
|
|
if (OpTE->State == TreeEntry::Vectorize &&
|
|
OpTE->getOpcode() == Instruction::PHI) {
|
|
auto It = PhisToOrders.find(OpTE);
|
|
if (It != PhisToOrders.end())
|
|
return It->second;
|
|
}
|
|
return OpTE->ReorderIndices;
|
|
}();
|
|
// First consider the order of the external scalar users.
|
|
auto It = ExternalUserReorderMap.find(OpTE);
|
|
if (It != ExternalUserReorderMap.end()) {
|
|
const auto &ExternalUserReorderIndices = It->second;
|
|
// If the OpTE vector factor != number of scalars - use natural order,
|
|
// it is an attempt to reorder node with reused scalars but with
|
|
// external uses.
|
|
if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
|
|
OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
|
|
ExternalUserReorderIndices.size();
|
|
} else {
|
|
for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
|
|
++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
|
|
}
|
|
// No other useful reorder data in this entry.
|
|
if (Order.empty())
|
|
continue;
|
|
}
|
|
// Stores actually store the mask, not the order, need to invert.
|
|
if (OpTE->State == TreeEntry::Vectorize &&
|
|
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
|
|
assert(!OpTE->isAltShuffle() &&
|
|
"Alternate instructions are only supported by BinaryOperator "
|
|
"and CastInst.");
|
|
SmallVector<int> Mask;
|
|
inversePermutation(Order, Mask);
|
|
unsigned E = Order.size();
|
|
OrdersType CurrentOrder(E, E);
|
|
transform(Mask, CurrentOrder.begin(), [E](int Idx) {
|
|
return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
|
|
});
|
|
fixupOrderingIndices(CurrentOrder);
|
|
++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
|
|
} else {
|
|
++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
|
|
}
|
|
}
|
|
if (OrdersUses.empty())
|
|
continue;
|
|
// Choose the most used order.
|
|
unsigned IdentityCnt = 0;
|
|
unsigned FilledIdentityCnt = 0;
|
|
OrdersType IdentityOrder(VF, VF);
|
|
for (auto &Pair : OrdersUses) {
|
|
if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
|
|
if (!Pair.first.empty())
|
|
FilledIdentityCnt += Pair.second;
|
|
IdentityCnt += Pair.second;
|
|
combineOrders(IdentityOrder, Pair.first);
|
|
}
|
|
}
|
|
MutableArrayRef<unsigned> BestOrder = IdentityOrder;
|
|
unsigned Cnt = IdentityCnt;
|
|
for (auto &Pair : OrdersUses) {
|
|
// Prefer identity order. But, if filled identity found (non-empty order)
|
|
// with same number of uses, as the new candidate order, we can choose
|
|
// this candidate order.
|
|
if (Cnt < Pair.second ||
|
|
(Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
|
|
Cnt == Pair.second && !BestOrder.empty() &&
|
|
isIdentityOrder(BestOrder))) {
|
|
combineOrders(Pair.first, BestOrder);
|
|
BestOrder = Pair.first;
|
|
Cnt = Pair.second;
|
|
} else {
|
|
combineOrders(BestOrder, Pair.first);
|
|
}
|
|
}
|
|
// Set order of the user node.
|
|
if (isIdentityOrder(BestOrder))
|
|
continue;
|
|
fixupOrderingIndices(BestOrder);
|
|
SmallVector<int> Mask;
|
|
inversePermutation(BestOrder, Mask);
|
|
SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
|
|
unsigned E = BestOrder.size();
|
|
transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
|
|
return I < E ? static_cast<int>(I) : PoisonMaskElem;
|
|
});
|
|
// Do an actual reordering, if profitable.
|
|
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
|
|
// Just do the reordering for the nodes with the given VF.
|
|
if (TE->Scalars.size() != VF) {
|
|
if (TE->ReuseShuffleIndices.size() == VF) {
|
|
assert(TE->State != TreeEntry::SplitVectorize &&
|
|
"Split vectorized not expected.");
|
|
// Need to reorder the reuses masks of the operands with smaller VF to
|
|
// be able to find the match between the graph nodes and scalar
|
|
// operands of the given node during vectorization/cost estimation.
|
|
assert(
|
|
(!TE->UserTreeIndex ||
|
|
TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
|
|
TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
|
|
TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
|
|
"All users must be of VF size.");
|
|
if (SLPReVec) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
// ShuffleVectorInst does not do reorderOperands (and it should not
|
|
// because ShuffleVectorInst supports only a limited set of
|
|
// patterns). Only do reorderNodeWithReuses if the user is not
|
|
// ShuffleVectorInst.
|
|
if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
|
|
isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
|
|
continue;
|
|
}
|
|
// Update ordering of the operands with the smaller VF than the given
|
|
// one.
|
|
reorderNodeWithReuses(*TE, Mask);
|
|
// Update orders in user split vectorize nodes.
|
|
if (TE->UserTreeIndex &&
|
|
TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
|
|
TE->UserTreeIndex.UserTE->reorderSplitNode(
|
|
TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
|
|
}
|
|
continue;
|
|
}
|
|
if ((TE->State == TreeEntry::SplitVectorize &&
|
|
TE->ReuseShuffleIndices.empty()) ||
|
|
((TE->State == TreeEntry::Vectorize ||
|
|
TE->State == TreeEntry::StridedVectorize ||
|
|
TE->State == TreeEntry::CompressVectorize) &&
|
|
(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
|
|
InsertElementInst>(TE->getMainOp()) ||
|
|
(SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
|
|
assert(
|
|
(!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
|
|
TE->ReuseShuffleIndices.empty())) &&
|
|
"Alternate instructions are only supported by BinaryOperator "
|
|
"and CastInst.");
|
|
// Build correct orders for extract{element,value}, loads,
|
|
// stores and alternate (split) nodes.
|
|
reorderOrder(TE->ReorderIndices, Mask);
|
|
if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
|
|
TE->reorderOperands(Mask);
|
|
} else {
|
|
// Reorder the node and its operands.
|
|
TE->reorderOperands(Mask);
|
|
assert(TE->ReorderIndices.empty() &&
|
|
"Expected empty reorder sequence.");
|
|
reorderScalars(TE->Scalars, Mask);
|
|
}
|
|
if (!TE->ReuseShuffleIndices.empty()) {
|
|
// Apply reversed order to keep the original ordering of the reused
|
|
// elements to avoid extra reorder indices shuffling.
|
|
OrdersType CurrentOrder;
|
|
reorderOrder(CurrentOrder, MaskOrder);
|
|
SmallVector<int> NewReuses;
|
|
inversePermutation(CurrentOrder, NewReuses);
|
|
addMask(NewReuses, TE->ReuseShuffleIndices);
|
|
TE->ReuseShuffleIndices.swap(NewReuses);
|
|
} else if (TE->UserTreeIndex &&
|
|
TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
|
|
// Update orders in user split vectorize nodes.
|
|
TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
|
|
Mask, MaskOrder);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool BoUpSLP::canReorderOperands(
|
|
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
|
|
ArrayRef<TreeEntry *> ReorderableGathers,
|
|
SmallVectorImpl<TreeEntry *> &GatherOps) {
|
|
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
|
|
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
|
|
return OpData.first == I &&
|
|
(OpData.second->State == TreeEntry::Vectorize ||
|
|
OpData.second->State == TreeEntry::StridedVectorize ||
|
|
OpData.second->State == TreeEntry::CompressVectorize ||
|
|
OpData.second->State == TreeEntry::SplitVectorize);
|
|
}))
|
|
continue;
|
|
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
|
|
// Add the node to the list of the ordered nodes with the identity
|
|
// order.
|
|
Edges.emplace_back(I, TE);
|
|
// Add ScatterVectorize nodes to the list of operands, where just
|
|
// reordering of the scalars is required. Similar to the gathers, so
|
|
// simply add to the list of gathered ops.
|
|
// If there are reused scalars, process this node as a regular vectorize
|
|
// node, just reorder reuses mask.
|
|
if (TE->State != TreeEntry::Vectorize &&
|
|
TE->State != TreeEntry::StridedVectorize &&
|
|
TE->State != TreeEntry::CompressVectorize &&
|
|
TE->State != TreeEntry::SplitVectorize &&
|
|
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
|
|
GatherOps.push_back(TE);
|
|
continue;
|
|
}
|
|
TreeEntry *Gather = nullptr;
|
|
if (count_if(ReorderableGathers,
|
|
[&Gather, UserTE, I](TreeEntry *TE) {
|
|
assert(TE->State != TreeEntry::Vectorize &&
|
|
TE->State != TreeEntry::StridedVectorize &&
|
|
TE->State != TreeEntry::CompressVectorize &&
|
|
TE->State != TreeEntry::SplitVectorize &&
|
|
"Only non-vectorized nodes are expected.");
|
|
if (TE->UserTreeIndex.UserTE == UserTE &&
|
|
TE->UserTreeIndex.EdgeIdx == I) {
|
|
assert(TE->isSame(UserTE->getOperand(I)) &&
|
|
"Operand entry does not match operands.");
|
|
Gather = TE;
|
|
return true;
|
|
}
|
|
return false;
|
|
}) > 1 &&
|
|
!allConstant(UserTE->getOperand(I)))
|
|
return false;
|
|
if (Gather)
|
|
GatherOps.push_back(Gather);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
|
|
struct TreeEntryCompare {
|
|
bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
|
|
if (LHS->UserTreeIndex && RHS->UserTreeIndex)
|
|
return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
|
|
return LHS->Idx < RHS->Idx;
|
|
}
|
|
};
|
|
PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;
|
|
DenseSet<const TreeEntry *> GathersToOrders;
|
|
// Find all reorderable leaf nodes with the given VF.
|
|
// Currently the are vectorized loads,extracts without alternate operands +
|
|
// some gathering of extracts.
|
|
SmallVector<TreeEntry *> NonVectorized;
|
|
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
|
|
if (TE->State != TreeEntry::Vectorize &&
|
|
TE->State != TreeEntry::StridedVectorize &&
|
|
TE->State != TreeEntry::CompressVectorize &&
|
|
TE->State != TreeEntry::SplitVectorize)
|
|
NonVectorized.push_back(TE.get());
|
|
if (std::optional<OrdersType> CurrentOrder =
|
|
getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
|
|
Queue.push(TE.get());
|
|
if (!(TE->State == TreeEntry::Vectorize ||
|
|
TE->State == TreeEntry::StridedVectorize ||
|
|
TE->State == TreeEntry::CompressVectorize ||
|
|
TE->State == TreeEntry::SplitVectorize) ||
|
|
!TE->ReuseShuffleIndices.empty())
|
|
GathersToOrders.insert(TE.get());
|
|
}
|
|
}
|
|
|
|
// 1. Propagate order to the graph nodes, which use only reordered nodes.
|
|
// I.e., if the node has operands, that are reordered, try to make at least
|
|
// one operand order in the natural order and reorder others + reorder the
|
|
// user node itself.
|
|
SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
|
|
while (!Queue.empty()) {
|
|
// 1. Filter out only reordered nodes.
|
|
std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
|
|
TreeEntry *TE = Queue.top();
|
|
const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
|
|
Queue.pop();
|
|
SmallVector<TreeEntry *> OrderedOps(1, TE);
|
|
while (!Queue.empty()) {
|
|
TE = Queue.top();
|
|
if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
|
|
break;
|
|
Queue.pop();
|
|
OrderedOps.push_back(TE);
|
|
}
|
|
for (TreeEntry *TE : OrderedOps) {
|
|
if (!(TE->State == TreeEntry::Vectorize ||
|
|
TE->State == TreeEntry::StridedVectorize ||
|
|
TE->State == TreeEntry::CompressVectorize ||
|
|
TE->State == TreeEntry::SplitVectorize ||
|
|
(TE->isGather() && GathersToOrders.contains(TE))) ||
|
|
!TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
|
|
!Visited.insert(TE).second)
|
|
continue;
|
|
// Build a map between user nodes and their operands order to speedup
|
|
// search. The graph currently does not provide this dependency directly.
|
|
Users.first = TE->UserTreeIndex.UserTE;
|
|
Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
|
|
}
|
|
if (Users.first) {
|
|
auto &Data = Users;
|
|
if (Data.first->State == TreeEntry::SplitVectorize) {
|
|
assert(
|
|
Data.second.size() <= 2 &&
|
|
"Expected not greater than 2 operands for split vectorize node.");
|
|
if (any_of(Data.second,
|
|
[](const auto &Op) { return !Op.second->UserTreeIndex; }))
|
|
continue;
|
|
// Update orders in user split vectorize nodes.
|
|
assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
|
|
"Expected exactly 2 entries.");
|
|
for (const auto &P : Data.first->CombinedEntriesWithIndices) {
|
|
TreeEntry &OpTE = *VectorizableTree[P.first].get();
|
|
OrdersType Order = OpTE.ReorderIndices;
|
|
if (Order.empty()) {
|
|
if (!OpTE.isGather())
|
|
continue;
|
|
const auto BestOrder =
|
|
getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
|
|
if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
|
|
continue;
|
|
Order = *BestOrder;
|
|
}
|
|
fixupOrderingIndices(Order);
|
|
SmallVector<int> Mask;
|
|
inversePermutation(Order, Mask);
|
|
const unsigned E = Order.size();
|
|
SmallVector<int> MaskOrder(E, PoisonMaskElem);
|
|
transform(Order, MaskOrder.begin(), [E](unsigned I) {
|
|
return I < E ? static_cast<int>(I) : PoisonMaskElem;
|
|
});
|
|
Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
|
|
// Clear ordering of the operand.
|
|
if (!OpTE.ReorderIndices.empty()) {
|
|
OpTE.ReorderIndices.clear();
|
|
} else if (!OpTE.ReuseShuffleIndices.empty()) {
|
|
reorderReuses(OpTE.ReuseShuffleIndices, Mask);
|
|
} else {
|
|
assert(OpTE.isGather() && "Expected only gather/buildvector node.");
|
|
reorderScalars(OpTE.Scalars, Mask);
|
|
}
|
|
}
|
|
if (Data.first->ReuseShuffleIndices.empty() &&
|
|
!Data.first->ReorderIndices.empty()) {
|
|
// Insert user node to the list to try to sink reordering deeper in
|
|
// the graph.
|
|
Queue.push(Data.first);
|
|
}
|
|
continue;
|
|
}
|
|
// Check that operands are used only in the User node.
|
|
SmallVector<TreeEntry *> GatherOps;
|
|
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
|
|
GatherOps)) {
|
|
Visited.insert_range(llvm::make_second_range(Data.second));
|
|
continue;
|
|
}
|
|
// All operands are reordered and used only in this node - propagate the
|
|
// most used order to the user node.
|
|
MapVector<OrdersType, unsigned,
|
|
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
|
|
OrdersUses;
|
|
// Do the analysis for each tree entry only once, otherwise the order of
|
|
// the same node my be considered several times, though might be not
|
|
// profitable.
|
|
SmallPtrSet<const TreeEntry *, 4> VisitedOps;
|
|
SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
|
|
for (const auto &Op : Data.second) {
|
|
TreeEntry *OpTE = Op.second;
|
|
if (!VisitedOps.insert(OpTE).second)
|
|
continue;
|
|
if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
|
|
continue;
|
|
const auto Order = [&]() -> const OrdersType {
|
|
if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
|
|
return getReorderingData(*OpTE, /*TopToBottom=*/false,
|
|
IgnoreReorder)
|
|
.value_or(OrdersType(1));
|
|
return OpTE->ReorderIndices;
|
|
}();
|
|
// The order is partially ordered, skip it in favor of fully non-ordered
|
|
// orders.
|
|
if (Order.size() == 1)
|
|
continue;
|
|
|
|
// Check that the reordering does not increase number of shuffles, i.e.
|
|
// same-values-nodes has same parents or their parents has same parents.
|
|
if (!Order.empty() && !isIdentityOrder(Order)) {
|
|
Value *Root = OpTE->hasState()
|
|
? OpTE->getMainOp()
|
|
: *find_if_not(OpTE->Scalars, isConstant);
|
|
auto GetSameNodesUsers = [&](Value *Root) {
|
|
SmallSetVector<TreeEntry *, 4> Res;
|
|
for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
|
|
if (TE != OpTE && TE->UserTreeIndex &&
|
|
TE->getVectorFactor() == OpTE->getVectorFactor() &&
|
|
TE->Scalars.size() == OpTE->Scalars.size() &&
|
|
((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
|
|
(OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
|
|
Res.insert(TE->UserTreeIndex.UserTE);
|
|
}
|
|
for (const TreeEntry *TE : getTreeEntries(Root)) {
|
|
if (TE != OpTE && TE->UserTreeIndex &&
|
|
TE->getVectorFactor() == OpTE->getVectorFactor() &&
|
|
TE->Scalars.size() == OpTE->Scalars.size() &&
|
|
((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
|
|
(OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
|
|
Res.insert(TE->UserTreeIndex.UserTE);
|
|
}
|
|
return Res.takeVector();
|
|
};
|
|
auto GetNumOperands = [](const TreeEntry *TE) {
|
|
if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
|
|
return CI->arg_size();
|
|
return TE->getNumOperands();
|
|
};
|
|
auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
|
|
const TreeEntry *TE) {
|
|
Intrinsic::ID ID = Intrinsic::not_intrinsic;
|
|
if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
|
|
ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
|
|
if (ID != Intrinsic::not_intrinsic &&
|
|
isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))
|
|
continue;
|
|
const TreeEntry *Op = getOperandEntry(TE, Idx);
|
|
if (Op->isGather() && Op->hasState()) {
|
|
const TreeEntry *VecOp =
|
|
getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
|
|
if (VecOp)
|
|
Op = VecOp;
|
|
}
|
|
if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
|
|
if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
|
|
if (!RevisitedOps.insert(UTE).second)
|
|
return false;
|
|
return UTE == Data.first || !UTE->ReorderIndices.empty() ||
|
|
!UTE->ReuseShuffleIndices.empty() ||
|
|
(UTE->UserTreeIndex &&
|
|
UTE->UserTreeIndex.UserTE == Data.first) ||
|
|
(Data.first->UserTreeIndex &&
|
|
Data.first->UserTreeIndex.UserTE == UTE) ||
|
|
(IgnoreReorder && UTE->UserTreeIndex &&
|
|
UTE->UserTreeIndex.UserTE->Idx == 0) ||
|
|
NodeShouldBeReorderedWithOperands(UTE);
|
|
}))
|
|
continue;
|
|
for (TreeEntry *UTE : Users) {
|
|
Intrinsic::ID ID = Intrinsic::not_intrinsic;
|
|
if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
|
|
ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
|
|
if (ID != Intrinsic::not_intrinsic &&
|
|
isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))
|
|
continue;
|
|
const TreeEntry *Op = getOperandEntry(UTE, Idx);
|
|
Visited.erase(Op);
|
|
Queue.push(const_cast<TreeEntry *>(Op));
|
|
}
|
|
}
|
|
}
|
|
unsigned NumOps = count_if(
|
|
Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
|
|
return P.second == OpTE;
|
|
});
|
|
// Stores actually store the mask, not the order, need to invert.
|
|
if (OpTE->State == TreeEntry::Vectorize &&
|
|
OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
|
|
assert(!OpTE->isAltShuffle() &&
|
|
"Alternate instructions are only supported by BinaryOperator "
|
|
"and CastInst.");
|
|
SmallVector<int> Mask;
|
|
inversePermutation(Order, Mask);
|
|
unsigned E = Order.size();
|
|
OrdersType CurrentOrder(E, E);
|
|
transform(Mask, CurrentOrder.begin(), [E](int Idx) {
|
|
return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
|
|
});
|
|
fixupOrderingIndices(CurrentOrder);
|
|
OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
|
|
NumOps;
|
|
} else {
|
|
OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
|
|
}
|
|
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
|
|
const auto AllowsReordering = [&](const TreeEntry *TE) {
|
|
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
|
|
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
|
|
(IgnoreReorder && TE->Idx == 0))
|
|
return true;
|
|
if (TE->isGather()) {
|
|
if (GathersToOrders.contains(TE))
|
|
return !getReorderingData(*TE, /*TopToBottom=*/false,
|
|
IgnoreReorder)
|
|
.value_or(OrdersType(1))
|
|
.empty();
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
if (OpTE->UserTreeIndex) {
|
|
TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
|
|
if (!VisitedUsers.insert(UserTE).second)
|
|
continue;
|
|
// May reorder user node if it requires reordering, has reused
|
|
// scalars, is an alternate op vectorize node or its op nodes require
|
|
// reordering.
|
|
if (AllowsReordering(UserTE))
|
|
continue;
|
|
// Check if users allow reordering.
|
|
// Currently look up just 1 level of operands to avoid increase of
|
|
// the compile time.
|
|
// Profitable to reorder if definitely more operands allow
|
|
// reordering rather than those with natural order.
|
|
ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
|
|
if (static_cast<unsigned>(count_if(
|
|
Ops, [UserTE, &AllowsReordering](
|
|
const std::pair<unsigned, TreeEntry *> &Op) {
|
|
return AllowsReordering(Op.second) &&
|
|
Op.second->UserTreeIndex.UserTE == UserTE;
|
|
})) <= Ops.size() / 2)
|
|
++Res.first->second;
|
|
}
|
|
}
|
|
if (OrdersUses.empty()) {
|
|
Visited.insert_range(llvm::make_second_range(Data.second));
|
|
continue;
|
|
}
|
|
// Choose the most used order.
|
|
unsigned IdentityCnt = 0;
|
|
unsigned VF = Data.second.front().second->getVectorFactor();
|
|
OrdersType IdentityOrder(VF, VF);
|
|
for (auto &Pair : OrdersUses) {
|
|
if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
|
|
IdentityCnt += Pair.second;
|
|
combineOrders(IdentityOrder, Pair.first);
|
|
}
|
|
}
|
|
MutableArrayRef<unsigned> BestOrder = IdentityOrder;
|
|
unsigned Cnt = IdentityCnt;
|
|
for (auto &Pair : OrdersUses) {
|
|
// Prefer identity order. But, if filled identity found (non-empty
|
|
// order) with same number of uses, as the new candidate order, we can
|
|
// choose this candidate order.
|
|
if (Cnt < Pair.second) {
|
|
combineOrders(Pair.first, BestOrder);
|
|
BestOrder = Pair.first;
|
|
Cnt = Pair.second;
|
|
} else {
|
|
combineOrders(BestOrder, Pair.first);
|
|
}
|
|
}
|
|
// Set order of the user node.
|
|
if (isIdentityOrder(BestOrder)) {
|
|
Visited.insert_range(llvm::make_second_range(Data.second));
|
|
continue;
|
|
}
|
|
fixupOrderingIndices(BestOrder);
|
|
// Erase operands from OrderedEntries list and adjust their orders.
|
|
VisitedOps.clear();
|
|
SmallVector<int> Mask;
|
|
inversePermutation(BestOrder, Mask);
|
|
SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
|
|
unsigned E = BestOrder.size();
|
|
transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
|
|
return I < E ? static_cast<int>(I) : PoisonMaskElem;
|
|
});
|
|
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
|
|
TreeEntry *TE = Op.second;
|
|
if (!VisitedOps.insert(TE).second)
|
|
continue;
|
|
if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
|
|
reorderNodeWithReuses(*TE, Mask);
|
|
continue;
|
|
}
|
|
// Gathers are processed separately.
|
|
if (TE->State != TreeEntry::Vectorize &&
|
|
TE->State != TreeEntry::StridedVectorize &&
|
|
TE->State != TreeEntry::CompressVectorize &&
|
|
TE->State != TreeEntry::SplitVectorize &&
|
|
(TE->State != TreeEntry::ScatterVectorize ||
|
|
TE->ReorderIndices.empty()))
|
|
continue;
|
|
assert((BestOrder.size() == TE->ReorderIndices.size() ||
|
|
TE->ReorderIndices.empty()) &&
|
|
"Non-matching sizes of user/operand entries.");
|
|
reorderOrder(TE->ReorderIndices, Mask);
|
|
if (IgnoreReorder && TE == VectorizableTree.front().get())
|
|
IgnoreReorder = false;
|
|
}
|
|
// For gathers just need to reorder its scalars.
|
|
for (TreeEntry *Gather : GatherOps) {
|
|
assert(Gather->ReorderIndices.empty() &&
|
|
"Unexpected reordering of gathers.");
|
|
if (!Gather->ReuseShuffleIndices.empty()) {
|
|
// Just reorder reuses indices.
|
|
reorderReuses(Gather->ReuseShuffleIndices, Mask);
|
|
continue;
|
|
}
|
|
reorderScalars(Gather->Scalars, Mask);
|
|
Visited.insert(Gather);
|
|
}
|
|
// Reorder operands of the user node and set the ordering for the user
|
|
// node itself.
|
|
if (Data.first->State != TreeEntry::Vectorize ||
|
|
!isa<ExtractElementInst, ExtractValueInst, LoadInst>(
|
|
Data.first->getMainOp()) ||
|
|
Data.first->isAltShuffle())
|
|
Data.first->reorderOperands(Mask);
|
|
if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
|
|
Data.first->isAltShuffle() ||
|
|
Data.first->State == TreeEntry::StridedVectorize ||
|
|
Data.first->State == TreeEntry::CompressVectorize) {
|
|
reorderScalars(Data.first->Scalars, Mask);
|
|
reorderOrder(Data.first->ReorderIndices, MaskOrder,
|
|
/*BottomOrder=*/true);
|
|
if (Data.first->ReuseShuffleIndices.empty() &&
|
|
!Data.first->ReorderIndices.empty() &&
|
|
!Data.first->isAltShuffle()) {
|
|
// Insert user node to the list to try to sink reordering deeper in
|
|
// the graph.
|
|
Queue.push(Data.first);
|
|
}
|
|
} else {
|
|
reorderOrder(Data.first->ReorderIndices, Mask);
|
|
}
|
|
}
|
|
}
|
|
// If the reordering is unnecessary, just remove the reorder.
|
|
if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
|
|
VectorizableTree.front()->ReuseShuffleIndices.empty())
|
|
VectorizableTree.front()->ReorderIndices.clear();
|
|
}
|
|
|
|
Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
|
|
if ((Entry.getOpcode() == Instruction::Store ||
|
|
Entry.getOpcode() == Instruction::Load) &&
|
|
Entry.State == TreeEntry::StridedVectorize &&
|
|
!Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
|
|
return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
|
|
return dyn_cast<Instruction>(Entry.Scalars.front());
|
|
}
|
|
|
|
void BoUpSLP::buildExternalUses(
|
|
const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
|
|
DenseMap<Value *, unsigned> ScalarToExtUses;
|
|
// Collect the values that we need to extract from the tree.
|
|
for (auto &TEPtr : VectorizableTree) {
|
|
TreeEntry *Entry = TEPtr.get();
|
|
|
|
// No need to handle users of gathered values.
|
|
if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
|
|
continue;
|
|
|
|
// For each lane:
|
|
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
|
|
Value *Scalar = Entry->Scalars[Lane];
|
|
if (!isa<Instruction>(Scalar))
|
|
continue;
|
|
// All uses must be replaced already? No need to do it again.
|
|
auto It = ScalarToExtUses.find(Scalar);
|
|
if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
|
|
continue;
|
|
|
|
// Check if the scalar is externally used as an extra arg.
|
|
const auto ExtI = ExternallyUsedValues.find(Scalar);
|
|
if (ExtI != ExternallyUsedValues.end()) {
|
|
int FoundLane = Entry->findLaneForValue(Scalar);
|
|
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
|
|
<< FoundLane << " from " << *Scalar << ".\n");
|
|
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
|
|
ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
|
|
continue;
|
|
}
|
|
for (User *U : Scalar->users()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
|
|
|
|
Instruction *UserInst = dyn_cast<Instruction>(U);
|
|
if (!UserInst || isDeleted(UserInst))
|
|
continue;
|
|
|
|
// Ignore users in the user ignore list.
|
|
if (UserIgnoreList && UserIgnoreList->contains(UserInst))
|
|
continue;
|
|
|
|
// Skip in-tree scalars that become vectors
|
|
if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
|
|
!UseEntries.empty()) {
|
|
// Some in-tree scalars will remain as scalar in vectorized
|
|
// instructions. If that is the case, the one in FoundLane will
|
|
// be used.
|
|
if (all_of(UseEntries, [&](TreeEntry *UseEntry) {
|
|
return UseEntry->State == TreeEntry::ScatterVectorize ||
|
|
!doesInTreeUserNeedToExtract(
|
|
Scalar, getRootEntryInstruction(*UseEntry), TLI,
|
|
TTI);
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
|
|
<< ".\n");
|
|
assert(none_of(UseEntries,
|
|
[](TreeEntry *UseEntry) {
|
|
return UseEntry->isGather();
|
|
}) &&
|
|
"Bad state");
|
|
continue;
|
|
}
|
|
U = nullptr;
|
|
if (It != ScalarToExtUses.end()) {
|
|
ExternalUses[It->second].User = nullptr;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (U && Scalar->hasNUsesOrMore(UsesLimit))
|
|
U = nullptr;
|
|
int FoundLane = Entry->findLaneForValue(Scalar);
|
|
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
|
|
<< " from lane " << FoundLane << " from " << *Scalar
|
|
<< ".\n");
|
|
It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
|
|
ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
|
|
if (!U)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
SmallVector<SmallVector<StoreInst *>>
|
|
BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
|
|
SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
|
|
SmallVector<StoreInst *>, 8>
|
|
PtrToStoresMap;
|
|
for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
|
|
Value *V = TE->Scalars[Lane];
|
|
// Don't iterate over the users of constant data.
|
|
if (!isa<Instruction>(V))
|
|
continue;
|
|
// To save compilation time we don't visit if we have too many users.
|
|
if (V->hasNUsesOrMore(UsesLimit))
|
|
break;
|
|
|
|
// Collect stores per pointer object.
|
|
for (User *U : V->users()) {
|
|
auto *SI = dyn_cast<StoreInst>(U);
|
|
// Test whether we can handle the store. V might be a global, which could
|
|
// be used in a different function.
|
|
if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
|
|
!isValidElementType(SI->getValueOperand()->getType()))
|
|
continue;
|
|
// Skip entry if already
|
|
if (isVectorized(U))
|
|
continue;
|
|
|
|
Value *Ptr =
|
|
getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
|
|
auto &StoresVec = PtrToStoresMap[{SI->getParent(),
|
|
SI->getValueOperand()->getType(), Ptr}];
|
|
// For now just keep one store per pointer object per lane.
|
|
// TODO: Extend this to support multiple stores per pointer per lane
|
|
if (StoresVec.size() > Lane)
|
|
continue;
|
|
if (!StoresVec.empty()) {
|
|
std::optional<int> Diff = getPointersDiff(
|
|
SI->getValueOperand()->getType(), SI->getPointerOperand(),
|
|
SI->getValueOperand()->getType(),
|
|
StoresVec.front()->getPointerOperand(), *DL, *SE,
|
|
/*StrictCheck=*/true);
|
|
// We failed to compare the pointers so just abandon this store.
|
|
if (!Diff)
|
|
continue;
|
|
}
|
|
StoresVec.push_back(SI);
|
|
}
|
|
}
|
|
SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
|
|
unsigned I = 0;
|
|
for (auto &P : PtrToStoresMap) {
|
|
Res[I].swap(P.second);
|
|
++I;
|
|
}
|
|
return Res;
|
|
}
|
|
|
|
bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
|
|
OrdersType &ReorderIndices) const {
|
|
// We check whether the stores in StoreVec can form a vector by sorting them
|
|
// and checking whether they are consecutive.
|
|
|
|
// To avoid calling getPointersDiff() while sorting we create a vector of
|
|
// pairs {store, offset from first} and sort this instead.
|
|
SmallVector<std::pair<int, unsigned>> StoreOffsetVec;
|
|
StoreInst *S0 = StoresVec[0];
|
|
StoreOffsetVec.emplace_back(0, 0);
|
|
Type *S0Ty = S0->getValueOperand()->getType();
|
|
Value *S0Ptr = S0->getPointerOperand();
|
|
for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
|
|
StoreInst *SI = StoresVec[Idx];
|
|
std::optional<int> Diff =
|
|
getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
|
|
SI->getPointerOperand(), *DL, *SE,
|
|
/*StrictCheck=*/true);
|
|
StoreOffsetVec.emplace_back(*Diff, Idx);
|
|
}
|
|
|
|
// Check if the stores are consecutive by checking if their difference is 1.
|
|
if (StoreOffsetVec.size() != StoresVec.size())
|
|
return false;
|
|
sort(StoreOffsetVec,
|
|
[](const std::pair<int, unsigned> &L,
|
|
const std::pair<int, unsigned> &R) { return L.first < R.first; });
|
|
unsigned Idx = 0;
|
|
int PrevDist = 0;
|
|
for (const auto &P : StoreOffsetVec) {
|
|
if (Idx > 0 && P.first != PrevDist + 1)
|
|
return false;
|
|
PrevDist = P.first;
|
|
++Idx;
|
|
}
|
|
|
|
// Calculate the shuffle indices according to their offset against the sorted
|
|
// StoreOffsetVec.
|
|
ReorderIndices.assign(StoresVec.size(), 0);
|
|
bool IsIdentity = true;
|
|
for (auto [I, P] : enumerate(StoreOffsetVec)) {
|
|
ReorderIndices[P.second] = I;
|
|
IsIdentity &= P.second == I;
|
|
}
|
|
// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
|
|
// reorderTopToBottom() and reorderBottomToTop(), so we are following the
|
|
// same convention here.
|
|
if (IsIdentity)
|
|
ReorderIndices.clear();
|
|
|
|
return true;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
|
|
for (unsigned Idx : Order)
|
|
dbgs() << Idx << ", ";
|
|
dbgs() << "\n";
|
|
}
|
|
#endif
|
|
|
|
SmallVector<BoUpSLP::OrdersType, 1>
|
|
BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
|
|
unsigned NumLanes = TE->Scalars.size();
|
|
|
|
SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
|
|
|
|
// Holds the reorder indices for each candidate store vector that is a user of
|
|
// the current TreeEntry.
|
|
SmallVector<OrdersType, 1> ExternalReorderIndices;
|
|
|
|
// Now inspect the stores collected per pointer and look for vectorization
|
|
// candidates. For each candidate calculate the reorder index vector and push
|
|
// it into `ExternalReorderIndices`
|
|
for (ArrayRef<StoreInst *> StoresVec : Stores) {
|
|
// If we have fewer than NumLanes stores, then we can't form a vector.
|
|
if (StoresVec.size() != NumLanes)
|
|
continue;
|
|
|
|
// If the stores are not consecutive then abandon this StoresVec.
|
|
OrdersType ReorderIndices;
|
|
if (!canFormVector(StoresVec, ReorderIndices))
|
|
continue;
|
|
|
|
// We now know that the scalars in StoresVec can form a vector instruction,
|
|
// so set the reorder indices.
|
|
ExternalReorderIndices.push_back(ReorderIndices);
|
|
}
|
|
return ExternalReorderIndices;
|
|
}
|
|
|
|
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
|
|
const SmallDenseSet<Value *> &UserIgnoreLst) {
|
|
deleteTree();
|
|
UserIgnoreList = &UserIgnoreLst;
|
|
if (!allSameType(Roots))
|
|
return;
|
|
buildTree_rec(Roots, 0, EdgeInfo());
|
|
}
|
|
|
|
void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
|
|
deleteTree();
|
|
if (!allSameType(Roots))
|
|
return;
|
|
buildTree_rec(Roots, 0, EdgeInfo());
|
|
}
|
|
|
|
/// Tries to find subvector of loads and builds new vector of only loads if can
|
|
/// be profitable.
|
|
static void gatherPossiblyVectorizableLoads(
|
|
const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
|
|
ScalarEvolution &SE, const TargetTransformInfo &TTI,
|
|
SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
|
|
bool AddNew = true) {
|
|
if (VL.empty())
|
|
return;
|
|
Type *ScalarTy = getValueType(VL.front());
|
|
if (!isValidElementType(ScalarTy))
|
|
return;
|
|
SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;
|
|
SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
|
|
for (Value *V : VL) {
|
|
auto *LI = dyn_cast<LoadInst>(V);
|
|
if (!LI)
|
|
continue;
|
|
if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
|
|
continue;
|
|
bool IsFound = false;
|
|
for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
|
|
assert(LI->getParent() == Data.front().first->getParent() &&
|
|
LI->getType() == Data.front().first->getType() &&
|
|
getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
|
|
getUnderlyingObject(Data.front().first->getPointerOperand(),
|
|
RecursionMaxDepth) &&
|
|
"Expected loads with the same type, same parent and same "
|
|
"underlying pointer.");
|
|
std::optional<int> Dist = getPointersDiff(
|
|
LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
|
|
Data.front().first->getPointerOperand(), DL, SE,
|
|
/*StrictCheck=*/true);
|
|
if (!Dist)
|
|
continue;
|
|
auto It = Map.find(*Dist);
|
|
if (It != Map.end() && It->second != LI)
|
|
continue;
|
|
if (It == Map.end()) {
|
|
Data.emplace_back(LI, *Dist);
|
|
Map.try_emplace(*Dist, LI);
|
|
}
|
|
IsFound = true;
|
|
break;
|
|
}
|
|
if (!IsFound) {
|
|
ClusteredLoads.emplace_back().emplace_back(LI, 0);
|
|
ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
|
|
}
|
|
}
|
|
auto FindMatchingLoads =
|
|
[&](ArrayRef<std::pair<LoadInst *, int>> Loads,
|
|
SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>
|
|
&GatheredLoads,
|
|
SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
|
|
int &Offset, unsigned &Start) {
|
|
if (Loads.empty())
|
|
return GatheredLoads.end();
|
|
SmallVector<std::pair<int, int>> Res;
|
|
LoadInst *LI = Loads.front().first;
|
|
for (auto [Idx, Data] : enumerate(GatheredLoads)) {
|
|
if (Idx < Start)
|
|
continue;
|
|
ToAdd.clear();
|
|
if (LI->getParent() != Data.front().first->getParent() ||
|
|
LI->getType() != Data.front().first->getType())
|
|
continue;
|
|
std::optional<int> Dist =
|
|
getPointersDiff(LI->getType(), LI->getPointerOperand(),
|
|
Data.front().first->getType(),
|
|
Data.front().first->getPointerOperand(), DL, SE,
|
|
/*StrictCheck=*/true);
|
|
if (!Dist)
|
|
continue;
|
|
SmallSet<int, 4> DataDists;
|
|
SmallPtrSet<LoadInst *, 4> DataLoads;
|
|
for (std::pair<LoadInst *, int> P : Data) {
|
|
DataDists.insert(P.second);
|
|
DataLoads.insert(P.first);
|
|
}
|
|
// Found matching gathered loads - check if all loads are unique or
|
|
// can be effectively vectorized.
|
|
unsigned NumUniques = 0;
|
|
for (auto [Cnt, Pair] : enumerate(Loads)) {
|
|
bool Used = DataLoads.contains(Pair.first);
|
|
if (!Used && !DataDists.contains(*Dist + Pair.second)) {
|
|
++NumUniques;
|
|
ToAdd.insert(Cnt);
|
|
} else if (Used) {
|
|
Repeated.insert(Cnt);
|
|
}
|
|
}
|
|
if (NumUniques > 0 &&
|
|
(Loads.size() == NumUniques ||
|
|
(Loads.size() - NumUniques >= 2 &&
|
|
Loads.size() - NumUniques >= Loads.size() / 2 &&
|
|
(has_single_bit(Data.size() + NumUniques) ||
|
|
bit_ceil(Data.size()) <
|
|
bit_ceil(Data.size() + NumUniques))))) {
|
|
Offset = *Dist;
|
|
Start = Idx + 1;
|
|
return std::next(GatheredLoads.begin(), Idx);
|
|
}
|
|
}
|
|
ToAdd.clear();
|
|
return GatheredLoads.end();
|
|
};
|
|
for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
|
|
unsigned Start = 0;
|
|
SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
|
|
int Offset = 0;
|
|
auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
|
|
Offset, Start);
|
|
while (It != GatheredLoads.end()) {
|
|
assert(!LocalToAdd.empty() && "Expected some elements to add.");
|
|
for (unsigned Idx : LocalToAdd)
|
|
It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
|
|
ToAdd.insert_range(LocalToAdd);
|
|
It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
|
|
Start);
|
|
}
|
|
if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
|
|
return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
|
|
})) {
|
|
auto AddNewLoads =
|
|
[&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) {
|
|
for (unsigned Idx : seq<unsigned>(Data.size())) {
|
|
if (ToAdd.contains(Idx) || Repeated.contains(Idx))
|
|
continue;
|
|
Loads.push_back(Data[Idx]);
|
|
}
|
|
};
|
|
if (!AddNew) {
|
|
LoadInst *LI = Data.front().first;
|
|
It = find_if(
|
|
GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
|
|
return PD.front().first->getParent() == LI->getParent() &&
|
|
PD.front().first->getType() == LI->getType();
|
|
});
|
|
while (It != GatheredLoads.end()) {
|
|
AddNewLoads(*It);
|
|
It = std::find_if(
|
|
std::next(It), GatheredLoads.end(),
|
|
[&](ArrayRef<std::pair<LoadInst *, int>> PD) {
|
|
return PD.front().first->getParent() == LI->getParent() &&
|
|
PD.front().first->getType() == LI->getType();
|
|
});
|
|
}
|
|
}
|
|
GatheredLoads.emplace_back().append(Data.begin(), Data.end());
|
|
AddNewLoads(GatheredLoads.emplace_back());
|
|
}
|
|
}
|
|
}
|
|
|
|
void BoUpSLP::tryToVectorizeGatheredLoads(
|
|
const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
|
|
SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
|
|
8> &GatheredLoads) {
|
|
GatheredLoadsEntriesFirst = VectorizableTree.size();
|
|
|
|
SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
|
|
LoadEntriesToVectorize.size());
|
|
for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
|
|
Set.insert_range(VectorizableTree[Idx]->Scalars);
|
|
|
|
// Sort loads by distance.
|
|
auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
|
|
const std::pair<LoadInst *, int> &L2) {
|
|
return L1.second > L2.second;
|
|
};
|
|
|
|
auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
|
|
ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
|
|
Loads.size());
|
|
Align Alignment = computeCommonAlignment<LoadInst>(Values);
|
|
auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
|
|
return TTI->isLegalMaskedGather(Ty, Alignment) &&
|
|
!TTI->forceScalarizeMaskedGather(Ty, Alignment);
|
|
};
|
|
|
|
auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
|
|
BoUpSLP::ValueSet &VectorizedLoads,
|
|
SmallVectorImpl<LoadInst *> &NonVectorized,
|
|
bool Final, unsigned MaxVF) {
|
|
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
|
|
unsigned StartIdx = 0;
|
|
SmallVector<int> CandidateVFs;
|
|
if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
|
|
CandidateVFs.push_back(MaxVF);
|
|
for (int NumElts = getFloorFullVectorNumberOfElements(
|
|
*TTI, Loads.front()->getType(), MaxVF);
|
|
NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
|
|
*TTI, Loads.front()->getType(), NumElts - 1)) {
|
|
CandidateVFs.push_back(NumElts);
|
|
if (VectorizeNonPowerOf2 && NumElts > 2)
|
|
CandidateVFs.push_back(NumElts - 1);
|
|
}
|
|
|
|
if (Final && CandidateVFs.empty())
|
|
return Results;
|
|
|
|
unsigned BestVF = Final ? CandidateVFs.back() : 0;
|
|
for (unsigned NumElts : CandidateVFs) {
|
|
if (Final && NumElts > BestVF)
|
|
continue;
|
|
SmallVector<unsigned> MaskedGatherVectorized;
|
|
for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
|
|
++Cnt) {
|
|
ArrayRef<LoadInst *> Slice =
|
|
ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
|
|
if (VectorizedLoads.count(Slice.front()) ||
|
|
VectorizedLoads.count(Slice.back()) ||
|
|
areKnownNonVectorizableLoads(Slice))
|
|
continue;
|
|
// Check if it is profitable to try vectorizing gathered loads. It is
|
|
// profitable if we have more than 3 consecutive loads or if we have
|
|
// less but all users are vectorized or deleted.
|
|
bool AllowToVectorize = false;
|
|
// Check if it is profitable to vectorize 2-elements loads.
|
|
if (NumElts == 2) {
|
|
bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
|
|
Slice.front()->getType(), ElementCount::getFixed(NumElts));
|
|
auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
|
|
for (LoadInst *LI : Slice) {
|
|
// If single use/user - allow to vectorize.
|
|
if (LI->hasOneUse())
|
|
continue;
|
|
// 1. Check if number of uses equals number of users.
|
|
// 2. All users are deleted.
|
|
// 3. The load broadcasts are not allowed or the load is not
|
|
// broadcasted.
|
|
if (static_cast<unsigned int>(std::distance(
|
|
LI->user_begin(), LI->user_end())) != LI->getNumUses())
|
|
return false;
|
|
if (!IsLegalBroadcastLoad)
|
|
continue;
|
|
if (LI->hasNUsesOrMore(UsesLimit))
|
|
return false;
|
|
for (User *U : LI->users()) {
|
|
if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
|
|
continue;
|
|
for (const TreeEntry *UTE : getTreeEntries(U)) {
|
|
for (int I : seq<int>(UTE->getNumOperands())) {
|
|
if (all_of(UTE->getOperand(I), [LI](Value *V) {
|
|
return V == LI || isa<PoisonValue>(V);
|
|
}))
|
|
// Found legal broadcast - do not vectorize.
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
};
|
|
AllowToVectorize = CheckIfAllowed(Slice);
|
|
} else {
|
|
AllowToVectorize =
|
|
(NumElts >= 3 ||
|
|
any_of(ValueToGatherNodes.at(Slice.front()),
|
|
[=](const TreeEntry *TE) {
|
|
return TE->Scalars.size() == 2 &&
|
|
((TE->Scalars.front() == Slice.front() &&
|
|
TE->Scalars.back() == Slice.back()) ||
|
|
(TE->Scalars.front() == Slice.back() &&
|
|
TE->Scalars.back() == Slice.front()));
|
|
})) &&
|
|
hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
|
|
Slice.size());
|
|
}
|
|
if (AllowToVectorize) {
|
|
SmallVector<Value *> PointerOps;
|
|
OrdersType CurrentOrder;
|
|
// Try to build vector load.
|
|
ArrayRef<Value *> Values(
|
|
reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
|
|
LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
|
|
PointerOps, &BestVF);
|
|
if (LS != LoadsState::Gather ||
|
|
(BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
|
|
if (LS == LoadsState::ScatterVectorize) {
|
|
if (MaskedGatherVectorized.empty() ||
|
|
Cnt >= MaskedGatherVectorized.back() + NumElts)
|
|
MaskedGatherVectorized.push_back(Cnt);
|
|
continue;
|
|
}
|
|
if (LS != LoadsState::Gather) {
|
|
Results.emplace_back(Values, LS);
|
|
VectorizedLoads.insert_range(Slice);
|
|
// If we vectorized initial block, no need to try to vectorize it
|
|
// again.
|
|
if (Cnt == StartIdx)
|
|
StartIdx += NumElts;
|
|
}
|
|
// Check if the whole array was vectorized already - exit.
|
|
if (StartIdx >= Loads.size())
|
|
break;
|
|
// Erase last masked gather candidate, if another candidate within
|
|
// the range is found to be better.
|
|
if (!MaskedGatherVectorized.empty() &&
|
|
Cnt < MaskedGatherVectorized.back() + NumElts)
|
|
MaskedGatherVectorized.pop_back();
|
|
Cnt += NumElts - 1;
|
|
continue;
|
|
}
|
|
}
|
|
if (!AllowToVectorize || BestVF == 0)
|
|
registerNonVectorizableLoads(Slice);
|
|
}
|
|
// Mark masked gathers candidates as vectorized, if any.
|
|
for (unsigned Cnt : MaskedGatherVectorized) {
|
|
ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
|
|
Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
|
|
ArrayRef<Value *> Values(
|
|
reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
|
|
Results.emplace_back(Values, LoadsState::ScatterVectorize);
|
|
VectorizedLoads.insert_range(Slice);
|
|
// If we vectorized initial block, no need to try to vectorize it again.
|
|
if (Cnt == StartIdx)
|
|
StartIdx += NumElts;
|
|
}
|
|
}
|
|
for (LoadInst *LI : Loads) {
|
|
if (!VectorizedLoads.contains(LI))
|
|
NonVectorized.push_back(LI);
|
|
}
|
|
return Results;
|
|
};
|
|
auto ProcessGatheredLoads =
|
|
[&, &TTI = *TTI](
|
|
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
|
|
bool Final = false) {
|
|
SmallVector<LoadInst *> NonVectorized;
|
|
for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
|
|
if (LoadsDists.size() <= 1) {
|
|
NonVectorized.push_back(LoadsDists.back().first);
|
|
continue;
|
|
}
|
|
SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
|
|
SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
|
|
transform(LoadsDists, OriginalLoads.begin(),
|
|
[](const std::pair<LoadInst *, int> &L) -> LoadInst * {
|
|
return L.first;
|
|
});
|
|
stable_sort(LocalLoadsDists, LoadSorter);
|
|
SmallVector<LoadInst *> Loads;
|
|
unsigned MaxConsecutiveDistance = 0;
|
|
unsigned CurrentConsecutiveDist = 1;
|
|
int LastDist = LocalLoadsDists.front().second;
|
|
bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
|
|
for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
|
|
if (isVectorized(L.first))
|
|
continue;
|
|
assert(LastDist >= L.second &&
|
|
"Expected first distance always not less than second");
|
|
if (static_cast<unsigned>(LastDist - L.second) ==
|
|
CurrentConsecutiveDist) {
|
|
++CurrentConsecutiveDist;
|
|
MaxConsecutiveDistance =
|
|
std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
|
|
Loads.push_back(L.first);
|
|
continue;
|
|
}
|
|
if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
|
|
!Loads.empty())
|
|
Loads.pop_back();
|
|
CurrentConsecutiveDist = 1;
|
|
LastDist = L.second;
|
|
Loads.push_back(L.first);
|
|
}
|
|
if (Loads.size() <= 1)
|
|
continue;
|
|
if (AllowMaskedGather)
|
|
MaxConsecutiveDistance = Loads.size();
|
|
else if (MaxConsecutiveDistance < 2)
|
|
continue;
|
|
BoUpSLP::ValueSet VectorizedLoads;
|
|
SmallVector<LoadInst *> SortedNonVectorized;
|
|
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
|
|
GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
|
|
Final, MaxConsecutiveDistance);
|
|
if (!Results.empty() && !SortedNonVectorized.empty() &&
|
|
OriginalLoads.size() == Loads.size() &&
|
|
MaxConsecutiveDistance == Loads.size() &&
|
|
all_of(Results,
|
|
[](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
|
|
return P.second == LoadsState::ScatterVectorize;
|
|
})) {
|
|
VectorizedLoads.clear();
|
|
SmallVector<LoadInst *> UnsortedNonVectorized;
|
|
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
|
|
UnsortedResults =
|
|
GetVectorizedRanges(OriginalLoads, VectorizedLoads,
|
|
UnsortedNonVectorized, Final,
|
|
OriginalLoads.size());
|
|
if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
|
|
SortedNonVectorized.swap(UnsortedNonVectorized);
|
|
Results.swap(UnsortedResults);
|
|
}
|
|
}
|
|
for (auto [Slice, _] : Results) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
|
|
<< Slice.size() << ")\n");
|
|
if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
|
|
for (Value *L : Slice)
|
|
if (!isVectorized(L))
|
|
SortedNonVectorized.push_back(cast<LoadInst>(L));
|
|
continue;
|
|
}
|
|
|
|
// Select maximum VF as a maximum of user gathered nodes and
|
|
// distance between scalar loads in these nodes.
|
|
unsigned MaxVF = Slice.size();
|
|
unsigned UserMaxVF = 0;
|
|
unsigned InterleaveFactor = 0;
|
|
if (MaxVF == 2) {
|
|
UserMaxVF = MaxVF;
|
|
} else {
|
|
// Found distance between segments of the interleaved loads.
|
|
std::optional<unsigned> InterleavedLoadsDistance = 0;
|
|
unsigned Order = 0;
|
|
std::optional<unsigned> CommonVF = 0;
|
|
DenseMap<const TreeEntry *, unsigned> EntryToPosition;
|
|
SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
|
|
for (auto [Idx, V] : enumerate(Slice)) {
|
|
for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
|
|
UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
|
|
unsigned Pos =
|
|
EntryToPosition.try_emplace(E, Idx).first->second;
|
|
UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
|
|
if (CommonVF) {
|
|
if (*CommonVF == 0) {
|
|
CommonVF = E->Scalars.size();
|
|
continue;
|
|
}
|
|
if (*CommonVF != E->Scalars.size())
|
|
CommonVF.reset();
|
|
}
|
|
// Check if the load is the part of the interleaved load.
|
|
if (Pos != Idx && InterleavedLoadsDistance) {
|
|
if (!DeinterleavedNodes.contains(E) &&
|
|
any_of(E->Scalars, [&, Slice = Slice](Value *V) {
|
|
if (isa<Constant>(V))
|
|
return false;
|
|
if (isVectorized(V))
|
|
return true;
|
|
const auto &Nodes = ValueToGatherNodes.at(V);
|
|
return (Nodes.size() != 1 || !Nodes.contains(E)) &&
|
|
!is_contained(Slice, V);
|
|
})) {
|
|
InterleavedLoadsDistance.reset();
|
|
continue;
|
|
}
|
|
DeinterleavedNodes.insert(E);
|
|
if (*InterleavedLoadsDistance == 0) {
|
|
InterleavedLoadsDistance = Idx - Pos;
|
|
continue;
|
|
}
|
|
if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
|
|
(Idx - Pos) / *InterleavedLoadsDistance < Order)
|
|
InterleavedLoadsDistance.reset();
|
|
Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
|
|
}
|
|
}
|
|
}
|
|
DeinterleavedNodes.clear();
|
|
// Check if the large load represents interleaved load operation.
|
|
if (InterleavedLoadsDistance.value_or(0) > 1 &&
|
|
CommonVF.value_or(0) != 0) {
|
|
InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
|
|
unsigned VF = *CommonVF;
|
|
OrdersType Order;
|
|
SmallVector<Value *> PointerOps;
|
|
// Segmented load detected - vectorize at maximum vector factor.
|
|
if (InterleaveFactor <= Slice.size() &&
|
|
TTI.isLegalInterleavedAccessType(
|
|
getWidenedType(Slice.front()->getType(), VF),
|
|
InterleaveFactor,
|
|
cast<LoadInst>(Slice.front())->getAlign(),
|
|
cast<LoadInst>(Slice.front())
|
|
->getPointerAddressSpace()) &&
|
|
canVectorizeLoads(Slice, Slice.front(), Order,
|
|
PointerOps) == LoadsState::Vectorize) {
|
|
UserMaxVF = InterleaveFactor * VF;
|
|
} else {
|
|
InterleaveFactor = 0;
|
|
}
|
|
}
|
|
// Cannot represent the loads as consecutive vectorizable nodes -
|
|
// just exit.
|
|
unsigned ConsecutiveNodesSize = 0;
|
|
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
|
|
any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
|
|
[&, Slice = Slice](const auto &P) {
|
|
const auto *It = find_if(Slice, [&](Value *V) {
|
|
return std::get<1>(P).contains(V);
|
|
});
|
|
if (It == Slice.end())
|
|
return false;
|
|
const TreeEntry &TE =
|
|
*VectorizableTree[std::get<0>(P)];
|
|
ArrayRef<Value *> VL = TE.Scalars;
|
|
OrdersType Order;
|
|
SmallVector<Value *> PointerOps;
|
|
LoadsState State = canVectorizeLoads(
|
|
VL, VL.front(), Order, PointerOps);
|
|
if (State == LoadsState::ScatterVectorize ||
|
|
State == LoadsState::CompressVectorize)
|
|
return false;
|
|
ConsecutiveNodesSize += VL.size();
|
|
unsigned Start = std::distance(Slice.begin(), It);
|
|
unsigned Sz = Slice.size() - Start;
|
|
return Sz < VL.size() ||
|
|
Slice.slice(std::distance(Slice.begin(), It),
|
|
VL.size()) != VL;
|
|
}))
|
|
continue;
|
|
// Try to build long masked gather loads.
|
|
UserMaxVF = bit_ceil(UserMaxVF);
|
|
if (InterleaveFactor == 0 &&
|
|
any_of(seq<unsigned>(Slice.size() / UserMaxVF),
|
|
[&, Slice = Slice](unsigned Idx) {
|
|
OrdersType Order;
|
|
SmallVector<Value *> PointerOps;
|
|
return canVectorizeLoads(
|
|
Slice.slice(Idx * UserMaxVF, UserMaxVF),
|
|
Slice[Idx * UserMaxVF], Order,
|
|
PointerOps) ==
|
|
LoadsState::ScatterVectorize;
|
|
}))
|
|
UserMaxVF = MaxVF;
|
|
if (Slice.size() != ConsecutiveNodesSize)
|
|
MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
|
|
}
|
|
for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
|
|
bool IsVectorized = true;
|
|
for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
|
|
ArrayRef<Value *> SubSlice =
|
|
Slice.slice(I, std::min(VF, E - I));
|
|
if (isVectorized(SubSlice.front()))
|
|
continue;
|
|
// Check if the subslice is to be-vectorized entry, which is not
|
|
// equal to entry.
|
|
if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
|
|
[&](const auto &P) {
|
|
return !SubSlice.equals(
|
|
VectorizableTree[std::get<0>(P)]
|
|
->Scalars) &&
|
|
set_is_subset(SubSlice, std::get<1>(P));
|
|
}))
|
|
continue;
|
|
unsigned Sz = VectorizableTree.size();
|
|
buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
|
|
if (Sz == VectorizableTree.size()) {
|
|
IsVectorized = false;
|
|
// Try non-interleaved vectorization with smaller vector
|
|
// factor.
|
|
if (InterleaveFactor > 0) {
|
|
VF = 2 * (MaxVF / InterleaveFactor);
|
|
InterleaveFactor = 0;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
if (IsVectorized)
|
|
break;
|
|
}
|
|
}
|
|
NonVectorized.append(SortedNonVectorized);
|
|
}
|
|
return NonVectorized;
|
|
};
|
|
for (const auto &GLs : GatheredLoads) {
|
|
const auto &Ref = GLs.second;
|
|
SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
|
|
if (!Ref.empty() && !NonVectorized.empty() &&
|
|
std::accumulate(
|
|
Ref.begin(), Ref.end(), 0u,
|
|
[](unsigned S,
|
|
ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
|
|
return S + LoadsDists.size();
|
|
}) != NonVectorized.size() &&
|
|
IsMaskedGatherSupported(NonVectorized)) {
|
|
SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
|
|
for (LoadInst *LI : NonVectorized) {
|
|
// Reinsert non-vectorized loads to other list of loads with the same
|
|
// base pointers.
|
|
gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
|
|
FinalGatheredLoads,
|
|
/*AddNew=*/false);
|
|
}
|
|
// Final attempt to vectorize non-vectorized loads.
|
|
(void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
|
|
}
|
|
}
|
|
// Try to vectorize postponed load entries, previously marked as gathered.
|
|
for (unsigned Idx : LoadEntriesToVectorize) {
|
|
const TreeEntry &E = *VectorizableTree[Idx];
|
|
SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
|
|
// Avoid reordering, if possible.
|
|
if (!E.ReorderIndices.empty()) {
|
|
// Build a mask out of the reorder indices and reorder scalars per this
|
|
// mask.
|
|
SmallVector<int> ReorderMask;
|
|
inversePermutation(E.ReorderIndices, ReorderMask);
|
|
reorderScalars(GatheredScalars, ReorderMask);
|
|
}
|
|
buildTree_rec(GatheredScalars, 0, EdgeInfo());
|
|
}
|
|
// If no new entries created, consider it as no gathered loads entries must be
|
|
// handled.
|
|
if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
|
|
VectorizableTree.size())
|
|
GatheredLoadsEntriesFirst.reset();
|
|
}
|
|
|
|
/// Generates key/subkey pair for the given value to provide effective sorting
|
|
/// of the values and better detection of the vectorizable values sequences. The
|
|
/// keys/subkeys can be used for better sorting of the values themselves (keys)
|
|
/// and in values subgroups (subkeys).
|
|
static std::pair<size_t, size_t> generateKeySubkey(
|
|
Value *V, const TargetLibraryInfo *TLI,
|
|
function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
|
|
bool AllowAlternate) {
|
|
hash_code Key = hash_value(V->getValueID() + 2);
|
|
hash_code SubKey = hash_value(0);
|
|
// Sort the loads by the distance between the pointers.
|
|
if (auto *LI = dyn_cast<LoadInst>(V)) {
|
|
Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
|
|
if (LI->isSimple())
|
|
SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
|
|
else
|
|
Key = SubKey = hash_value(LI);
|
|
} else if (isVectorLikeInstWithConstOps(V)) {
|
|
// Sort extracts by the vector operands.
|
|
if (isa<ExtractElementInst, UndefValue>(V))
|
|
Key = hash_value(Value::UndefValueVal + 1);
|
|
if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
|
|
if (!isUndefVector(EI->getVectorOperand()).all() &&
|
|
!isa<UndefValue>(EI->getIndexOperand()))
|
|
SubKey = hash_value(EI->getVectorOperand());
|
|
}
|
|
} else if (auto *I = dyn_cast<Instruction>(V)) {
|
|
// Sort other instructions just by the opcodes except for CMPInst.
|
|
// For CMP also sort by the predicate kind.
|
|
if ((isa<BinaryOperator, CastInst>(I)) &&
|
|
isValidForAlternation(I->getOpcode())) {
|
|
if (AllowAlternate)
|
|
Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
|
|
else
|
|
Key = hash_combine(hash_value(I->getOpcode()), Key);
|
|
SubKey = hash_combine(
|
|
hash_value(I->getOpcode()), hash_value(I->getType()),
|
|
hash_value(isa<BinaryOperator>(I)
|
|
? I->getType()
|
|
: cast<CastInst>(I)->getOperand(0)->getType()));
|
|
// For casts, look through the only operand to improve compile time.
|
|
if (isa<CastInst>(I)) {
|
|
std::pair<size_t, size_t> OpVals =
|
|
generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
|
|
/*AllowAlternate=*/true);
|
|
Key = hash_combine(OpVals.first, Key);
|
|
SubKey = hash_combine(OpVals.first, SubKey);
|
|
}
|
|
} else if (auto *CI = dyn_cast<CmpInst>(I)) {
|
|
CmpInst::Predicate Pred = CI->getPredicate();
|
|
if (CI->isCommutative())
|
|
Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
|
|
CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
|
|
SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
|
|
hash_value(SwapPred),
|
|
hash_value(CI->getOperand(0)->getType()));
|
|
} else if (auto *Call = dyn_cast<CallInst>(I)) {
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
|
|
if (isTriviallyVectorizable(ID)) {
|
|
SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
|
|
} else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
|
|
SubKey = hash_combine(hash_value(I->getOpcode()),
|
|
hash_value(Call->getCalledFunction()));
|
|
} else {
|
|
Key = hash_combine(hash_value(Call), Key);
|
|
SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
|
|
}
|
|
for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
|
|
SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
|
|
hash_value(Op.Tag), SubKey);
|
|
} else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
|
|
if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
|
|
SubKey = hash_value(Gep->getPointerOperand());
|
|
else
|
|
SubKey = hash_value(Gep);
|
|
} else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
|
|
!isa<ConstantInt>(I->getOperand(1))) {
|
|
// Do not try to vectorize instructions with potentially high cost.
|
|
SubKey = hash_value(I);
|
|
} else {
|
|
SubKey = hash_value(I->getOpcode());
|
|
}
|
|
Key = hash_combine(hash_value(I->getParent()), Key);
|
|
}
|
|
return std::make_pair(Key, SubKey);
|
|
}
|
|
|
|
/// Checks if the specified instruction \p I is an alternate operation for
|
|
/// the given \p MainOp and \p AltOp instructions.
|
|
static bool isAlternateInstruction(const Instruction *I,
|
|
const Instruction *MainOp,
|
|
const Instruction *AltOp,
|
|
const TargetLibraryInfo &TLI);
|
|
|
|
bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
|
|
ArrayRef<Value *> VL) const {
|
|
Type *ScalarTy = S.getMainOp()->getType();
|
|
unsigned Opcode0 = S.getOpcode();
|
|
unsigned Opcode1 = S.getAltOpcode();
|
|
SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
|
|
// If this pattern is supported by the target then consider it profitable.
|
|
if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
|
|
Opcode1, OpcodeMask))
|
|
return true;
|
|
SmallVector<ValueList> Operands;
|
|
for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
|
|
Operands.emplace_back();
|
|
// Prepare the operand vector.
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V)) {
|
|
Operands.back().push_back(
|
|
PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
|
|
continue;
|
|
}
|
|
Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
|
|
}
|
|
}
|
|
if (Operands.size() == 2) {
|
|
// Try find best operands candidates.
|
|
for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
|
|
SmallVector<std::pair<Value *, Value *>> Candidates(3);
|
|
Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
|
|
Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
|
|
Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
|
|
std::optional<int> Res = findBestRootPair(Candidates);
|
|
switch (Res.value_or(0)) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
std::swap(Operands[0][I + 1], Operands[1][I + 1]);
|
|
break;
|
|
case 2:
|
|
std::swap(Operands[0][I], Operands[1][I]);
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unexpected index.");
|
|
}
|
|
}
|
|
}
|
|
DenseSet<unsigned> UniqueOpcodes;
|
|
constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
|
|
unsigned NonInstCnt = 0;
|
|
// Estimate number of instructions, required for the vectorized node and for
|
|
// the buildvector node.
|
|
unsigned UndefCnt = 0;
|
|
// Count the number of extra shuffles, required for vector nodes.
|
|
unsigned ExtraShuffleInsts = 0;
|
|
// Check that operands do not contain same values and create either perfect
|
|
// diamond match or shuffled match.
|
|
if (Operands.size() == 2) {
|
|
// Do not count same operands twice.
|
|
if (Operands.front() == Operands.back()) {
|
|
Operands.erase(Operands.begin());
|
|
} else if (!allConstant(Operands.front()) &&
|
|
all_of(Operands.front(), [&](Value *V) {
|
|
return is_contained(Operands.back(), V);
|
|
})) {
|
|
Operands.erase(Operands.begin());
|
|
++ExtraShuffleInsts;
|
|
}
|
|
}
|
|
const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
|
|
// Vectorize node, if:
|
|
// 1. at least single operand is constant or splat.
|
|
// 2. Operands have many loop invariants (the instructions are not loop
|
|
// invariants).
|
|
// 3. At least single unique operands is supposed to vectorized.
|
|
return none_of(Operands,
|
|
[&](ArrayRef<Value *> Op) {
|
|
if (allConstant(Op) ||
|
|
(!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
|
|
getSameOpcode(Op, *TLI)))
|
|
return false;
|
|
DenseMap<Value *, unsigned> Uniques;
|
|
for (Value *V : Op) {
|
|
if (isa<Constant, ExtractElementInst>(V) ||
|
|
isVectorized(V) || (L && L->isLoopInvariant(V))) {
|
|
if (isa<UndefValue>(V))
|
|
++UndefCnt;
|
|
continue;
|
|
}
|
|
auto Res = Uniques.try_emplace(V, 0);
|
|
// Found first duplicate - need to add shuffle.
|
|
if (!Res.second && Res.first->second == 1)
|
|
++ExtraShuffleInsts;
|
|
++Res.first->getSecond();
|
|
if (auto *I = dyn_cast<Instruction>(V))
|
|
UniqueOpcodes.insert(I->getOpcode());
|
|
else if (Res.second)
|
|
++NonInstCnt;
|
|
}
|
|
return none_of(Uniques, [&](const auto &P) {
|
|
return P.first->hasNUsesOrMore(P.second + 1) &&
|
|
none_of(P.first->users(), [&](User *U) {
|
|
return isVectorized(U) || Uniques.contains(U);
|
|
});
|
|
});
|
|
}) ||
|
|
// Do not vectorize node, if estimated number of vector instructions is
|
|
// more than estimated number of buildvector instructions. Number of
|
|
// vector operands is number of vector instructions + number of vector
|
|
// instructions for operands (buildvectors). Number of buildvector
|
|
// instructions is just number_of_operands * number_of_scalars.
|
|
(UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
|
|
(UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
|
|
NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
|
|
}
|
|
|
|
/// Builds the arguments types vector for the given call instruction with the
|
|
/// given \p ID for the specified vector factor.
|
|
static SmallVector<Type *>
|
|
buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
|
|
const unsigned VF, unsigned MinBW,
|
|
const TargetTransformInfo *TTI) {
|
|
SmallVector<Type *> ArgTys;
|
|
for (auto [Idx, Arg] : enumerate(CI->args())) {
|
|
if (ID != Intrinsic::not_intrinsic) {
|
|
if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
|
|
ArgTys.push_back(Arg->getType());
|
|
continue;
|
|
}
|
|
if (MinBW > 0) {
|
|
ArgTys.push_back(
|
|
getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
|
|
continue;
|
|
}
|
|
}
|
|
ArgTys.push_back(getWidenedType(Arg->getType(), VF));
|
|
}
|
|
return ArgTys;
|
|
}
|
|
|
|
/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
|
|
/// function (if possible) calls. Returns invalid cost for the corresponding
|
|
/// calls, if they cannot be vectorized/will be scalarized.
|
|
static std::pair<InstructionCost, InstructionCost>
|
|
getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
|
|
TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
|
|
ArrayRef<Type *> ArgTys) {
|
|
auto Shape = VFShape::get(CI->getFunctionType(),
|
|
ElementCount::getFixed(VecTy->getNumElements()),
|
|
false /*HasGlobalPred*/);
|
|
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
|
|
auto LibCost = InstructionCost::getInvalid();
|
|
if (!CI->isNoBuiltin() && VecFunc) {
|
|
// Calculate the cost of the vector library call.
|
|
// If the corresponding vector call is cheaper, return its cost.
|
|
LibCost =
|
|
TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
|
|
}
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
|
|
// Calculate the cost of the vector intrinsic call.
|
|
FastMathFlags FMF;
|
|
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
|
|
FMF = FPCI->getFastMathFlags();
|
|
const InstructionCost ScalarLimit = 10000;
|
|
IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
|
|
LibCost.isValid() ? LibCost : ScalarLimit);
|
|
auto IntrinsicCost =
|
|
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
|
|
if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
|
|
(!LibCost.isValid() && IntrinsicCost > ScalarLimit))
|
|
IntrinsicCost = InstructionCost::getInvalid();
|
|
|
|
return {IntrinsicCost, LibCost};
|
|
}
|
|
|
|
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
|
|
const InstructionsState &S, ArrayRef<Value *> VL,
|
|
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
|
|
SmallVectorImpl<Value *> &PointerOps) {
|
|
assert(S.getMainOp() &&
|
|
"Expected instructions with same/alternate opcodes only.");
|
|
|
|
unsigned ShuffleOrOp =
|
|
S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
|
|
Instruction *VL0 = S.getMainOp();
|
|
switch (ShuffleOrOp) {
|
|
case Instruction::PHI: {
|
|
// Too many operands - gather, most probably won't be vectorized.
|
|
if (VL0->getNumOperands() > MaxPHINumOperands)
|
|
return TreeEntry::NeedToGather;
|
|
// Check for terminator values (e.g. invoke).
|
|
for (Value *V : VL) {
|
|
auto *PHI = dyn_cast<PHINode>(V);
|
|
if (!PHI)
|
|
continue;
|
|
for (Value *Incoming : PHI->incoming_values()) {
|
|
Instruction *Term = dyn_cast<Instruction>(Incoming);
|
|
if (Term && Term->isTerminator()) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
}
|
|
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
case Instruction::ExtractElement:
|
|
if (any_of(VL, [&](Value *V) {
|
|
auto *EI = dyn_cast<ExtractElementInst>(V);
|
|
if (!EI)
|
|
return true;
|
|
return isVectorized(EI->getOperand(0));
|
|
}))
|
|
return TreeEntry::NeedToGather;
|
|
[[fallthrough]];
|
|
case Instruction::ExtractValue: {
|
|
bool Reuse = canReuseExtract(VL, CurrentOrder);
|
|
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
|
|
// non-full registers).
|
|
if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
|
|
return TreeEntry::NeedToGather;
|
|
if (Reuse || !CurrentOrder.empty())
|
|
return TreeEntry::Vectorize;
|
|
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
case Instruction::InsertElement: {
|
|
// Check that we have a buildvector and not a shuffle of 2 or more
|
|
// different vectors.
|
|
ValueSet SourceVectors;
|
|
for (Value *V : VL) {
|
|
SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
|
|
assert(getElementIndex(V) != std::nullopt &&
|
|
"Non-constant or undef index?");
|
|
}
|
|
|
|
if (count_if(VL, [&SourceVectors](Value *V) {
|
|
return !SourceVectors.contains(V);
|
|
}) >= 2) {
|
|
// Found 2nd source vector - cancel.
|
|
LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
|
|
"different source vectors.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
|
|
if (any_of(VL, [&SourceVectors](Value *V) {
|
|
// The last InsertElement can have multiple uses.
|
|
return SourceVectors.contains(V) && !V->hasOneUse();
|
|
})) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
|
|
"multiple uses.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
case Instruction::Load: {
|
|
// Check that a vectorized load would load the same memory as a scalar
|
|
// load. For example, we don't want to vectorize loads that are smaller
|
|
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
|
|
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
|
|
// from such a struct, we read/write packed bits disagreeing with the
|
|
// unvectorized version.
|
|
auto IsGatheredNode = [&]() {
|
|
if (!GatheredLoadsEntriesFirst)
|
|
return false;
|
|
return all_of(VL, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
|
|
return TE->Idx >= *GatheredLoadsEntriesFirst;
|
|
});
|
|
});
|
|
};
|
|
switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
|
|
case LoadsState::Vectorize:
|
|
return TreeEntry::Vectorize;
|
|
case LoadsState::CompressVectorize:
|
|
if (!IsGraphTransformMode && !VectorizableTree.empty()) {
|
|
// Delay slow vectorized nodes for better vectorization attempts.
|
|
LoadEntriesToVectorize.insert(VectorizableTree.size());
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
return IsGatheredNode() ? TreeEntry::NeedToGather
|
|
: TreeEntry::CompressVectorize;
|
|
case LoadsState::ScatterVectorize:
|
|
if (!IsGraphTransformMode && !VectorizableTree.empty()) {
|
|
// Delay slow vectorized nodes for better vectorization attempts.
|
|
LoadEntriesToVectorize.insert(VectorizableTree.size());
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
return IsGatheredNode() ? TreeEntry::NeedToGather
|
|
: TreeEntry::ScatterVectorize;
|
|
case LoadsState::StridedVectorize:
|
|
if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
|
|
// Delay slow vectorized nodes for better vectorization attempts.
|
|
LoadEntriesToVectorize.insert(VectorizableTree.size());
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
return IsGatheredNode() ? TreeEntry::NeedToGather
|
|
: TreeEntry::StridedVectorize;
|
|
case LoadsState::Gather:
|
|
#ifndef NDEBUG
|
|
Type *ScalarTy = VL0->getType();
|
|
if (DL->getTypeSizeInBits(ScalarTy) !=
|
|
DL->getTypeAllocSizeInBits(ScalarTy))
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
|
|
else if (any_of(VL, [](Value *V) {
|
|
auto *LI = dyn_cast<LoadInst>(V);
|
|
return !LI || !LI->isSimple();
|
|
}))
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
|
|
else
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
|
|
#endif // NDEBUG
|
|
registerNonVectorizableLoads(VL);
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
llvm_unreachable("Unexpected state of loads");
|
|
}
|
|
case Instruction::ZExt:
|
|
case Instruction::SExt:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPExt:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::SIToFP:
|
|
case Instruction::UIToFP:
|
|
case Instruction::Trunc:
|
|
case Instruction::FPTrunc:
|
|
case Instruction::BitCast: {
|
|
Type *SrcTy = VL0->getOperand(0)->getType();
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
|
|
if (Ty != SrcTy || !isValidElementType(Ty)) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: Gathering casts with different src types.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp: {
|
|
// Check that all of the compares have the same predicate.
|
|
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
|
|
CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
|
|
Type *ComparedTy = VL0->getOperand(0)->getType();
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
auto *Cmp = cast<CmpInst>(V);
|
|
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
|
|
Cmp->getOperand(0)->getType() != ComparedTy) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
case Instruction::Select:
|
|
case Instruction::FNeg:
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::FDiv:
|
|
case Instruction::URem:
|
|
case Instruction::SRem:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor:
|
|
case Instruction::Freeze:
|
|
if (S.getMainOp()->getType()->isFloatingPointTy() &&
|
|
TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
return I && I->isBinaryOp() && !I->isFast();
|
|
}))
|
|
return TreeEntry::NeedToGather;
|
|
return TreeEntry::Vectorize;
|
|
case Instruction::GetElementPtr: {
|
|
// We don't combine GEPs with complicated (nested) indexing.
|
|
for (Value *V : VL) {
|
|
auto *I = dyn_cast<GetElementPtrInst>(V);
|
|
if (!I)
|
|
continue;
|
|
if (I->getNumOperands() != 2) {
|
|
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
|
|
// We can't combine several GEPs into one vector if they operate on
|
|
// different types.
|
|
Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
|
|
for (Value *V : VL) {
|
|
auto *GEP = dyn_cast<GEPOperator>(V);
|
|
if (!GEP)
|
|
continue;
|
|
Type *CurTy = GEP->getSourceElementType();
|
|
if (Ty0 != CurTy) {
|
|
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
|
|
// We don't combine GEPs with non-constant indexes.
|
|
Type *Ty1 = VL0->getOperand(1)->getType();
|
|
for (Value *V : VL) {
|
|
auto *I = dyn_cast<GetElementPtrInst>(V);
|
|
if (!I)
|
|
continue;
|
|
auto *Op = I->getOperand(1);
|
|
if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
|
|
(Op->getType() != Ty1 &&
|
|
((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
|
|
Op->getType()->getScalarSizeInBits() >
|
|
DL->getIndexSizeInBits(
|
|
V->getType()->getPointerAddressSpace())))) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
case Instruction::Store: {
|
|
// Check if the stores are consecutive or if we need to swizzle them.
|
|
llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
|
|
// Avoid types that are padded when being allocated as scalars, while
|
|
// being packed together in a vector (such as i1).
|
|
if (DL->getTypeSizeInBits(ScalarTy) !=
|
|
DL->getTypeAllocSizeInBits(ScalarTy)) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
// Make sure all stores in the bundle are simple - we can't vectorize
|
|
// atomic or volatile stores.
|
|
for (Value *V : VL) {
|
|
auto *SI = cast<StoreInst>(V);
|
|
if (!SI->isSimple()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
PointerOps.push_back(SI->getPointerOperand());
|
|
}
|
|
|
|
// Check the order of pointer operands.
|
|
if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
|
|
Value *Ptr0;
|
|
Value *PtrN;
|
|
if (CurrentOrder.empty()) {
|
|
Ptr0 = PointerOps.front();
|
|
PtrN = PointerOps.back();
|
|
} else {
|
|
Ptr0 = PointerOps[CurrentOrder.front()];
|
|
PtrN = PointerOps[CurrentOrder.back()];
|
|
}
|
|
std::optional<int> Dist =
|
|
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
|
|
// Check that the sorted pointer operands are consecutive.
|
|
if (static_cast<unsigned>(*Dist) == VL.size() - 1)
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
case Instruction::Call: {
|
|
if (S.getMainOp()->getType()->isFloatingPointTy() &&
|
|
TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
return I && !I->isFast();
|
|
}))
|
|
return TreeEntry::NeedToGather;
|
|
// Check if the calls are all to the same vectorizable intrinsic or
|
|
// library function.
|
|
CallInst *CI = cast<CallInst>(VL0);
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
|
|
VFShape Shape = VFShape::get(
|
|
CI->getFunctionType(),
|
|
ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
|
|
false /*HasGlobalPred*/);
|
|
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
|
|
|
|
if (!VecFunc && !isTriviallyVectorizable(ID)) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
Function *F = CI->getCalledFunction();
|
|
unsigned NumArgs = CI->arg_size();
|
|
SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
|
|
for (unsigned J = 0; J != NumArgs; ++J)
|
|
if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI))
|
|
ScalarArgs[J] = CI->getArgOperand(J);
|
|
for (Value *V : VL) {
|
|
CallInst *CI2 = dyn_cast<CallInst>(V);
|
|
if (!CI2 || CI2->getCalledFunction() != F ||
|
|
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
|
|
(VecFunc &&
|
|
VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
|
|
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
|
|
LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
|
|
<< "\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
// Some intrinsics have scalar arguments and should be same in order for
|
|
// them to be vectorized.
|
|
for (unsigned J = 0; J != NumArgs; ++J) {
|
|
if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {
|
|
Value *A1J = CI2->getArgOperand(J);
|
|
if (ScalarArgs[J] != A1J) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: mismatched arguments in call:" << *CI
|
|
<< " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
}
|
|
// Verify that the bundle operands are identical between the two calls.
|
|
if (CI->hasOperandBundles() &&
|
|
!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
|
|
CI->op_begin() + CI->getBundleOperandsEndIndex(),
|
|
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
|
|
LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
|
|
<< "!=" << *V << '\n');
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
SmallVector<Type *> ArgTys =
|
|
buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
|
|
auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
|
|
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
|
|
if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
|
|
return TreeEntry::NeedToGather;
|
|
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
case Instruction::ShuffleVector: {
|
|
if (!S.isAltShuffle()) {
|
|
// REVEC can support non alternate shuffle.
|
|
if (SLPReVec && getShufflevectorNumGroups(VL))
|
|
return TreeEntry::Vectorize;
|
|
// If this is not an alternate sequence of opcode like add-sub
|
|
// then do not vectorize this instruction.
|
|
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< "SLP: ShuffleVector not vectorized, operands are buildvector and "
|
|
"the whole alt sequence is not profitable.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
|
|
return TreeEntry::Vectorize;
|
|
}
|
|
default:
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
|
|
return TreeEntry::NeedToGather;
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
/// Allows to correctly handle operands of the phi nodes based on the \p Main
|
|
/// PHINode order of incoming basic blocks/values.
|
|
class PHIHandler {
|
|
DominatorTree &DT;
|
|
PHINode *Main = nullptr;
|
|
SmallVector<Value *> Phis;
|
|
SmallVector<SmallVector<Value *>> Operands;
|
|
|
|
public:
|
|
PHIHandler() = delete;
|
|
PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
|
|
: DT(DT), Main(Main), Phis(Phis),
|
|
Operands(Main->getNumIncomingValues(),
|
|
SmallVector<Value *>(Phis.size(), nullptr)) {}
|
|
void buildOperands() {
|
|
constexpr unsigned FastLimit = 4;
|
|
if (Main->getNumIncomingValues() <= FastLimit) {
|
|
for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
|
|
BasicBlock *InBB = Main->getIncomingBlock(I);
|
|
if (!DT.isReachableFromEntry(InBB)) {
|
|
Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
|
|
continue;
|
|
}
|
|
// Prepare the operand vector.
|
|
for (auto [Idx, V] : enumerate(Phis)) {
|
|
auto *P = dyn_cast<PHINode>(V);
|
|
if (!P) {
|
|
assert(isa<PoisonValue>(V) &&
|
|
"Expected isa instruction or poison value.");
|
|
Operands[I][Idx] = V;
|
|
continue;
|
|
}
|
|
if (P->getIncomingBlock(I) == InBB)
|
|
Operands[I][Idx] = P->getIncomingValue(I);
|
|
else
|
|
Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
|
|
Blocks;
|
|
for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
|
|
BasicBlock *InBB = Main->getIncomingBlock(I);
|
|
if (!DT.isReachableFromEntry(InBB)) {
|
|
Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
|
|
continue;
|
|
}
|
|
Blocks.try_emplace(InBB).first->second.push_back(I);
|
|
}
|
|
for (auto [Idx, V] : enumerate(Phis)) {
|
|
if (isa<PoisonValue>(V)) {
|
|
for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
|
|
Operands[I][Idx] = V;
|
|
continue;
|
|
}
|
|
auto *P = cast<PHINode>(V);
|
|
for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
|
|
BasicBlock *InBB = P->getIncomingBlock(I);
|
|
if (InBB == Main->getIncomingBlock(I)) {
|
|
if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
|
|
continue;
|
|
Operands[I][Idx] = P->getIncomingValue(I);
|
|
continue;
|
|
}
|
|
auto *It = Blocks.find(InBB);
|
|
if (It == Blocks.end())
|
|
continue;
|
|
Operands[It->second.front()][Idx] = P->getIncomingValue(I);
|
|
}
|
|
}
|
|
for (const auto &P : Blocks) {
|
|
ArrayRef<unsigned> IncomingValues = P.second;
|
|
if (IncomingValues.size() <= 1)
|
|
continue;
|
|
unsigned BasicI = IncomingValues.front();
|
|
for (unsigned I : IncomingValues.drop_front()) {
|
|
assert(all_of(enumerate(Operands[I]),
|
|
[&](const auto &Data) {
|
|
return !Data.value() ||
|
|
Data.value() == Operands[BasicI][Data.index()];
|
|
}) &&
|
|
"Expected empty operands list.");
|
|
Operands[I] = Operands[BasicI];
|
|
}
|
|
}
|
|
}
|
|
ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
|
|
};
|
|
} // namespace
|
|
|
|
/// Returns main/alternate instructions for the given \p VL. Unlike
|
|
/// getSameOpcode supports non-compatible instructions for better SplitVectorize
|
|
/// node support.
|
|
/// \returns first main/alt instructions, if only poisons and instruction with
|
|
/// only 2 opcodes exists. Returns pair of nullptr otherwise.
|
|
static std::pair<Instruction *, Instruction *>
|
|
getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
|
|
Instruction *MainOp = nullptr;
|
|
Instruction *AltOp = nullptr;
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
return {};
|
|
if (!MainOp) {
|
|
MainOp = I;
|
|
continue;
|
|
}
|
|
if (MainOp->getOpcode() == I->getOpcode()) {
|
|
if (I->getParent() != MainOp->getParent())
|
|
return {};
|
|
continue;
|
|
}
|
|
if (!AltOp) {
|
|
AltOp = I;
|
|
continue;
|
|
}
|
|
if (AltOp->getOpcode() == I->getOpcode()) {
|
|
if (I->getParent() != AltOp->getParent())
|
|
return {};
|
|
continue;
|
|
}
|
|
return {};
|
|
}
|
|
if (!AltOp)
|
|
return {};
|
|
assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
|
|
"Expected different main and alt instructions.");
|
|
return std::make_pair(MainOp, AltOp);
|
|
}
|
|
|
|
/// Checks that every instruction appears once in the list and if not, packs
|
|
/// them, building \p ReuseShuffleIndices mask. The list of unique scalars is
|
|
/// extended by poison values to the whole register size.
|
|
static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
|
|
SmallVectorImpl<int> &ReuseShuffleIndices,
|
|
const TargetTransformInfo &TTI,
|
|
const TargetLibraryInfo &TLI,
|
|
const InstructionsState &S,
|
|
const BoUpSLP::EdgeInfo &UserTreeIdx,
|
|
bool DoNotFail) {
|
|
// Check that every instruction appears once in this bundle.
|
|
SmallVector<Value *> UniqueValues;
|
|
SmallVector<Value *> NonUniqueValueVL;
|
|
SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
|
|
for (Value *V : VL) {
|
|
if (isConstant(V)) {
|
|
ReuseShuffleIndices.emplace_back(
|
|
isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
|
|
UniqueValues.emplace_back(V);
|
|
continue;
|
|
}
|
|
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
|
|
ReuseShuffleIndices.emplace_back(Res.first->second);
|
|
if (Res.second)
|
|
UniqueValues.emplace_back(V);
|
|
}
|
|
size_t NumUniqueScalarValues = UniqueValues.size();
|
|
bool IsFullVectors = hasFullVectorsOrPowerOf2(
|
|
TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
|
|
if (NumUniqueScalarValues == VL.size() &&
|
|
(VectorizeNonPowerOf2 || IsFullVectors)) {
|
|
ReuseShuffleIndices.clear();
|
|
} else {
|
|
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
|
|
if ((UserTreeIdx.UserTE &&
|
|
UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
|
|
!hasFullVectorsOrPowerOf2(TTI, getValueType(VL.front()), VL.size())) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
|
|
"for nodes with padding.\n");
|
|
return false;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
|
|
if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
|
|
(UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
|
|
return isa<UndefValue>(V) || !isConstant(V);
|
|
}))) {
|
|
if (DoNotFail && UniquePositions.size() > 1 &&
|
|
NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
|
|
all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
|
|
// Find the number of elements, which forms full vectors.
|
|
unsigned PWSz = getFullVectorNumberOfElements(
|
|
TTI, UniqueValues.front()->getType(), UniqueValues.size());
|
|
PWSz = std::min<unsigned>(PWSz, VL.size());
|
|
if (PWSz == VL.size()) {
|
|
ReuseShuffleIndices.clear();
|
|
} else {
|
|
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
|
|
NonUniqueValueVL.append(
|
|
PWSz - UniqueValues.size(),
|
|
PoisonValue::get(UniqueValues.front()->getType()));
|
|
// Check that extended with poisons operations are still valid for
|
|
// vectorization (div/rem are not allowed).
|
|
if (!getSameOpcode(NonUniqueValueVL, TLI).valid()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
|
return false;
|
|
}
|
|
VL = NonUniqueValueVL;
|
|
}
|
|
return true;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
|
return false;
|
|
}
|
|
VL = UniqueValues;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
|
|
const InstructionsState &LocalState,
|
|
SmallVectorImpl<Value *> &Op1,
|
|
SmallVectorImpl<Value *> &Op2,
|
|
OrdersType &ReorderIndices) const {
|
|
constexpr unsigned SmallNodeSize = 4;
|
|
if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
|
|
!SplitAlternateInstructions)
|
|
return false;
|
|
|
|
ReorderIndices.assign(VL.size(), VL.size());
|
|
SmallBitVector Op1Indices(VL.size());
|
|
for (auto [Idx, V] : enumerate(VL)) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I) {
|
|
Op1.push_back(V);
|
|
Op1Indices.set(Idx);
|
|
continue;
|
|
}
|
|
if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
|
|
I->getOpcode() == LocalState.getOpcode()) ||
|
|
(LocalState.getAltOpcode() == LocalState.getOpcode() &&
|
|
!isAlternateInstruction(I, LocalState.getMainOp(),
|
|
LocalState.getAltOp(), *TLI))) {
|
|
Op1.push_back(V);
|
|
Op1Indices.set(Idx);
|
|
continue;
|
|
}
|
|
Op2.push_back(V);
|
|
}
|
|
Type *ScalarTy = getValueType(VL.front());
|
|
VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
|
|
unsigned Opcode0 = LocalState.getOpcode();
|
|
unsigned Opcode1 = LocalState.getAltOpcode();
|
|
SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
|
|
// Enable split node, only if all nodes do not form legal alternate
|
|
// instruction (like X86 addsub).
|
|
SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
|
|
SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
|
|
if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
|
|
TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
|
|
!hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
|
|
!hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
|
|
return false;
|
|
// Enable split node, only if all nodes are power-of-2/full registers.
|
|
unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
|
|
for (unsigned Idx : seq<unsigned>(VL.size())) {
|
|
if (Op1Indices.test(Idx)) {
|
|
ReorderIndices[Op1Cnt] = Idx;
|
|
++Op1Cnt;
|
|
} else {
|
|
ReorderIndices[Op2Cnt] = Idx;
|
|
++Op2Cnt;
|
|
}
|
|
}
|
|
if (isIdentityOrder(ReorderIndices))
|
|
ReorderIndices.clear();
|
|
SmallVector<int> Mask;
|
|
if (!ReorderIndices.empty())
|
|
inversePermutation(ReorderIndices, Mask);
|
|
unsigned NumParts = TTI->getNumberOfParts(VecTy);
|
|
VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
|
|
VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
|
|
// Check non-profitable single register ops, which better to be represented
|
|
// as alternate ops.
|
|
if (NumParts >= VL.size())
|
|
return false;
|
|
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
|
|
InstructionCost InsertCost = ::getShuffleCost(
|
|
*TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
|
|
FixedVectorType *SubVecTy =
|
|
getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
|
|
InstructionCost NewShuffleCost =
|
|
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
|
|
if (!LocalState.isCmpOp() && NumParts <= 1 &&
|
|
(Mask.empty() || InsertCost >= NewShuffleCost))
|
|
return false;
|
|
if ((LocalState.getMainOp()->isBinaryOp() &&
|
|
LocalState.getAltOp()->isBinaryOp() &&
|
|
(LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
|
|
LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
|
|
(LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
|
|
(LocalState.getMainOp()->isUnaryOp() &&
|
|
LocalState.getAltOp()->isUnaryOp())) {
|
|
InstructionCost OriginalVecOpsCost =
|
|
TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
|
|
TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
|
|
SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
|
|
for (unsigned Idx : seq<unsigned>(VL.size())) {
|
|
if (isa<PoisonValue>(VL[Idx]))
|
|
continue;
|
|
OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
|
|
}
|
|
InstructionCost OriginalCost =
|
|
OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
|
|
VecTy, OriginalMask, Kind);
|
|
InstructionCost NewVecOpsCost =
|
|
TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
|
|
TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
|
|
InstructionCost NewCost =
|
|
NewVecOpsCost + InsertCost +
|
|
(!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
|
|
VectorizableTree.front()->getOpcode() == Instruction::Store
|
|
? NewShuffleCost
|
|
: 0);
|
|
// If not profitable to split - exit.
|
|
if (NewCost >= OriginalCost)
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool BoUpSLP::isLegalToVectorizeScalars(ArrayRef<Value *> VL, unsigned Depth,
|
|
const EdgeInfo &UserTreeIdx,
|
|
InstructionsState &S,
|
|
bool &TryToFindDuplicates,
|
|
bool &TrySplitVectorize) const {
|
|
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
|
|
|
|
S = getSameOpcode(VL, *TLI);
|
|
TryToFindDuplicates = true;
|
|
TrySplitVectorize = false;
|
|
|
|
// Don't go into catchswitch blocks, which can happen with PHIs.
|
|
// Such blocks can only have PHIs and the catchswitch. There is no
|
|
// place to insert a shuffle if we need to, so just avoid that issue.
|
|
if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
|
|
LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
|
|
// Do not try to pack to avoid extra instructions here.
|
|
TryToFindDuplicates = false;
|
|
return false;
|
|
}
|
|
|
|
// Check if this is a duplicate of another entry.
|
|
if (S) {
|
|
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
|
|
for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
|
|
if (E->isSame(VL)) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
|
|
<< ".\n");
|
|
return false;
|
|
}
|
|
SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
|
|
if (all_of(VL, [&](Value *V) {
|
|
return isa<PoisonValue>(V) || Values.contains(V);
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
|
|
// a load), in which case peek through to include it in the tree, without
|
|
// ballooning over-budget.
|
|
if (Depth >= RecursionMaxDepth &&
|
|
!(S && !S.isAltShuffle() && VL.size() >= 4 &&
|
|
(match(S.getMainOp(), m_Load(m_Value())) ||
|
|
all_of(VL, [&S](const Value *I) {
|
|
return match(I,
|
|
m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
|
|
cast<Instruction>(I)->getOpcode() == S.getOpcode();
|
|
})))) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
|
|
return false;
|
|
}
|
|
|
|
// Don't handle scalable vectors
|
|
if (S && S.getOpcode() == Instruction::ExtractElement &&
|
|
isa<ScalableVectorType>(
|
|
cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
|
|
return false;
|
|
}
|
|
|
|
// Don't handle vectors.
|
|
if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
|
|
// Do not try to pack to avoid extra instructions here.
|
|
TryToFindDuplicates = false;
|
|
return false;
|
|
}
|
|
|
|
// If all of the operands are identical or constant we have a simple solution.
|
|
// If we deal with insert/extract instructions, they all must have constant
|
|
// indices, otherwise we should gather them, not try to vectorize.
|
|
// If alternate op node with 2 elements with gathered operands - do not
|
|
// vectorize.
|
|
auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
|
|
if (!S || !S.isAltShuffle() || VL.size() > 2)
|
|
return false;
|
|
if (VectorizableTree.size() < MinTreeSize)
|
|
return false;
|
|
if (Depth >= RecursionMaxDepth - 1)
|
|
return true;
|
|
// Check if all operands are extracts, part of vector node or can build a
|
|
// regular vectorize node.
|
|
SmallVector<unsigned, 8> InstsCount;
|
|
for (Value *V : VL) {
|
|
auto *I = cast<Instruction>(V);
|
|
InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
|
|
return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
|
|
}));
|
|
}
|
|
bool IsCommutative =
|
|
isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
|
|
if ((IsCommutative &&
|
|
std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
|
|
(!IsCommutative &&
|
|
all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
|
|
return true;
|
|
assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
|
|
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
|
|
auto *I1 = cast<Instruction>(VL.front());
|
|
auto *I2 = cast<Instruction>(VL.back());
|
|
for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
|
|
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
|
|
I2->getOperand(Op));
|
|
if (static_cast<unsigned>(count_if(
|
|
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
|
|
return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
|
|
})) >= S.getMainOp()->getNumOperands() / 2)
|
|
return false;
|
|
if (S.getMainOp()->getNumOperands() > 2)
|
|
return true;
|
|
if (IsCommutative) {
|
|
// Check permuted operands.
|
|
Candidates.clear();
|
|
for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
|
|
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
|
|
I2->getOperand((Op + 1) % E));
|
|
if (any_of(
|
|
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
|
|
return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
|
|
}))
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
SmallVector<unsigned> SortedIndices;
|
|
BasicBlock *BB = nullptr;
|
|
bool IsScatterVectorizeUserTE =
|
|
UserTreeIdx.UserTE &&
|
|
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
|
|
bool AreAllSameBlock = S && allSameBlock(VL);
|
|
bool AreScatterAllGEPSameBlock =
|
|
(IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
|
|
VL.size() > 2 &&
|
|
all_of(VL,
|
|
[&BB](Value *V) {
|
|
auto *I = dyn_cast<GetElementPtrInst>(V);
|
|
if (!I)
|
|
return doesNotNeedToBeScheduled(V);
|
|
if (!BB)
|
|
BB = I->getParent();
|
|
return BB == I->getParent() && I->getNumOperands() == 2;
|
|
}) &&
|
|
BB &&
|
|
sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
|
|
SortedIndices));
|
|
bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
|
|
if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
|
|
(S &&
|
|
isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
|
|
S.getMainOp()) &&
|
|
!all_of(VL, isVectorLikeInstWithConstOps)) ||
|
|
NotProfitableForVectorization(VL)) {
|
|
if (!S) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
|
|
"C,S,B,O, small shuffle. \n");
|
|
TrySplitVectorize = true;
|
|
return false;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
|
|
return false;
|
|
}
|
|
|
|
// Don't vectorize ephemeral values.
|
|
if (S && !EphValues.empty()) {
|
|
for (Value *V : VL) {
|
|
if (EphValues.count(V)) {
|
|
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
|
|
<< ") is ephemeral.\n");
|
|
// Do not try to pack to avoid extra instructions here.
|
|
TryToFindDuplicates = false;
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// We now know that this is a vector of instructions of the same type from
|
|
// the same block.
|
|
|
|
// Check that none of the instructions in the bundle are already in the tree
|
|
// and the node may be not profitable for the vectorization as the small
|
|
// alternate node.
|
|
if (S && S.isAltShuffle()) {
|
|
auto GetNumVectorizedExtracted = [&]() {
|
|
APInt Extracted = APInt::getZero(VL.size());
|
|
APInt Vectorized = APInt::getAllOnes(VL.size());
|
|
for (auto [Idx, V] : enumerate(VL)) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I || doesNotNeedToBeScheduled(I) ||
|
|
all_of(I->operands(), [&](const Use &U) {
|
|
return isa<ExtractElementInst>(U.get());
|
|
}))
|
|
continue;
|
|
if (isVectorized(I))
|
|
Vectorized.clearBit(Idx);
|
|
else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
|
|
Extracted.setBit(Idx);
|
|
}
|
|
return std::make_pair(Vectorized, Extracted);
|
|
};
|
|
auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
|
|
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
|
|
bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
|
|
if (!Vectorized.isAllOnes() && !PreferScalarize) {
|
|
// Rough cost estimation, if the vector code (+ potential extracts) is
|
|
// more profitable than the scalar + buildvector.
|
|
Type *ScalarTy = VL.front()->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, VL.size());
|
|
InstructionCost VectorizeCostEstimate =
|
|
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
|
|
::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
|
|
/*Insert=*/false, /*Extract=*/true, Kind);
|
|
InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
|
|
*TTI, ScalarTy, VecTy, Vectorized,
|
|
/*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
|
|
PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
|
|
}
|
|
if (PreferScalarize) {
|
|
LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
|
|
"node is not profitable.\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
|
|
if (UserIgnoreList && !UserIgnoreList->empty()) {
|
|
for (Value *V : VL) {
|
|
if (UserIgnoreList->contains(V)) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Special processing for sorted pointers for ScatterVectorize node with
|
|
// constant indeces only.
|
|
if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
|
|
assert(VL.front()->getType()->isPointerTy() &&
|
|
count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
|
|
"Expected pointers only.");
|
|
// Reset S to make it GetElementPtr kind of node.
|
|
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
|
|
assert(It != VL.end() && "Expected at least one GEP.");
|
|
S = getSameOpcode(*It, *TLI);
|
|
}
|
|
|
|
// Check that all of the users of the scalars that we want to vectorize are
|
|
// schedulable.
|
|
Instruction *VL0 = S.getMainOp();
|
|
BB = VL0->getParent();
|
|
|
|
if (S &&
|
|
(BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
|
|
!DT->isReachableFromEntry(BB))) {
|
|
// Don't go into unreachable blocks. They may contain instructions with
|
|
// dependency cycles which confuse the final scheduling.
|
|
// Do not vectorize EH and non-returning blocks, not profitable in most
|
|
// cases.
|
|
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|
const EdgeInfo &UserTreeIdx,
|
|
unsigned InterleaveFactor) {
|
|
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
|
|
|
|
SmallVector<int> ReuseShuffleIndices;
|
|
SmallVector<Value *> NonUniqueValueVL(VL.begin(), VL.end());
|
|
auto TryToFindDuplicates = [&](const InstructionsState &S,
|
|
bool DoNotFail = false) {
|
|
if (tryToFindDuplicates(NonUniqueValueVL, ReuseShuffleIndices, *TTI, *TLI,
|
|
S, UserTreeIdx, DoNotFail)) {
|
|
VL = NonUniqueValueVL;
|
|
return true;
|
|
}
|
|
auto Invalid = ScheduleBundle::invalid();
|
|
newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
|
|
return false;
|
|
};
|
|
|
|
InstructionsState S = InstructionsState::invalid();
|
|
// Tries to build split node.
|
|
auto TrySplitNode = [&](const InstructionsState &LocalState) {
|
|
SmallVector<Value *> Op1, Op2;
|
|
OrdersType ReorderIndices;
|
|
if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
|
|
return false;
|
|
|
|
SmallVector<Value *> NewVL(VL.size());
|
|
copy(Op1, NewVL.begin());
|
|
copy(Op2, std::next(NewVL.begin(), Op1.size()));
|
|
auto Invalid = ScheduleBundle::invalid();
|
|
auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
|
|
UserTreeIdx, {}, ReorderIndices);
|
|
LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
|
|
auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
|
|
InstructionsState S = getSameOpcode(Op, *TLI);
|
|
if (S && (isa<LoadInst>(S.getMainOp()) ||
|
|
getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
|
|
// Build gather node for loads, they will be gathered later.
|
|
TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
|
|
Idx == 0 ? 0 : Op1.size());
|
|
(void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
|
|
} else {
|
|
TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
|
|
Idx == 0 ? 0 : Op1.size());
|
|
buildTree_rec(Op, Depth, {TE, Idx});
|
|
}
|
|
};
|
|
AddNode(Op1, 0);
|
|
AddNode(Op2, 1);
|
|
return true;
|
|
};
|
|
|
|
bool TryToPackDuplicates;
|
|
bool TrySplitVectorize;
|
|
if (!isLegalToVectorizeScalars(VL, Depth, UserTreeIdx, S, TryToPackDuplicates,
|
|
TrySplitVectorize)) {
|
|
if (TrySplitVectorize) {
|
|
auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
|
|
// Last chance to try to vectorize alternate node.
|
|
if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
|
|
return;
|
|
}
|
|
if (!TryToPackDuplicates || TryToFindDuplicates(S)) {
|
|
auto Invalid = ScheduleBundle::invalid();
|
|
newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// FIXME: investigate if there are profitable cases for VL.size() <= 4.
|
|
if (S.isAltShuffle() && TrySplitNode(S))
|
|
return;
|
|
|
|
// Check that every instruction appears once in this bundle.
|
|
if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
|
|
return;
|
|
|
|
// Perform specific checks for each particular instruction kind.
|
|
bool IsScatterVectorizeUserTE =
|
|
UserTreeIdx.UserTE &&
|
|
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
|
|
OrdersType CurrentOrder;
|
|
SmallVector<Value *> PointerOps;
|
|
TreeEntry::EntryState State = getScalarsVectorizationState(
|
|
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
|
|
if (State == TreeEntry::NeedToGather) {
|
|
auto Invalid = ScheduleBundle::invalid();
|
|
newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
return;
|
|
}
|
|
|
|
Instruction *VL0 = S.getMainOp();
|
|
BasicBlock *BB = VL0->getParent();
|
|
auto &BSRef = BlocksSchedules[BB];
|
|
if (!BSRef)
|
|
BSRef = std::make_unique<BlockScheduling>(BB);
|
|
|
|
BlockScheduling &BS = *BSRef;
|
|
|
|
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
|
|
std::optional<ScheduleBundle *> BundlePtr =
|
|
BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S);
|
|
#ifdef EXPENSIVE_CHECKS
|
|
// Make sure we didn't break any internal invariants
|
|
BS.verify();
|
|
#endif
|
|
if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
|
|
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
|
|
// Last chance to try to vectorize alternate node.
|
|
if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
|
|
return;
|
|
auto Invalid = ScheduleBundle::invalid();
|
|
newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
NonScheduledFirst.insert(VL.front());
|
|
if (S.getOpcode() == Instruction::Load &&
|
|
BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
|
|
registerNonVectorizableLoads(VL);
|
|
return;
|
|
}
|
|
ScheduleBundle Empty;
|
|
ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
|
|
LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
|
|
|
|
unsigned ShuffleOrOp =
|
|
S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
|
|
auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
|
|
// Postpone PHI nodes creation
|
|
SmallVector<unsigned> PHIOps;
|
|
for (unsigned I : seq<unsigned>(Operands.size())) {
|
|
ArrayRef<Value *> Op = Operands[I];
|
|
if (Op.empty())
|
|
continue;
|
|
InstructionsState S = getSameOpcode(Op, *TLI);
|
|
if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
|
|
buildTree_rec(Op, Depth + 1, {TE, I});
|
|
else
|
|
PHIOps.push_back(I);
|
|
}
|
|
for (unsigned I : PHIOps)
|
|
buildTree_rec(Operands[I], Depth + 1, {TE, I});
|
|
};
|
|
switch (ShuffleOrOp) {
|
|
case Instruction::PHI: {
|
|
auto *PH = cast<PHINode>(VL0);
|
|
|
|
TreeEntry *TE =
|
|
newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
|
|
TE->dump());
|
|
|
|
// Keeps the reordered operands to avoid code duplication.
|
|
PHIHandler Handler(*DT, PH, VL);
|
|
Handler.buildOperands();
|
|
for (unsigned I : seq<unsigned>(PH->getNumOperands()))
|
|
TE->setOperand(I, Handler.getOperands(I));
|
|
SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
|
|
for (unsigned I : seq<unsigned>(PH->getNumOperands()))
|
|
Operands[I] = Handler.getOperands(I);
|
|
CreateOperandNodes(TE, Operands);
|
|
return;
|
|
}
|
|
case Instruction::ExtractValue:
|
|
case Instruction::ExtractElement: {
|
|
if (CurrentOrder.empty()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
|
|
} else {
|
|
LLVM_DEBUG({
|
|
dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
|
|
"with order";
|
|
for (unsigned Idx : CurrentOrder)
|
|
dbgs() << " " << Idx;
|
|
dbgs() << "\n";
|
|
});
|
|
fixupOrderingIndices(CurrentOrder);
|
|
}
|
|
// Insert new order with initial value 0, if it does not exist,
|
|
// otherwise return the iterator to the existing one.
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices, CurrentOrder);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
|
|
"(ExtractValueInst/ExtractElementInst).\n";
|
|
TE->dump());
|
|
// This is a special case, as it does not gather, but at the same time
|
|
// we are not extending buildTree_rec() towards the operands.
|
|
TE->setOperand(*this);
|
|
return;
|
|
}
|
|
case Instruction::InsertElement: {
|
|
assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
|
|
|
|
auto OrdCompare = [](const std::pair<int, int> &P1,
|
|
const std::pair<int, int> &P2) {
|
|
return P1.first > P2.first;
|
|
};
|
|
PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
|
|
decltype(OrdCompare)>
|
|
Indices(OrdCompare);
|
|
for (int I = 0, E = VL.size(); I < E; ++I) {
|
|
unsigned Idx = *getElementIndex(VL[I]);
|
|
Indices.emplace(Idx, I);
|
|
}
|
|
OrdersType CurrentOrder(VL.size(), VL.size());
|
|
bool IsIdentity = true;
|
|
for (int I = 0, E = VL.size(); I < E; ++I) {
|
|
CurrentOrder[Indices.top().second] = I;
|
|
IsIdentity &= Indices.top().second == I;
|
|
Indices.pop();
|
|
}
|
|
if (IsIdentity)
|
|
CurrentOrder.clear();
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
{}, CurrentOrder);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
|
|
TE->dump());
|
|
|
|
TE->setOperand(*this);
|
|
buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
|
|
return;
|
|
}
|
|
case Instruction::Load: {
|
|
// Check that a vectorized load would load the same memory as a scalar
|
|
// load. For example, we don't want to vectorize loads that are smaller
|
|
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
|
|
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
|
|
// from such a struct, we read/write packed bits disagreeing with the
|
|
// unvectorized version.
|
|
TreeEntry *TE = nullptr;
|
|
fixupOrderingIndices(CurrentOrder);
|
|
switch (State) {
|
|
case TreeEntry::Vectorize:
|
|
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
|
|
if (CurrentOrder.empty())
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
|
|
TE->dump());
|
|
else
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: added a new TreeEntry (jumbled LoadInst).\n";
|
|
TE->dump());
|
|
break;
|
|
case TreeEntry::CompressVectorize:
|
|
// Vectorizing non-consecutive loads with (masked)load + compress.
|
|
TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
|
|
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
|
|
TE->dump());
|
|
break;
|
|
case TreeEntry::StridedVectorize:
|
|
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
|
|
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
|
|
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
|
|
TE->dump());
|
|
break;
|
|
case TreeEntry::ScatterVectorize:
|
|
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
|
|
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
|
|
UserTreeIdx, ReuseShuffleIndices);
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
|
|
TE->dump());
|
|
break;
|
|
case TreeEntry::CombinedVectorize:
|
|
case TreeEntry::SplitVectorize:
|
|
case TreeEntry::NeedToGather:
|
|
llvm_unreachable("Unexpected loads state.");
|
|
}
|
|
TE->setOperand(*this);
|
|
if (State == TreeEntry::ScatterVectorize)
|
|
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
|
|
return;
|
|
}
|
|
case Instruction::ZExt:
|
|
case Instruction::SExt:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPExt:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::SIToFP:
|
|
case Instruction::UIToFP:
|
|
case Instruction::Trunc:
|
|
case Instruction::FPTrunc:
|
|
case Instruction::BitCast: {
|
|
auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
|
|
std::make_pair(std::numeric_limits<unsigned>::min(),
|
|
std::numeric_limits<unsigned>::max()));
|
|
if (ShuffleOrOp == Instruction::ZExt ||
|
|
ShuffleOrOp == Instruction::SExt) {
|
|
CastMaxMinBWSizes = std::make_pair(
|
|
std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
|
|
PrevMaxBW),
|
|
std::min<unsigned>(
|
|
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
|
|
PrevMinBW));
|
|
} else if (ShuffleOrOp == Instruction::Trunc) {
|
|
CastMaxMinBWSizes = std::make_pair(
|
|
std::max<unsigned>(
|
|
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
|
|
PrevMaxBW),
|
|
std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
|
|
PrevMinBW));
|
|
}
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
|
|
TE->dump());
|
|
|
|
TE->setOperand(*this);
|
|
for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
|
|
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
|
|
if (ShuffleOrOp == Instruction::Trunc) {
|
|
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
|
|
} else if (ShuffleOrOp == Instruction::SIToFP ||
|
|
ShuffleOrOp == Instruction::UIToFP) {
|
|
unsigned NumSignBits =
|
|
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
|
|
if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
|
|
APInt Mask = DB->getDemandedBits(OpI);
|
|
NumSignBits = std::max(NumSignBits, Mask.countl_zero());
|
|
}
|
|
if (NumSignBits * 2 >=
|
|
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
|
|
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
|
|
}
|
|
return;
|
|
}
|
|
case Instruction::ICmp:
|
|
case Instruction::FCmp: {
|
|
// Check that all of the compares have the same predicate.
|
|
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
|
|
TE->dump());
|
|
|
|
ValueList Left, Right;
|
|
VLOperands Ops(VL, S, *this);
|
|
if (cast<CmpInst>(VL0)->isCommutative()) {
|
|
// Commutative predicate - collect + sort operands of the instructions
|
|
// so that each side is more likely to have the same opcode.
|
|
assert(P0 == CmpInst::getSwappedPredicate(P0) &&
|
|
"Commutative Predicate mismatch");
|
|
Ops.reorder();
|
|
Left = Ops.getVL(0);
|
|
Right = Ops.getVL(1);
|
|
} else {
|
|
// Collect operands - commute if it uses the swapped predicate.
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V)) {
|
|
Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
|
|
Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
|
|
continue;
|
|
}
|
|
auto *Cmp = cast<CmpInst>(V);
|
|
Value *LHS = Cmp->getOperand(0);
|
|
Value *RHS = Cmp->getOperand(1);
|
|
if (Cmp->getPredicate() != P0)
|
|
std::swap(LHS, RHS);
|
|
Left.push_back(LHS);
|
|
Right.push_back(RHS);
|
|
}
|
|
}
|
|
TE->setOperand(0, Left);
|
|
TE->setOperand(1, Right);
|
|
buildTree_rec(Left, Depth + 1, {TE, 0});
|
|
buildTree_rec(Right, Depth + 1, {TE, 1});
|
|
if (ShuffleOrOp == Instruction::ICmp) {
|
|
unsigned NumSignBits0 =
|
|
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
|
|
if (NumSignBits0 * 2 >=
|
|
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
|
|
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
|
|
unsigned NumSignBits1 =
|
|
ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
|
|
if (NumSignBits1 * 2 >=
|
|
DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
|
|
ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
|
|
}
|
|
return;
|
|
}
|
|
case Instruction::Select:
|
|
case Instruction::FNeg:
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::FDiv:
|
|
case Instruction::URem:
|
|
case Instruction::SRem:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor:
|
|
case Instruction::Freeze: {
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: added a new TreeEntry "
|
|
"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
|
|
TE->dump());
|
|
|
|
TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
|
|
for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
|
|
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
|
|
return;
|
|
}
|
|
case Instruction::GetElementPtr: {
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
|
|
TE->dump());
|
|
SmallVector<ValueList, 2> Operands(2);
|
|
// Prepare the operand vector for pointer operands.
|
|
for (Value *V : VL) {
|
|
auto *GEP = dyn_cast<GetElementPtrInst>(V);
|
|
if (!GEP) {
|
|
Operands.front().push_back(V);
|
|
continue;
|
|
}
|
|
Operands.front().push_back(GEP->getPointerOperand());
|
|
}
|
|
TE->setOperand(0, Operands.front());
|
|
// Need to cast all indices to the same type before vectorization to
|
|
// avoid crash.
|
|
// Required to be able to find correct matches between different gather
|
|
// nodes and reuse the vectorized values rather than trying to gather them
|
|
// again.
|
|
int IndexIdx = 1;
|
|
Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
|
|
Type *Ty = all_of(VL,
|
|
[VL0Ty, IndexIdx](Value *V) {
|
|
auto *GEP = dyn_cast<GetElementPtrInst>(V);
|
|
if (!GEP)
|
|
return true;
|
|
return VL0Ty == GEP->getOperand(IndexIdx)->getType();
|
|
})
|
|
? VL0Ty
|
|
: DL->getIndexType(cast<GetElementPtrInst>(VL0)
|
|
->getPointerOperandType()
|
|
->getScalarType());
|
|
// Prepare the operand vector.
|
|
for (Value *V : VL) {
|
|
auto *I = dyn_cast<GetElementPtrInst>(V);
|
|
if (!I) {
|
|
Operands.back().push_back(
|
|
ConstantInt::get(Ty, 0, /*isSigned=*/false));
|
|
continue;
|
|
}
|
|
auto *Op = I->getOperand(IndexIdx);
|
|
auto *CI = dyn_cast<ConstantInt>(Op);
|
|
if (!CI)
|
|
Operands.back().push_back(Op);
|
|
else
|
|
Operands.back().push_back(ConstantFoldIntegerCast(
|
|
CI, Ty, CI->getValue().isSignBitSet(), *DL));
|
|
}
|
|
TE->setOperand(IndexIdx, Operands.back());
|
|
|
|
for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
|
|
buildTree_rec(Operands[I], Depth + 1, {TE, I});
|
|
return;
|
|
}
|
|
case Instruction::Store: {
|
|
bool Consecutive = CurrentOrder.empty();
|
|
if (!Consecutive)
|
|
fixupOrderingIndices(CurrentOrder);
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices, CurrentOrder);
|
|
if (Consecutive)
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
|
|
TE->dump());
|
|
else
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
|
|
TE->dump());
|
|
TE->setOperand(*this);
|
|
buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
|
|
return;
|
|
}
|
|
case Instruction::Call: {
|
|
// Check if the calls are all to the same vectorizable intrinsic or
|
|
// library function.
|
|
CallInst *CI = cast<CallInst>(VL0);
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
|
|
TE->dump());
|
|
TE->setOperand(*this, isCommutative(VL0));
|
|
for (unsigned I : seq<unsigned>(CI->arg_size())) {
|
|
// For scalar operands no need to create an entry since no need to
|
|
// vectorize it.
|
|
if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
|
|
continue;
|
|
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
|
|
}
|
|
return;
|
|
}
|
|
case Instruction::ShuffleVector: {
|
|
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
|
|
ReuseShuffleIndices);
|
|
if (S.isAltShuffle()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
|
|
TE->dump());
|
|
} else {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
|
|
TE->dump());
|
|
}
|
|
|
|
// Reorder operands if reordering would enable vectorization.
|
|
auto *CI = dyn_cast<CmpInst>(VL0);
|
|
if (CI && any_of(VL, [](Value *V) {
|
|
return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
|
|
})) {
|
|
auto *MainCI = cast<CmpInst>(S.getMainOp());
|
|
auto *AltCI = cast<CmpInst>(S.getAltOp());
|
|
CmpInst::Predicate MainP = MainCI->getPredicate();
|
|
CmpInst::Predicate AltP = AltCI->getPredicate();
|
|
assert(MainP != AltP &&
|
|
"Expected different main/alternate predicates.");
|
|
ValueList Left, Right;
|
|
// Collect operands - commute if it uses the swapped predicate or
|
|
// alternate operation.
|
|
for (Value *V : VL) {
|
|
if (isa<PoisonValue>(V)) {
|
|
Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
|
|
Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
|
|
continue;
|
|
}
|
|
auto *Cmp = cast<CmpInst>(V);
|
|
Value *LHS = Cmp->getOperand(0);
|
|
Value *RHS = Cmp->getOperand(1);
|
|
|
|
if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
|
|
if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
|
|
std::swap(LHS, RHS);
|
|
} else {
|
|
if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
|
|
std::swap(LHS, RHS);
|
|
}
|
|
Left.push_back(LHS);
|
|
Right.push_back(RHS);
|
|
}
|
|
TE->setOperand(0, Left);
|
|
TE->setOperand(1, Right);
|
|
buildTree_rec(Left, Depth + 1, {TE, 0});
|
|
buildTree_rec(Right, Depth + 1, {TE, 1});
|
|
return;
|
|
}
|
|
|
|
TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
|
|
for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
|
|
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
|
|
return;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
llvm_unreachable("Unexpected vectorization of the instructions.");
|
|
}
|
|
|
|
unsigned BoUpSLP::canMapToVector(Type *T) const {
|
|
unsigned N = 1;
|
|
Type *EltTy = T;
|
|
|
|
while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
|
|
if (EltTy->isEmptyTy())
|
|
return 0;
|
|
if (auto *ST = dyn_cast<StructType>(EltTy)) {
|
|
// Check that struct is homogeneous.
|
|
for (const auto *Ty : ST->elements())
|
|
if (Ty != *ST->element_begin())
|
|
return 0;
|
|
N *= ST->getNumElements();
|
|
EltTy = *ST->element_begin();
|
|
} else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
|
|
N *= AT->getNumElements();
|
|
EltTy = AT->getElementType();
|
|
} else {
|
|
auto *VT = cast<FixedVectorType>(EltTy);
|
|
N *= VT->getNumElements();
|
|
EltTy = VT->getElementType();
|
|
}
|
|
}
|
|
|
|
if (!isValidElementType(EltTy))
|
|
return 0;
|
|
uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
|
|
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
|
|
VTSize != DL->getTypeStoreSizeInBits(T))
|
|
return 0;
|
|
return N;
|
|
}
|
|
|
|
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
|
|
SmallVectorImpl<unsigned> &CurrentOrder,
|
|
bool ResizeAllowed) const {
|
|
const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
|
|
assert(It != VL.end() && "Expected at least one extract instruction.");
|
|
auto *E0 = cast<Instruction>(*It);
|
|
assert(
|
|
all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
|
|
"Invalid opcode");
|
|
// Check if all of the extracts come from the same vector and from the
|
|
// correct offset.
|
|
Value *Vec = E0->getOperand(0);
|
|
|
|
CurrentOrder.clear();
|
|
|
|
// We have to extract from a vector/aggregate with the same number of elements.
|
|
unsigned NElts;
|
|
if (E0->getOpcode() == Instruction::ExtractValue) {
|
|
NElts = canMapToVector(Vec->getType());
|
|
if (!NElts)
|
|
return false;
|
|
// Check if load can be rewritten as load of vector.
|
|
LoadInst *LI = dyn_cast<LoadInst>(Vec);
|
|
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
|
|
return false;
|
|
} else {
|
|
NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
|
|
}
|
|
|
|
unsigned E = VL.size();
|
|
if (!ResizeAllowed && NElts != E)
|
|
return false;
|
|
SmallVector<int> Indices(E, PoisonMaskElem);
|
|
unsigned MinIdx = NElts, MaxIdx = 0;
|
|
for (auto [I, V] : enumerate(VL)) {
|
|
auto *Inst = dyn_cast<Instruction>(V);
|
|
if (!Inst)
|
|
continue;
|
|
if (Inst->getOperand(0) != Vec)
|
|
return false;
|
|
if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
|
|
if (isa<UndefValue>(EE->getIndexOperand()))
|
|
continue;
|
|
std::optional<unsigned> Idx = getExtractIndex(Inst);
|
|
if (!Idx)
|
|
return false;
|
|
const unsigned ExtIdx = *Idx;
|
|
if (ExtIdx >= NElts)
|
|
continue;
|
|
Indices[I] = ExtIdx;
|
|
if (MinIdx > ExtIdx)
|
|
MinIdx = ExtIdx;
|
|
if (MaxIdx < ExtIdx)
|
|
MaxIdx = ExtIdx;
|
|
}
|
|
if (MaxIdx - MinIdx + 1 > E)
|
|
return false;
|
|
if (MaxIdx + 1 <= E)
|
|
MinIdx = 0;
|
|
|
|
// Check that all of the indices extract from the correct offset.
|
|
bool ShouldKeepOrder = true;
|
|
// Assign to all items the initial value E + 1 so we can check if the extract
|
|
// instruction index was used already.
|
|
// Also, later we can check that all the indices are used and we have a
|
|
// consecutive access in the extract instructions, by checking that no
|
|
// element of CurrentOrder still has value E + 1.
|
|
CurrentOrder.assign(E, E);
|
|
for (unsigned I = 0; I < E; ++I) {
|
|
if (Indices[I] == PoisonMaskElem)
|
|
continue;
|
|
const unsigned ExtIdx = Indices[I] - MinIdx;
|
|
if (CurrentOrder[ExtIdx] != E) {
|
|
CurrentOrder.clear();
|
|
return false;
|
|
}
|
|
ShouldKeepOrder &= ExtIdx == I;
|
|
CurrentOrder[ExtIdx] = I;
|
|
}
|
|
if (ShouldKeepOrder)
|
|
CurrentOrder.clear();
|
|
|
|
return ShouldKeepOrder;
|
|
}
|
|
|
|
bool BoUpSLP::areAllUsersVectorized(
|
|
Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
|
|
return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
|
|
all_of(I->users(), [this](User *U) {
|
|
return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
|
|
(isa<ExtractElementInst>(U) && MustGather.contains(U));
|
|
});
|
|
}
|
|
|
|
void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
|
|
const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
|
|
SmallVectorImpl<Value *> *OpScalars,
|
|
SmallVectorImpl<Value *> *AltScalars) const {
|
|
unsigned Sz = Scalars.size();
|
|
Mask.assign(Sz, PoisonMaskElem);
|
|
SmallVector<int> OrderMask;
|
|
if (!ReorderIndices.empty())
|
|
inversePermutation(ReorderIndices, OrderMask);
|
|
for (unsigned I = 0; I < Sz; ++I) {
|
|
unsigned Idx = I;
|
|
if (!ReorderIndices.empty())
|
|
Idx = OrderMask[I];
|
|
if (isa<PoisonValue>(Scalars[Idx]))
|
|
continue;
|
|
auto *OpInst = cast<Instruction>(Scalars[Idx]);
|
|
if (IsAltOp(OpInst)) {
|
|
Mask[I] = Sz + Idx;
|
|
if (AltScalars)
|
|
AltScalars->push_back(OpInst);
|
|
} else {
|
|
Mask[I] = Idx;
|
|
if (OpScalars)
|
|
OpScalars->push_back(OpInst);
|
|
}
|
|
}
|
|
if (!ReuseShuffleIndices.empty()) {
|
|
SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
|
|
transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
|
|
return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
|
|
});
|
|
Mask.swap(NewMask);
|
|
}
|
|
}
|
|
|
|
static bool isAlternateInstruction(const Instruction *I,
|
|
const Instruction *MainOp,
|
|
const Instruction *AltOp,
|
|
const TargetLibraryInfo &TLI) {
|
|
if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
|
|
auto *AltCI = cast<CmpInst>(AltOp);
|
|
CmpInst::Predicate MainP = MainCI->getPredicate();
|
|
[[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
|
|
assert(MainP != AltP && "Expected different main/alternate predicates.");
|
|
auto *CI = cast<CmpInst>(I);
|
|
if (isCmpSameOrSwapped(MainCI, CI, TLI))
|
|
return false;
|
|
if (isCmpSameOrSwapped(AltCI, CI, TLI))
|
|
return true;
|
|
CmpInst::Predicate P = CI->getPredicate();
|
|
CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
|
|
|
|
assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
|
|
"CmpInst expected to match either main or alternate predicate or "
|
|
"their swap.");
|
|
return MainP != P && MainP != SwappedP;
|
|
}
|
|
return I->getOpcode() == AltOp->getOpcode();
|
|
}
|
|
|
|
TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
|
|
assert(!Ops.empty());
|
|
const auto *Op0 = Ops.front();
|
|
|
|
const bool IsConstant = all_of(Ops, [](Value *V) {
|
|
// TODO: We should allow undef elements here
|
|
return isConstant(V) && !isa<UndefValue>(V);
|
|
});
|
|
const bool IsUniform = all_of(Ops, [=](Value *V) {
|
|
// TODO: We should allow undef elements here
|
|
return V == Op0;
|
|
});
|
|
const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
|
|
// TODO: We should allow undef elements here
|
|
if (auto *CI = dyn_cast<ConstantInt>(V))
|
|
return CI->getValue().isPowerOf2();
|
|
return false;
|
|
});
|
|
const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
|
|
// TODO: We should allow undef elements here
|
|
if (auto *CI = dyn_cast<ConstantInt>(V))
|
|
return CI->getValue().isNegatedPowerOf2();
|
|
return false;
|
|
});
|
|
|
|
TTI::OperandValueKind VK = TTI::OK_AnyValue;
|
|
if (IsConstant && IsUniform)
|
|
VK = TTI::OK_UniformConstantValue;
|
|
else if (IsConstant)
|
|
VK = TTI::OK_NonUniformConstantValue;
|
|
else if (IsUniform)
|
|
VK = TTI::OK_UniformValue;
|
|
|
|
TTI::OperandValueProperties VP = TTI::OP_None;
|
|
VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
|
|
VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
|
|
|
|
return {VK, VP};
|
|
}
|
|
|
|
namespace {
|
|
/// The base class for shuffle instruction emission and shuffle cost estimation.
|
|
class BaseShuffleAnalysis {
|
|
protected:
|
|
Type *ScalarTy = nullptr;
|
|
|
|
BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
|
|
|
|
/// V is expected to be a vectorized value.
|
|
/// When REVEC is disabled, there is no difference between VF and
|
|
/// VNumElements.
|
|
/// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
|
|
/// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
|
|
/// of 8.
|
|
unsigned getVF(Value *V) const {
|
|
assert(V && "V cannot be nullptr");
|
|
assert(isa<FixedVectorType>(V->getType()) &&
|
|
"V does not have FixedVectorType");
|
|
assert(ScalarTy && "ScalarTy cannot be nullptr");
|
|
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
|
|
unsigned VNumElements =
|
|
cast<FixedVectorType>(V->getType())->getNumElements();
|
|
assert(VNumElements > ScalarTyNumElements &&
|
|
"the number of elements of V is not large enough");
|
|
assert(VNumElements % ScalarTyNumElements == 0 &&
|
|
"the number of elements of V is not a vectorized value");
|
|
return VNumElements / ScalarTyNumElements;
|
|
}
|
|
|
|
/// Checks if the mask is an identity mask.
|
|
/// \param IsStrict if is true the function returns false if mask size does
|
|
/// not match vector size.
|
|
static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
|
|
bool IsStrict) {
|
|
int Limit = Mask.size();
|
|
int VF = VecTy->getNumElements();
|
|
int Index = -1;
|
|
if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
|
|
return true;
|
|
if (!IsStrict) {
|
|
// Consider extract subvector starting from index 0.
|
|
if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
|
|
Index == 0)
|
|
return true;
|
|
// All VF-size submasks are identity (e.g.
|
|
// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
|
|
if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
|
|
ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
|
|
return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
|
|
ShuffleVectorInst::isIdentityMask(Slice, VF);
|
|
}))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Tries to combine 2 different masks into single one.
|
|
/// \param LocalVF Vector length of the permuted input vector. \p Mask may
|
|
/// change the size of the vector, \p LocalVF is the original size of the
|
|
/// shuffled vector.
|
|
static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
|
|
ArrayRef<int> ExtMask) {
|
|
unsigned VF = Mask.size();
|
|
SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
|
|
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
|
|
if (ExtMask[I] == PoisonMaskElem)
|
|
continue;
|
|
int MaskedIdx = Mask[ExtMask[I] % VF];
|
|
NewMask[I] =
|
|
MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
|
|
}
|
|
Mask.swap(NewMask);
|
|
}
|
|
|
|
/// Looks through shuffles trying to reduce final number of shuffles in the
|
|
/// code. The function looks through the previously emitted shuffle
|
|
/// instructions and properly mark indices in mask as undef.
|
|
/// For example, given the code
|
|
/// \code
|
|
/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
|
|
/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
|
|
/// \endcode
|
|
/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
|
|
/// look through %s1 and %s2 and select vectors %0 and %1 with mask
|
|
/// <0, 1, 2, 3> for the shuffle.
|
|
/// If 2 operands are of different size, the smallest one will be resized and
|
|
/// the mask recalculated properly.
|
|
/// For example, given the code
|
|
/// \code
|
|
/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
|
|
/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
|
|
/// \endcode
|
|
/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
|
|
/// look through %s1 and %s2 and select vectors %0 and %1 with mask
|
|
/// <0, 1, 2, 3> for the shuffle.
|
|
/// So, it tries to transform permutations to simple vector merge, if
|
|
/// possible.
|
|
/// \param V The input vector which must be shuffled using the given \p Mask.
|
|
/// If the better candidate is found, \p V is set to this best candidate
|
|
/// vector.
|
|
/// \param Mask The input mask for the shuffle. If the best candidate is found
|
|
/// during looking-through-shuffles attempt, it is updated accordingly.
|
|
/// \param SinglePermute true if the shuffle operation is originally a
|
|
/// single-value-permutation. In this case the look-through-shuffles procedure
|
|
/// may look for resizing shuffles as the best candidates.
|
|
/// \return true if the shuffle results in the non-resizing identity shuffle
|
|
/// (and thus can be ignored), false - otherwise.
|
|
static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
|
|
bool SinglePermute) {
|
|
Value *Op = V;
|
|
ShuffleVectorInst *IdentityOp = nullptr;
|
|
SmallVector<int> IdentityMask;
|
|
while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
|
|
// Exit if not a fixed vector type or changing size shuffle.
|
|
auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
|
|
if (!SVTy)
|
|
break;
|
|
// Remember the identity or broadcast mask, if it is not a resizing
|
|
// shuffle. If no better candidates are found, this Op and Mask will be
|
|
// used in the final shuffle.
|
|
if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
|
|
if (!IdentityOp || !SinglePermute ||
|
|
(isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
|
|
!ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
|
|
IdentityMask.size()))) {
|
|
IdentityOp = SV;
|
|
// Store current mask in the IdentityMask so later we did not lost
|
|
// this info if IdentityOp is selected as the best candidate for the
|
|
// permutation.
|
|
IdentityMask.assign(Mask);
|
|
}
|
|
}
|
|
// Remember the broadcast mask. If no better candidates are found, this Op
|
|
// and Mask will be used in the final shuffle.
|
|
// Zero splat can be used as identity too, since it might be used with
|
|
// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
|
|
// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
|
|
// expensive, the analysis founds out, that the source vector is just a
|
|
// broadcast, this original mask can be transformed to identity mask <0,
|
|
// 1, 2, 3>.
|
|
// \code
|
|
// %0 = shuffle %v, poison, zeroinitalizer
|
|
// %res = shuffle %0, poison, <3, 1, 2, 0>
|
|
// \endcode
|
|
// may be transformed to
|
|
// \code
|
|
// %0 = shuffle %v, poison, zeroinitalizer
|
|
// %res = shuffle %0, poison, <0, 1, 2, 3>
|
|
// \endcode
|
|
if (SV->isZeroEltSplat()) {
|
|
IdentityOp = SV;
|
|
IdentityMask.assign(Mask);
|
|
}
|
|
int LocalVF = Mask.size();
|
|
if (auto *SVOpTy =
|
|
dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
|
|
LocalVF = SVOpTy->getNumElements();
|
|
SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
|
|
for (auto [Idx, I] : enumerate(Mask)) {
|
|
if (I == PoisonMaskElem ||
|
|
static_cast<unsigned>(I) >= SV->getShuffleMask().size())
|
|
continue;
|
|
ExtMask[Idx] = SV->getMaskValue(I);
|
|
}
|
|
bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
|
|
SV->getOperand(0),
|
|
buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
|
|
.all();
|
|
bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
|
|
SV->getOperand(1),
|
|
buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
|
|
.all();
|
|
if (!IsOp1Undef && !IsOp2Undef) {
|
|
// Update mask and mark undef elems.
|
|
for (int &I : Mask) {
|
|
if (I == PoisonMaskElem)
|
|
continue;
|
|
if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
|
|
PoisonMaskElem)
|
|
I = PoisonMaskElem;
|
|
}
|
|
break;
|
|
}
|
|
SmallVector<int> ShuffleMask(SV->getShuffleMask());
|
|
combineMasks(LocalVF, ShuffleMask, Mask);
|
|
Mask.swap(ShuffleMask);
|
|
if (IsOp2Undef)
|
|
Op = SV->getOperand(0);
|
|
else
|
|
Op = SV->getOperand(1);
|
|
}
|
|
if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
|
|
!OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
|
|
ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
|
|
if (IdentityOp) {
|
|
V = IdentityOp;
|
|
assert(Mask.size() == IdentityMask.size() &&
|
|
"Expected masks of same sizes.");
|
|
// Clear known poison elements.
|
|
for (auto [I, Idx] : enumerate(Mask))
|
|
if (Idx == PoisonMaskElem)
|
|
IdentityMask[I] = PoisonMaskElem;
|
|
Mask.swap(IdentityMask);
|
|
auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
|
|
return SinglePermute &&
|
|
(isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
|
|
/*IsStrict=*/true) ||
|
|
(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
|
|
Shuffle->isZeroEltSplat() &&
|
|
ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size()) &&
|
|
all_of(enumerate(Mask), [&](const auto &P) {
|
|
return P.value() == PoisonMaskElem ||
|
|
Shuffle->getShuffleMask()[P.index()] == 0;
|
|
})));
|
|
}
|
|
V = Op;
|
|
return false;
|
|
}
|
|
V = Op;
|
|
return true;
|
|
}
|
|
|
|
/// Smart shuffle instruction emission, walks through shuffles trees and
|
|
/// tries to find the best matching vector for the actual shuffle
|
|
/// instruction.
|
|
template <typename T, typename ShuffleBuilderTy>
|
|
static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
|
|
ShuffleBuilderTy &Builder, Type *ScalarTy) {
|
|
assert(V1 && "Expected at least one vector value.");
|
|
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
|
|
SmallVector<int> NewMask(Mask);
|
|
if (ScalarTyNumElements != 1) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
|
|
Mask = NewMask;
|
|
}
|
|
if (V2)
|
|
Builder.resizeToMatch(V1, V2);
|
|
int VF = Mask.size();
|
|
if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
|
|
VF = FTy->getNumElements();
|
|
if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
|
|
V2, buildUseMask(VF, Mask, UseMask::SecondArg))
|
|
.all()) {
|
|
// Peek through shuffles.
|
|
Value *Op1 = V1;
|
|
Value *Op2 = V2;
|
|
int VF =
|
|
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
|
|
SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
|
|
SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
|
|
for (int I = 0, E = Mask.size(); I < E; ++I) {
|
|
if (Mask[I] < VF)
|
|
CombinedMask1[I] = Mask[I];
|
|
else
|
|
CombinedMask2[I] = Mask[I] - VF;
|
|
}
|
|
Value *PrevOp1;
|
|
Value *PrevOp2;
|
|
do {
|
|
PrevOp1 = Op1;
|
|
PrevOp2 = Op2;
|
|
(void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
|
|
(void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
|
|
// Check if we have 2 resizing shuffles - need to peek through operands
|
|
// again.
|
|
if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
|
|
if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
|
|
SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
|
|
for (auto [Idx, I] : enumerate(CombinedMask1)) {
|
|
if (I == PoisonMaskElem)
|
|
continue;
|
|
ExtMask1[Idx] = SV1->getMaskValue(I);
|
|
}
|
|
SmallBitVector UseMask1 = buildUseMask(
|
|
cast<FixedVectorType>(SV1->getOperand(1)->getType())
|
|
->getNumElements(),
|
|
ExtMask1, UseMask::SecondArg);
|
|
SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
|
|
for (auto [Idx, I] : enumerate(CombinedMask2)) {
|
|
if (I == PoisonMaskElem)
|
|
continue;
|
|
ExtMask2[Idx] = SV2->getMaskValue(I);
|
|
}
|
|
SmallBitVector UseMask2 = buildUseMask(
|
|
cast<FixedVectorType>(SV2->getOperand(1)->getType())
|
|
->getNumElements(),
|
|
ExtMask2, UseMask::SecondArg);
|
|
if (SV1->getOperand(0)->getType() ==
|
|
SV2->getOperand(0)->getType() &&
|
|
SV1->getOperand(0)->getType() != SV1->getType() &&
|
|
isUndefVector(SV1->getOperand(1), UseMask1).all() &&
|
|
isUndefVector(SV2->getOperand(1), UseMask2).all()) {
|
|
Op1 = SV1->getOperand(0);
|
|
Op2 = SV2->getOperand(0);
|
|
SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
|
|
int LocalVF = ShuffleMask1.size();
|
|
if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
|
|
LocalVF = FTy->getNumElements();
|
|
combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
|
|
CombinedMask1.swap(ShuffleMask1);
|
|
SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
|
|
LocalVF = ShuffleMask2.size();
|
|
if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
|
|
LocalVF = FTy->getNumElements();
|
|
combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
|
|
CombinedMask2.swap(ShuffleMask2);
|
|
}
|
|
}
|
|
} while (PrevOp1 != Op1 || PrevOp2 != Op2);
|
|
Builder.resizeToMatch(Op1, Op2);
|
|
VF = std::max(cast<VectorType>(Op1->getType())
|
|
->getElementCount()
|
|
.getKnownMinValue(),
|
|
cast<VectorType>(Op2->getType())
|
|
->getElementCount()
|
|
.getKnownMinValue());
|
|
for (int I = 0, E = Mask.size(); I < E; ++I) {
|
|
if (CombinedMask2[I] != PoisonMaskElem) {
|
|
assert(CombinedMask1[I] == PoisonMaskElem &&
|
|
"Expected undefined mask element");
|
|
CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
|
|
}
|
|
}
|
|
if (Op1 == Op2 &&
|
|
(ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
|
|
(ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
|
|
isa<ShuffleVectorInst>(Op1) &&
|
|
cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
|
|
ArrayRef(CombinedMask1))))
|
|
return Builder.createIdentity(Op1);
|
|
return Builder.createShuffleVector(
|
|
Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
|
|
CombinedMask1);
|
|
}
|
|
if (isa<PoisonValue>(V1))
|
|
return Builder.createPoison(
|
|
cast<VectorType>(V1->getType())->getElementType(), Mask.size());
|
|
bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
|
|
assert(V1 && "Expected non-null value after looking through shuffles.");
|
|
|
|
if (!IsIdentity)
|
|
return Builder.createShuffleVector(V1, NewMask);
|
|
return Builder.createIdentity(V1);
|
|
}
|
|
|
|
/// Transforms mask \p CommonMask per given \p Mask to make proper set after
|
|
/// shuffle emission.
|
|
static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
|
|
ArrayRef<int> Mask) {
|
|
for (unsigned I : seq<unsigned>(CommonMask.size()))
|
|
if (Mask[I] != PoisonMaskElem)
|
|
CommonMask[I] = I;
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
|
|
static std::pair<InstructionCost, InstructionCost>
|
|
getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
|
|
Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
|
|
Type *ScalarTy, VectorType *VecTy) {
|
|
InstructionCost ScalarCost = 0;
|
|
InstructionCost VecCost = 0;
|
|
// Here we differentiate two cases: (1) when Ptrs represent a regular
|
|
// vectorization tree node (as they are pointer arguments of scattered
|
|
// loads) or (2) when Ptrs are the arguments of loads or stores being
|
|
// vectorized as plane wide unit-stride load/store since all the
|
|
// loads/stores are known to be from/to adjacent locations.
|
|
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
|
|
// Case 2: estimate costs for pointer related costs when vectorizing to
|
|
// a wide load/store.
|
|
// Scalar cost is estimated as a set of pointers with known relationship
|
|
// between them.
|
|
// For vector code we will use BasePtr as argument for the wide load/store
|
|
// but we also need to account all the instructions which are going to
|
|
// stay in vectorized code due to uses outside of these scalar
|
|
// loads/stores.
|
|
ScalarCost = TTI.getPointersChainCost(
|
|
Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
|
|
CostKind);
|
|
|
|
SmallVector<const Value *> PtrsRetainedInVecCode;
|
|
for (Value *V : Ptrs) {
|
|
if (V == BasePtr) {
|
|
PtrsRetainedInVecCode.push_back(V);
|
|
continue;
|
|
}
|
|
auto *Ptr = dyn_cast<GetElementPtrInst>(V);
|
|
// For simplicity assume Ptr to stay in vectorized code if it's not a
|
|
// GEP instruction. We don't care since it's cost considered free.
|
|
// TODO: We should check for any uses outside of vectorizable tree
|
|
// rather than just single use.
|
|
if (!Ptr || !Ptr->hasOneUse())
|
|
PtrsRetainedInVecCode.push_back(V);
|
|
}
|
|
|
|
if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
|
|
// If all pointers stay in vectorized code then we don't have
|
|
// any savings on that.
|
|
return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
|
|
}
|
|
VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
|
|
TTI::PointersChainInfo::getKnownStride(),
|
|
VecTy, CostKind);
|
|
} else {
|
|
// Case 1: Ptrs are the arguments of loads that we are going to transform
|
|
// into masked gather load intrinsic.
|
|
// All the scalar GEPs will be removed as a result of vectorization.
|
|
// For any external uses of some lanes extract element instructions will
|
|
// be generated (which cost is estimated separately).
|
|
TTI::PointersChainInfo PtrsInfo =
|
|
all_of(Ptrs,
|
|
[](const Value *V) {
|
|
auto *Ptr = dyn_cast<GetElementPtrInst>(V);
|
|
return Ptr && !Ptr->hasAllConstantIndices();
|
|
})
|
|
? TTI::PointersChainInfo::getUnknownStride()
|
|
: TTI::PointersChainInfo::getKnownStride();
|
|
|
|
ScalarCost =
|
|
TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
|
|
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
|
|
if (!BaseGEP) {
|
|
auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
|
|
if (It != Ptrs.end())
|
|
BaseGEP = cast<GEPOperator>(*It);
|
|
}
|
|
if (BaseGEP) {
|
|
SmallVector<const Value *> Indices(BaseGEP->indices());
|
|
VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
|
|
BaseGEP->getPointerOperand(), Indices, VecTy,
|
|
CostKind);
|
|
}
|
|
}
|
|
|
|
return std::make_pair(ScalarCost, VecCost);
|
|
}
|
|
|
|
void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
|
|
assert(TE.isGather() && TE.ReorderIndices.empty() &&
|
|
"Expected gather node without reordering.");
|
|
DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
|
|
SmallSet<size_t, 2> LoadKeyUsed;
|
|
|
|
// Do not reorder nodes if it small (just 2 elements), all-constant or all
|
|
// instructions have same opcode already.
|
|
if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
|
|
all_of(TE.Scalars, isConstant))
|
|
return;
|
|
|
|
if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
|
|
return VectorizableTree[Idx]->isSame(TE.Scalars);
|
|
}))
|
|
return;
|
|
|
|
auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
|
|
Key = hash_combine(hash_value(LI->getParent()), Key);
|
|
Value *Ptr =
|
|
getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
|
|
if (LoadKeyUsed.contains(Key)) {
|
|
auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
|
|
if (LIt != LoadsMap.end()) {
|
|
for (LoadInst *RLI : LIt->second) {
|
|
if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
|
|
LI->getType(), LI->getPointerOperand(), *DL, *SE,
|
|
/*StrictCheck=*/true))
|
|
return hash_value(RLI->getPointerOperand());
|
|
}
|
|
for (LoadInst *RLI : LIt->second) {
|
|
if (arePointersCompatible(RLI->getPointerOperand(),
|
|
LI->getPointerOperand(), *TLI)) {
|
|
hash_code SubKey = hash_value(RLI->getPointerOperand());
|
|
return SubKey;
|
|
}
|
|
}
|
|
if (LIt->second.size() > 2) {
|
|
hash_code SubKey =
|
|
hash_value(LIt->second.back()->getPointerOperand());
|
|
return SubKey;
|
|
}
|
|
}
|
|
}
|
|
LoadKeyUsed.insert(Key);
|
|
LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
|
|
return hash_value(LI->getPointerOperand());
|
|
};
|
|
MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
|
|
SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
|
|
bool IsOrdered = true;
|
|
unsigned NumInstructions = 0;
|
|
// Try to "cluster" scalar instructions, to be able to build extra vectorized
|
|
// nodes.
|
|
for (auto [I, V] : enumerate(TE.Scalars)) {
|
|
size_t Key = 1, Idx = 1;
|
|
if (auto *Inst = dyn_cast<Instruction>(V);
|
|
Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
|
|
!isDeleted(Inst) && !isVectorized(V)) {
|
|
std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
|
|
/*AllowAlternate=*/false);
|
|
++NumInstructions;
|
|
}
|
|
auto &Container = SortedValues[Key];
|
|
if (IsOrdered && !KeyToIndex.contains(V) &&
|
|
!(isa<Constant, ExtractElementInst>(V) ||
|
|
isVectorLikeInstWithConstOps(V)) &&
|
|
((Container.contains(Idx) &&
|
|
KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
|
|
(!Container.empty() && !Container.contains(Idx) &&
|
|
KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
|
|
IsOrdered = false;
|
|
auto &KTI = KeyToIndex[V];
|
|
if (KTI.empty())
|
|
Container[Idx].push_back(V);
|
|
KTI.push_back(I);
|
|
}
|
|
SmallVector<std::pair<unsigned, unsigned>> SubVectors;
|
|
APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
|
|
if (!IsOrdered && NumInstructions > 1) {
|
|
unsigned Cnt = 0;
|
|
TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
|
|
for (const auto &D : SortedValues) {
|
|
for (const auto &P : D.second) {
|
|
unsigned Sz = 0;
|
|
for (Value *V : P.second) {
|
|
ArrayRef<unsigned> Indices = KeyToIndex.at(V);
|
|
for (auto [K, Idx] : enumerate(Indices)) {
|
|
TE.ReorderIndices[Cnt + K] = Idx;
|
|
TE.Scalars[Cnt + K] = V;
|
|
}
|
|
Sz += Indices.size();
|
|
Cnt += Indices.size();
|
|
}
|
|
if (Sz > 1 && isa<Instruction>(P.second.front())) {
|
|
const unsigned SubVF = getFloorFullVectorNumberOfElements(
|
|
*TTI, TE.Scalars.front()->getType(), Sz);
|
|
SubVectors.emplace_back(Cnt - Sz, SubVF);
|
|
for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
|
|
DemandedElts.clearBit(I);
|
|
} else if (!P.second.empty() && isConstant(P.second.front())) {
|
|
for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
|
|
DemandedElts.clearBit(I);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Reuses always require shuffles, so consider it as profitable.
|
|
if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
|
|
return;
|
|
// Do simple cost estimation.
|
|
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
InstructionCost Cost = 0;
|
|
auto *ScalarTy = TE.Scalars.front()->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
|
|
for (auto [Idx, Sz] : SubVectors) {
|
|
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
|
|
Idx, getWidenedType(ScalarTy, Sz));
|
|
}
|
|
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
|
|
/*Insert=*/true,
|
|
/*Extract=*/false, CostKind);
|
|
int Sz = TE.Scalars.size();
|
|
SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
|
|
TE.ReorderIndices.end());
|
|
for (unsigned I : seq<unsigned>(Sz)) {
|
|
Value *V = TE.getOrdered(I);
|
|
if (isa<PoisonValue>(V)) {
|
|
ReorderMask[I] = PoisonMaskElem;
|
|
} else if (isConstant(V) || DemandedElts[I]) {
|
|
ReorderMask[I] = I + TE.ReorderIndices.size();
|
|
}
|
|
}
|
|
Cost += ::getShuffleCost(*TTI,
|
|
any_of(ReorderMask, [&](int I) { return I >= Sz; })
|
|
? TTI::SK_PermuteTwoSrc
|
|
: TTI::SK_PermuteSingleSrc,
|
|
VecTy, ReorderMask);
|
|
DemandedElts = APInt::getAllOnes(TE.Scalars.size());
|
|
ReorderMask.assign(Sz, PoisonMaskElem);
|
|
for (unsigned I : seq<unsigned>(Sz)) {
|
|
Value *V = TE.getOrdered(I);
|
|
if (isConstant(V)) {
|
|
DemandedElts.clearBit(I);
|
|
if (!isa<PoisonValue>(V))
|
|
ReorderMask[I] = I;
|
|
} else {
|
|
ReorderMask[I] = I + Sz;
|
|
}
|
|
}
|
|
InstructionCost BVCost =
|
|
getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
|
|
/*Insert=*/true, /*Extract=*/false, CostKind);
|
|
if (!DemandedElts.isAllOnes())
|
|
BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
|
|
if (Cost >= BVCost) {
|
|
SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
|
|
reorderScalars(TE.Scalars, Mask);
|
|
TE.ReorderIndices.clear();
|
|
}
|
|
}
|
|
|
|
void BoUpSLP::transformNodes() {
|
|
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
BaseGraphSize = VectorizableTree.size();
|
|
// Turn graph transforming mode on and off, when done.
|
|
class GraphTransformModeRAAI {
|
|
bool &SavedIsGraphTransformMode;
|
|
|
|
public:
|
|
GraphTransformModeRAAI(bool &IsGraphTransformMode)
|
|
: SavedIsGraphTransformMode(IsGraphTransformMode) {
|
|
IsGraphTransformMode = true;
|
|
}
|
|
~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
|
|
} TransformContext(IsGraphTransformMode);
|
|
// Operands are profitable if they are:
|
|
// 1. At least one constant
|
|
// or
|
|
// 2. Splats
|
|
// or
|
|
// 3. Results in good vectorization opportunity, i.e. may generate vector
|
|
// nodes and reduce cost of the graph.
|
|
auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
|
|
const InstructionsState &S) {
|
|
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
|
|
for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
|
|
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
|
|
I2->getOperand(Op));
|
|
return all_of(
|
|
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
|
|
return all_of(Cand,
|
|
[](const std::pair<Value *, Value *> &P) {
|
|
return isa<Constant>(P.first) ||
|
|
isa<Constant>(P.second) || P.first == P.second;
|
|
}) ||
|
|
findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
|
|
});
|
|
};
|
|
|
|
// Try to reorder gather nodes for better vectorization opportunities.
|
|
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
|
|
TreeEntry &E = *VectorizableTree[Idx];
|
|
if (E.isGather())
|
|
reorderGatherNode(E);
|
|
}
|
|
|
|
// Better to use full gathered loads analysis, if there are only 2 loads
|
|
// gathered nodes each having less than 16 elements.
|
|
constexpr unsigned VFLimit = 16;
|
|
bool ForceLoadGather =
|
|
count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->isGather() && TE->hasState() &&
|
|
TE->getOpcode() == Instruction::Load &&
|
|
TE->getVectorFactor() < VFLimit;
|
|
}) == 2;
|
|
|
|
// Checks if the scalars are used in other node.
|
|
auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
|
|
function_ref<bool(Value *)> CheckContainer) {
|
|
return TE->isSame(VL) || all_of(VL, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
return false;
|
|
return is_contained(TE->Scalars, I) || CheckContainer(I);
|
|
});
|
|
};
|
|
auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
|
|
if (E.hasState()) {
|
|
if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
|
|
!TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
|
|
return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
|
|
ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
|
|
return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
|
|
return is_contained(TEs, TE);
|
|
});
|
|
});
|
|
}))
|
|
return true;
|
|
;
|
|
if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
|
|
!TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
|
|
return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
|
|
ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
|
|
return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
|
|
return is_contained(TEs, TE);
|
|
});
|
|
});
|
|
}))
|
|
return true;
|
|
} else {
|
|
// Check if the gather node full copy of split node.
|
|
auto *It = find_if(E.Scalars, IsaPred<Instruction>);
|
|
if (It != E.Scalars.end()) {
|
|
if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
|
|
!TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
|
|
return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
|
|
ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
|
|
return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
|
|
return is_contained(TEs, TE);
|
|
});
|
|
});
|
|
}))
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
// The tree may grow here, so iterate over nodes, built before.
|
|
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
|
|
TreeEntry &E = *VectorizableTree[Idx];
|
|
if (E.isGather()) {
|
|
ArrayRef<Value *> VL = E.Scalars;
|
|
const unsigned Sz = getVectorElementSize(VL.front());
|
|
unsigned MinVF = getMinVF(2 * Sz);
|
|
// Do not try partial vectorization for small nodes (<= 2), nodes with the
|
|
// same opcode and same parent block or all constants.
|
|
if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
|
|
!(!E.hasState() || E.getOpcode() == Instruction::Load ||
|
|
E.isAltShuffle() || !allSameBlock(VL)) ||
|
|
allConstant(VL) || isSplat(VL))
|
|
continue;
|
|
if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
|
|
continue;
|
|
// Check if the node is a copy of other vector nodes.
|
|
if (CheckForSameVectorNodes(E))
|
|
continue;
|
|
// Try to find vectorizable sequences and transform them into a series of
|
|
// insertvector instructions.
|
|
unsigned StartIdx = 0;
|
|
unsigned End = VL.size();
|
|
for (unsigned VF = getFloorFullVectorNumberOfElements(
|
|
*TTI, VL.front()->getType(), VL.size() - 1);
|
|
VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
|
|
*TTI, VL.front()->getType(), VF - 1)) {
|
|
if (StartIdx + VF > End)
|
|
continue;
|
|
SmallVector<std::pair<unsigned, unsigned>> Slices;
|
|
for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
|
|
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
|
|
// If any instruction is vectorized already - do not try again.
|
|
// Reuse the existing node, if it fully matches the slice.
|
|
if (isVectorized(Slice.front()) &&
|
|
!getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
|
|
continue;
|
|
// Constant already handled effectively - skip.
|
|
if (allConstant(Slice))
|
|
continue;
|
|
// Do not try to vectorize small splats (less than vector register and
|
|
// only with the single non-undef element).
|
|
bool IsSplat = isSplat(Slice);
|
|
bool IsTwoRegisterSplat = true;
|
|
if (IsSplat && VF == 2) {
|
|
unsigned NumRegs2VF = ::getNumberOfParts(
|
|
*TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
|
|
IsTwoRegisterSplat = NumRegs2VF == 2;
|
|
}
|
|
if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
|
|
count(Slice, Slice.front()) ==
|
|
static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
|
|
: 1)) {
|
|
if (IsSplat)
|
|
continue;
|
|
InstructionsState S = getSameOpcode(Slice, *TLI);
|
|
if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
|
|
(S.getOpcode() == Instruction::Load &&
|
|
areKnownNonVectorizableLoads(Slice)) ||
|
|
(S.getOpcode() != Instruction::Load &&
|
|
!hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
|
|
continue;
|
|
if (VF == 2) {
|
|
// Try to vectorize reduced values or if all users are vectorized.
|
|
// For expensive instructions extra extracts might be profitable.
|
|
if ((!UserIgnoreList || E.Idx != 0) &&
|
|
TTI->getInstructionCost(S.getMainOp(), CostKind) <
|
|
TTI::TCC_Expensive &&
|
|
!all_of(Slice, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
return areAllUsersVectorized(cast<Instruction>(V),
|
|
UserIgnoreList);
|
|
}))
|
|
continue;
|
|
if (S.getOpcode() == Instruction::Load) {
|
|
OrdersType Order;
|
|
SmallVector<Value *> PointerOps;
|
|
LoadsState Res =
|
|
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
|
|
// Do not vectorize gathers.
|
|
if (Res == LoadsState::ScatterVectorize ||
|
|
Res == LoadsState::Gather) {
|
|
if (Res == LoadsState::Gather) {
|
|
registerNonVectorizableLoads(Slice);
|
|
// If reductions and the scalars from the root node are
|
|
// analyzed - mark as non-vectorizable reduction.
|
|
if (UserIgnoreList && E.Idx == 0)
|
|
analyzedReductionVals(Slice);
|
|
}
|
|
continue;
|
|
}
|
|
} else if (S.getOpcode() == Instruction::ExtractElement ||
|
|
(TTI->getInstructionCost(S.getMainOp(), CostKind) <
|
|
TTI::TCC_Expensive &&
|
|
!CheckOperandsProfitability(
|
|
S.getMainOp(),
|
|
cast<Instruction>(*find_if(reverse(Slice),
|
|
IsaPred<Instruction>)),
|
|
S))) {
|
|
// Do not vectorize extractelements (handled effectively
|
|
// alread). Do not vectorize non-profitable instructions (with
|
|
// low cost and non-vectorizable operands.)
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
Slices.emplace_back(Cnt, Slice.size());
|
|
}
|
|
auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
|
|
E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
|
|
if (StartIdx == Cnt)
|
|
StartIdx = Cnt + Sz;
|
|
if (End == Cnt + Sz)
|
|
End = Cnt;
|
|
};
|
|
for (auto [Cnt, Sz] : Slices) {
|
|
ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
|
|
const TreeEntry *SameTE = nullptr;
|
|
if (const auto *It = find_if(Slice, IsaPred<Instruction>);
|
|
It != Slice.end()) {
|
|
// If any instruction is vectorized already - do not try again.
|
|
SameTE = getSameValuesTreeEntry(*It, Slice);
|
|
}
|
|
unsigned PrevSize = VectorizableTree.size();
|
|
[[maybe_unused]] unsigned PrevEntriesSize =
|
|
LoadEntriesToVectorize.size();
|
|
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
|
|
if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
|
|
VectorizableTree[PrevSize]->isGather() &&
|
|
VectorizableTree[PrevSize]->hasState() &&
|
|
VectorizableTree[PrevSize]->getOpcode() !=
|
|
Instruction::ExtractElement &&
|
|
!isSplat(Slice)) {
|
|
if (UserIgnoreList && E.Idx == 0 && VF == 2)
|
|
analyzedReductionVals(Slice);
|
|
VectorizableTree.pop_back();
|
|
assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
|
|
"LoadEntriesToVectorize expected to remain the same");
|
|
continue;
|
|
}
|
|
AddCombinedNode(PrevSize, Cnt, Sz);
|
|
}
|
|
}
|
|
// Restore ordering, if no extra vectorization happened.
|
|
if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
|
|
SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
|
|
reorderScalars(E.Scalars, Mask);
|
|
E.ReorderIndices.clear();
|
|
}
|
|
}
|
|
if (!E.hasState())
|
|
continue;
|
|
switch (E.getOpcode()) {
|
|
case Instruction::Load: {
|
|
// No need to reorder masked gather loads, just reorder the scalar
|
|
// operands.
|
|
if (E.State != TreeEntry::Vectorize)
|
|
break;
|
|
Type *ScalarTy = E.getMainOp()->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
|
|
Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
|
|
// Check if profitable to represent consecutive load + reverse as strided
|
|
// load with stride -1.
|
|
if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
|
|
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
|
|
SmallVector<int> Mask;
|
|
inversePermutation(E.ReorderIndices, Mask);
|
|
auto *BaseLI = cast<LoadInst>(E.Scalars.back());
|
|
InstructionCost OriginalVecCost =
|
|
TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
|
|
BaseLI->getPointerAddressSpace(), CostKind,
|
|
TTI::OperandValueInfo()) +
|
|
::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
|
|
InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
|
|
Instruction::Load, VecTy, BaseLI->getPointerOperand(),
|
|
/*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
|
|
if (StridedCost < OriginalVecCost)
|
|
// Strided load is more profitable than consecutive load + reverse -
|
|
// transform the node to strided load.
|
|
E.State = TreeEntry::StridedVectorize;
|
|
}
|
|
break;
|
|
}
|
|
case Instruction::Store: {
|
|
Type *ScalarTy =
|
|
cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
|
|
Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
|
|
// Check if profitable to represent consecutive load + reverse as strided
|
|
// load with stride -1.
|
|
if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
|
|
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
|
|
SmallVector<int> Mask;
|
|
inversePermutation(E.ReorderIndices, Mask);
|
|
auto *BaseSI = cast<StoreInst>(E.Scalars.back());
|
|
InstructionCost OriginalVecCost =
|
|
TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
|
|
BaseSI->getPointerAddressSpace(), CostKind,
|
|
TTI::OperandValueInfo()) +
|
|
::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
|
|
InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
|
|
Instruction::Store, VecTy, BaseSI->getPointerOperand(),
|
|
/*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
|
|
if (StridedCost < OriginalVecCost)
|
|
// Strided store is more profitable than reverse + consecutive store -
|
|
// transform the node to strided store.
|
|
E.State = TreeEntry::StridedVectorize;
|
|
} else if (!E.ReorderIndices.empty()) {
|
|
// Check for interleaved stores.
|
|
auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
|
|
auto *BaseSI = cast<StoreInst>(E.Scalars.front());
|
|
assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
|
|
if (Mask.size() < 4)
|
|
return 0u;
|
|
for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
|
|
if (ShuffleVectorInst::isInterleaveMask(
|
|
Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
|
|
TTI.isLegalInterleavedAccessType(
|
|
VecTy, Factor, BaseSI->getAlign(),
|
|
BaseSI->getPointerAddressSpace()))
|
|
return Factor;
|
|
}
|
|
|
|
return 0u;
|
|
};
|
|
SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
|
|
unsigned InterleaveFactor = IsInterleaveMask(Mask);
|
|
if (InterleaveFactor != 0)
|
|
E.setInterleave(InterleaveFactor);
|
|
}
|
|
break;
|
|
}
|
|
case Instruction::Select: {
|
|
if (E.State != TreeEntry::Vectorize)
|
|
break;
|
|
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
|
|
if (MinMaxID == Intrinsic::not_intrinsic)
|
|
break;
|
|
// This node is a minmax node.
|
|
E.CombinedOp = TreeEntry::MinMax;
|
|
TreeEntry *CondEntry = getOperandEntry(&E, 0);
|
|
if (SelectOnly && CondEntry->UserTreeIndex &&
|
|
CondEntry->State == TreeEntry::Vectorize) {
|
|
// The condition node is part of the combined minmax node.
|
|
CondEntry->State = TreeEntry::CombinedVectorize;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (LoadEntriesToVectorize.empty()) {
|
|
// Single load node - exit.
|
|
if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
|
|
VectorizableTree.front()->getOpcode() == Instruction::Load)
|
|
return;
|
|
// Small graph with small VF - exit.
|
|
constexpr unsigned SmallTree = 3;
|
|
constexpr unsigned SmallVF = 2;
|
|
if ((VectorizableTree.size() <= SmallTree &&
|
|
VectorizableTree.front()->Scalars.size() == SmallVF) ||
|
|
(VectorizableTree.size() <= 2 && UserIgnoreList))
|
|
return;
|
|
|
|
if (VectorizableTree.front()->isNonPowOf2Vec() &&
|
|
getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
|
|
getCanonicalGraphSize() <= SmallTree &&
|
|
count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
|
|
[](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->isGather() && TE->hasState() &&
|
|
TE->getOpcode() == Instruction::Load &&
|
|
!allSameBlock(TE->Scalars);
|
|
}) == 1)
|
|
return;
|
|
}
|
|
|
|
// A list of loads to be gathered during the vectorization process. We can
|
|
// try to vectorize them at the end, if profitable.
|
|
SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
|
|
SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8>
|
|
GatheredLoads;
|
|
|
|
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
|
|
TreeEntry &E = *TE;
|
|
if (E.isGather() &&
|
|
((E.hasState() && E.getOpcode() == Instruction::Load) ||
|
|
(!E.hasState() && any_of(E.Scalars,
|
|
[&](Value *V) {
|
|
return isa<LoadInst>(V) &&
|
|
!isVectorized(V) &&
|
|
!isDeleted(cast<Instruction>(V));
|
|
}))) &&
|
|
!isSplat(E.Scalars)) {
|
|
for (Value *V : E.Scalars) {
|
|
auto *LI = dyn_cast<LoadInst>(V);
|
|
if (!LI)
|
|
continue;
|
|
if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
|
|
continue;
|
|
gatherPossiblyVectorizableLoads(
|
|
*this, V, *DL, *SE, *TTI,
|
|
GatheredLoads[std::make_tuple(
|
|
LI->getParent(),
|
|
getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
|
|
LI->getType())]);
|
|
}
|
|
}
|
|
}
|
|
// Try to vectorize gathered loads if this is not just a gather of loads.
|
|
if (!GatheredLoads.empty())
|
|
tryToVectorizeGatheredLoads(GatheredLoads);
|
|
}
|
|
|
|
/// Merges shuffle masks and emits final shuffle instruction, if required. It
|
|
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
|
|
/// when the actual shuffle instruction is generated only if this is actually
|
|
/// required. Otherwise, the shuffle instruction emission is delayed till the
|
|
/// end of the process, to reduce the number of emitted instructions and further
|
|
/// analysis/transformations.
|
|
class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
|
|
bool IsFinalized = false;
|
|
SmallVector<int> CommonMask;
|
|
SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
|
|
const TargetTransformInfo &TTI;
|
|
InstructionCost Cost = 0;
|
|
SmallDenseSet<Value *> VectorizedVals;
|
|
BoUpSLP &R;
|
|
SmallPtrSetImpl<Value *> &CheckedExtracts;
|
|
constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
/// While set, still trying to estimate the cost for the same nodes and we
|
|
/// can delay actual cost estimation (virtual shuffle instruction emission).
|
|
/// May help better estimate the cost if same nodes must be permuted + allows
|
|
/// to move most of the long shuffles cost estimation to TTI.
|
|
bool SameNodesEstimated = true;
|
|
|
|
static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
|
|
if (Ty->getScalarType()->isPointerTy()) {
|
|
Constant *Res = ConstantExpr::getIntToPtr(
|
|
ConstantInt::getAllOnesValue(
|
|
IntegerType::get(Ty->getContext(),
|
|
DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
|
|
Ty->getScalarType());
|
|
if (auto *VTy = dyn_cast<VectorType>(Ty))
|
|
Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
|
|
return Res;
|
|
}
|
|
return Constant::getAllOnesValue(Ty);
|
|
}
|
|
|
|
InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
|
|
if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
|
|
return TTI::TCC_Free;
|
|
auto *VecTy = getWidenedType(ScalarTy, VL.size());
|
|
InstructionCost GatherCost = 0;
|
|
SmallVector<Value *> Gathers(VL);
|
|
if (!Root && isSplat(VL)) {
|
|
// Found the broadcasting of the single scalar, calculate the cost as
|
|
// the broadcast.
|
|
const auto *It = find_if_not(VL, IsaPred<UndefValue>);
|
|
assert(It != VL.end() && "Expected at least one non-undef value.");
|
|
// Add broadcast for non-identity shuffle only.
|
|
bool NeedShuffle =
|
|
count(VL, *It) > 1 &&
|
|
(VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
|
|
if (!NeedShuffle) {
|
|
if (isa<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
return TTI.getShuffleCost(
|
|
TTI::SK_InsertSubvector, VecTy, {}, CostKind,
|
|
std::distance(VL.begin(), It) * getNumElements(ScalarTy),
|
|
cast<FixedVectorType>(ScalarTy));
|
|
}
|
|
return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
|
|
CostKind, std::distance(VL.begin(), It),
|
|
PoisonValue::get(VecTy), *It);
|
|
}
|
|
|
|
SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
|
|
transform(VL, ShuffleMask.begin(), [](Value *V) {
|
|
return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
|
|
});
|
|
InstructionCost InsertCost =
|
|
TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
|
|
PoisonValue::get(VecTy), *It);
|
|
return InsertCost + ::getShuffleCost(TTI,
|
|
TargetTransformInfo::SK_Broadcast,
|
|
VecTy, ShuffleMask, CostKind,
|
|
/*Index=*/0, /*SubTp=*/nullptr,
|
|
/*Args=*/*It);
|
|
}
|
|
return GatherCost +
|
|
(all_of(Gathers, IsaPred<UndefValue>)
|
|
? TTI::TCC_Free
|
|
: R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
|
|
ScalarTy));
|
|
};
|
|
|
|
/// Compute the cost of creating a vector containing the extracted values from
|
|
/// \p VL.
|
|
InstructionCost
|
|
computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
|
|
ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
|
|
unsigned NumParts) {
|
|
assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
|
|
unsigned NumElts =
|
|
std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
|
|
auto *EE = dyn_cast<ExtractElementInst>(V);
|
|
if (!EE)
|
|
return Sz;
|
|
auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
|
|
if (!VecTy)
|
|
return Sz;
|
|
return std::max(Sz, VecTy->getNumElements());
|
|
});
|
|
// FIXME: this must be moved to TTI for better estimation.
|
|
unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
|
|
auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
|
|
SmallVectorImpl<unsigned> &Indices)
|
|
-> std::optional<TTI::ShuffleKind> {
|
|
if (NumElts <= EltsPerVector)
|
|
return std::nullopt;
|
|
int OffsetReg0 =
|
|
alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
|
|
[](int S, int I) {
|
|
if (I == PoisonMaskElem)
|
|
return S;
|
|
return std::min(S, I);
|
|
}),
|
|
EltsPerVector);
|
|
int OffsetReg1 = OffsetReg0;
|
|
DenseSet<int> RegIndices;
|
|
// Check that if trying to permute same single/2 input vectors.
|
|
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
|
|
int FirstRegId = -1;
|
|
Indices.assign(1, OffsetReg0);
|
|
for (auto [Pos, I] : enumerate(Mask)) {
|
|
if (I == PoisonMaskElem)
|
|
continue;
|
|
int Idx = I - OffsetReg0;
|
|
int RegId =
|
|
(Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
|
|
if (FirstRegId < 0)
|
|
FirstRegId = RegId;
|
|
RegIndices.insert(RegId);
|
|
if (RegIndices.size() > 2)
|
|
return std::nullopt;
|
|
if (RegIndices.size() == 2) {
|
|
ShuffleKind = TTI::SK_PermuteTwoSrc;
|
|
if (Indices.size() == 1) {
|
|
OffsetReg1 = alignDown(
|
|
std::accumulate(
|
|
std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
|
|
[&](int S, int I) {
|
|
if (I == PoisonMaskElem)
|
|
return S;
|
|
int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
|
|
((I - OffsetReg0) % NumElts) / EltsPerVector;
|
|
if (RegId == FirstRegId)
|
|
return S;
|
|
return std::min(S, I);
|
|
}),
|
|
EltsPerVector);
|
|
Indices.push_back(OffsetReg1 % NumElts);
|
|
}
|
|
Idx = I - OffsetReg1;
|
|
}
|
|
I = (Idx % NumElts) % EltsPerVector +
|
|
(RegId == FirstRegId ? 0 : EltsPerVector);
|
|
}
|
|
return ShuffleKind;
|
|
};
|
|
InstructionCost Cost = 0;
|
|
|
|
// Process extracts in blocks of EltsPerVector to check if the source vector
|
|
// operand can be re-used directly. If not, add the cost of creating a
|
|
// shuffle to extract the values into a vector register.
|
|
for (unsigned Part : seq<unsigned>(NumParts)) {
|
|
if (!ShuffleKinds[Part])
|
|
continue;
|
|
ArrayRef<int> MaskSlice = Mask.slice(
|
|
Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
|
|
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
|
|
copy(MaskSlice, SubMask.begin());
|
|
SmallVector<unsigned, 2> Indices;
|
|
std::optional<TTI::ShuffleKind> RegShuffleKind =
|
|
CheckPerRegistersShuffle(SubMask, Indices);
|
|
if (!RegShuffleKind) {
|
|
if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
|
|
!ShuffleVectorInst::isIdentityMask(
|
|
MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
|
|
Cost +=
|
|
::getShuffleCost(TTI, *ShuffleKinds[Part],
|
|
getWidenedType(ScalarTy, NumElts), MaskSlice);
|
|
continue;
|
|
}
|
|
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
|
|
!ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
|
|
Cost +=
|
|
::getShuffleCost(TTI, *RegShuffleKind,
|
|
getWidenedType(ScalarTy, EltsPerVector), SubMask);
|
|
}
|
|
const unsigned BaseVF = getFullVectorNumberOfElements(
|
|
*R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
|
|
for (unsigned Idx : Indices) {
|
|
assert((Idx + EltsPerVector) <= BaseVF &&
|
|
"SK_ExtractSubvector index out of range");
|
|
Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
|
|
getWidenedType(ScalarTy, BaseVF), {}, CostKind,
|
|
Idx, getWidenedType(ScalarTy, EltsPerVector));
|
|
}
|
|
// Second attempt to check, if just a permute is better estimated than
|
|
// subvector extract.
|
|
SubMask.assign(NumElts, PoisonMaskElem);
|
|
copy(MaskSlice, SubMask.begin());
|
|
InstructionCost OriginalCost = ::getShuffleCost(
|
|
TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
|
|
if (OriginalCost < Cost)
|
|
Cost = OriginalCost;
|
|
}
|
|
return Cost;
|
|
}
|
|
/// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
|
|
/// mask \p Mask, register number \p Part, that includes \p SliceSize
|
|
/// elements.
|
|
void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
|
|
ArrayRef<int> Mask, unsigned Part,
|
|
unsigned SliceSize) {
|
|
if (SameNodesEstimated) {
|
|
// Delay the cost estimation if the same nodes are reshuffling.
|
|
// If we already requested the cost of reshuffling of E1 and E2 before, no
|
|
// need to estimate another cost with the sub-Mask, instead include this
|
|
// sub-Mask into the CommonMask to estimate it later and avoid double cost
|
|
// estimation.
|
|
if ((InVectors.size() == 2 &&
|
|
cast<const TreeEntry *>(InVectors.front()) == &E1 &&
|
|
cast<const TreeEntry *>(InVectors.back()) == E2) ||
|
|
(!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
|
|
unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
|
|
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
|
|
[](int Idx) { return Idx == PoisonMaskElem; }) &&
|
|
"Expected all poisoned elements.");
|
|
ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
|
|
copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
|
|
return;
|
|
}
|
|
// Found non-matching nodes - need to estimate the cost for the matched
|
|
// and transform mask.
|
|
Cost += createShuffle(InVectors.front(),
|
|
InVectors.size() == 1 ? nullptr : InVectors.back(),
|
|
CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
} else if (InVectors.size() == 2) {
|
|
Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
}
|
|
SameNodesEstimated = false;
|
|
if (!E2 && InVectors.size() == 1) {
|
|
unsigned VF = E1.getVectorFactor();
|
|
if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
|
|
VF = std::max(VF, getVF(V1));
|
|
} else {
|
|
const auto *E = cast<const TreeEntry *>(InVectors.front());
|
|
VF = std::max(VF, E->getVectorFactor());
|
|
}
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
|
|
CommonMask[Idx] = Mask[Idx] + VF;
|
|
Cost += createShuffle(InVectors.front(), &E1, CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
} else {
|
|
auto P = InVectors.front();
|
|
Cost += createShuffle(&E1, E2, Mask);
|
|
unsigned VF = Mask.size();
|
|
if (Value *V1 = dyn_cast<Value *>(P)) {
|
|
VF = std::max(VF,
|
|
getNumElements(V1->getType()));
|
|
} else {
|
|
const auto *E = cast<const TreeEntry *>(P);
|
|
VF = std::max(VF, E->getVectorFactor());
|
|
}
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (Mask[Idx] != PoisonMaskElem)
|
|
CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
|
|
Cost += createShuffle(P, InVectors.front(), CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
}
|
|
}
|
|
|
|
class ShuffleCostBuilder {
|
|
const TargetTransformInfo &TTI;
|
|
|
|
static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
|
|
int Index = -1;
|
|
return Mask.empty() ||
|
|
(VF == Mask.size() &&
|
|
ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
|
|
(ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
|
|
Index == 0);
|
|
}
|
|
|
|
public:
|
|
ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
|
|
~ShuffleCostBuilder() = default;
|
|
InstructionCost createShuffleVector(Value *V1, Value *,
|
|
ArrayRef<int> Mask) const {
|
|
// Empty mask or identity mask are free.
|
|
unsigned VF =
|
|
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
|
|
if (isEmptyOrIdentity(Mask, VF))
|
|
return TTI::TCC_Free;
|
|
return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
|
|
cast<VectorType>(V1->getType()), Mask);
|
|
}
|
|
InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
|
|
// Empty mask or identity mask are free.
|
|
unsigned VF =
|
|
cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
|
|
if (isEmptyOrIdentity(Mask, VF))
|
|
return TTI::TCC_Free;
|
|
return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
|
|
cast<VectorType>(V1->getType()), Mask);
|
|
}
|
|
InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
|
|
InstructionCost createPoison(Type *Ty, unsigned VF) const {
|
|
return TTI::TCC_Free;
|
|
}
|
|
void resizeToMatch(Value *&, Value *&) const {}
|
|
};
|
|
|
|
/// Smart shuffle instruction emission, walks through shuffles trees and
|
|
/// tries to find the best matching vector for the actual shuffle
|
|
/// instruction.
|
|
InstructionCost
|
|
createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
|
|
const PointerUnion<Value *, const TreeEntry *> &P2,
|
|
ArrayRef<int> Mask) {
|
|
ShuffleCostBuilder Builder(TTI);
|
|
SmallVector<int> CommonMask(Mask);
|
|
Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
|
|
unsigned CommonVF = Mask.size();
|
|
InstructionCost ExtraCost = 0;
|
|
auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
|
|
unsigned VF) -> InstructionCost {
|
|
if (E.isGather() && allConstant(E.Scalars))
|
|
return TTI::TCC_Free;
|
|
Type *EScalarTy = E.Scalars.front()->getType();
|
|
bool IsSigned = true;
|
|
if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
|
|
EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
|
|
IsSigned = It->second.second;
|
|
}
|
|
if (EScalarTy != ScalarTy) {
|
|
unsigned CastOpcode = Instruction::Trunc;
|
|
unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
|
|
unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
|
|
if (DstSz > SrcSz)
|
|
CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
|
|
return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
|
|
getWidenedType(EScalarTy, VF),
|
|
TTI::CastContextHint::None, CostKind);
|
|
}
|
|
return TTI::TCC_Free;
|
|
};
|
|
auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
|
|
if (isa<Constant>(V))
|
|
return TTI::TCC_Free;
|
|
auto *VecTy = cast<VectorType>(V->getType());
|
|
Type *EScalarTy = VecTy->getElementType();
|
|
if (EScalarTy != ScalarTy) {
|
|
bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
|
|
unsigned CastOpcode = Instruction::Trunc;
|
|
unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
|
|
unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
|
|
if (DstSz > SrcSz)
|
|
CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
|
|
return TTI.getCastInstrCost(
|
|
CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
|
|
VecTy, TTI::CastContextHint::None, CostKind);
|
|
}
|
|
return TTI::TCC_Free;
|
|
};
|
|
if (!V1 && !V2 && !P2.isNull()) {
|
|
// Shuffle 2 entry nodes.
|
|
const TreeEntry *E = cast<const TreeEntry *>(P1);
|
|
unsigned VF = E->getVectorFactor();
|
|
const TreeEntry *E2 = cast<const TreeEntry *>(P2);
|
|
CommonVF = std::max(VF, E2->getVectorFactor());
|
|
assert(all_of(Mask,
|
|
[=](int Idx) {
|
|
return Idx < 2 * static_cast<int>(CommonVF);
|
|
}) &&
|
|
"All elements in mask must be less than 2 * CommonVF.");
|
|
if (E->Scalars.size() == E2->Scalars.size()) {
|
|
SmallVector<int> EMask = E->getCommonMask();
|
|
SmallVector<int> E2Mask = E2->getCommonMask();
|
|
if (!EMask.empty() || !E2Mask.empty()) {
|
|
for (int &Idx : CommonMask) {
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
|
|
Idx = EMask[Idx];
|
|
else if (Idx >= static_cast<int>(CommonVF))
|
|
Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
|
|
E->Scalars.size();
|
|
}
|
|
}
|
|
CommonVF = E->Scalars.size();
|
|
ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
|
|
GetNodeMinBWAffectedCost(*E2, CommonVF);
|
|
} else {
|
|
ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
|
|
GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
|
|
}
|
|
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
|
|
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
|
|
} else if (!V1 && P2.isNull()) {
|
|
// Shuffle single entry node.
|
|
const TreeEntry *E = cast<const TreeEntry *>(P1);
|
|
unsigned VF = E->getVectorFactor();
|
|
CommonVF = VF;
|
|
assert(
|
|
all_of(Mask,
|
|
[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
|
|
"All elements in mask must be less than CommonVF.");
|
|
if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
|
|
SmallVector<int> EMask = E->getCommonMask();
|
|
assert(!EMask.empty() && "Expected non-empty common mask.");
|
|
for (int &Idx : CommonMask) {
|
|
if (Idx != PoisonMaskElem)
|
|
Idx = EMask[Idx];
|
|
}
|
|
CommonVF = E->Scalars.size();
|
|
} else if (unsigned Factor = E->getInterleaveFactor();
|
|
Factor > 0 && E->Scalars.size() != Mask.size() &&
|
|
ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
|
|
Factor)) {
|
|
// Deinterleaved nodes are free.
|
|
std::iota(CommonMask.begin(), CommonMask.end(), 0);
|
|
}
|
|
ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
|
|
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
|
|
// Not identity/broadcast? Try to see if the original vector is better.
|
|
if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
|
|
CommonVF == CommonMask.size() &&
|
|
any_of(enumerate(CommonMask),
|
|
[](const auto &&P) {
|
|
return P.value() != PoisonMaskElem &&
|
|
static_cast<unsigned>(P.value()) != P.index();
|
|
}) &&
|
|
any_of(CommonMask,
|
|
[](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
|
|
SmallVector<int> ReorderMask;
|
|
inversePermutation(E->ReorderIndices, ReorderMask);
|
|
::addMask(CommonMask, ReorderMask);
|
|
}
|
|
} else if (V1 && P2.isNull()) {
|
|
// Shuffle single vector.
|
|
ExtraCost += GetValueMinBWAffectedCost(V1);
|
|
CommonVF = getVF(V1);
|
|
assert(
|
|
all_of(Mask,
|
|
[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
|
|
"All elements in mask must be less than CommonVF.");
|
|
} else if (V1 && !V2) {
|
|
// Shuffle vector and tree node.
|
|
unsigned VF = getVF(V1);
|
|
const TreeEntry *E2 = cast<const TreeEntry *>(P2);
|
|
CommonVF = std::max(VF, E2->getVectorFactor());
|
|
assert(all_of(Mask,
|
|
[=](int Idx) {
|
|
return Idx < 2 * static_cast<int>(CommonVF);
|
|
}) &&
|
|
"All elements in mask must be less than 2 * CommonVF.");
|
|
if (E2->Scalars.size() == VF && VF != CommonVF) {
|
|
SmallVector<int> E2Mask = E2->getCommonMask();
|
|
assert(!E2Mask.empty() && "Expected non-empty common mask.");
|
|
for (int &Idx : CommonMask) {
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
if (Idx >= static_cast<int>(CommonVF))
|
|
Idx = E2Mask[Idx - CommonVF] + VF;
|
|
}
|
|
CommonVF = VF;
|
|
}
|
|
ExtraCost += GetValueMinBWAffectedCost(V1);
|
|
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
|
|
ExtraCost += GetNodeMinBWAffectedCost(
|
|
*E2, std::min(CommonVF, E2->getVectorFactor()));
|
|
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
|
|
} else if (!V1 && V2) {
|
|
// Shuffle vector and tree node.
|
|
unsigned VF = getVF(V2);
|
|
const TreeEntry *E1 = cast<const TreeEntry *>(P1);
|
|
CommonVF = std::max(VF, E1->getVectorFactor());
|
|
assert(all_of(Mask,
|
|
[=](int Idx) {
|
|
return Idx < 2 * static_cast<int>(CommonVF);
|
|
}) &&
|
|
"All elements in mask must be less than 2 * CommonVF.");
|
|
if (E1->Scalars.size() == VF && VF != CommonVF) {
|
|
SmallVector<int> E1Mask = E1->getCommonMask();
|
|
assert(!E1Mask.empty() && "Expected non-empty common mask.");
|
|
for (int &Idx : CommonMask) {
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
if (Idx >= static_cast<int>(CommonVF))
|
|
Idx = E1Mask[Idx - CommonVF] + VF;
|
|
else
|
|
Idx = E1Mask[Idx];
|
|
}
|
|
CommonVF = VF;
|
|
}
|
|
ExtraCost += GetNodeMinBWAffectedCost(
|
|
*E1, std::min(CommonVF, E1->getVectorFactor()));
|
|
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
|
|
ExtraCost += GetValueMinBWAffectedCost(V2);
|
|
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
|
|
} else {
|
|
assert(V1 && V2 && "Expected both vectors.");
|
|
unsigned VF = getVF(V1);
|
|
CommonVF = std::max(VF, getVF(V2));
|
|
assert(all_of(Mask,
|
|
[=](int Idx) {
|
|
return Idx < 2 * static_cast<int>(CommonVF);
|
|
}) &&
|
|
"All elements in mask must be less than 2 * CommonVF.");
|
|
ExtraCost +=
|
|
GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
|
|
if (V1->getType() != V2->getType()) {
|
|
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
|
|
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
|
|
} else {
|
|
if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
|
|
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
|
|
if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
|
|
V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
|
|
}
|
|
}
|
|
InVectors.front() =
|
|
Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
|
|
if (InVectors.size() == 2)
|
|
InVectors.pop_back();
|
|
return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
|
|
V1, V2, CommonMask, Builder, ScalarTy);
|
|
}
|
|
|
|
public:
|
|
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
|
|
ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
|
|
SmallPtrSetImpl<Value *> &CheckedExtracts)
|
|
: BaseShuffleAnalysis(ScalarTy), TTI(TTI),
|
|
VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
|
|
CheckedExtracts(CheckedExtracts) {}
|
|
Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
|
|
ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
|
|
unsigned NumParts, bool &UseVecBaseAsInput) {
|
|
UseVecBaseAsInput = false;
|
|
if (Mask.empty())
|
|
return nullptr;
|
|
Value *VecBase = nullptr;
|
|
SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
|
|
if (!E->ReorderIndices.empty()) {
|
|
SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
|
|
E->ReorderIndices.end());
|
|
reorderScalars(VL, ReorderMask);
|
|
}
|
|
// Check if it can be considered reused if same extractelements were
|
|
// vectorized already.
|
|
bool PrevNodeFound = any_of(
|
|
ArrayRef(R.VectorizableTree).take_front(E->Idx),
|
|
[&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return ((TE->hasState() && !TE->isAltShuffle() &&
|
|
TE->getOpcode() == Instruction::ExtractElement) ||
|
|
TE->isGather()) &&
|
|
all_of(enumerate(TE->Scalars), [&](auto &&Data) {
|
|
return VL.size() > Data.index() &&
|
|
(Mask[Data.index()] == PoisonMaskElem ||
|
|
isa<UndefValue>(VL[Data.index()]) ||
|
|
Data.value() == VL[Data.index()]);
|
|
});
|
|
});
|
|
SmallPtrSet<Value *, 4> UniqueBases;
|
|
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
|
|
SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
|
|
for (unsigned Part : seq<unsigned>(NumParts)) {
|
|
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
|
|
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
|
|
for (auto [I, V] :
|
|
enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
|
|
// Ignore non-extractelement scalars.
|
|
if (isa<UndefValue>(V) ||
|
|
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
|
|
continue;
|
|
// If all users of instruction are going to be vectorized and this
|
|
// instruction itself is not going to be vectorized, consider this
|
|
// instruction as dead and remove its cost from the final cost of the
|
|
// vectorized tree.
|
|
// Also, avoid adjusting the cost for extractelements with multiple uses
|
|
// in different graph entries.
|
|
auto *EE = cast<ExtractElementInst>(V);
|
|
VecBase = EE->getVectorOperand();
|
|
UniqueBases.insert(VecBase);
|
|
ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
|
|
if (!CheckedExtracts.insert(V).second ||
|
|
!R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
|
|
any_of(EE->users(),
|
|
[&](User *U) {
|
|
return isa<GetElementPtrInst>(U) &&
|
|
!R.areAllUsersVectorized(cast<Instruction>(U),
|
|
&VectorizedVals);
|
|
}) ||
|
|
(!VEs.empty() && !is_contained(VEs, E)))
|
|
continue;
|
|
std::optional<unsigned> EEIdx = getExtractIndex(EE);
|
|
if (!EEIdx)
|
|
continue;
|
|
unsigned Idx = *EEIdx;
|
|
// Take credit for instruction that will become dead.
|
|
if (EE->hasOneUse() || !PrevNodeFound) {
|
|
Instruction *Ext = EE->user_back();
|
|
if (isa<SExtInst, ZExtInst>(Ext) &&
|
|
all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
|
|
// Use getExtractWithExtendCost() to calculate the cost of
|
|
// extractelement/ext pair.
|
|
Cost -=
|
|
TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
|
|
EE->getVectorOperandType(), Idx);
|
|
// Add back the cost of s|zext which is subtracted separately.
|
|
Cost += TTI.getCastInstrCost(
|
|
Ext->getOpcode(), Ext->getType(), EE->getType(),
|
|
TTI::getCastContextHint(Ext), CostKind, Ext);
|
|
continue;
|
|
}
|
|
}
|
|
APInt &DemandedElts =
|
|
VectorOpsToExtracts
|
|
.try_emplace(VecBase,
|
|
APInt::getZero(getNumElements(VecBase->getType())))
|
|
.first->getSecond();
|
|
DemandedElts.setBit(Idx);
|
|
}
|
|
}
|
|
for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
|
|
Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),
|
|
DemandedElts, /*Insert=*/false,
|
|
/*Extract=*/true, CostKind);
|
|
// Check that gather of extractelements can be represented as just a
|
|
// shuffle of a single/two vectors the scalars are extracted from.
|
|
// Found the bunch of extractelement instructions that must be gathered
|
|
// into a vector and can be represented as a permutation elements in a
|
|
// single input vector or of 2 input vectors.
|
|
// Done for reused if same extractelements were vectorized already.
|
|
if (!PrevNodeFound)
|
|
Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
|
|
InVectors.assign(1, E);
|
|
CommonMask.assign(Mask.begin(), Mask.end());
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
SameNodesEstimated = false;
|
|
if (NumParts != 1 && UniqueBases.size() != 1) {
|
|
UseVecBaseAsInput = true;
|
|
VecBase =
|
|
Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
|
|
}
|
|
return VecBase;
|
|
}
|
|
/// Checks if the specified entry \p E needs to be delayed because of its
|
|
/// dependency nodes.
|
|
std::optional<InstructionCost>
|
|
needToDelay(const TreeEntry *,
|
|
ArrayRef<SmallVector<const TreeEntry *>>) const {
|
|
// No need to delay the cost estimation during analysis.
|
|
return std::nullopt;
|
|
}
|
|
/// Reset the builder to handle perfect diamond match.
|
|
void resetForSameNode() {
|
|
IsFinalized = false;
|
|
CommonMask.clear();
|
|
InVectors.clear();
|
|
Cost = 0;
|
|
VectorizedVals.clear();
|
|
SameNodesEstimated = true;
|
|
}
|
|
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
|
|
if (&E1 == &E2) {
|
|
assert(all_of(Mask,
|
|
[&](int Idx) {
|
|
return Idx < static_cast<int>(E1.getVectorFactor());
|
|
}) &&
|
|
"Expected single vector shuffle mask.");
|
|
add(E1, Mask);
|
|
return;
|
|
}
|
|
if (InVectors.empty()) {
|
|
CommonMask.assign(Mask.begin(), Mask.end());
|
|
InVectors.assign({&E1, &E2});
|
|
return;
|
|
}
|
|
assert(!CommonMask.empty() && "Expected non-empty common mask.");
|
|
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
|
|
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
|
|
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
|
|
const auto *It =
|
|
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
|
|
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
|
|
estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
|
|
}
|
|
void add(const TreeEntry &E1, ArrayRef<int> Mask) {
|
|
if (InVectors.empty()) {
|
|
CommonMask.assign(Mask.begin(), Mask.end());
|
|
InVectors.assign(1, &E1);
|
|
return;
|
|
}
|
|
assert(!CommonMask.empty() && "Expected non-empty common mask.");
|
|
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
|
|
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
|
|
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
|
|
const auto *It =
|
|
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
|
|
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
|
|
estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
|
|
if (!SameNodesEstimated && InVectors.size() == 1)
|
|
InVectors.emplace_back(&E1);
|
|
}
|
|
/// Adds 2 input vectors and the mask for their shuffling.
|
|
void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
|
|
// May come only for shuffling of 2 vectors with extractelements, already
|
|
// handled in adjustExtracts.
|
|
assert(InVectors.size() == 1 &&
|
|
all_of(enumerate(CommonMask),
|
|
[&](auto P) {
|
|
if (P.value() == PoisonMaskElem)
|
|
return Mask[P.index()] == PoisonMaskElem;
|
|
auto *EI = cast<ExtractElementInst>(
|
|
cast<const TreeEntry *>(InVectors.front())
|
|
->getOrdered(P.index()));
|
|
return EI->getVectorOperand() == V1 ||
|
|
EI->getVectorOperand() == V2;
|
|
}) &&
|
|
"Expected extractelement vectors.");
|
|
}
|
|
/// Adds another one input vector and the mask for the shuffling.
|
|
void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
|
|
if (InVectors.empty()) {
|
|
assert(CommonMask.empty() && !ForExtracts &&
|
|
"Expected empty input mask/vectors.");
|
|
CommonMask.assign(Mask.begin(), Mask.end());
|
|
InVectors.assign(1, V1);
|
|
return;
|
|
}
|
|
if (ForExtracts) {
|
|
// No need to add vectors here, already handled them in adjustExtracts.
|
|
assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
|
|
!CommonMask.empty() &&
|
|
all_of(enumerate(CommonMask),
|
|
[&](auto P) {
|
|
Value *Scalar = cast<const TreeEntry *>(InVectors[0])
|
|
->getOrdered(P.index());
|
|
if (P.value() == PoisonMaskElem)
|
|
return P.value() == Mask[P.index()] ||
|
|
isa<UndefValue>(Scalar);
|
|
if (isa<Constant>(V1))
|
|
return true;
|
|
auto *EI = cast<ExtractElementInst>(Scalar);
|
|
return EI->getVectorOperand() == V1;
|
|
}) &&
|
|
"Expected only tree entry for extractelement vectors.");
|
|
return;
|
|
}
|
|
assert(!InVectors.empty() && !CommonMask.empty() &&
|
|
"Expected only tree entries from extracts/reused buildvectors.");
|
|
unsigned VF = getVF(V1);
|
|
if (InVectors.size() == 2) {
|
|
Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
VF = std::max<unsigned>(VF, CommonMask.size());
|
|
} else if (const auto *InTE =
|
|
InVectors.front().dyn_cast<const TreeEntry *>()) {
|
|
VF = std::max(VF, InTE->getVectorFactor());
|
|
} else {
|
|
VF = std::max(
|
|
VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
|
|
->getNumElements());
|
|
}
|
|
InVectors.push_back(V1);
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
|
|
CommonMask[Idx] = Mask[Idx] + VF;
|
|
}
|
|
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
|
|
Value *Root = nullptr) {
|
|
Cost += getBuildVectorCost(VL, Root);
|
|
if (!Root) {
|
|
// FIXME: Need to find a way to avoid use of getNullValue here.
|
|
SmallVector<Constant *> Vals;
|
|
unsigned VF = VL.size();
|
|
if (MaskVF != 0)
|
|
VF = std::min(VF, MaskVF);
|
|
Type *VLScalarTy = VL.front()->getType();
|
|
for (Value *V : VL.take_front(VF)) {
|
|
Type *ScalarTy = VLScalarTy->getScalarType();
|
|
if (isa<PoisonValue>(V)) {
|
|
Vals.push_back(PoisonValue::get(ScalarTy));
|
|
continue;
|
|
}
|
|
if (isa<UndefValue>(V)) {
|
|
Vals.push_back(UndefValue::get(ScalarTy));
|
|
continue;
|
|
}
|
|
Vals.push_back(Constant::getNullValue(ScalarTy));
|
|
}
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
// When REVEC is enabled, we need to expand vector types into scalar
|
|
// types.
|
|
Vals = replicateMask(Vals, VecTy->getNumElements());
|
|
}
|
|
return ConstantVector::get(Vals);
|
|
}
|
|
return ConstantVector::getSplat(
|
|
ElementCount::getFixed(
|
|
cast<FixedVectorType>(Root->getType())->getNumElements()),
|
|
getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
|
|
}
|
|
InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
|
|
/// Finalize emission of the shuffles.
|
|
InstructionCost
|
|
finalize(ArrayRef<int> ExtMask,
|
|
ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
|
|
ArrayRef<int> SubVectorsMask, unsigned VF = 0,
|
|
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
|
|
IsFinalized = true;
|
|
if (Action) {
|
|
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
|
|
if (InVectors.size() == 2)
|
|
Cost += createShuffle(Vec, InVectors.back(), CommonMask);
|
|
else
|
|
Cost += createShuffle(Vec, nullptr, CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
assert(VF > 0 &&
|
|
"Expected vector length for the final value before action.");
|
|
Value *V = cast<Value *>(Vec);
|
|
Action(V, CommonMask);
|
|
InVectors.front() = V;
|
|
}
|
|
if (!SubVectors.empty()) {
|
|
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
|
|
if (InVectors.size() == 2)
|
|
Cost += createShuffle(Vec, InVectors.back(), CommonMask);
|
|
else
|
|
Cost += createShuffle(Vec, nullptr, CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
// Add subvectors permutation cost.
|
|
if (!SubVectorsMask.empty()) {
|
|
assert(SubVectorsMask.size() <= CommonMask.size() &&
|
|
"Expected same size of masks for subvectors and common mask.");
|
|
SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
|
|
copy(SubVectorsMask, SVMask.begin());
|
|
for (auto [I1, I2] : zip(SVMask, CommonMask)) {
|
|
if (I2 != PoisonMaskElem) {
|
|
assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
|
|
I1 = I2 + CommonMask.size();
|
|
}
|
|
}
|
|
Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
|
|
getWidenedType(ScalarTy, CommonMask.size()),
|
|
SVMask, CostKind);
|
|
}
|
|
for (auto [E, Idx] : SubVectors) {
|
|
Type *EScalarTy = E->Scalars.front()->getType();
|
|
bool IsSigned = true;
|
|
if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
|
|
EScalarTy =
|
|
IntegerType::get(EScalarTy->getContext(), It->second.first);
|
|
IsSigned = It->second.second;
|
|
}
|
|
if (ScalarTy != EScalarTy) {
|
|
unsigned CastOpcode = Instruction::Trunc;
|
|
unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
|
|
unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
|
|
if (DstSz > SrcSz)
|
|
CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
|
|
Cost += TTI.getCastInstrCost(
|
|
CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
|
|
getWidenedType(EScalarTy, E->getVectorFactor()),
|
|
TTI::CastContextHint::Normal, CostKind);
|
|
}
|
|
Cost += ::getShuffleCost(
|
|
TTI, TTI::SK_InsertSubvector,
|
|
getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
|
|
getWidenedType(ScalarTy, E->getVectorFactor()));
|
|
if (!CommonMask.empty()) {
|
|
std::iota(std::next(CommonMask.begin(), Idx),
|
|
std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
|
|
Idx);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!ExtMask.empty()) {
|
|
if (CommonMask.empty()) {
|
|
CommonMask.assign(ExtMask.begin(), ExtMask.end());
|
|
} else {
|
|
SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
|
|
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
|
|
if (ExtMask[I] == PoisonMaskElem)
|
|
continue;
|
|
NewMask[I] = CommonMask[ExtMask[I]];
|
|
}
|
|
CommonMask.swap(NewMask);
|
|
}
|
|
}
|
|
if (CommonMask.empty()) {
|
|
assert(InVectors.size() == 1 && "Expected only one vector with no mask");
|
|
return Cost;
|
|
}
|
|
return Cost +
|
|
createShuffle(InVectors.front(),
|
|
InVectors.size() == 2 ? InVectors.back() : nullptr,
|
|
CommonMask);
|
|
}
|
|
|
|
~ShuffleCostEstimator() {
|
|
assert((IsFinalized || CommonMask.empty()) &&
|
|
"Shuffle construction must be finalized.");
|
|
}
|
|
};
|
|
|
|
const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
|
|
unsigned Idx) const {
|
|
ArrayRef<Value *> VL = E->getOperand(Idx);
|
|
InstructionsState S = getSameOpcode(VL, *TLI);
|
|
// Special processing for GEPs bundle, which may include non-gep values.
|
|
if (!S && VL.front()->getType()->isPointerTy()) {
|
|
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
|
|
if (It != VL.end())
|
|
S = getSameOpcode(*It, *TLI);
|
|
}
|
|
if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx, VL, S))
|
|
return VE;
|
|
if (S || !isConstant(VL.front())) {
|
|
for (const TreeEntry *VE :
|
|
ValueToGatherNodes.lookup(S ? S.getMainOp() : VL.front()))
|
|
if (VE->UserTreeIndex.EdgeIdx == Idx && VE->UserTreeIndex.UserTE == E) {
|
|
assert(VE->isSame(VL) && "Expected gather node with same values.");
|
|
return VE;
|
|
}
|
|
}
|
|
const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
|
|
[&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return (TE->isGather() ||
|
|
TE->State == TreeEntry::SplitVectorize) &&
|
|
TE->UserTreeIndex.EdgeIdx == Idx &&
|
|
TE->UserTreeIndex.UserTE == E;
|
|
});
|
|
assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
|
|
return It->get();
|
|
}
|
|
|
|
TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
|
|
if (TE.State == TreeEntry::ScatterVectorize ||
|
|
TE.State == TreeEntry::StridedVectorize)
|
|
return TTI::CastContextHint::GatherScatter;
|
|
if (TE.State == TreeEntry::CompressVectorize)
|
|
return TTI::CastContextHint::Masked;
|
|
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
|
|
!TE.isAltShuffle()) {
|
|
if (TE.ReorderIndices.empty())
|
|
return TTI::CastContextHint::Normal;
|
|
SmallVector<int> Mask;
|
|
inversePermutation(TE.ReorderIndices, Mask);
|
|
if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
|
|
return TTI::CastContextHint::Reversed;
|
|
}
|
|
return TTI::CastContextHint::None;
|
|
}
|
|
|
|
InstructionCost
|
|
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
|
|
SmallPtrSetImpl<Value *> &CheckedExtracts) {
|
|
ArrayRef<Value *> VL = E->Scalars;
|
|
|
|
Type *ScalarTy = getValueType(VL[0]);
|
|
if (!isValidElementType(ScalarTy))
|
|
return InstructionCost::getInvalid();
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
|
|
// If we have computed a smaller type for the expression, update VecTy so
|
|
// that the costs will be accurate.
|
|
auto It = MinBWs.find(E);
|
|
Type *OrigScalarTy = ScalarTy;
|
|
if (It != MinBWs.end()) {
|
|
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
|
|
ScalarTy = IntegerType::get(F->getContext(), It->second.first);
|
|
if (VecTy)
|
|
ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
|
|
}
|
|
auto *VecTy = getWidenedType(ScalarTy, VL.size());
|
|
unsigned EntryVF = E->getVectorFactor();
|
|
auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
|
|
|
|
if (E->isGather()) {
|
|
if (allConstant(VL))
|
|
return 0;
|
|
if (isa<InsertElementInst>(VL[0]))
|
|
return InstructionCost::getInvalid();
|
|
if (isa<CmpInst>(VL.front()))
|
|
ScalarTy = VL.front()->getType();
|
|
return processBuildVector<ShuffleCostEstimator, InstructionCost>(
|
|
E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
|
|
}
|
|
if (E->State == TreeEntry::SplitVectorize) {
|
|
assert(E->CombinedEntriesWithIndices.size() == 2 &&
|
|
"Expected exactly 2 combined entries.");
|
|
assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
|
|
InstructionCost VectorCost = 0;
|
|
if (E->ReorderIndices.empty()) {
|
|
VectorCost = ::getShuffleCost(
|
|
*TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
|
|
E->CombinedEntriesWithIndices.back().second,
|
|
getWidenedType(
|
|
ScalarTy,
|
|
VectorizableTree[E->CombinedEntriesWithIndices.back().first]
|
|
->getVectorFactor()));
|
|
} else {
|
|
unsigned CommonVF =
|
|
std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
|
|
->getVectorFactor(),
|
|
VectorizableTree[E->CombinedEntriesWithIndices.back().first]
|
|
->getVectorFactor());
|
|
VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
|
|
getWidenedType(ScalarTy, CommonVF),
|
|
E->getSplitMask(), CostKind);
|
|
}
|
|
LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
|
|
return VectorCost;
|
|
}
|
|
InstructionCost CommonCost = 0;
|
|
SmallVector<int> Mask;
|
|
if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
|
|
(E->State != TreeEntry::StridedVectorize ||
|
|
!isReverseOrder(E->ReorderIndices))) {
|
|
SmallVector<int> NewMask;
|
|
if (E->getOpcode() == Instruction::Store) {
|
|
// For stores the order is actually a mask.
|
|
NewMask.resize(E->ReorderIndices.size());
|
|
copy(E->ReorderIndices, NewMask.begin());
|
|
} else {
|
|
inversePermutation(E->ReorderIndices, NewMask);
|
|
}
|
|
::addMask(Mask, NewMask);
|
|
}
|
|
if (!E->ReuseShuffleIndices.empty())
|
|
::addMask(Mask, E->ReuseShuffleIndices);
|
|
if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
|
|
CommonCost =
|
|
::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
|
|
assert((E->State == TreeEntry::Vectorize ||
|
|
E->State == TreeEntry::ScatterVectorize ||
|
|
E->State == TreeEntry::StridedVectorize ||
|
|
E->State == TreeEntry::CompressVectorize) &&
|
|
"Unhandled state");
|
|
assert(E->getOpcode() &&
|
|
((allSameType(VL) && allSameBlock(VL)) ||
|
|
(E->getOpcode() == Instruction::GetElementPtr &&
|
|
E->getMainOp()->getType()->isPointerTy())) &&
|
|
"Invalid VL");
|
|
Instruction *VL0 = E->getMainOp();
|
|
unsigned ShuffleOrOp =
|
|
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
|
|
if (E->CombinedOp != TreeEntry::NotCombinedOp)
|
|
ShuffleOrOp = E->CombinedOp;
|
|
SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
|
|
const unsigned Sz = UniqueValues.size();
|
|
SmallBitVector UsedScalars(Sz, false);
|
|
for (unsigned I = 0; I < Sz; ++I) {
|
|
if (isa<Instruction>(UniqueValues[I]) &&
|
|
getTreeEntries(UniqueValues[I]).front() == E)
|
|
continue;
|
|
UsedScalars.set(I);
|
|
}
|
|
auto GetCastContextHint = [&](Value *V) {
|
|
if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
|
|
return getCastContextHint(*OpTEs.front());
|
|
InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
|
|
if (SrcState && SrcState.getOpcode() == Instruction::Load &&
|
|
!SrcState.isAltShuffle())
|
|
return TTI::CastContextHint::GatherScatter;
|
|
return TTI::CastContextHint::None;
|
|
};
|
|
auto GetCostDiff =
|
|
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
|
|
function_ref<InstructionCost(InstructionCost)> VectorCost) {
|
|
// Calculate the cost of this instruction.
|
|
InstructionCost ScalarCost = 0;
|
|
if (isa<CastInst, CallInst>(VL0)) {
|
|
// For some of the instructions no need to calculate cost for each
|
|
// particular instruction, we can use the cost of the single
|
|
// instruction x total number of scalar instructions.
|
|
ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
|
|
} else {
|
|
for (unsigned I = 0; I < Sz; ++I) {
|
|
if (UsedScalars.test(I))
|
|
continue;
|
|
ScalarCost += ScalarEltCost(I);
|
|
}
|
|
}
|
|
|
|
InstructionCost VecCost = VectorCost(CommonCost);
|
|
// Check if the current node must be resized, if the parent node is not
|
|
// resized.
|
|
if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
|
|
E->Idx != 0 &&
|
|
(E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
|
|
const EdgeInfo &EI = E->UserTreeIndex;
|
|
if (!EI.UserTE->hasState() ||
|
|
EI.UserTE->getOpcode() != Instruction::Select ||
|
|
EI.EdgeIdx != 0) {
|
|
auto UserBWIt = MinBWs.find(EI.UserTE);
|
|
Type *UserScalarTy =
|
|
(EI.UserTE->isGather() ||
|
|
EI.UserTE->State == TreeEntry::SplitVectorize)
|
|
? EI.UserTE->Scalars.front()->getType()
|
|
: EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
|
|
if (UserBWIt != MinBWs.end())
|
|
UserScalarTy = IntegerType::get(ScalarTy->getContext(),
|
|
UserBWIt->second.first);
|
|
if (ScalarTy != UserScalarTy) {
|
|
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
|
|
unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
|
|
unsigned VecOpcode;
|
|
auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
|
|
if (BWSz > SrcBWSz)
|
|
VecOpcode = Instruction::Trunc;
|
|
else
|
|
VecOpcode =
|
|
It->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
TTI::CastContextHint CCH = GetCastContextHint(VL0);
|
|
VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
|
|
CostKind);
|
|
}
|
|
}
|
|
}
|
|
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
|
|
ScalarCost, "Calculated costs for Tree"));
|
|
return VecCost - ScalarCost;
|
|
};
|
|
// Calculate cost difference from vectorizing set of GEPs.
|
|
// Negative value means vectorizing is profitable.
|
|
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
|
|
assert((E->State == TreeEntry::Vectorize ||
|
|
E->State == TreeEntry::StridedVectorize ||
|
|
E->State == TreeEntry::CompressVectorize) &&
|
|
"Entry state expected to be Vectorize, StridedVectorize or "
|
|
"MaskedLoadCompressVectorize here.");
|
|
InstructionCost ScalarCost = 0;
|
|
InstructionCost VecCost = 0;
|
|
std::tie(ScalarCost, VecCost) = getGEPCosts(
|
|
*TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
|
|
LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
|
|
"Calculated GEPs cost for Tree"));
|
|
|
|
return VecCost - ScalarCost;
|
|
};
|
|
|
|
auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
|
|
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
|
|
if (MinMaxID == Intrinsic::not_intrinsic)
|
|
return InstructionCost::getInvalid();
|
|
Type *CanonicalType = Ty;
|
|
if (CanonicalType->isPtrOrPtrVectorTy())
|
|
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
|
|
CanonicalType->getContext(),
|
|
DL->getTypeSizeInBits(CanonicalType->getScalarType())));
|
|
|
|
IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
|
|
{CanonicalType, CanonicalType});
|
|
InstructionCost IntrinsicCost =
|
|
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
|
|
// If the selects are the only uses of the compares, they will be
|
|
// dead and we can adjust the cost by removing their cost.
|
|
if (VI && SelectOnly) {
|
|
assert((!Ty->isVectorTy() || SLPReVec) &&
|
|
"Expected only for scalar type.");
|
|
auto *CI = cast<CmpInst>(VI->getOperand(0));
|
|
IntrinsicCost -= TTI->getCmpSelInstrCost(
|
|
CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
|
|
CostKind, {TTI::OK_AnyValue, TTI::OP_None},
|
|
{TTI::OK_AnyValue, TTI::OP_None}, CI);
|
|
}
|
|
return IntrinsicCost;
|
|
};
|
|
switch (ShuffleOrOp) {
|
|
case Instruction::PHI: {
|
|
// Count reused scalars.
|
|
InstructionCost ScalarCost = 0;
|
|
SmallPtrSet<const TreeEntry *, 4> CountedOps;
|
|
for (Value *V : UniqueValues) {
|
|
auto *PHI = dyn_cast<PHINode>(V);
|
|
if (!PHI)
|
|
continue;
|
|
|
|
ValueList Operands(PHI->getNumIncomingValues(), nullptr);
|
|
for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
|
|
Value *Op = PHI->getIncomingValue(I);
|
|
Operands[I] = Op;
|
|
}
|
|
if (const TreeEntry *OpTE =
|
|
getSameValuesTreeEntry(Operands.front(), Operands))
|
|
if (CountedOps.insert(OpTE).second &&
|
|
!OpTE->ReuseShuffleIndices.empty())
|
|
ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
|
|
OpTE->Scalars.size());
|
|
}
|
|
|
|
return CommonCost - ScalarCost;
|
|
}
|
|
case Instruction::ExtractValue:
|
|
case Instruction::ExtractElement: {
|
|
APInt DemandedElts;
|
|
VectorType *SrcVecTy = nullptr;
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
if (isa<PoisonValue>(UniqueValues[Idx]))
|
|
return InstructionCost(TTI::TCC_Free);
|
|
|
|
auto *I = cast<Instruction>(UniqueValues[Idx]);
|
|
if (!SrcVecTy) {
|
|
if (ShuffleOrOp == Instruction::ExtractElement) {
|
|
auto *EE = cast<ExtractElementInst>(I);
|
|
SrcVecTy = EE->getVectorOperandType();
|
|
} else {
|
|
auto *EV = cast<ExtractValueInst>(I);
|
|
Type *AggregateTy = EV->getAggregateOperand()->getType();
|
|
unsigned NumElts;
|
|
if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
|
|
NumElts = ATy->getNumElements();
|
|
else
|
|
NumElts = AggregateTy->getStructNumElements();
|
|
SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
|
|
}
|
|
}
|
|
if (I->hasOneUse()) {
|
|
Instruction *Ext = I->user_back();
|
|
if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
|
|
all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
|
|
// Use getExtractWithExtendCost() to calculate the cost of
|
|
// extractelement/ext pair.
|
|
InstructionCost Cost = TTI->getExtractWithExtendCost(
|
|
Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
|
|
// Subtract the cost of s|zext which is subtracted separately.
|
|
Cost -= TTI->getCastInstrCost(
|
|
Ext->getOpcode(), Ext->getType(), I->getType(),
|
|
TTI::getCastContextHint(Ext), CostKind, Ext);
|
|
return Cost;
|
|
}
|
|
}
|
|
if (DemandedElts.isZero())
|
|
DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
|
|
DemandedElts.setBit(*getExtractIndex(I));
|
|
return InstructionCost(TTI::TCC_Free);
|
|
};
|
|
auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
|
|
return CommonCost - (DemandedElts.isZero()
|
|
? TTI::TCC_Free
|
|
: TTI.getScalarizationOverhead(
|
|
SrcVecTy, DemandedElts, /*Insert=*/false,
|
|
/*Extract=*/true, CostKind));
|
|
};
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case Instruction::InsertElement: {
|
|
assert(E->ReuseShuffleIndices.empty() &&
|
|
"Unique insertelements only are expected.");
|
|
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
|
|
unsigned const NumElts = SrcVecTy->getNumElements();
|
|
unsigned const NumScalars = VL.size();
|
|
|
|
unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
|
|
|
|
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
|
|
unsigned OffsetBeg = *getElementIndex(VL.front());
|
|
unsigned OffsetEnd = OffsetBeg;
|
|
InsertMask[OffsetBeg] = 0;
|
|
for (auto [I, V] : enumerate(VL.drop_front())) {
|
|
unsigned Idx = *getElementIndex(V);
|
|
if (OffsetBeg > Idx)
|
|
OffsetBeg = Idx;
|
|
else if (OffsetEnd < Idx)
|
|
OffsetEnd = Idx;
|
|
InsertMask[Idx] = I + 1;
|
|
}
|
|
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
|
|
if (NumOfParts > 0 && NumOfParts < NumElts)
|
|
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
|
|
unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
|
|
VecScalarsSz;
|
|
unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
|
|
unsigned InsertVecSz = std::min<unsigned>(
|
|
PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
|
|
((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
|
|
bool IsWholeSubvector =
|
|
OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
|
|
// Check if we can safely insert a subvector. If it is not possible, just
|
|
// generate a whole-sized vector and shuffle the source vector and the new
|
|
// subvector.
|
|
if (OffsetBeg + InsertVecSz > VecSz) {
|
|
// Align OffsetBeg to generate correct mask.
|
|
OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
|
|
InsertVecSz = VecSz;
|
|
}
|
|
|
|
APInt DemandedElts = APInt::getZero(NumElts);
|
|
// TODO: Add support for Instruction::InsertValue.
|
|
SmallVector<int> Mask;
|
|
if (!E->ReorderIndices.empty()) {
|
|
inversePermutation(E->ReorderIndices, Mask);
|
|
Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
|
|
} else {
|
|
Mask.assign(VecSz, PoisonMaskElem);
|
|
std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
|
|
}
|
|
bool IsIdentity = true;
|
|
SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
|
|
Mask.swap(PrevMask);
|
|
for (unsigned I = 0; I < NumScalars; ++I) {
|
|
unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
|
|
DemandedElts.setBit(InsertIdx);
|
|
IsIdentity &= InsertIdx - OffsetBeg == I;
|
|
Mask[InsertIdx - OffsetBeg] = I;
|
|
}
|
|
assert(Offset < NumElts && "Failed to find vector index offset");
|
|
|
|
InstructionCost Cost = 0;
|
|
Cost -=
|
|
getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
|
|
/*Insert*/ true, /*Extract*/ false, CostKind);
|
|
|
|
// First cost - resize to actual vector size if not identity shuffle or
|
|
// need to shift the vector.
|
|
// Do not calculate the cost if the actual size is the register size and
|
|
// we can merge this shuffle with the following SK_Select.
|
|
auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
|
|
if (!IsIdentity)
|
|
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
|
|
InsertVecTy, Mask);
|
|
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
|
|
return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
|
|
}));
|
|
// Second cost - permutation with subvector, if some elements are from the
|
|
// initial vector or inserting a subvector.
|
|
// TODO: Implement the analysis of the FirstInsert->getOperand(0)
|
|
// subvector of ActualVecTy.
|
|
SmallBitVector InMask =
|
|
isUndefVector(FirstInsert->getOperand(0),
|
|
buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
|
|
if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
|
|
if (InsertVecSz != VecSz) {
|
|
auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
|
|
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
|
|
CostKind, OffsetBeg - Offset, InsertVecTy);
|
|
} else {
|
|
for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
|
|
Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
|
|
for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
|
|
I <= End; ++I)
|
|
if (Mask[I] != PoisonMaskElem)
|
|
Mask[I] = I + VecSz;
|
|
for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
|
|
Mask[I] =
|
|
((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
|
|
Cost +=
|
|
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
|
|
}
|
|
}
|
|
return Cost;
|
|
}
|
|
case Instruction::ZExt:
|
|
case Instruction::SExt:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPExt:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::SIToFP:
|
|
case Instruction::UIToFP:
|
|
case Instruction::Trunc:
|
|
case Instruction::FPTrunc:
|
|
case Instruction::BitCast: {
|
|
auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
|
|
Type *SrcScalarTy = VL0->getOperand(0)->getType();
|
|
auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
|
|
unsigned Opcode = ShuffleOrOp;
|
|
unsigned VecOpcode = Opcode;
|
|
if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
|
|
(SrcIt != MinBWs.end() || It != MinBWs.end())) {
|
|
// Check if the values are candidates to demote.
|
|
unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
|
|
if (SrcIt != MinBWs.end()) {
|
|
SrcBWSz = SrcIt->second.first;
|
|
unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
|
|
SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
|
|
SrcVecTy =
|
|
getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
|
|
}
|
|
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
|
|
if (BWSz == SrcBWSz) {
|
|
VecOpcode = Instruction::BitCast;
|
|
} else if (BWSz < SrcBWSz) {
|
|
VecOpcode = Instruction::Trunc;
|
|
} else if (It != MinBWs.end()) {
|
|
assert(BWSz > SrcBWSz && "Invalid cast!");
|
|
VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
} else if (SrcIt != MinBWs.end()) {
|
|
assert(BWSz > SrcBWSz && "Invalid cast!");
|
|
VecOpcode =
|
|
SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
}
|
|
} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
|
|
!SrcIt->second.second) {
|
|
VecOpcode = Instruction::UIToFP;
|
|
}
|
|
auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
|
|
assert(Idx == 0 && "Expected 0 index only");
|
|
return TTI->getCastInstrCost(Opcode, VL0->getType(),
|
|
VL0->getOperand(0)->getType(),
|
|
TTI::getCastContextHint(VL0), CostKind, VL0);
|
|
};
|
|
auto GetVectorCost = [=](InstructionCost CommonCost) {
|
|
// Do not count cost here if minimum bitwidth is in effect and it is just
|
|
// a bitcast (here it is just a noop).
|
|
if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
|
|
return CommonCost;
|
|
auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
|
|
TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
|
|
|
|
bool IsArithmeticExtendedReduction =
|
|
E->Idx == 0 && UserIgnoreList &&
|
|
all_of(*UserIgnoreList, [](Value *V) {
|
|
auto *I = cast<Instruction>(V);
|
|
return is_contained({Instruction::Add, Instruction::FAdd,
|
|
Instruction::Mul, Instruction::FMul,
|
|
Instruction::And, Instruction::Or,
|
|
Instruction::Xor},
|
|
I->getOpcode());
|
|
});
|
|
if (IsArithmeticExtendedReduction &&
|
|
(VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
|
|
return CommonCost;
|
|
return CommonCost +
|
|
TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
|
|
VecOpcode == Opcode ? VI : nullptr);
|
|
};
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case Instruction::FCmp:
|
|
case Instruction::ICmp:
|
|
case Instruction::Select: {
|
|
CmpPredicate VecPred, SwappedVecPred;
|
|
auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
|
|
if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
|
|
match(VL0, MatchCmp))
|
|
SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
|
|
else
|
|
SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
|
|
? CmpInst::BAD_FCMP_PREDICATE
|
|
: CmpInst::BAD_ICMP_PREDICATE;
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
if (isa<PoisonValue>(UniqueValues[Idx]))
|
|
return InstructionCost(TTI::TCC_Free);
|
|
|
|
auto *VI = cast<Instruction>(UniqueValues[Idx]);
|
|
CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
|
|
? CmpInst::BAD_FCMP_PREDICATE
|
|
: CmpInst::BAD_ICMP_PREDICATE;
|
|
auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
|
|
if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
|
|
!match(VI, MatchCmp)) ||
|
|
(CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
|
|
CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
|
|
VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
|
|
? CmpInst::BAD_FCMP_PREDICATE
|
|
: CmpInst::BAD_ICMP_PREDICATE;
|
|
|
|
InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
|
|
E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
|
|
CostKind, getOperandInfo(VI->getOperand(0)),
|
|
getOperandInfo(VI->getOperand(1)), VI);
|
|
InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
|
|
if (IntrinsicCost.isValid())
|
|
ScalarCost = IntrinsicCost;
|
|
|
|
return ScalarCost;
|
|
};
|
|
auto GetVectorCost = [&](InstructionCost CommonCost) {
|
|
auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
|
|
|
|
InstructionCost VecCost =
|
|
TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
|
|
CostKind, getOperandInfo(E->getOperand(0)),
|
|
getOperandInfo(E->getOperand(1)), VL0);
|
|
if (auto *SI = dyn_cast<SelectInst>(VL0)) {
|
|
auto *CondType =
|
|
getWidenedType(SI->getCondition()->getType(), VL.size());
|
|
unsigned CondNumElements = CondType->getNumElements();
|
|
unsigned VecTyNumElements = getNumElements(VecTy);
|
|
assert(VecTyNumElements >= CondNumElements &&
|
|
VecTyNumElements % CondNumElements == 0 &&
|
|
"Cannot vectorize Instruction::Select");
|
|
if (CondNumElements != VecTyNumElements) {
|
|
// When the return type is i1 but the source is fixed vector type, we
|
|
// need to duplicate the condition value.
|
|
VecCost += ::getShuffleCost(
|
|
*TTI, TTI::SK_PermuteSingleSrc, CondType,
|
|
createReplicatedMask(VecTyNumElements / CondNumElements,
|
|
CondNumElements));
|
|
}
|
|
}
|
|
return VecCost + CommonCost;
|
|
};
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case TreeEntry::MinMax: {
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
return GetMinMaxCost(OrigScalarTy);
|
|
};
|
|
auto GetVectorCost = [&](InstructionCost CommonCost) {
|
|
InstructionCost VecCost = GetMinMaxCost(VecTy);
|
|
return VecCost + CommonCost;
|
|
};
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case Instruction::FNeg:
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::FDiv:
|
|
case Instruction::URem:
|
|
case Instruction::SRem:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor: {
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
if (isa<PoisonValue>(UniqueValues[Idx]))
|
|
return InstructionCost(TTI::TCC_Free);
|
|
|
|
auto *VI = cast<Instruction>(UniqueValues[Idx]);
|
|
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
|
|
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
|
|
TTI::OperandValueInfo Op2Info =
|
|
TTI::getOperandInfo(VI->getOperand(OpIdx));
|
|
SmallVector<const Value *> Operands(VI->operand_values());
|
|
return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
|
|
Op1Info, Op2Info, Operands, VI);
|
|
};
|
|
auto GetVectorCost = [=](InstructionCost CommonCost) {
|
|
if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
|
|
for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
|
|
ArrayRef<Value *> Ops = E->getOperand(I);
|
|
if (all_of(Ops, [&](Value *Op) {
|
|
auto *CI = dyn_cast<ConstantInt>(Op);
|
|
return CI && CI->getValue().countr_one() >= It->second.first;
|
|
}))
|
|
return CommonCost;
|
|
}
|
|
}
|
|
unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
|
|
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
|
|
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
|
|
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
|
|
Op2Info, {}, nullptr, TLI) +
|
|
CommonCost;
|
|
};
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case Instruction::GetElementPtr: {
|
|
return CommonCost + GetGEPCostDiff(VL, VL0);
|
|
}
|
|
case Instruction::Load: {
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
auto *VI = cast<LoadInst>(UniqueValues[Idx]);
|
|
return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
|
|
VI->getAlign(), VI->getPointerAddressSpace(),
|
|
CostKind, TTI::OperandValueInfo(), VI);
|
|
};
|
|
auto *LI0 = cast<LoadInst>(VL0);
|
|
auto GetVectorCost = [&](InstructionCost CommonCost) {
|
|
InstructionCost VecLdCost;
|
|
switch (E->State) {
|
|
case TreeEntry::Vectorize:
|
|
if (unsigned Factor = E->getInterleaveFactor()) {
|
|
VecLdCost = TTI->getInterleavedMemoryOpCost(
|
|
Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
|
|
LI0->getPointerAddressSpace(), CostKind);
|
|
|
|
} else {
|
|
VecLdCost = TTI->getMemoryOpCost(
|
|
Instruction::Load, VecTy, LI0->getAlign(),
|
|
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
|
|
}
|
|
break;
|
|
case TreeEntry::StridedVectorize: {
|
|
Align CommonAlignment =
|
|
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
|
|
VecLdCost = TTI->getStridedMemoryOpCost(
|
|
Instruction::Load, VecTy, LI0->getPointerOperand(),
|
|
/*VariableMask=*/false, CommonAlignment, CostKind);
|
|
break;
|
|
}
|
|
case TreeEntry::CompressVectorize: {
|
|
bool IsMasked;
|
|
unsigned InterleaveFactor;
|
|
SmallVector<int> CompressMask;
|
|
VectorType *LoadVecTy;
|
|
SmallVector<Value *> Scalars(VL.begin(), VL.end());
|
|
if (!E->ReorderIndices.empty()) {
|
|
SmallVector<int> Mask(E->ReorderIndices.begin(),
|
|
E->ReorderIndices.end());
|
|
reorderScalars(Scalars, Mask);
|
|
}
|
|
SmallVector<Value *> PointerOps(Scalars.size());
|
|
for (auto [I, V] : enumerate(Scalars))
|
|
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
|
|
[[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
|
|
Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
|
|
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
|
|
CompressMask, LoadVecTy);
|
|
assert(IsVectorized && "Expected to be vectorized");
|
|
Align CommonAlignment;
|
|
if (IsMasked)
|
|
CommonAlignment = computeCommonAlignment<LoadInst>(VL);
|
|
else
|
|
CommonAlignment = LI0->getAlign();
|
|
if (InterleaveFactor) {
|
|
VecLdCost = TTI->getInterleavedMemoryOpCost(
|
|
Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt,
|
|
CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
|
|
} else if (IsMasked) {
|
|
VecLdCost = TTI->getMaskedMemoryOpCost(
|
|
Instruction::Load, LoadVecTy, CommonAlignment,
|
|
LI0->getPointerAddressSpace(), CostKind);
|
|
// TODO: include this cost into CommonCost.
|
|
VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
|
|
LoadVecTy, CompressMask, CostKind);
|
|
} else {
|
|
VecLdCost = TTI->getMemoryOpCost(
|
|
Instruction::Load, LoadVecTy, CommonAlignment,
|
|
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
|
|
// TODO: include this cost into CommonCost.
|
|
VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
|
|
LoadVecTy, CompressMask, CostKind);
|
|
}
|
|
break;
|
|
}
|
|
case TreeEntry::ScatterVectorize: {
|
|
Align CommonAlignment =
|
|
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
|
|
VecLdCost = TTI->getGatherScatterOpCost(
|
|
Instruction::Load, VecTy, LI0->getPointerOperand(),
|
|
/*VariableMask=*/false, CommonAlignment, CostKind);
|
|
break;
|
|
}
|
|
case TreeEntry::CombinedVectorize:
|
|
case TreeEntry::SplitVectorize:
|
|
case TreeEntry::NeedToGather:
|
|
llvm_unreachable("Unexpected vectorization state.");
|
|
}
|
|
return VecLdCost + CommonCost;
|
|
};
|
|
|
|
InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
|
|
// If this node generates masked gather load then it is not a terminal node.
|
|
// Hence address operand cost is estimated separately.
|
|
if (E->State == TreeEntry::ScatterVectorize)
|
|
return Cost;
|
|
|
|
// Estimate cost of GEPs since this tree node is a terminator.
|
|
SmallVector<Value *> PointerOps(VL.size());
|
|
for (auto [I, V] : enumerate(VL))
|
|
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
|
|
return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
|
|
}
|
|
case Instruction::Store: {
|
|
bool IsReorder = !E->ReorderIndices.empty();
|
|
auto GetScalarCost = [=](unsigned Idx) {
|
|
auto *VI = cast<StoreInst>(VL[Idx]);
|
|
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
|
|
return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
|
|
VI->getAlign(), VI->getPointerAddressSpace(),
|
|
CostKind, OpInfo, VI);
|
|
};
|
|
auto *BaseSI =
|
|
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
|
|
auto GetVectorCost = [=](InstructionCost CommonCost) {
|
|
// We know that we can merge the stores. Calculate the cost.
|
|
InstructionCost VecStCost;
|
|
if (E->State == TreeEntry::StridedVectorize) {
|
|
Align CommonAlignment =
|
|
computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
|
|
VecStCost = TTI->getStridedMemoryOpCost(
|
|
Instruction::Store, VecTy, BaseSI->getPointerOperand(),
|
|
/*VariableMask=*/false, CommonAlignment, CostKind);
|
|
} else {
|
|
assert(E->State == TreeEntry::Vectorize &&
|
|
"Expected either strided or consecutive stores.");
|
|
if (unsigned Factor = E->getInterleaveFactor()) {
|
|
assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
|
|
"No reused shuffles expected");
|
|
CommonCost = 0;
|
|
VecStCost = TTI->getInterleavedMemoryOpCost(
|
|
Instruction::Store, VecTy, Factor, std::nullopt,
|
|
BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
|
|
} else {
|
|
TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
|
|
VecStCost = TTI->getMemoryOpCost(
|
|
Instruction::Store, VecTy, BaseSI->getAlign(),
|
|
BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
|
|
}
|
|
}
|
|
return VecStCost + CommonCost;
|
|
};
|
|
SmallVector<Value *> PointerOps(VL.size());
|
|
for (auto [I, V] : enumerate(VL)) {
|
|
unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
|
|
PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
|
|
}
|
|
|
|
return GetCostDiff(GetScalarCost, GetVectorCost) +
|
|
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
|
|
}
|
|
case Instruction::Call: {
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
auto *CI = cast<CallInst>(UniqueValues[Idx]);
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
if (ID != Intrinsic::not_intrinsic) {
|
|
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
|
|
return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
|
|
}
|
|
return TTI->getCallInstrCost(CI->getCalledFunction(),
|
|
CI->getFunctionType()->getReturnType(),
|
|
CI->getFunctionType()->params(), CostKind);
|
|
};
|
|
auto GetVectorCost = [=](InstructionCost CommonCost) {
|
|
auto *CI = cast<CallInst>(VL0);
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
|
|
CI, ID, VecTy->getNumElements(),
|
|
It != MinBWs.end() ? It->second.first : 0, TTI);
|
|
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
|
|
return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
|
|
};
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case Instruction::ShuffleVector: {
|
|
if (!SLPReVec || E->isAltShuffle())
|
|
assert(E->isAltShuffle() &&
|
|
((Instruction::isBinaryOp(E->getOpcode()) &&
|
|
Instruction::isBinaryOp(E->getAltOpcode())) ||
|
|
(Instruction::isCast(E->getOpcode()) &&
|
|
Instruction::isCast(E->getAltOpcode())) ||
|
|
(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
|
|
"Invalid Shuffle Vector Operand");
|
|
// Try to find the previous shuffle node with the same operands and same
|
|
// main/alternate ops.
|
|
auto TryFindNodeWithEqualOperands = [=]() {
|
|
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
|
|
if (TE.get() == E)
|
|
break;
|
|
if (TE->hasState() && TE->isAltShuffle() &&
|
|
((TE->getOpcode() == E->getOpcode() &&
|
|
TE->getAltOpcode() == E->getAltOpcode()) ||
|
|
(TE->getOpcode() == E->getAltOpcode() &&
|
|
TE->getAltOpcode() == E->getOpcode())) &&
|
|
TE->hasEqualOperands(*E))
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
auto GetScalarCost = [&](unsigned Idx) {
|
|
if (isa<PoisonValue>(UniqueValues[Idx]))
|
|
return InstructionCost(TTI::TCC_Free);
|
|
|
|
auto *VI = cast<Instruction>(UniqueValues[Idx]);
|
|
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
|
|
(void)E;
|
|
return TTI->getInstructionCost(VI, CostKind);
|
|
};
|
|
// Need to clear CommonCost since the final shuffle cost is included into
|
|
// vector cost.
|
|
auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
|
|
// VecCost is equal to sum of the cost of creating 2 vectors
|
|
// and the cost of creating shuffle.
|
|
InstructionCost VecCost = 0;
|
|
if (TryFindNodeWithEqualOperands()) {
|
|
LLVM_DEBUG({
|
|
dbgs() << "SLP: diamond match for alternate node found.\n";
|
|
E->dump();
|
|
});
|
|
// No need to add new vector costs here since we're going to reuse
|
|
// same main/alternate vector ops, just do different shuffling.
|
|
} else if (Instruction::isBinaryOp(E->getOpcode())) {
|
|
VecCost =
|
|
TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
|
|
VecCost +=
|
|
TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
|
|
} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
|
|
auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
|
|
VecCost = TTIRef.getCmpSelInstrCost(
|
|
E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
|
|
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
|
|
VL0);
|
|
VecCost += TTIRef.getCmpSelInstrCost(
|
|
E->getOpcode(), VecTy, MaskTy,
|
|
cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
|
|
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
|
|
E->getAltOp());
|
|
} else {
|
|
Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
|
|
auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
|
|
if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
|
|
auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
|
|
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
|
|
unsigned SrcBWSz =
|
|
DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
|
|
if (SrcIt != MinBWs.end()) {
|
|
SrcBWSz = SrcIt->second.first;
|
|
SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
|
|
SrcTy = getWidenedType(SrcSclTy, VL.size());
|
|
}
|
|
if (BWSz <= SrcBWSz) {
|
|
if (BWSz < SrcBWSz)
|
|
VecCost =
|
|
TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
|
|
TTI::CastContextHint::None, CostKind);
|
|
LLVM_DEBUG({
|
|
dbgs()
|
|
<< "SLP: alternate extension, which should be truncated.\n";
|
|
E->dump();
|
|
});
|
|
return VecCost;
|
|
}
|
|
}
|
|
VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
|
|
TTI::CastContextHint::None, CostKind);
|
|
VecCost +=
|
|
TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
|
|
TTI::CastContextHint::None, CostKind);
|
|
}
|
|
SmallVector<int> Mask;
|
|
E->buildAltOpShuffleMask(
|
|
[&](Instruction *I) {
|
|
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
|
|
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
|
|
*TLI);
|
|
},
|
|
Mask);
|
|
VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
|
|
FinalVecTy, Mask, CostKind);
|
|
// Patterns like [fadd,fsub] can be combined into a single instruction
|
|
// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
|
|
// need to take into account their order when looking for the most used
|
|
// order.
|
|
unsigned Opcode0 = E->getOpcode();
|
|
unsigned Opcode1 = E->getAltOpcode();
|
|
SmallBitVector OpcodeMask(
|
|
getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
|
|
// If this pattern is supported by the target then we consider the
|
|
// order.
|
|
if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
|
|
InstructionCost AltVecCost = TTIRef.getAltInstrCost(
|
|
VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
|
|
return AltVecCost < VecCost ? AltVecCost : VecCost;
|
|
}
|
|
// TODO: Check the reverse order too.
|
|
return VecCost;
|
|
};
|
|
if (SLPReVec && !E->isAltShuffle())
|
|
return GetCostDiff(
|
|
GetScalarCost, [&](InstructionCost) -> InstructionCost {
|
|
// If a group uses mask in order, the shufflevector can be
|
|
// eliminated by instcombine. Then the cost is 0.
|
|
assert(isa<ShuffleVectorInst>(VL.front()) &&
|
|
"Not supported shufflevector usage.");
|
|
auto *SV = cast<ShuffleVectorInst>(VL.front());
|
|
unsigned SVNumElements =
|
|
cast<FixedVectorType>(SV->getOperand(0)->getType())
|
|
->getNumElements();
|
|
unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
|
|
for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
|
|
ArrayRef<Value *> Group = VL.slice(I, GroupSize);
|
|
int NextIndex = 0;
|
|
if (!all_of(Group, [&](Value *V) {
|
|
assert(isa<ShuffleVectorInst>(V) &&
|
|
"Not supported shufflevector usage.");
|
|
auto *SV = cast<ShuffleVectorInst>(V);
|
|
int Index;
|
|
[[maybe_unused]] bool IsExtractSubvectorMask =
|
|
SV->isExtractSubvectorMask(Index);
|
|
assert(IsExtractSubvectorMask &&
|
|
"Not supported shufflevector usage.");
|
|
if (NextIndex != Index)
|
|
return false;
|
|
NextIndex += SV->getShuffleMask().size();
|
|
return true;
|
|
}))
|
|
return ::getShuffleCost(
|
|
*TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
|
|
calculateShufflevectorMask(E->Scalars));
|
|
}
|
|
return TTI::TCC_Free;
|
|
});
|
|
return GetCostDiff(GetScalarCost, GetVectorCost);
|
|
}
|
|
case Instruction::Freeze:
|
|
return CommonCost;
|
|
default:
|
|
llvm_unreachable("Unknown instruction");
|
|
}
|
|
}
|
|
|
|
bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
|
|
LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
|
|
<< VectorizableTree.size() << " is fully vectorizable .\n");
|
|
|
|
auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
|
|
SmallVector<int> Mask;
|
|
return TE->isGather() &&
|
|
!any_of(TE->Scalars,
|
|
[this](Value *V) { return EphValues.contains(V); }) &&
|
|
(allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
|
|
TE->Scalars.size() < Limit ||
|
|
(((TE->hasState() &&
|
|
TE->getOpcode() == Instruction::ExtractElement) ||
|
|
all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
|
|
isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
|
|
(TE->hasState() && TE->getOpcode() == Instruction::Load &&
|
|
!TE->isAltShuffle()) ||
|
|
any_of(TE->Scalars, IsaPred<LoadInst>));
|
|
};
|
|
|
|
// We only handle trees of heights 1 and 2.
|
|
if (VectorizableTree.size() == 1 &&
|
|
(VectorizableTree[0]->State == TreeEntry::Vectorize ||
|
|
VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
|
|
VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
|
|
(ForReduction &&
|
|
AreVectorizableGathers(VectorizableTree[0].get(),
|
|
VectorizableTree[0]->Scalars.size()) &&
|
|
VectorizableTree[0]->getVectorFactor() > 2)))
|
|
return true;
|
|
|
|
if (VectorizableTree.size() != 2)
|
|
return false;
|
|
|
|
// Handle splat and all-constants stores. Also try to vectorize tiny trees
|
|
// with the second gather nodes if they have less scalar operands rather than
|
|
// the initial tree element (may be profitable to shuffle the second gather)
|
|
// or they are extractelements, which form shuffle.
|
|
SmallVector<int> Mask;
|
|
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
|
|
AreVectorizableGathers(VectorizableTree[1].get(),
|
|
VectorizableTree[0]->Scalars.size()))
|
|
return true;
|
|
|
|
// Gathering cost would be too much for tiny trees.
|
|
if (VectorizableTree[0]->isGather() ||
|
|
(VectorizableTree[1]->isGather() &&
|
|
VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
|
|
VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
|
|
VectorizableTree[0]->State != TreeEntry::CompressVectorize))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
|
|
TargetTransformInfo *TTI,
|
|
bool MustMatchOrInst) {
|
|
// Look past the root to find a source value. Arbitrarily follow the
|
|
// path through operand 0 of any 'or'. Also, peek through optional
|
|
// shift-left-by-multiple-of-8-bits.
|
|
Value *ZextLoad = Root;
|
|
const APInt *ShAmtC;
|
|
bool FoundOr = false;
|
|
while (!isa<ConstantExpr>(ZextLoad) &&
|
|
(match(ZextLoad, m_Or(m_Value(), m_Value())) ||
|
|
(match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
|
|
ShAmtC->urem(8) == 0))) {
|
|
auto *BinOp = cast<BinaryOperator>(ZextLoad);
|
|
ZextLoad = BinOp->getOperand(0);
|
|
if (BinOp->getOpcode() == Instruction::Or)
|
|
FoundOr = true;
|
|
}
|
|
// Check if the input is an extended load of the required or/shift expression.
|
|
Value *Load;
|
|
if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
|
|
!match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
|
|
return false;
|
|
|
|
// Require that the total load bit width is a legal integer type.
|
|
// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
|
|
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
|
|
Type *SrcTy = Load->getType();
|
|
unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
|
|
if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
|
|
return false;
|
|
|
|
// Everything matched - assume that we can fold the whole sequence using
|
|
// load combining.
|
|
LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
|
|
<< *(cast<Instruction>(Root)) << "\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
|
|
if (RdxKind != RecurKind::Or)
|
|
return false;
|
|
|
|
unsigned NumElts = VectorizableTree[0]->Scalars.size();
|
|
Value *FirstReduced = VectorizableTree[0]->Scalars[0];
|
|
return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
|
|
/* MatchOr */ false);
|
|
}
|
|
|
|
bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
|
|
// Peek through a final sequence of stores and check if all operations are
|
|
// likely to be load-combined.
|
|
unsigned NumElts = Stores.size();
|
|
for (Value *Scalar : Stores) {
|
|
Value *X;
|
|
if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
|
|
!isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
|
|
if (!DebugCounter::shouldExecute(VectorizedGraphs))
|
|
return true;
|
|
|
|
// Graph is empty - do nothing.
|
|
if (VectorizableTree.empty()) {
|
|
assert(ExternalUses.empty() && "We shouldn't have any external users");
|
|
|
|
return true;
|
|
}
|
|
|
|
// No need to vectorize inserts of gathered values.
|
|
if (VectorizableTree.size() == 2 &&
|
|
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
|
|
VectorizableTree[1]->isGather() &&
|
|
(VectorizableTree[1]->getVectorFactor() <= 2 ||
|
|
!(isSplat(VectorizableTree[1]->Scalars) ||
|
|
allConstant(VectorizableTree[1]->Scalars))))
|
|
return true;
|
|
|
|
// If the graph includes only PHI nodes and gathers, it is defnitely not
|
|
// profitable for the vectorization, we can skip it, if the cost threshold is
|
|
// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
|
|
// gathers/buildvectors.
|
|
constexpr int Limit = 4;
|
|
if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
|
|
!VectorizableTree.empty() &&
|
|
all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return (TE->isGather() &&
|
|
(!TE->hasState() ||
|
|
TE->getOpcode() != Instruction::ExtractElement) &&
|
|
count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
|
|
(TE->hasState() && TE->getOpcode() == Instruction::PHI);
|
|
}))
|
|
return true;
|
|
|
|
// Do not vectorize small tree of phis only, if all vector phis are also
|
|
// gathered.
|
|
if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
|
|
VectorizableTree.size() <= Limit &&
|
|
all_of(VectorizableTree,
|
|
[&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return (TE->isGather() &&
|
|
(!TE->hasState() ||
|
|
TE->getOpcode() != Instruction::ExtractElement) &&
|
|
count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
|
|
Limit) ||
|
|
(TE->hasState() &&
|
|
(TE->getOpcode() == Instruction::InsertElement ||
|
|
(TE->getOpcode() == Instruction::PHI &&
|
|
all_of(TE->Scalars, [&](Value *V) {
|
|
return isa<PoisonValue>(V) || MustGather.contains(V);
|
|
}))));
|
|
}) &&
|
|
any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->State == TreeEntry::Vectorize &&
|
|
TE->getOpcode() == Instruction::PHI;
|
|
}))
|
|
return true;
|
|
|
|
// We can vectorize the tree if its size is greater than or equal to the
|
|
// minimum size specified by the MinTreeSize command line option.
|
|
if (VectorizableTree.size() >= MinTreeSize)
|
|
return false;
|
|
|
|
// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
|
|
// can vectorize it if we can prove it fully vectorizable.
|
|
if (isFullyVectorizableTinyTree(ForReduction))
|
|
return false;
|
|
|
|
// Check if any of the gather node forms an insertelement buildvector
|
|
// somewhere.
|
|
bool IsAllowedSingleBVNode =
|
|
VectorizableTree.size() > 1 ||
|
|
(VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
|
|
!VectorizableTree.front()->isAltShuffle() &&
|
|
VectorizableTree.front()->getOpcode() != Instruction::PHI &&
|
|
VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
|
|
allSameBlock(VectorizableTree.front()->Scalars));
|
|
if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
|
|
return isa<ExtractElementInst, Constant>(V) ||
|
|
(IsAllowedSingleBVNode &&
|
|
!V->hasNUsesOrMore(UsesLimit) &&
|
|
any_of(V->users(), IsaPred<InsertElementInst>));
|
|
});
|
|
}))
|
|
return false;
|
|
|
|
if (VectorizableTree.back()->isGather() &&
|
|
VectorizableTree.back()->hasState() &&
|
|
VectorizableTree.back()->isAltShuffle() &&
|
|
VectorizableTree.back()->getVectorFactor() > 2 &&
|
|
allSameBlock(VectorizableTree.back()->Scalars) &&
|
|
!VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
|
|
TTI->getScalarizationOverhead(
|
|
getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
|
|
VectorizableTree.back()->getVectorFactor()),
|
|
APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
|
|
/*Insert=*/true, /*Extract=*/false,
|
|
TTI::TCK_RecipThroughput) > -SLPCostThreshold)
|
|
return false;
|
|
|
|
// Otherwise, we can't vectorize the tree. It is both tiny and not fully
|
|
// vectorizable.
|
|
return true;
|
|
}
|
|
|
|
bool BoUpSLP::isTreeNotExtendable() const {
|
|
if (getCanonicalGraphSize() != getTreeSize()) {
|
|
constexpr unsigned SmallTree = 3;
|
|
if (VectorizableTree.front()->isNonPowOf2Vec() &&
|
|
getCanonicalGraphSize() <= SmallTree &&
|
|
count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
|
|
[](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->isGather() && TE->hasState() &&
|
|
TE->getOpcode() == Instruction::Load &&
|
|
!allSameBlock(TE->Scalars);
|
|
}) == 1)
|
|
return true;
|
|
return false;
|
|
}
|
|
bool Res = false;
|
|
for (unsigned Idx : seq<unsigned>(getTreeSize())) {
|
|
TreeEntry &E = *VectorizableTree[Idx];
|
|
if (E.State == TreeEntry::SplitVectorize)
|
|
return false;
|
|
if (!E.isGather())
|
|
continue;
|
|
if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
|
|
(!E.hasState() &&
|
|
all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||
|
|
(isa<ExtractElementInst>(E.Scalars.front()) &&
|
|
getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
|
|
return false;
|
|
if (isSplat(E.Scalars) || allConstant(E.Scalars))
|
|
continue;
|
|
Res = true;
|
|
}
|
|
return Res;
|
|
}
|
|
|
|
InstructionCost BoUpSLP::getSpillCost() {
|
|
// Walk from the bottom of the tree to the top, tracking which values are
|
|
// live. When we see a call instruction that is not part of our tree,
|
|
// query TTI to see if there is a cost to keeping values live over it
|
|
// (for example, if spills and fills are required).
|
|
|
|
const TreeEntry *Root = VectorizableTree.front().get();
|
|
if (Root->isGather())
|
|
return 0;
|
|
|
|
InstructionCost Cost = 0;
|
|
SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
|
|
EntriesToOperands;
|
|
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
|
|
SmallPtrSet<const Instruction *, 8> LastInstructions;
|
|
for (const auto &TEPtr : VectorizableTree) {
|
|
if (!TEPtr->isGather()) {
|
|
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
|
|
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
|
|
LastInstructions.insert(LastInst);
|
|
}
|
|
if (TEPtr->UserTreeIndex)
|
|
EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
|
|
}
|
|
|
|
auto NoCallIntrinsic = [this](const Instruction *I) {
|
|
const auto *II = dyn_cast<IntrinsicInst>(I);
|
|
if (!II)
|
|
return false;
|
|
if (II->isAssumeLikeIntrinsic())
|
|
return true;
|
|
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
|
|
InstructionCost IntrCost =
|
|
TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
|
|
InstructionCost CallCost = TTI->getCallInstrCost(
|
|
nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
|
|
return IntrCost < CallCost;
|
|
};
|
|
|
|
// Maps last instruction in the entry to the last instruction for the one of
|
|
// operand entries and the flag. If the flag is true, there are no calls in
|
|
// between these instructions.
|
|
SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
|
|
CheckedInstructions;
|
|
unsigned Budget = 0;
|
|
const unsigned BudgetLimit =
|
|
ScheduleRegionSizeBudget / VectorizableTree.size();
|
|
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
|
|
const Instruction *Last) {
|
|
assert(First->getParent() == Last->getParent() &&
|
|
"Expected instructions in same block.");
|
|
if (auto It = CheckedInstructions.find(Last);
|
|
It != CheckedInstructions.end()) {
|
|
const Instruction *Checked = It->second.getPointer();
|
|
if (Checked == First || Checked->comesBefore(First))
|
|
return It->second.getInt() != 0;
|
|
Last = Checked;
|
|
} else if (Last == First || Last->comesBefore(First)) {
|
|
return true;
|
|
}
|
|
BasicBlock::const_reverse_iterator InstIt =
|
|
++First->getIterator().getReverse(),
|
|
PrevInstIt =
|
|
Last->getIterator().getReverse();
|
|
SmallVector<const Instruction *> LastInstsInRange;
|
|
while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
|
|
// Debug information does not impact spill cost.
|
|
// Vectorized calls, represented as vector intrinsics, do not impact spill
|
|
// cost.
|
|
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
|
|
CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
|
|
for (const Instruction *LastInst : LastInstsInRange)
|
|
CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
|
|
return false;
|
|
}
|
|
if (LastInstructions.contains(&*PrevInstIt))
|
|
LastInstsInRange.push_back(&*PrevInstIt);
|
|
|
|
++PrevInstIt;
|
|
++Budget;
|
|
}
|
|
for (const Instruction *LastInst : LastInstsInRange)
|
|
CheckedInstructions.try_emplace(
|
|
LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
|
|
Budget <= BudgetLimit ? 1 : 0);
|
|
return Budget <= BudgetLimit;
|
|
};
|
|
auto AddCosts = [&](const TreeEntry *Op) {
|
|
Type *ScalarTy = Op->Scalars.front()->getType();
|
|
auto It = MinBWs.find(Op);
|
|
if (It != MinBWs.end())
|
|
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
|
|
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
|
|
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
|
|
if (ScalarTy->isVectorTy()) {
|
|
// Handle revec dead vector instructions.
|
|
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
|
|
}
|
|
};
|
|
// Memoize the relationship between blocks, i.e. if there is (at least one)
|
|
// non-vectorized call between the blocks. This allows to skip the analysis of
|
|
// the same block paths multiple times.
|
|
SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
|
|
ParentOpParentToPreds;
|
|
auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
|
|
BasicBlock *OpParent) {
|
|
auto Key = std::make_pair(Root, OpParent);
|
|
if (auto It = ParentOpParentToPreds.find(Key);
|
|
It != ParentOpParentToPreds.end())
|
|
return It->second;
|
|
SmallVector<BasicBlock *> Worklist;
|
|
if (Pred)
|
|
Worklist.push_back(Pred);
|
|
else
|
|
Worklist.append(pred_begin(Root), pred_end(Root));
|
|
SmallPtrSet<const BasicBlock *, 16> Visited;
|
|
SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
|
|
ParentsPairsToAdd;
|
|
bool Res = false;
|
|
auto Cleanup = make_scope_exit([&]() {
|
|
for (const auto &KeyPair : ParentsPairsToAdd) {
|
|
assert(!ParentOpParentToPreds.contains(KeyPair) &&
|
|
"Should not have been added before.");
|
|
ParentOpParentToPreds.try_emplace(KeyPair, Res);
|
|
}
|
|
});
|
|
while (!Worklist.empty()) {
|
|
BasicBlock *BB = Worklist.pop_back_val();
|
|
if (BB == OpParent || !Visited.insert(BB).second)
|
|
continue;
|
|
auto Pair = std::make_pair(BB, OpParent);
|
|
if (auto It = ParentOpParentToPreds.find(Pair);
|
|
It != ParentOpParentToPreds.end()) {
|
|
Res = It->second;
|
|
return Res;
|
|
}
|
|
ParentsPairsToAdd.insert(Pair);
|
|
unsigned BlockSize = BB->size();
|
|
if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
|
|
return Res;
|
|
Budget += BlockSize;
|
|
if (Budget > BudgetLimit)
|
|
return Res;
|
|
if (!CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
|
|
BB->getTerminator()))
|
|
return Res;
|
|
Worklist.append(pred_begin(BB), pred_end(BB));
|
|
}
|
|
Res = true;
|
|
return Res;
|
|
};
|
|
SmallVector<const TreeEntry *> LiveEntries(1, Root);
|
|
while (!LiveEntries.empty()) {
|
|
const TreeEntry *Entry = LiveEntries.pop_back_val();
|
|
SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
|
|
if (Operands.empty())
|
|
continue;
|
|
Instruction *LastInst = EntriesToLastInstruction.at(Entry);
|
|
BasicBlock *Parent = LastInst->getParent();
|
|
for (const TreeEntry *Op : Operands) {
|
|
if (!Op->isGather())
|
|
LiveEntries.push_back(Op);
|
|
if (Entry->State == TreeEntry::SplitVectorize ||
|
|
(Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
|
|
(Op->isGather() && allConstant(Op->Scalars)))
|
|
continue;
|
|
Budget = 0;
|
|
BasicBlock *Pred = nullptr;
|
|
if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
|
|
Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
|
|
BasicBlock *OpParent;
|
|
Instruction *OpLastInst;
|
|
if (Op->isGather()) {
|
|
assert(Entry->getOpcode() == Instruction::PHI &&
|
|
"Expected phi node only.");
|
|
OpParent = cast<PHINode>(Entry->getMainOp())
|
|
->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
|
|
OpLastInst = OpParent->getTerminator();
|
|
for (Value *V : Op->Scalars) {
|
|
auto *Inst = dyn_cast<Instruction>(V);
|
|
if (!Inst)
|
|
continue;
|
|
if (isVectorized(V)) {
|
|
OpParent = Inst->getParent();
|
|
OpLastInst = Inst;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
OpLastInst = EntriesToLastInstruction.at(Op);
|
|
OpParent = OpLastInst->getParent();
|
|
}
|
|
// Check the call instructions within the same basic blocks.
|
|
if (OpParent == Parent) {
|
|
if (Entry->getOpcode() == Instruction::PHI) {
|
|
if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
|
|
AddCosts(Op);
|
|
continue;
|
|
}
|
|
if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
|
|
AddCosts(Op);
|
|
continue;
|
|
}
|
|
// Check for call instruction in between blocks.
|
|
// 1. Check entry's block to the head.
|
|
if (Entry->getOpcode() != Instruction::PHI &&
|
|
!CheckForNonVecCallsInSameBlock(
|
|
&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
|
|
LastInst)) {
|
|
AddCosts(Op);
|
|
continue;
|
|
}
|
|
// 2. Check op's block from the end.
|
|
if (!CheckForNonVecCallsInSameBlock(OpLastInst,
|
|
OpParent->getTerminator())) {
|
|
AddCosts(Op);
|
|
continue;
|
|
}
|
|
// 3. Check the predecessors of entry's block till op's block.
|
|
if (!CheckPredecessors(Parent, Pred, OpParent)) {
|
|
AddCosts(Op);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
return Cost;
|
|
}
|
|
|
|
/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
|
|
/// buildvector sequence.
|
|
static bool isFirstInsertElement(const InsertElementInst *IE1,
|
|
const InsertElementInst *IE2) {
|
|
if (IE1 == IE2)
|
|
return false;
|
|
const auto *I1 = IE1;
|
|
const auto *I2 = IE2;
|
|
const InsertElementInst *PrevI1;
|
|
const InsertElementInst *PrevI2;
|
|
unsigned Idx1 = *getElementIndex(IE1);
|
|
unsigned Idx2 = *getElementIndex(IE2);
|
|
do {
|
|
if (I2 == IE1)
|
|
return true;
|
|
if (I1 == IE2)
|
|
return false;
|
|
PrevI1 = I1;
|
|
PrevI2 = I2;
|
|
if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
|
|
getElementIndex(I1).value_or(Idx2) != Idx2)
|
|
I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
|
|
if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
|
|
getElementIndex(I2).value_or(Idx1) != Idx1)
|
|
I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
|
|
} while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
|
|
llvm_unreachable("Two different buildvectors not expected.");
|
|
}
|
|
|
|
namespace {
|
|
/// Returns incoming Value *, if the requested type is Value * too, or a default
|
|
/// value, otherwise.
|
|
struct ValueSelect {
|
|
template <typename U>
|
|
static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
|
|
return V;
|
|
}
|
|
template <typename U>
|
|
static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
|
|
return U();
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
/// Does the analysis of the provided shuffle masks and performs the requested
|
|
/// actions on the vectors with the given shuffle masks. It tries to do it in
|
|
/// several steps.
|
|
/// 1. If the Base vector is not undef vector, resizing the very first mask to
|
|
/// have common VF and perform action for 2 input vectors (including non-undef
|
|
/// Base). Other shuffle masks are combined with the resulting after the 1 stage
|
|
/// and processed as a shuffle of 2 elements.
|
|
/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
|
|
/// action only for 1 vector with the given mask, if it is not the identity
|
|
/// mask.
|
|
/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
|
|
/// vectors, combing the masks properly between the steps.
|
|
template <typename T>
|
|
static T *performExtractsShuffleAction(
|
|
MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
|
|
function_ref<unsigned(T *)> GetVF,
|
|
function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
|
|
function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
|
|
assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
|
|
SmallVector<int> Mask(ShuffleMask.begin()->second);
|
|
auto VMIt = std::next(ShuffleMask.begin());
|
|
T *Prev = nullptr;
|
|
SmallBitVector UseMask =
|
|
buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
|
|
SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
|
|
if (!IsBaseUndef.all()) {
|
|
// Base is not undef, need to combine it with the next subvectors.
|
|
std::pair<T *, bool> Res =
|
|
ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
|
|
SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
|
|
for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
|
|
if (Mask[Idx] == PoisonMaskElem)
|
|
Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
|
|
else
|
|
Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
|
|
}
|
|
[[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
|
|
assert((!V || GetVF(V) == Mask.size()) &&
|
|
"Expected base vector of VF number of elements.");
|
|
Prev = Action(Mask, {nullptr, Res.first});
|
|
} else if (ShuffleMask.size() == 1) {
|
|
// Base is undef and only 1 vector is shuffled - perform the action only for
|
|
// single vector, if the mask is not the identity mask.
|
|
std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
|
|
/*ForSingleMask=*/true);
|
|
if (Res.second)
|
|
// Identity mask is found.
|
|
Prev = Res.first;
|
|
else
|
|
Prev = Action(Mask, {ShuffleMask.begin()->first});
|
|
} else {
|
|
// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
|
|
// shuffles step by step, combining shuffle between the steps.
|
|
unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
|
|
unsigned Vec2VF = GetVF(VMIt->first);
|
|
if (Vec1VF == Vec2VF) {
|
|
// No need to resize the input vectors since they are of the same size, we
|
|
// can shuffle them directly.
|
|
ArrayRef<int> SecMask = VMIt->second;
|
|
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
|
|
if (SecMask[I] != PoisonMaskElem) {
|
|
assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
|
|
Mask[I] = SecMask[I] + Vec1VF;
|
|
}
|
|
}
|
|
Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
|
|
} else {
|
|
// Vectors of different sizes - resize and reshuffle.
|
|
std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
|
|
/*ForSingleMask=*/false);
|
|
std::pair<T *, bool> Res2 =
|
|
ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
|
|
ArrayRef<int> SecMask = VMIt->second;
|
|
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
|
|
if (Mask[I] != PoisonMaskElem) {
|
|
assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
|
|
if (Res1.second)
|
|
Mask[I] = I;
|
|
} else if (SecMask[I] != PoisonMaskElem) {
|
|
assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
|
|
Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
|
|
}
|
|
}
|
|
Prev = Action(Mask, {Res1.first, Res2.first});
|
|
}
|
|
VMIt = std::next(VMIt);
|
|
}
|
|
[[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
|
|
// Perform requested actions for the remaining masks/vectors.
|
|
for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
|
|
// Shuffle other input vectors, if any.
|
|
std::pair<T *, bool> Res =
|
|
ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
|
|
ArrayRef<int> SecMask = VMIt->second;
|
|
for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
|
|
if (SecMask[I] != PoisonMaskElem) {
|
|
assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
|
|
"Multiple uses of scalars.");
|
|
Mask[I] = (Res.second ? I : SecMask[I]) + VF;
|
|
} else if (Mask[I] != PoisonMaskElem) {
|
|
Mask[I] = I;
|
|
}
|
|
}
|
|
Prev = Action(Mask, {Prev, Res.first});
|
|
}
|
|
return Prev;
|
|
}
|
|
|
|
namespace {
|
|
/// Data type for handling buildvector sequences with the reused scalars from
|
|
/// other tree entries.
|
|
template <typename T> struct ShuffledInsertData {
|
|
/// List of insertelements to be replaced by shuffles.
|
|
SmallVector<InsertElementInst *> InsertElements;
|
|
/// The parent vectors and shuffle mask for the given list of inserts.
|
|
MapVector<T, SmallVector<int>> ValueMasks;
|
|
};
|
|
} // namespace
|
|
|
|
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
|
|
InstructionCost ReductionCost) {
|
|
InstructionCost Cost = ReductionCost;
|
|
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
|
|
<< VectorizableTree.size() << ".\n");
|
|
|
|
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
|
|
|
|
SmallPtrSet<Value *, 4> CheckedExtracts;
|
|
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
|
|
TreeEntry &TE = *VectorizableTree[I];
|
|
// No need to count the cost for combined entries, they are combined and
|
|
// just skip their cost.
|
|
if (TE.State == TreeEntry::CombinedVectorize) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: Skipping cost for combined node that starts with "
|
|
<< *TE.Scalars[0] << ".\n";
|
|
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
|
continue;
|
|
}
|
|
if (TE.hasState() &&
|
|
(TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
|
|
if (const TreeEntry *E =
|
|
getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
|
|
E && E->getVectorFactor() == TE.getVectorFactor()) {
|
|
// Some gather nodes might be absolutely the same as some vectorizable
|
|
// nodes after reordering, need to handle it.
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
|
|
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
|
|
<< "SLP: Current total cost = " << Cost << "\n");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Exclude cost of gather loads nodes which are not used. These nodes were
|
|
// built as part of the final attempt to vectorize gathered loads.
|
|
assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
|
|
"Expected gather nodes with users only.");
|
|
|
|
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
|
|
Cost += C;
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
|
|
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
|
|
<< "SLP: Current total cost = " << Cost << "\n");
|
|
}
|
|
|
|
if (Cost >= -SLPCostThreshold &&
|
|
none_of(ExternalUses, [](const ExternalUser &EU) {
|
|
return isa_and_nonnull<InsertElementInst>(EU.User);
|
|
}))
|
|
return Cost;
|
|
|
|
SmallPtrSet<Value *, 16> ExtractCostCalculated;
|
|
InstructionCost ExtractCost = 0;
|
|
SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
|
|
SmallVector<APInt> DemandedElts;
|
|
SmallDenseSet<Value *, 4> UsedInserts;
|
|
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
|
|
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
|
|
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
|
|
SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
|
|
// Keep track {Scalar, Index, User} tuple.
|
|
// On AArch64, this helps in fusing a mov instruction, associated with
|
|
// extractelement, with fmul in the backend so that extractelement is free.
|
|
SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
|
|
for (ExternalUser &EU : ExternalUses) {
|
|
ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
|
|
}
|
|
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
|
|
for (ExternalUser &EU : ExternalUses) {
|
|
// Uses by ephemeral values are free (because the ephemeral value will be
|
|
// removed prior to code generation, and so the extraction will be
|
|
// removed as well).
|
|
if (EphValues.count(EU.User))
|
|
continue;
|
|
|
|
// Check if the scalar for the given user or all users is accounted already.
|
|
if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
|
|
(EU.User &&
|
|
CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
|
|
continue;
|
|
|
|
// Used in unreachable blocks or in EH pads (rarely executed) or is
|
|
// terminated with unreachable instruction.
|
|
if (BasicBlock *UserParent =
|
|
EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
|
|
UserParent &&
|
|
(!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
|
|
isa_and_present<UnreachableInst>(UserParent->getTerminator())))
|
|
continue;
|
|
|
|
// We only add extract cost once for the same scalar.
|
|
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
|
|
!ExtractCostCalculated.insert(EU.Scalar).second)
|
|
continue;
|
|
|
|
// No extract cost for vector "scalar" if REVEC is disabled
|
|
if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
|
|
continue;
|
|
|
|
// If found user is an insertelement, do not calculate extract cost but try
|
|
// to detect it as a final shuffled/identity match.
|
|
// TODO: what if a user is insertvalue when REVEC is enabled?
|
|
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
|
|
VU && VU->getOperand(1) == EU.Scalar) {
|
|
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
|
|
if (!UsedInserts.insert(VU).second)
|
|
continue;
|
|
std::optional<unsigned> InsertIdx = getElementIndex(VU);
|
|
if (InsertIdx) {
|
|
const TreeEntry *ScalarTE = &EU.E;
|
|
auto *It = find_if(
|
|
ShuffledInserts,
|
|
[this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
|
|
// Checks if 2 insertelements are from the same buildvector.
|
|
InsertElementInst *VecInsert = Data.InsertElements.front();
|
|
return areTwoInsertFromSameBuildVector(
|
|
VU, VecInsert, [this](InsertElementInst *II) -> Value * {
|
|
Value *Op0 = II->getOperand(0);
|
|
if (isVectorized(II) && !isVectorized(Op0))
|
|
return nullptr;
|
|
return Op0;
|
|
});
|
|
});
|
|
int VecId = -1;
|
|
if (It == ShuffledInserts.end()) {
|
|
auto &Data = ShuffledInserts.emplace_back();
|
|
Data.InsertElements.emplace_back(VU);
|
|
DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
|
|
VecId = ShuffledInserts.size() - 1;
|
|
auto It = MinBWs.find(ScalarTE);
|
|
if (It != MinBWs.end() &&
|
|
VectorCasts
|
|
.insert(std::make_pair(ScalarTE, FTy->getElementType()))
|
|
.second) {
|
|
unsigned BWSz = It->second.first;
|
|
unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
|
|
unsigned VecOpcode;
|
|
if (DstBWSz < BWSz)
|
|
VecOpcode = Instruction::Trunc;
|
|
else
|
|
VecOpcode =
|
|
It->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
InstructionCost C = TTI->getCastInstrCost(
|
|
VecOpcode, FTy,
|
|
getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
|
|
FTy->getNumElements()),
|
|
TTI::CastContextHint::None, CostKind);
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
|
<< " for extending externally used vector with "
|
|
"non-equal minimum bitwidth.\n");
|
|
Cost += C;
|
|
}
|
|
} else {
|
|
if (isFirstInsertElement(VU, It->InsertElements.front()))
|
|
It->InsertElements.front() = VU;
|
|
VecId = std::distance(ShuffledInserts.begin(), It);
|
|
}
|
|
int InIdx = *InsertIdx;
|
|
SmallVectorImpl<int> &Mask =
|
|
ShuffledInserts[VecId].ValueMasks[ScalarTE];
|
|
if (Mask.empty())
|
|
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
|
|
Mask[InIdx] = EU.Lane;
|
|
DemandedElts[VecId].setBit(InIdx);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
// If we plan to rewrite the tree in a smaller type, we will need to sign
|
|
// extend the extracted value back to the original type. Here, we account
|
|
// for the extract and the added cost of the sign extend if needed.
|
|
InstructionCost ExtraCost = TTI::TCC_Free;
|
|
auto *ScalarTy = EU.Scalar->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
|
|
const TreeEntry *Entry = &EU.E;
|
|
auto It = MinBWs.find(Entry);
|
|
if (It != MinBWs.end()) {
|
|
Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
|
|
MinTy = getWidenedType(MinTy, VecTy->getNumElements());
|
|
unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
|
|
? Instruction::ZExt
|
|
: Instruction::SExt;
|
|
VecTy = getWidenedType(MinTy, BundleWidth);
|
|
ExtraCost =
|
|
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
|
|
} else {
|
|
ExtraCost =
|
|
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
|
|
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
|
|
}
|
|
// Leave the scalar instructions as is if they are cheaper than extracts.
|
|
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
|
|
Entry->getOpcode() == Instruction::Load) {
|
|
// Checks if the user of the external scalar is phi in loop body.
|
|
auto IsPhiInLoop = [&](const ExternalUser &U) {
|
|
if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
|
|
auto *I = cast<Instruction>(U.Scalar);
|
|
const Loop *L = LI->getLoopFor(Phi->getParent());
|
|
return L && (Phi->getParent() == I->getParent() ||
|
|
L == LI->getLoopFor(I->getParent()));
|
|
}
|
|
return false;
|
|
};
|
|
if (!ValueToExtUses) {
|
|
ValueToExtUses.emplace();
|
|
for_each(enumerate(ExternalUses), [&](const auto &P) {
|
|
// Ignore phis in loops.
|
|
if (IsPhiInLoop(P.value()))
|
|
return;
|
|
|
|
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
|
|
});
|
|
}
|
|
// Can use original instruction, if no operands vectorized or they are
|
|
// marked as externally used already.
|
|
auto *Inst = cast<Instruction>(EU.Scalar);
|
|
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
|
|
auto OperandIsScalar = [&](Value *V) {
|
|
if (!isVectorized(V)) {
|
|
// Some extractelements might be not vectorized, but
|
|
// transformed into shuffle and removed from the function,
|
|
// consider it here.
|
|
if (auto *EE = dyn_cast<ExtractElementInst>(V))
|
|
return !EE->hasOneUse() || !MustGather.contains(EE);
|
|
return true;
|
|
}
|
|
return ValueToExtUses->contains(V);
|
|
};
|
|
bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
|
|
bool CanBeUsedAsScalarCast = false;
|
|
if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
|
|
if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
|
|
Op && all_of(Op->operands(), OperandIsScalar)) {
|
|
InstructionCost OpCost =
|
|
(isVectorized(Op) && !ValueToExtUses->contains(Op))
|
|
? TTI->getInstructionCost(Op, CostKind)
|
|
: 0;
|
|
if (ScalarCost + OpCost <= ExtraCost) {
|
|
CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
|
|
ScalarCost += OpCost;
|
|
}
|
|
}
|
|
}
|
|
if (CanBeUsedAsScalar) {
|
|
bool KeepScalar = ScalarCost <= ExtraCost;
|
|
// Try to keep original scalar if the user is the phi node from the same
|
|
// block as the root phis, currently vectorized. It allows to keep
|
|
// better ordering info of PHIs, being vectorized currently.
|
|
bool IsProfitablePHIUser =
|
|
(KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
|
|
VectorizableTree.front()->Scalars.size() > 2)) &&
|
|
VectorizableTree.front()->getOpcode() == Instruction::PHI &&
|
|
!Inst->hasNUsesOrMore(UsesLimit) &&
|
|
none_of(Inst->users(),
|
|
[&](User *U) {
|
|
auto *PHIUser = dyn_cast<PHINode>(U);
|
|
return (!PHIUser ||
|
|
PHIUser->getParent() !=
|
|
cast<Instruction>(
|
|
VectorizableTree.front()->getMainOp())
|
|
->getParent()) &&
|
|
!isVectorized(U);
|
|
}) &&
|
|
count_if(Entry->Scalars, [&](Value *V) {
|
|
return ValueToExtUses->contains(V);
|
|
}) <= 2;
|
|
if (IsProfitablePHIUser) {
|
|
KeepScalar = true;
|
|
} else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
|
|
ExtraCost - ScalarCost <= TTI::TCC_Basic &&
|
|
(!GatheredLoadsEntriesFirst.has_value() ||
|
|
Entry->Idx < *GatheredLoadsEntriesFirst)) {
|
|
unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
|
|
return ValueToExtUses->contains(V);
|
|
});
|
|
auto It = ExtractsCount.find(Entry);
|
|
if (It != ExtractsCount.end()) {
|
|
assert(ScalarUsesCount >= It->getSecond().size() &&
|
|
"Expected total number of external uses not less than "
|
|
"number of scalar uses.");
|
|
ScalarUsesCount -= It->getSecond().size();
|
|
}
|
|
// Keep original scalar if number of externally used instructions in
|
|
// the same entry is not power of 2. It may help to do some extra
|
|
// vectorization for now.
|
|
KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
|
|
}
|
|
if (KeepScalar) {
|
|
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
|
|
for_each(Inst->operands(), [&](Value *V) {
|
|
auto It = ValueToExtUses->find(V);
|
|
if (It != ValueToExtUses->end()) {
|
|
// Replace all uses to avoid compiler crash.
|
|
ExternalUses[It->second].User = nullptr;
|
|
}
|
|
});
|
|
ExtraCost = ScalarCost;
|
|
if (!IsPhiInLoop(EU))
|
|
ExtractsCount[Entry].insert(Inst);
|
|
if (CanBeUsedAsScalarCast) {
|
|
ScalarOpsFromCasts.insert(Inst->getOperand(0));
|
|
// Update the users of the operands of the cast operand to avoid
|
|
// compiler crash.
|
|
if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
|
|
for_each(IOp->operands(), [&](Value *V) {
|
|
auto It = ValueToExtUses->find(V);
|
|
if (It != ValueToExtUses->end()) {
|
|
// Replace all uses to avoid compiler crash.
|
|
ExternalUses[It->second].User = nullptr;
|
|
}
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ExtractCost += ExtraCost;
|
|
}
|
|
// Insert externals for extract of operands of casts to be emitted as scalars
|
|
// instead of extractelement.
|
|
for (Value *V : ScalarOpsFromCasts) {
|
|
ExternalUsesAsOriginalScalar.insert(V);
|
|
if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
|
|
ExternalUses.emplace_back(V, nullptr, *TEs.front(),
|
|
TEs.front()->findLaneForValue(V));
|
|
}
|
|
}
|
|
// Add reduced value cost, if resized.
|
|
if (!VectorizedVals.empty()) {
|
|
const TreeEntry &Root = *VectorizableTree.front();
|
|
auto BWIt = MinBWs.find(&Root);
|
|
if (BWIt != MinBWs.end()) {
|
|
Type *DstTy = Root.Scalars.front()->getType();
|
|
unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
|
|
unsigned SrcSz =
|
|
ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
|
|
if (OriginalSz != SrcSz) {
|
|
unsigned Opcode = Instruction::Trunc;
|
|
if (OriginalSz > SrcSz)
|
|
Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
|
|
}
|
|
Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
|
|
TTI::CastContextHint::None,
|
|
TTI::TCK_RecipThroughput);
|
|
}
|
|
}
|
|
}
|
|
|
|
Cost += ExtractCost;
|
|
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
|
|
bool) {
|
|
InstructionCost C = 0;
|
|
unsigned VF = Mask.size();
|
|
unsigned VecVF = TE->getVectorFactor();
|
|
if (VF != VecVF &&
|
|
(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
|
|
!ShuffleVectorInst::isIdentityMask(Mask, VF))) {
|
|
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
|
|
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
|
|
OrigMask.begin());
|
|
C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
|
|
getWidenedType(TE->getMainOp()->getType(), VecVF),
|
|
OrigMask);
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: Adding cost " << C
|
|
<< " for final shuffle of insertelement external users.\n";
|
|
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
|
Cost += C;
|
|
return std::make_pair(TE, true);
|
|
}
|
|
return std::make_pair(TE, false);
|
|
};
|
|
// Calculate the cost of the reshuffled vectors, if any.
|
|
for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
|
|
Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
|
|
auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
|
|
unsigned VF = 0;
|
|
auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
|
|
ArrayRef<const TreeEntry *> TEs) {
|
|
assert((TEs.size() == 1 || TEs.size() == 2) &&
|
|
"Expected exactly 1 or 2 tree entries.");
|
|
if (TEs.size() == 1) {
|
|
if (VF == 0)
|
|
VF = TEs.front()->getVectorFactor();
|
|
auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
|
|
if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
|
|
!all_of(enumerate(Mask), [=](const auto &Data) {
|
|
return Data.value() == PoisonMaskElem ||
|
|
(Data.index() < VF &&
|
|
static_cast<int>(Data.index()) == Data.value());
|
|
})) {
|
|
InstructionCost C =
|
|
::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask);
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
|
<< " for final shuffle of insertelement "
|
|
"external users.\n";
|
|
TEs.front()->dump();
|
|
dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
|
Cost += C;
|
|
}
|
|
} else {
|
|
if (VF == 0) {
|
|
if (TEs.front() &&
|
|
TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
|
|
VF = TEs.front()->getVectorFactor();
|
|
else
|
|
VF = Mask.size();
|
|
}
|
|
auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
|
|
InstructionCost C =
|
|
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
|
<< " for final shuffle of vector node and external "
|
|
"insertelement users.\n";
|
|
if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
|
|
dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
|
Cost += C;
|
|
}
|
|
VF = Mask.size();
|
|
return TEs.back();
|
|
};
|
|
(void)performExtractsShuffleAction<const TreeEntry>(
|
|
MutableArrayRef(Vector.data(), Vector.size()), Base,
|
|
[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
|
|
EstimateShufflesCost);
|
|
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
|
cast<FixedVectorType>(
|
|
ShuffledInserts[I].InsertElements.front()->getType()),
|
|
DemandedElts[I],
|
|
/*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
|
|
Cost -= InsertCost;
|
|
}
|
|
|
|
// Add the cost for reduced value resize (if required).
|
|
if (ReductionBitWidth != 0) {
|
|
assert(UserIgnoreList && "Expected reduction tree.");
|
|
const TreeEntry &E = *VectorizableTree.front();
|
|
auto It = MinBWs.find(&E);
|
|
if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
|
|
unsigned SrcSize = It->second.first;
|
|
unsigned DstSize = ReductionBitWidth;
|
|
unsigned Opcode = Instruction::Trunc;
|
|
if (SrcSize < DstSize) {
|
|
bool IsArithmeticExtendedReduction =
|
|
all_of(*UserIgnoreList, [](Value *V) {
|
|
auto *I = cast<Instruction>(V);
|
|
return is_contained({Instruction::Add, Instruction::FAdd,
|
|
Instruction::Mul, Instruction::FMul,
|
|
Instruction::And, Instruction::Or,
|
|
Instruction::Xor},
|
|
I->getOpcode());
|
|
});
|
|
if (IsArithmeticExtendedReduction)
|
|
Opcode =
|
|
Instruction::BitCast; // Handle it by getExtendedReductionCost
|
|
else
|
|
Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
}
|
|
if (Opcode != Instruction::BitCast) {
|
|
auto *SrcVecTy =
|
|
getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
|
|
auto *DstVecTy =
|
|
getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
|
|
TTI::CastContextHint CCH = getCastContextHint(E);
|
|
InstructionCost CastCost;
|
|
switch (E.getOpcode()) {
|
|
case Instruction::SExt:
|
|
case Instruction::ZExt:
|
|
case Instruction::Trunc: {
|
|
const TreeEntry *OpTE = getOperandEntry(&E, 0);
|
|
CCH = getCastContextHint(*OpTE);
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
|
|
TTI::TCK_RecipThroughput);
|
|
Cost += CastCost;
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
|
|
<< " for final resize for reduction from " << SrcVecTy
|
|
<< " to " << DstVecTy << "\n";
|
|
dbgs() << "SLP: Current total cost = " << Cost << "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
std::optional<InstructionCost> SpillCost;
|
|
if (Cost < -SLPCostThreshold) {
|
|
SpillCost = getSpillCost();
|
|
Cost += *SpillCost;
|
|
}
|
|
#ifndef NDEBUG
|
|
SmallString<256> Str;
|
|
{
|
|
raw_svector_ostream OS(Str);
|
|
OS << "SLP: Spill Cost = ";
|
|
if (SpillCost)
|
|
OS << *SpillCost;
|
|
else
|
|
OS << "<skipped>";
|
|
OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
|
|
<< "SLP: Total Cost = " << Cost << ".\n";
|
|
}
|
|
LLVM_DEBUG(dbgs() << Str);
|
|
if (ViewSLPTree)
|
|
ViewGraph(this, "SLP" + F->getName(), false, Str);
|
|
#endif
|
|
|
|
return Cost;
|
|
}
|
|
|
|
/// Tries to find extractelement instructions with constant indices from fixed
|
|
/// vector type and gather such instructions into a bunch, which highly likely
|
|
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
|
|
/// successful, the matched scalars are replaced by poison values in \p VL for
|
|
/// future analysis.
|
|
std::optional<TTI::ShuffleKind>
|
|
BoUpSLP::tryToGatherSingleRegisterExtractElements(
|
|
MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
|
|
// Scan list of gathered scalars for extractelements that can be represented
|
|
// as shuffles.
|
|
MapVector<Value *, SmallVector<int>> VectorOpToIdx;
|
|
SmallVector<int> UndefVectorExtracts;
|
|
for (int I = 0, E = VL.size(); I < E; ++I) {
|
|
auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
|
|
if (!EI) {
|
|
if (isa<UndefValue>(VL[I]))
|
|
UndefVectorExtracts.push_back(I);
|
|
continue;
|
|
}
|
|
auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
|
|
if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
|
|
continue;
|
|
std::optional<unsigned> Idx = getExtractIndex(EI);
|
|
// Undefined index.
|
|
if (!Idx) {
|
|
UndefVectorExtracts.push_back(I);
|
|
continue;
|
|
}
|
|
if (Idx >= VecTy->getNumElements()) {
|
|
UndefVectorExtracts.push_back(I);
|
|
continue;
|
|
}
|
|
SmallBitVector ExtractMask(VecTy->getNumElements(), true);
|
|
ExtractMask.reset(*Idx);
|
|
if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
|
|
UndefVectorExtracts.push_back(I);
|
|
continue;
|
|
}
|
|
VectorOpToIdx[EI->getVectorOperand()].push_back(I);
|
|
}
|
|
// Sort the vector operands by the maximum number of uses in extractelements.
|
|
SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
|
|
VectorOpToIdx.takeVector();
|
|
stable_sort(Vectors, [](const auto &P1, const auto &P2) {
|
|
return P1.second.size() > P2.second.size();
|
|
});
|
|
// Find the best pair of the vectors or a single vector.
|
|
const int UndefSz = UndefVectorExtracts.size();
|
|
unsigned SingleMax = 0;
|
|
unsigned PairMax = 0;
|
|
if (!Vectors.empty()) {
|
|
SingleMax = Vectors.front().second.size() + UndefSz;
|
|
if (Vectors.size() > 1) {
|
|
auto *ItNext = std::next(Vectors.begin());
|
|
PairMax = SingleMax + ItNext->second.size();
|
|
}
|
|
}
|
|
if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
|
|
return std::nullopt;
|
|
// Check if better to perform a shuffle of 2 vectors or just of a single
|
|
// vector.
|
|
SmallVector<Value *> SavedVL(VL.begin(), VL.end());
|
|
SmallVector<Value *> GatheredExtracts(
|
|
VL.size(), PoisonValue::get(VL.front()->getType()));
|
|
if (SingleMax >= PairMax && SingleMax) {
|
|
for (int Idx : Vectors.front().second)
|
|
std::swap(GatheredExtracts[Idx], VL[Idx]);
|
|
} else if (!Vectors.empty()) {
|
|
for (unsigned Idx : {0, 1})
|
|
for (int Idx : Vectors[Idx].second)
|
|
std::swap(GatheredExtracts[Idx], VL[Idx]);
|
|
}
|
|
// Add extracts from undefs too.
|
|
for (int Idx : UndefVectorExtracts)
|
|
std::swap(GatheredExtracts[Idx], VL[Idx]);
|
|
// Check that gather of extractelements can be represented as just a
|
|
// shuffle of a single/two vectors the scalars are extracted from.
|
|
std::optional<TTI::ShuffleKind> Res =
|
|
isFixedVectorShuffle(GatheredExtracts, Mask, AC);
|
|
if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
|
|
// TODO: try to check other subsets if possible.
|
|
// Restore the original VL if attempt was not successful.
|
|
copy(SavedVL, VL.begin());
|
|
return std::nullopt;
|
|
}
|
|
// Restore unused scalars from mask, if some of the extractelements were not
|
|
// selected for shuffle.
|
|
for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
|
|
if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
|
|
isa<UndefValue>(GatheredExtracts[I])) {
|
|
std::swap(VL[I], GatheredExtracts[I]);
|
|
continue;
|
|
}
|
|
auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
|
|
if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
|
|
!isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
|
|
is_contained(UndefVectorExtracts, I))
|
|
continue;
|
|
}
|
|
return Res;
|
|
}
|
|
|
|
/// Tries to find extractelement instructions with constant indices from fixed
|
|
/// vector type and gather such instructions into a bunch, which highly likely
|
|
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
|
|
/// successful, the matched scalars are replaced by poison values in \p VL for
|
|
/// future analysis.
|
|
SmallVector<std::optional<TTI::ShuffleKind>>
|
|
BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
|
|
SmallVectorImpl<int> &Mask,
|
|
unsigned NumParts) const {
|
|
assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
|
|
SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
|
|
Mask.assign(VL.size(), PoisonMaskElem);
|
|
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
|
|
for (unsigned Part : seq<unsigned>(NumParts)) {
|
|
// Scan list of gathered scalars for extractelements that can be represented
|
|
// as shuffles.
|
|
MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
|
|
Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
|
|
SmallVector<int> SubMask;
|
|
std::optional<TTI::ShuffleKind> Res =
|
|
tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
|
|
ShufflesRes[Part] = Res;
|
|
copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
|
|
}
|
|
if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
|
|
return Res.has_value();
|
|
}))
|
|
ShufflesRes.clear();
|
|
return ShufflesRes;
|
|
}
|
|
|
|
std::optional<TargetTransformInfo::ShuffleKind>
|
|
BoUpSLP::isGatherShuffledSingleRegisterEntry(
|
|
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
|
|
SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
|
|
Entries.clear();
|
|
// TODO: currently checking only for Scalars in the tree entry, need to count
|
|
// reused elements too for better cost estimation.
|
|
auto GetUserEntry = [&](const TreeEntry *TE) {
|
|
while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
|
|
TE = TE->UserTreeIndex.UserTE;
|
|
if (TE == VectorizableTree.front().get())
|
|
return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
|
|
return TE->UserTreeIndex;
|
|
};
|
|
auto HasGatherUser = [&](const TreeEntry *TE) {
|
|
while (TE->Idx != 0 && TE->UserTreeIndex) {
|
|
if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
|
|
return true;
|
|
TE = TE->UserTreeIndex.UserTE;
|
|
}
|
|
return false;
|
|
};
|
|
const EdgeInfo TEUseEI = GetUserEntry(TE);
|
|
if (!TEUseEI)
|
|
return std::nullopt;
|
|
const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
|
|
const BasicBlock *TEInsertBlock = nullptr;
|
|
// Main node of PHI entries keeps the correct order of operands/incoming
|
|
// blocks.
|
|
if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp());
|
|
PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
|
|
TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
|
|
TEInsertPt = TEInsertBlock->getTerminator();
|
|
} else {
|
|
TEInsertBlock = TEInsertPt->getParent();
|
|
}
|
|
if (!DT->isReachableFromEntry(TEInsertBlock))
|
|
return std::nullopt;
|
|
auto *NodeUI = DT->getNode(TEInsertBlock);
|
|
assert(NodeUI && "Should only process reachable instructions");
|
|
SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
|
|
auto CheckOrdering = [&](const Instruction *InsertPt) {
|
|
// Argument InsertPt is an instruction where vector code for some other
|
|
// tree entry (one that shares one or more scalars with TE) is going to be
|
|
// generated. This lambda returns true if insertion point of vector code
|
|
// for the TE dominates that point (otherwise dependency is the other way
|
|
// around). The other node is not limited to be of a gather kind. Gather
|
|
// nodes are not scheduled and their vector code is inserted before their
|
|
// first user. If user is PHI, that is supposed to be at the end of a
|
|
// predecessor block. Otherwise it is the last instruction among scalars of
|
|
// the user node. So, instead of checking dependency between instructions
|
|
// themselves, we check dependency between their insertion points for vector
|
|
// code (since each scalar instruction ends up as a lane of a vector
|
|
// instruction).
|
|
const BasicBlock *InsertBlock = InsertPt->getParent();
|
|
auto *NodeEUI = DT->getNode(InsertBlock);
|
|
if (!NodeEUI)
|
|
return false;
|
|
assert((NodeUI == NodeEUI) ==
|
|
(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
// Check the order of the gather nodes users.
|
|
if (TEInsertPt->getParent() != InsertBlock &&
|
|
(DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
|
|
return false;
|
|
if (TEInsertPt->getParent() == InsertBlock &&
|
|
TEInsertPt->comesBefore(InsertPt))
|
|
return false;
|
|
return true;
|
|
};
|
|
// Find all tree entries used by the gathered values. If no common entries
|
|
// found - not a shuffle.
|
|
// Here we build a set of tree nodes for each gathered value and trying to
|
|
// find the intersection between these sets. If we have at least one common
|
|
// tree node for each gathered value - we have just a permutation of the
|
|
// single vector. If we have 2 different sets, we're in situation where we
|
|
// have a permutation of 2 input vectors.
|
|
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
|
|
SmallDenseMap<Value *, int> UsedValuesEntry;
|
|
SmallPtrSet<const Value *, 16> VisitedValue;
|
|
auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
|
|
// The node is reused - exit.
|
|
if ((TEPtr->getVectorFactor() != VL.size() &&
|
|
TEPtr->Scalars.size() != VL.size()) ||
|
|
(!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
|
|
return false;
|
|
UsedTEs.clear();
|
|
UsedTEs.emplace_back().insert(TEPtr);
|
|
for (Value *V : VL) {
|
|
if (isConstant(V))
|
|
continue;
|
|
UsedValuesEntry.try_emplace(V, 0);
|
|
}
|
|
return true;
|
|
};
|
|
auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
|
|
unsigned EdgeIdx) {
|
|
const TreeEntry *Ptr1 = User1;
|
|
const TreeEntry *Ptr2 = User2;
|
|
SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
|
|
while (Ptr2) {
|
|
PtrToIdx.try_emplace(Ptr2, EdgeIdx);
|
|
EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
|
|
Ptr2 = Ptr2->UserTreeIndex.UserTE;
|
|
}
|
|
while (Ptr1) {
|
|
unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
|
|
Ptr1 = Ptr1->UserTreeIndex.UserTE;
|
|
if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
|
|
return Idx < It->second;
|
|
}
|
|
return false;
|
|
};
|
|
for (Value *V : VL) {
|
|
if (isConstant(V) || !VisitedValue.insert(V).second)
|
|
continue;
|
|
// Build a list of tree entries where V is used.
|
|
SmallPtrSet<const TreeEntry *, 4> VToTEs;
|
|
for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
|
|
if (TEPtr == TE || TEPtr->Idx == 0)
|
|
continue;
|
|
assert(any_of(TEPtr->Scalars,
|
|
[&](Value *V) { return GatheredScalars.contains(V); }) &&
|
|
"Must contain at least single gathered value.");
|
|
assert(TEPtr->UserTreeIndex &&
|
|
"Expected only single user of a gather node.");
|
|
const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
|
|
|
|
PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
|
|
? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
|
|
: nullptr;
|
|
Instruction *InsertPt =
|
|
UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
|
|
: &getLastInstructionInBundle(UseEI.UserTE);
|
|
if (TEInsertPt == InsertPt) {
|
|
// If the schedulable insertion point is used in multiple entries - just
|
|
// exit, no known ordering at this point, available only after real
|
|
// scheduling.
|
|
if (!doesNotNeedToBeScheduled(InsertPt) &&
|
|
(TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
|
|
continue;
|
|
// If the users are the PHI nodes with the same incoming blocks - skip.
|
|
if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
|
|
TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
|
|
UseEI.UserTE->State == TreeEntry::Vectorize &&
|
|
UseEI.UserTE->getOpcode() == Instruction::PHI &&
|
|
TEUseEI.UserTE != UseEI.UserTE)
|
|
continue;
|
|
// If 2 gathers are operands of the same entry (regardless of whether
|
|
// user is PHI or else), compare operands indices, use the earlier one
|
|
// as the base.
|
|
if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
|
|
continue;
|
|
// If the user instruction is used for some reason in different
|
|
// vectorized nodes - make it depend on index.
|
|
if (TEUseEI.UserTE != UseEI.UserTE &&
|
|
(TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
|
|
HasGatherUser(TEUseEI.UserTE)))
|
|
continue;
|
|
// If the user node is the operand of the other user node - skip.
|
|
if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
|
|
continue;
|
|
}
|
|
|
|
// Check if the user node of the TE comes after user node of TEPtr,
|
|
// otherwise TEPtr depends on TE.
|
|
if ((TEInsertBlock != InsertPt->getParent() ||
|
|
TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
|
|
!CheckOrdering(InsertPt))
|
|
continue;
|
|
// The node is reused - exit.
|
|
if (CheckAndUseSameNode(TEPtr))
|
|
break;
|
|
VToTEs.insert(TEPtr);
|
|
}
|
|
if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
|
|
const auto *It = find_if(
|
|
VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
|
|
if (It != VTEs.end()) {
|
|
const TreeEntry *VTE = *It;
|
|
if (none_of(TE->CombinedEntriesWithIndices,
|
|
[&](const auto &P) { return P.first == VTE->Idx; })) {
|
|
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
|
|
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
|
|
continue;
|
|
}
|
|
// The node is reused - exit.
|
|
if (CheckAndUseSameNode(VTE))
|
|
break;
|
|
VToTEs.insert(VTE);
|
|
}
|
|
}
|
|
if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
|
|
const TreeEntry *VTE = VTEs.front();
|
|
if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
|
|
VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
|
|
VTEs = VTEs.drop_front();
|
|
// Iterate through all vectorized nodes.
|
|
const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
|
|
return MTE->State == TreeEntry::Vectorize;
|
|
});
|
|
if (MIt == VTEs.end())
|
|
continue;
|
|
VTE = *MIt;
|
|
}
|
|
if (none_of(TE->CombinedEntriesWithIndices,
|
|
[&](const auto &P) { return P.first == VTE->Idx; })) {
|
|
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
|
|
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
|
|
continue;
|
|
}
|
|
// The node is reused - exit.
|
|
if (CheckAndUseSameNode(VTE))
|
|
break;
|
|
VToTEs.insert(VTE);
|
|
}
|
|
if (VToTEs.empty())
|
|
continue;
|
|
if (UsedTEs.empty()) {
|
|
// The first iteration, just insert the list of nodes to vector.
|
|
UsedTEs.push_back(VToTEs);
|
|
UsedValuesEntry.try_emplace(V, 0);
|
|
} else {
|
|
// Need to check if there are any previously used tree nodes which use V.
|
|
// If there are no such nodes, consider that we have another one input
|
|
// vector.
|
|
SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
|
|
unsigned Idx = 0;
|
|
for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
|
|
// Do we have a non-empty intersection of previously listed tree entries
|
|
// and tree entries using current V?
|
|
set_intersect(VToTEs, Set);
|
|
if (!VToTEs.empty()) {
|
|
// Yes, write the new subset and continue analysis for the next
|
|
// scalar.
|
|
Set.swap(VToTEs);
|
|
break;
|
|
}
|
|
VToTEs = SavedVToTEs;
|
|
++Idx;
|
|
}
|
|
// No non-empty intersection found - need to add a second set of possible
|
|
// source vectors.
|
|
if (Idx == UsedTEs.size()) {
|
|
// If the number of input vectors is greater than 2 - not a permutation,
|
|
// fallback to the regular gather.
|
|
// TODO: support multiple reshuffled nodes.
|
|
if (UsedTEs.size() == 2)
|
|
continue;
|
|
UsedTEs.push_back(SavedVToTEs);
|
|
Idx = UsedTEs.size() - 1;
|
|
}
|
|
UsedValuesEntry.try_emplace(V, Idx);
|
|
}
|
|
}
|
|
|
|
if (UsedTEs.empty()) {
|
|
Entries.clear();
|
|
return std::nullopt;
|
|
}
|
|
|
|
unsigned VF = 0;
|
|
if (UsedTEs.size() == 1) {
|
|
// Keep the order to avoid non-determinism.
|
|
SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
|
|
UsedTEs.front().end());
|
|
sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
|
|
return TE1->Idx < TE2->Idx;
|
|
});
|
|
// Try to find the perfect match in another gather node at first.
|
|
auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
|
|
return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
|
|
});
|
|
if (It != FirstEntries.end() &&
|
|
((*It)->getVectorFactor() == VL.size() ||
|
|
((*It)->getVectorFactor() == TE->Scalars.size() &&
|
|
TE->ReuseShuffleIndices.size() == VL.size() &&
|
|
(*It)->isSame(TE->Scalars)))) {
|
|
Entries.push_back(*It);
|
|
if ((*It)->getVectorFactor() == VL.size()) {
|
|
std::iota(std::next(Mask.begin(), Part * VL.size()),
|
|
std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
|
|
} else {
|
|
SmallVector<int> CommonMask = TE->getCommonMask();
|
|
copy(CommonMask, Mask.begin());
|
|
}
|
|
// Clear undef scalars.
|
|
for (unsigned I : seq<unsigned>(VL.size()))
|
|
if (isa<PoisonValue>(VL[I]))
|
|
Mask[Part * VL.size() + I] = PoisonMaskElem;
|
|
return TargetTransformInfo::SK_PermuteSingleSrc;
|
|
}
|
|
// No perfect match, just shuffle, so choose the first tree node from the
|
|
// tree.
|
|
Entries.push_back(FirstEntries.front());
|
|
// Update mapping between values and corresponding tree entries.
|
|
for_each(UsedValuesEntry, [&](auto &P) { P.second = 0; });
|
|
VF = FirstEntries.front()->getVectorFactor();
|
|
} else {
|
|
// Try to find nodes with the same vector factor.
|
|
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
|
|
// Keep the order of tree nodes to avoid non-determinism.
|
|
DenseMap<int, const TreeEntry *> VFToTE;
|
|
for (const TreeEntry *TE : UsedTEs.front()) {
|
|
unsigned VF = TE->getVectorFactor();
|
|
auto It = VFToTE.find(VF);
|
|
if (It != VFToTE.end()) {
|
|
if (It->second->Idx > TE->Idx)
|
|
It->getSecond() = TE;
|
|
continue;
|
|
}
|
|
VFToTE.try_emplace(VF, TE);
|
|
}
|
|
// Same, keep the order to avoid non-determinism.
|
|
SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
|
|
UsedTEs.back().end());
|
|
sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
|
|
return TE1->Idx < TE2->Idx;
|
|
});
|
|
for (const TreeEntry *TE : SecondEntries) {
|
|
auto It = VFToTE.find(TE->getVectorFactor());
|
|
if (It != VFToTE.end()) {
|
|
VF = It->first;
|
|
Entries.push_back(It->second);
|
|
Entries.push_back(TE);
|
|
break;
|
|
}
|
|
}
|
|
// No 2 source vectors with the same vector factor - just choose 2 with max
|
|
// index.
|
|
if (Entries.empty()) {
|
|
Entries.push_back(*llvm::max_element(
|
|
UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
|
|
return TE1->Idx < TE2->Idx;
|
|
}));
|
|
Entries.push_back(SecondEntries.front());
|
|
VF = std::max(Entries.front()->getVectorFactor(),
|
|
Entries.back()->getVectorFactor());
|
|
} else {
|
|
VF = Entries.front()->getVectorFactor();
|
|
}
|
|
SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
|
|
for (const TreeEntry *E : Entries)
|
|
ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
|
|
E->Scalars.end());
|
|
// Update mapping between values and corresponding tree entries.
|
|
for_each(UsedValuesEntry, [&](auto &P) {
|
|
for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
|
|
if (ValuesToEntries[Idx].contains(P.first)) {
|
|
P.second = Idx;
|
|
break;
|
|
}
|
|
});
|
|
}
|
|
|
|
bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
|
|
// Checks if the 2 PHIs are compatible in terms of high possibility to be
|
|
// vectorized.
|
|
auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
|
|
auto *PHI = cast<PHINode>(V);
|
|
auto *PHI1 = cast<PHINode>(V1);
|
|
// Check that all incoming values are compatible/from same parent (if they
|
|
// are instructions).
|
|
// The incoming values are compatible if they all are constants, or
|
|
// instruction with the same/alternate opcodes from the same basic block.
|
|
for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
|
|
Value *In = PHI->getIncomingValue(I);
|
|
Value *In1 = PHI1->getIncomingValue(I);
|
|
if (isConstant(In) && isConstant(In1))
|
|
continue;
|
|
if (!getSameOpcode({In, In1}, *TLI))
|
|
return false;
|
|
if (cast<Instruction>(In)->getParent() !=
|
|
cast<Instruction>(In1)->getParent())
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
// Check if the value can be ignored during analysis for shuffled gathers.
|
|
// We suppose it is better to ignore instruction, which do not form splats,
|
|
// are not vectorized/not extractelements (these instructions will be handled
|
|
// by extractelements processing) or may form vector node in future.
|
|
auto MightBeIgnored = [=](Value *V) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
return I && !IsSplatOrUndefs && !isVectorized(I) &&
|
|
!isVectorLikeInstWithConstOps(I) &&
|
|
!areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
|
|
};
|
|
// Check that the neighbor instruction may form a full vector node with the
|
|
// current instruction V. It is possible, if they have same/alternate opcode
|
|
// and same parent basic block.
|
|
auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
|
|
Value *V1 = VL[Idx];
|
|
bool UsedInSameVTE = false;
|
|
auto It = UsedValuesEntry.find(V1);
|
|
if (It != UsedValuesEntry.end())
|
|
UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
|
|
return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
|
|
getSameOpcode({V, V1}, *TLI) &&
|
|
cast<Instruction>(V)->getParent() ==
|
|
cast<Instruction>(V1)->getParent() &&
|
|
(!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
|
|
};
|
|
// Build a shuffle mask for better cost estimation and vector emission.
|
|
SmallBitVector UsedIdxs(Entries.size());
|
|
SmallVector<std::pair<unsigned, int>> EntryLanes;
|
|
for (int I = 0, E = VL.size(); I < E; ++I) {
|
|
Value *V = VL[I];
|
|
auto It = UsedValuesEntry.find(V);
|
|
if (It == UsedValuesEntry.end())
|
|
continue;
|
|
// Do not try to shuffle scalars, if they are constants, or instructions
|
|
// that can be vectorized as a result of the following vector build
|
|
// vectorization.
|
|
if (isConstant(V) || (MightBeIgnored(V) &&
|
|
((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
|
|
(I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
|
|
continue;
|
|
unsigned Idx = It->second;
|
|
EntryLanes.emplace_back(Idx, I);
|
|
UsedIdxs.set(Idx);
|
|
}
|
|
// Iterate through all shuffled scalars and select entries, which can be used
|
|
// for final shuffle.
|
|
SmallVector<const TreeEntry *> TempEntries;
|
|
for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
|
|
if (!UsedIdxs.test(I))
|
|
continue;
|
|
// Fix the entry number for the given scalar. If it is the first entry, set
|
|
// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
|
|
// These indices are used when calculating final shuffle mask as the vector
|
|
// offset.
|
|
for (std::pair<unsigned, int> &Pair : EntryLanes)
|
|
if (Pair.first == I)
|
|
Pair.first = TempEntries.size();
|
|
TempEntries.push_back(Entries[I]);
|
|
}
|
|
Entries.swap(TempEntries);
|
|
if (EntryLanes.size() == Entries.size() &&
|
|
!VL.equals(ArrayRef(TE->Scalars)
|
|
.slice(Part * VL.size(),
|
|
std::min<int>(VL.size(), TE->Scalars.size())))) {
|
|
// We may have here 1 or 2 entries only. If the number of scalars is equal
|
|
// to the number of entries, no need to do the analysis, it is not very
|
|
// profitable. Since VL is not the same as TE->Scalars, it means we already
|
|
// have some shuffles before. Cut off not profitable case.
|
|
Entries.clear();
|
|
return std::nullopt;
|
|
}
|
|
// Build the final mask, check for the identity shuffle, if possible.
|
|
bool IsIdentity = Entries.size() == 1;
|
|
// Pair.first is the offset to the vector, while Pair.second is the index of
|
|
// scalar in the list.
|
|
for (const std::pair<unsigned, int> &Pair : EntryLanes) {
|
|
unsigned Idx = Part * VL.size() + Pair.second;
|
|
Mask[Idx] =
|
|
Pair.first * VF +
|
|
(ForOrder ? std::distance(
|
|
Entries[Pair.first]->Scalars.begin(),
|
|
find(Entries[Pair.first]->Scalars, VL[Pair.second]))
|
|
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
|
|
IsIdentity &= Mask[Idx] == Pair.second;
|
|
}
|
|
if (ForOrder || IsIdentity || Entries.empty()) {
|
|
switch (Entries.size()) {
|
|
case 1:
|
|
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
|
|
return TargetTransformInfo::SK_PermuteSingleSrc;
|
|
break;
|
|
case 2:
|
|
if (EntryLanes.size() > 2 || VL.size() <= 2)
|
|
return TargetTransformInfo::SK_PermuteTwoSrc;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
} else if (!isa<VectorType>(VL.front()->getType()) &&
|
|
(EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
|
|
// Do the cost estimation if shuffle beneficial than buildvector.
|
|
SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
|
|
std::next(Mask.begin(), (Part + 1) * VL.size()));
|
|
int MinElement = SubMask.front(), MaxElement = SubMask.front();
|
|
for (int Idx : SubMask) {
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
|
|
MinElement = Idx;
|
|
if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
|
|
MaxElement = Idx;
|
|
}
|
|
assert(MaxElement >= 0 && MinElement >= 0 &&
|
|
MaxElement % VF >= MinElement % VF &&
|
|
"Expected at least single element.");
|
|
unsigned NewVF = std::max<unsigned>(
|
|
VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
|
|
(MaxElement % VF) -
|
|
(MinElement % VF) + 1));
|
|
if (NewVF < VF) {
|
|
for_each(SubMask, [&](int &Idx) {
|
|
if (Idx == PoisonMaskElem)
|
|
return;
|
|
Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
|
|
(Idx >= static_cast<int>(VF) ? NewVF : 0);
|
|
});
|
|
} else {
|
|
NewVF = VF;
|
|
}
|
|
|
|
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
|
|
auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
|
|
auto GetShuffleCost = [&,
|
|
&TTI = *TTI](ArrayRef<int> Mask,
|
|
ArrayRef<const TreeEntry *> Entries,
|
|
VectorType *VecTy) -> InstructionCost {
|
|
if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
|
|
ShuffleVectorInst::isDeInterleaveMaskOfFactor(
|
|
Mask, Entries.front()->getInterleaveFactor()))
|
|
return TTI::TCC_Free;
|
|
return ::getShuffleCost(TTI,
|
|
Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
|
|
: TTI::SK_PermuteSingleSrc,
|
|
VecTy, Mask, CostKind);
|
|
};
|
|
InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
|
|
InstructionCost FirstShuffleCost = 0;
|
|
SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
|
|
if (Entries.size() == 1 || !Entries[0]->isGather()) {
|
|
FirstShuffleCost = ShuffleCost;
|
|
} else {
|
|
// Transform mask to include only first entry.
|
|
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
|
|
bool IsIdentity = true;
|
|
for (auto [I, Idx] : enumerate(FirstMask)) {
|
|
if (Idx >= static_cast<int>(NewVF)) {
|
|
Idx = PoisonMaskElem;
|
|
} else {
|
|
DemandedElts.clearBit(I);
|
|
if (Idx != PoisonMaskElem)
|
|
IsIdentity &= static_cast<int>(I) == Idx;
|
|
}
|
|
}
|
|
if (!IsIdentity)
|
|
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
|
|
FirstShuffleCost += getScalarizationOverhead(
|
|
*TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
|
|
/*Extract=*/false, CostKind);
|
|
}
|
|
InstructionCost SecondShuffleCost = 0;
|
|
SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
|
|
if (Entries.size() == 1 || !Entries[1]->isGather()) {
|
|
SecondShuffleCost = ShuffleCost;
|
|
} else {
|
|
// Transform mask to include only first entry.
|
|
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
|
|
bool IsIdentity = true;
|
|
for (auto [I, Idx] : enumerate(SecondMask)) {
|
|
if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
|
|
Idx = PoisonMaskElem;
|
|
} else {
|
|
DemandedElts.clearBit(I);
|
|
if (Idx != PoisonMaskElem) {
|
|
Idx -= NewVF;
|
|
IsIdentity &= static_cast<int>(I) == Idx;
|
|
}
|
|
}
|
|
}
|
|
if (!IsIdentity)
|
|
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
|
|
SecondShuffleCost += getScalarizationOverhead(
|
|
*TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
|
|
/*Extract=*/false, CostKind);
|
|
}
|
|
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
|
|
for (auto [I, Idx] : enumerate(SubMask))
|
|
if (Idx == PoisonMaskElem)
|
|
DemandedElts.clearBit(I);
|
|
InstructionCost BuildVectorCost = getScalarizationOverhead(
|
|
*TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
|
|
/*Extract=*/false, CostKind);
|
|
const TreeEntry *BestEntry = nullptr;
|
|
if (FirstShuffleCost < ShuffleCost) {
|
|
std::for_each(std::next(Mask.begin(), Part * VL.size()),
|
|
std::next(Mask.begin(), (Part + 1) * VL.size()),
|
|
[&](int &Idx) {
|
|
if (Idx >= static_cast<int>(VF))
|
|
Idx = PoisonMaskElem;
|
|
});
|
|
BestEntry = Entries.front();
|
|
ShuffleCost = FirstShuffleCost;
|
|
}
|
|
if (SecondShuffleCost < ShuffleCost) {
|
|
std::for_each(std::next(Mask.begin(), Part * VL.size()),
|
|
std::next(Mask.begin(), (Part + 1) * VL.size()),
|
|
[&](int &Idx) {
|
|
if (Idx < static_cast<int>(VF))
|
|
Idx = PoisonMaskElem;
|
|
else
|
|
Idx -= VF;
|
|
});
|
|
BestEntry = Entries[1];
|
|
ShuffleCost = SecondShuffleCost;
|
|
}
|
|
if (BuildVectorCost >= ShuffleCost) {
|
|
if (BestEntry) {
|
|
Entries.clear();
|
|
Entries.push_back(BestEntry);
|
|
}
|
|
return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
|
|
: TargetTransformInfo::SK_PermuteSingleSrc;
|
|
}
|
|
}
|
|
Entries.clear();
|
|
// Clear the corresponding mask elements.
|
|
std::fill(std::next(Mask.begin(), Part * VL.size()),
|
|
std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
|
|
return std::nullopt;
|
|
}
|
|
|
|
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
|
|
BoUpSLP::isGatherShuffledEntry(
|
|
const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
|
|
SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
|
|
bool ForOrder) {
|
|
assert(NumParts > 0 && NumParts < VL.size() &&
|
|
"Expected positive number of registers.");
|
|
Entries.clear();
|
|
// No need to check for the topmost gather node.
|
|
if (TE == VectorizableTree.front().get() &&
|
|
(!GatheredLoadsEntriesFirst.has_value() ||
|
|
none_of(ArrayRef(VectorizableTree).drop_front(),
|
|
[](const std::unique_ptr<TreeEntry> &TE) {
|
|
return !TE->isGather();
|
|
})))
|
|
return {};
|
|
// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
|
|
// implemented yet.
|
|
if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
|
|
return {};
|
|
Mask.assign(VL.size(), PoisonMaskElem);
|
|
assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
|
|
"Expected only single user of the gather node.");
|
|
assert(VL.size() % NumParts == 0 &&
|
|
"Number of scalars must be divisible by NumParts.");
|
|
if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
|
|
TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
|
|
(TE->Idx == 0 ||
|
|
(TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
|
|
isSplat(TE->Scalars) ||
|
|
(TE->hasState() &&
|
|
getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
|
|
return {};
|
|
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
|
|
SmallVector<std::optional<TTI::ShuffleKind>> Res;
|
|
for (unsigned Part : seq<unsigned>(NumParts)) {
|
|
ArrayRef<Value *> SubVL =
|
|
VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
|
|
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
|
|
std::optional<TTI::ShuffleKind> SubRes =
|
|
isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
|
|
ForOrder);
|
|
if (!SubRes)
|
|
SubEntries.clear();
|
|
Res.push_back(SubRes);
|
|
if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
|
|
SubEntries.front()->getVectorFactor() == VL.size() &&
|
|
(SubEntries.front()->isSame(TE->Scalars) ||
|
|
SubEntries.front()->isSame(VL))) {
|
|
SmallVector<const TreeEntry *> LocalSubEntries;
|
|
LocalSubEntries.swap(SubEntries);
|
|
Entries.clear();
|
|
Res.clear();
|
|
std::iota(Mask.begin(), Mask.end(), 0);
|
|
// Clear undef scalars.
|
|
for (int I = 0, Sz = VL.size(); I < Sz; ++I)
|
|
if (isa<PoisonValue>(VL[I]))
|
|
Mask[I] = PoisonMaskElem;
|
|
Entries.emplace_back(1, LocalSubEntries.front());
|
|
Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
|
|
return Res;
|
|
}
|
|
}
|
|
if (all_of(Res,
|
|
[](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
|
|
Entries.clear();
|
|
return {};
|
|
}
|
|
return Res;
|
|
}
|
|
|
|
InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
|
|
Type *ScalarTy) const {
|
|
const unsigned VF = VL.size();
|
|
auto *VecTy = getWidenedType(ScalarTy, VF);
|
|
bool DuplicateNonConst = false;
|
|
// Find the cost of inserting/extracting values from the vector.
|
|
// Check if the same elements are inserted several times and count them as
|
|
// shuffle candidates.
|
|
APInt ShuffledElements = APInt::getZero(VF);
|
|
APInt DemandedElements = APInt::getZero(VF);
|
|
DenseMap<Value *, unsigned> UniqueElements;
|
|
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
InstructionCost Cost;
|
|
auto EstimateInsertCost = [&](unsigned I, Value *V) {
|
|
DemandedElements.setBit(I);
|
|
if (V->getType() != ScalarTy)
|
|
Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
|
|
TTI::CastContextHint::None, CostKind);
|
|
};
|
|
SmallVector<int> ShuffleMask(VF, PoisonMaskElem);
|
|
SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
|
|
std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
|
|
for (auto [I, V] : enumerate(VL)) {
|
|
// No need to shuffle duplicates for constants.
|
|
if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
|
|
ShuffledElements.setBit(I);
|
|
ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
|
|
continue;
|
|
}
|
|
|
|
if (isConstant(V)) {
|
|
ConstantShuffleMask[I] = I + VF;
|
|
ShuffleMask[I] = I;
|
|
continue;
|
|
}
|
|
auto Res = UniqueElements.try_emplace(V, I);
|
|
if (Res.second) {
|
|
EstimateInsertCost(I, V);
|
|
ShuffleMask[I] = I;
|
|
continue;
|
|
}
|
|
|
|
DuplicateNonConst = true;
|
|
ShuffledElements.setBit(I);
|
|
ShuffleMask[I] = Res.first->second;
|
|
}
|
|
// FIXME: add a cost for constant vector materialization.
|
|
bool IsAnyNonUndefConst =
|
|
any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
|
|
// 1. Shuffle input source vector and constant vector.
|
|
if (!ForPoisonSrc && IsAnyNonUndefConst) {
|
|
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, VecTy,
|
|
ConstantShuffleMask);
|
|
// Update the shuffle mask for shuffling with incoming source (all elements
|
|
// are used!) or with constant subvector.
|
|
for_each(enumerate(ShuffleMask), [&](auto P) {
|
|
if ((!ForPoisonSrc && P.value() == PoisonMaskElem) ||
|
|
ConstantShuffleMask[P.index()] != PoisonMaskElem)
|
|
P.value() = P.index();
|
|
else if (P.value() != PoisonMaskElem)
|
|
P.value() += VF;
|
|
});
|
|
}
|
|
|
|
// 2. Insert unique non-constants.
|
|
if (!DemandedElements.isZero())
|
|
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
|
|
/*Insert=*/true,
|
|
/*Extract=*/false, CostKind,
|
|
ForPoisonSrc && !IsAnyNonUndefConst, VL);
|
|
// 3. Shuffle duplicates.
|
|
if (DuplicateNonConst)
|
|
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
|
|
VecTy, ShuffleMask, CostKind);
|
|
return Cost;
|
|
}
|
|
|
|
Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
|
|
auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
|
|
if (Res)
|
|
return *Res;
|
|
// Get the basic block this bundle is in. All instructions in the bundle
|
|
// should be in this block (except for extractelement-like instructions with
|
|
// constant indices or gathered loads).
|
|
auto *Front = E->getMainOp();
|
|
auto *BB = Front->getParent();
|
|
assert(((GatheredLoadsEntriesFirst.has_value() &&
|
|
E->getOpcode() == Instruction::Load && E->isGather() &&
|
|
E->Idx < *GatheredLoadsEntriesFirst) ||
|
|
E->State == TreeEntry::SplitVectorize ||
|
|
all_of(E->Scalars,
|
|
[=](Value *V) -> bool {
|
|
if (E->getOpcode() == Instruction::GetElementPtr &&
|
|
!isa<GetElementPtrInst>(V))
|
|
return true;
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
|
|
isVectorLikeInstWithConstOps(I);
|
|
})) &&
|
|
"Expected gathered loads or GEPs or instructions from same basic "
|
|
"block.");
|
|
|
|
auto FindLastInst = [&]() {
|
|
Instruction *LastInst = Front;
|
|
for (Value *V : E->Scalars) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
continue;
|
|
if (LastInst->getParent() == I->getParent()) {
|
|
if (LastInst->comesBefore(I))
|
|
LastInst = I;
|
|
continue;
|
|
}
|
|
assert(((E->getOpcode() == Instruction::GetElementPtr &&
|
|
!isa<GetElementPtrInst>(I)) ||
|
|
E->State == TreeEntry::SplitVectorize ||
|
|
(isVectorLikeInstWithConstOps(LastInst) &&
|
|
isVectorLikeInstWithConstOps(I)) ||
|
|
(GatheredLoadsEntriesFirst.has_value() &&
|
|
E->getOpcode() == Instruction::Load && E->isGather() &&
|
|
E->Idx < *GatheredLoadsEntriesFirst)) &&
|
|
"Expected vector-like or non-GEP in GEP node insts only.");
|
|
if (!DT->isReachableFromEntry(LastInst->getParent())) {
|
|
LastInst = I;
|
|
continue;
|
|
}
|
|
if (!DT->isReachableFromEntry(I->getParent()))
|
|
continue;
|
|
auto *NodeA = DT->getNode(LastInst->getParent());
|
|
auto *NodeB = DT->getNode(I->getParent());
|
|
assert(NodeA && "Should only process reachable instructions");
|
|
assert(NodeB && "Should only process reachable instructions");
|
|
assert((NodeA == NodeB) ==
|
|
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
|
|
LastInst = I;
|
|
}
|
|
BB = LastInst->getParent();
|
|
return LastInst;
|
|
};
|
|
|
|
auto FindFirstInst = [&]() {
|
|
Instruction *FirstInst = Front;
|
|
for (Value *V : E->Scalars) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
continue;
|
|
if (FirstInst->getParent() == I->getParent()) {
|
|
if (I->comesBefore(FirstInst))
|
|
FirstInst = I;
|
|
continue;
|
|
}
|
|
assert(((E->getOpcode() == Instruction::GetElementPtr &&
|
|
!isa<GetElementPtrInst>(I)) ||
|
|
(isVectorLikeInstWithConstOps(FirstInst) &&
|
|
isVectorLikeInstWithConstOps(I))) &&
|
|
"Expected vector-like or non-GEP in GEP node insts only.");
|
|
if (!DT->isReachableFromEntry(FirstInst->getParent())) {
|
|
FirstInst = I;
|
|
continue;
|
|
}
|
|
if (!DT->isReachableFromEntry(I->getParent()))
|
|
continue;
|
|
auto *NodeA = DT->getNode(FirstInst->getParent());
|
|
auto *NodeB = DT->getNode(I->getParent());
|
|
assert(NodeA && "Should only process reachable instructions");
|
|
assert(NodeB && "Should only process reachable instructions");
|
|
assert((NodeA == NodeB) ==
|
|
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
|
|
FirstInst = I;
|
|
}
|
|
return FirstInst;
|
|
};
|
|
|
|
if (E->State == TreeEntry::SplitVectorize) {
|
|
Res = FindLastInst();
|
|
if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
|
|
for (auto *E : Entries) {
|
|
auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
|
|
if (!I)
|
|
I = &getLastInstructionInBundle(E);
|
|
if (Res->comesBefore(I))
|
|
Res = I;
|
|
}
|
|
}
|
|
return *Res;
|
|
}
|
|
|
|
// Set insertpoint for gathered loads to the very first load.
|
|
if (GatheredLoadsEntriesFirst.has_value() &&
|
|
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
|
|
E->getOpcode() == Instruction::Load) {
|
|
Res = FindFirstInst();
|
|
return *Res;
|
|
}
|
|
|
|
// Set the insert point to the beginning of the basic block if the entry
|
|
// should not be scheduled.
|
|
auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
|
|
if (E->isGather())
|
|
return nullptr;
|
|
// Found previously that the instruction do not need to be scheduled.
|
|
const auto *It = BlocksSchedules.find(BB);
|
|
if (It == BlocksSchedules.end())
|
|
return nullptr;
|
|
for (Value *V : E->Scalars) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I))
|
|
continue;
|
|
ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
|
|
if (Bundles.empty())
|
|
continue;
|
|
const auto *It = find_if(
|
|
Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
|
|
if (It != Bundles.end())
|
|
return *It;
|
|
}
|
|
return nullptr;
|
|
};
|
|
const ScheduleBundle *Bundle = FindScheduleBundle(E);
|
|
if (!E->isGather() && !Bundle) {
|
|
if ((E->getOpcode() == Instruction::GetElementPtr &&
|
|
any_of(E->Scalars,
|
|
[](Value *V) {
|
|
return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
|
|
})) ||
|
|
all_of(E->Scalars,
|
|
[](Value *V) {
|
|
return isa<PoisonValue>(V) ||
|
|
(!isVectorLikeInstWithConstOps(V) &&
|
|
isUsedOutsideBlock(V));
|
|
}) ||
|
|
(E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
|
|
return isa<ExtractElementInst, UndefValue>(V) ||
|
|
areAllOperandsNonInsts(V);
|
|
})))
|
|
Res = FindLastInst();
|
|
else
|
|
Res = FindFirstInst();
|
|
return *Res;
|
|
}
|
|
|
|
// Find the last instruction. The common case should be that BB has been
|
|
// scheduled, and the last instruction is VL.back(). So we start with
|
|
// VL.back() and iterate over schedule data until we reach the end of the
|
|
// bundle. The end of the bundle is marked by null ScheduleData.
|
|
if (Bundle) {
|
|
assert(!E->isGather() && "Gathered instructions should not be scheduled");
|
|
Res = Bundle->getBundle().back()->getInst();
|
|
return *Res;
|
|
}
|
|
|
|
// LastInst can still be null at this point if there's either not an entry
|
|
// for BB in BlocksSchedules or there's no ScheduleData available for
|
|
// VL.back(). This can be the case if buildTree_rec aborts for various
|
|
// reasons (e.g., the maximum recursion depth is reached, the maximum region
|
|
// size is reached, etc.). ScheduleData is initialized in the scheduling
|
|
// "dry-run".
|
|
//
|
|
// If this happens, we can still find the last instruction by brute force. We
|
|
// iterate forwards from Front (inclusive) until we either see all
|
|
// instructions in the bundle or reach the end of the block. If Front is the
|
|
// last instruction in program order, LastInst will be set to Front, and we
|
|
// will visit all the remaining instructions in the block.
|
|
//
|
|
// One of the reasons we exit early from buildTree_rec is to place an upper
|
|
// bound on compile-time. Thus, taking an additional compile-time hit here is
|
|
// not ideal. However, this should be exceedingly rare since it requires that
|
|
// we both exit early from buildTree_rec and that the bundle be out-of-order
|
|
// (causing us to iterate all the way to the end of the block).
|
|
if (!Res)
|
|
Res = FindLastInst();
|
|
assert(Res && "Failed to find last instruction in bundle");
|
|
return *Res;
|
|
}
|
|
|
|
void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
|
|
auto *Front = E->getMainOp();
|
|
Instruction *LastInst = &getLastInstructionInBundle(E);
|
|
assert(LastInst && "Failed to find last instruction in bundle");
|
|
BasicBlock::iterator LastInstIt = LastInst->getIterator();
|
|
// If the instruction is PHI, set the insert point after all the PHIs.
|
|
bool IsPHI = isa<PHINode>(LastInst);
|
|
if (IsPHI)
|
|
LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
|
|
if (IsPHI ||
|
|
(!E->isGather() && E->State != TreeEntry::SplitVectorize &&
|
|
doesNotNeedToSchedule(E->Scalars)) ||
|
|
(GatheredLoadsEntriesFirst.has_value() &&
|
|
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
|
|
E->getOpcode() == Instruction::Load)) {
|
|
Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
|
|
} else {
|
|
// Set the insertion point after the last instruction in the bundle. Set the
|
|
// debug location to Front.
|
|
Builder.SetInsertPoint(
|
|
LastInst->getParent(),
|
|
LastInst->getNextNonDebugInstruction()->getIterator());
|
|
}
|
|
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
|
|
}
|
|
|
|
Value *BoUpSLP::gather(
|
|
ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
|
|
function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
|
|
// List of instructions/lanes from current block and/or the blocks which are
|
|
// part of the current loop. These instructions will be inserted at the end to
|
|
// make it possible to optimize loops and hoist invariant instructions out of
|
|
// the loops body with better chances for success.
|
|
SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
|
|
SmallSet<int, 4> PostponedIndices;
|
|
Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
|
|
auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
|
|
SmallPtrSet<BasicBlock *, 4> Visited;
|
|
while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
|
|
InsertBB = InsertBB->getSinglePredecessor();
|
|
return InsertBB && InsertBB == InstBB;
|
|
};
|
|
for (int I = 0, E = VL.size(); I < E; ++I) {
|
|
if (auto *Inst = dyn_cast<Instruction>(VL[I]))
|
|
if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
|
|
isVectorized(Inst) ||
|
|
(L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
|
|
PostponedIndices.insert(I).second)
|
|
PostponedInsts.emplace_back(Inst, I);
|
|
}
|
|
|
|
auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
|
|
Type *Ty) {
|
|
Value *Scalar = V;
|
|
if (Scalar->getType() != Ty) {
|
|
assert(Scalar->getType()->isIntOrIntVectorTy() &&
|
|
Ty->isIntOrIntVectorTy() && "Expected integer types only.");
|
|
Value *V = Scalar;
|
|
if (auto *CI = dyn_cast<CastInst>(Scalar);
|
|
isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
|
|
Value *Op = CI->getOperand(0);
|
|
if (auto *IOp = dyn_cast<Instruction>(Op);
|
|
!IOp || !(isDeleted(IOp) || isVectorized(IOp)))
|
|
V = Op;
|
|
}
|
|
Scalar = Builder.CreateIntCast(
|
|
V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
|
|
}
|
|
|
|
Instruction *InsElt;
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
Vec =
|
|
createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
|
|
auto *II = dyn_cast<IntrinsicInst>(Vec);
|
|
if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
|
|
return Vec;
|
|
InsElt = II;
|
|
} else {
|
|
Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
|
|
InsElt = dyn_cast<InsertElementInst>(Vec);
|
|
if (!InsElt)
|
|
return Vec;
|
|
}
|
|
GatherShuffleExtractSeq.insert(InsElt);
|
|
CSEBlocks.insert(InsElt->getParent());
|
|
// Add to our 'need-to-extract' list.
|
|
if (isa<Instruction>(V)) {
|
|
if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
|
|
// Find which lane we need to extract.
|
|
User *UserOp = nullptr;
|
|
if (Scalar != V) {
|
|
if (auto *SI = dyn_cast<Instruction>(Scalar))
|
|
UserOp = SI;
|
|
} else {
|
|
UserOp = InsElt;
|
|
}
|
|
if (UserOp) {
|
|
unsigned FoundLane = Entries.front()->findLaneForValue(V);
|
|
ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
|
|
}
|
|
}
|
|
}
|
|
return Vec;
|
|
};
|
|
auto *VecTy = getWidenedType(ScalarTy, VL.size());
|
|
Value *Vec = PoisonValue::get(VecTy);
|
|
SmallVector<int> NonConsts;
|
|
SmallVector<int> Mask(VL.size());
|
|
std::iota(Mask.begin(), Mask.end(), 0);
|
|
Value *OriginalRoot = Root;
|
|
if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
|
|
SV && isa<PoisonValue>(SV->getOperand(1)) &&
|
|
SV->getOperand(0)->getType() == VecTy) {
|
|
Root = SV->getOperand(0);
|
|
Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
|
|
}
|
|
// Insert constant values at first.
|
|
for (int I = 0, E = VL.size(); I < E; ++I) {
|
|
if (PostponedIndices.contains(I))
|
|
continue;
|
|
if (!isConstant(VL[I])) {
|
|
NonConsts.push_back(I);
|
|
continue;
|
|
}
|
|
if (isa<PoisonValue>(VL[I]))
|
|
continue;
|
|
Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
|
|
Mask[I] = I + E;
|
|
}
|
|
if (Root) {
|
|
if (isa<PoisonValue>(Vec)) {
|
|
Vec = OriginalRoot;
|
|
} else {
|
|
Vec = CreateShuffle(Root, Vec, Mask);
|
|
if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
|
|
OI && OI->hasNUses(0) &&
|
|
none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->VectorizedValue == OI;
|
|
}))
|
|
eraseInstruction(OI);
|
|
}
|
|
}
|
|
// Insert non-constant values.
|
|
for (int I : NonConsts)
|
|
Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
|
|
// Append instructions, which are/may be part of the loop, in the end to make
|
|
// it possible to hoist non-loop-based instructions.
|
|
for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
|
|
Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
|
|
|
|
return Vec;
|
|
}
|
|
|
|
/// Merges shuffle masks and emits final shuffle instruction, if required. It
|
|
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
|
|
/// when the actual shuffle instruction is generated only if this is actually
|
|
/// required. Otherwise, the shuffle instruction emission is delayed till the
|
|
/// end of the process, to reduce the number of emitted instructions and further
|
|
/// analysis/transformations.
|
|
/// The class also will look through the previously emitted shuffle instructions
|
|
/// and properly mark indices in mask as undef.
|
|
/// For example, given the code
|
|
/// \code
|
|
/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
|
|
/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
|
|
/// \endcode
|
|
/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
|
|
/// look through %s1 and %s2 and emit
|
|
/// \code
|
|
/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
|
|
/// \endcode
|
|
/// instead.
|
|
/// If 2 operands are of different size, the smallest one will be resized and
|
|
/// the mask recalculated properly.
|
|
/// For example, given the code
|
|
/// \code
|
|
/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
|
|
/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
|
|
/// \endcode
|
|
/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
|
|
/// look through %s1 and %s2 and emit
|
|
/// \code
|
|
/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
|
|
/// \endcode
|
|
/// instead.
|
|
class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
|
|
bool IsFinalized = false;
|
|
/// Combined mask for all applied operands and masks. It is built during
|
|
/// analysis and actual emission of shuffle vector instructions.
|
|
SmallVector<int> CommonMask;
|
|
/// List of operands for the shuffle vector instruction. It hold at max 2
|
|
/// operands, if the 3rd is going to be added, the first 2 are combined into
|
|
/// shuffle with \p CommonMask mask, the first operand sets to be the
|
|
/// resulting shuffle and the second operand sets to be the newly added
|
|
/// operand. The \p CommonMask is transformed in the proper way after that.
|
|
SmallVector<Value *, 2> InVectors;
|
|
IRBuilderBase &Builder;
|
|
BoUpSLP &R;
|
|
|
|
class ShuffleIRBuilder {
|
|
IRBuilderBase &Builder;
|
|
/// Holds all of the instructions that we gathered.
|
|
SetVector<Instruction *> &GatherShuffleExtractSeq;
|
|
/// A list of blocks that we are going to CSE.
|
|
DenseSet<BasicBlock *> &CSEBlocks;
|
|
/// Data layout.
|
|
const DataLayout &DL;
|
|
|
|
public:
|
|
ShuffleIRBuilder(IRBuilderBase &Builder,
|
|
SetVector<Instruction *> &GatherShuffleExtractSeq,
|
|
DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
|
|
: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
|
|
CSEBlocks(CSEBlocks), DL(DL) {}
|
|
~ShuffleIRBuilder() = default;
|
|
/// Creates shufflevector for the 2 operands with the given mask.
|
|
Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
|
|
if (V1->getType() != V2->getType()) {
|
|
assert(V1->getType()->isIntOrIntVectorTy() &&
|
|
V1->getType()->isIntOrIntVectorTy() &&
|
|
"Expected integer vector types only.");
|
|
if (V1->getType() != V2->getType()) {
|
|
if (cast<VectorType>(V2->getType())
|
|
->getElementType()
|
|
->getIntegerBitWidth() < cast<VectorType>(V1->getType())
|
|
->getElementType()
|
|
->getIntegerBitWidth())
|
|
V2 = Builder.CreateIntCast(
|
|
V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
|
|
else
|
|
V1 = Builder.CreateIntCast(
|
|
V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
|
|
}
|
|
}
|
|
Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
|
|
if (auto *I = dyn_cast<Instruction>(Vec)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
return Vec;
|
|
}
|
|
/// Creates permutation of the single vector operand with the given mask, if
|
|
/// it is not identity mask.
|
|
Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
|
|
if (Mask.empty())
|
|
return V1;
|
|
unsigned VF = Mask.size();
|
|
unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
|
|
if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
|
|
return V1;
|
|
Value *Vec = Builder.CreateShuffleVector(V1, Mask);
|
|
if (auto *I = dyn_cast<Instruction>(Vec)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
return Vec;
|
|
}
|
|
Value *createIdentity(Value *V) { return V; }
|
|
Value *createPoison(Type *Ty, unsigned VF) {
|
|
return PoisonValue::get(getWidenedType(Ty, VF));
|
|
}
|
|
/// Resizes 2 input vector to match the sizes, if the they are not equal
|
|
/// yet. The smallest vector is resized to the size of the larger vector.
|
|
void resizeToMatch(Value *&V1, Value *&V2) {
|
|
if (V1->getType() == V2->getType())
|
|
return;
|
|
int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
|
|
int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
|
|
int VF = std::max(V1VF, V2VF);
|
|
int MinVF = std::min(V1VF, V2VF);
|
|
SmallVector<int> IdentityMask(VF, PoisonMaskElem);
|
|
std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
|
|
0);
|
|
Value *&Op = MinVF == V1VF ? V1 : V2;
|
|
Op = Builder.CreateShuffleVector(Op, IdentityMask);
|
|
if (auto *I = dyn_cast<Instruction>(Op)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
if (MinVF == V1VF)
|
|
V1 = Op;
|
|
else
|
|
V2 = Op;
|
|
}
|
|
};
|
|
|
|
/// Smart shuffle instruction emission, walks through shuffles trees and
|
|
/// tries to find the best matching vector for the actual shuffle
|
|
/// instruction.
|
|
Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
|
|
assert(V1 && "Expected at least one vector value.");
|
|
ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
|
|
R.CSEBlocks, *R.DL);
|
|
return BaseShuffleAnalysis::createShuffle<Value *>(
|
|
V1, V2, Mask, ShuffleBuilder, ScalarTy);
|
|
}
|
|
|
|
/// Cast value \p V to the vector type with the same number of elements, but
|
|
/// the base type \p ScalarTy.
|
|
Value *castToScalarTyElem(Value *V,
|
|
std::optional<bool> IsSigned = std::nullopt) {
|
|
auto *VecTy = cast<VectorType>(V->getType());
|
|
assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
|
|
if (VecTy->getElementType() == ScalarTy->getScalarType())
|
|
return V;
|
|
return Builder.CreateIntCast(
|
|
V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
|
|
IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
|
|
}
|
|
|
|
public:
|
|
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
|
|
: BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
|
|
|
|
/// Adjusts extractelements after reusing them.
|
|
Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
|
|
ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
|
|
unsigned NumParts, bool &UseVecBaseAsInput) {
|
|
UseVecBaseAsInput = false;
|
|
SmallPtrSet<Value *, 4> UniqueBases;
|
|
Value *VecBase = nullptr;
|
|
SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
|
|
if (!E->ReorderIndices.empty()) {
|
|
SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
|
|
E->ReorderIndices.end());
|
|
reorderScalars(VL, ReorderMask);
|
|
}
|
|
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
|
|
int Idx = Mask[I];
|
|
if (Idx == PoisonMaskElem)
|
|
continue;
|
|
auto *EI = cast<ExtractElementInst>(VL[I]);
|
|
VecBase = EI->getVectorOperand();
|
|
if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
|
|
VecBase = TEs.front()->VectorizedValue;
|
|
assert(VecBase && "Expected vectorized value.");
|
|
UniqueBases.insert(VecBase);
|
|
// If the only one use is vectorized - can delete the extractelement
|
|
// itself.
|
|
if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
|
|
(NumParts != 1 && count(VL, EI) > 1) ||
|
|
any_of(EI->users(), [&](User *U) {
|
|
ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
|
|
return UTEs.empty() || UTEs.size() > 1 ||
|
|
(isa<GetElementPtrInst>(U) &&
|
|
!R.areAllUsersVectorized(cast<Instruction>(U))) ||
|
|
(!UTEs.empty() &&
|
|
count_if(R.VectorizableTree,
|
|
[&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->UserTreeIndex.UserTE ==
|
|
UTEs.front() &&
|
|
is_contained(VL, EI);
|
|
}) != 1);
|
|
}))
|
|
continue;
|
|
R.eraseInstruction(EI);
|
|
}
|
|
if (NumParts == 1 || UniqueBases.size() == 1) {
|
|
assert(VecBase && "Expected vectorized value.");
|
|
return castToScalarTyElem(VecBase);
|
|
}
|
|
UseVecBaseAsInput = true;
|
|
auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
|
|
for (auto [I, Idx] : enumerate(Mask))
|
|
if (Idx != PoisonMaskElem)
|
|
Idx = I;
|
|
};
|
|
// Perform multi-register vector shuffle, joining them into a single virtual
|
|
// long vector.
|
|
// Need to shuffle each part independently and then insert all this parts
|
|
// into a long virtual vector register, forming the original vector.
|
|
Value *Vec = nullptr;
|
|
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
|
|
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
|
|
for (unsigned Part : seq<unsigned>(NumParts)) {
|
|
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
|
|
ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
|
|
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
|
|
constexpr int MaxBases = 2;
|
|
SmallVector<Value *, MaxBases> Bases(MaxBases);
|
|
auto VLMask = zip(SubVL, SubMask);
|
|
const unsigned VF = std::accumulate(
|
|
VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
|
|
if (std::get<1>(D) == PoisonMaskElem)
|
|
return S;
|
|
Value *VecOp =
|
|
cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
|
|
if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
|
|
!TEs.empty())
|
|
VecOp = TEs.front()->VectorizedValue;
|
|
assert(VecOp && "Expected vectorized value.");
|
|
const unsigned Size =
|
|
cast<FixedVectorType>(VecOp->getType())->getNumElements();
|
|
return std::max(S, Size);
|
|
});
|
|
for (const auto [V, I] : VLMask) {
|
|
if (I == PoisonMaskElem)
|
|
continue;
|
|
Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
|
|
if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
|
|
VecOp = TEs.front()->VectorizedValue;
|
|
assert(VecOp && "Expected vectorized value.");
|
|
VecOp = castToScalarTyElem(VecOp);
|
|
Bases[I / VF] = VecOp;
|
|
}
|
|
if (!Bases.front())
|
|
continue;
|
|
Value *SubVec;
|
|
if (Bases.back()) {
|
|
SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
|
|
TransformToIdentity(SubMask);
|
|
} else {
|
|
SubVec = Bases.front();
|
|
}
|
|
if (!Vec) {
|
|
Vec = SubVec;
|
|
assert((Part == 0 || all_of(seq<unsigned>(0, Part),
|
|
[&](unsigned P) {
|
|
ArrayRef<int> SubMask =
|
|
Mask.slice(P * SliceSize,
|
|
getNumElems(Mask.size(),
|
|
SliceSize, P));
|
|
return all_of(SubMask, [](int Idx) {
|
|
return Idx == PoisonMaskElem;
|
|
});
|
|
})) &&
|
|
"Expected first part or all previous parts masked.");
|
|
copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
|
|
} else {
|
|
unsigned NewVF =
|
|
cast<FixedVectorType>(Vec->getType())->getNumElements();
|
|
if (Vec->getType() != SubVec->getType()) {
|
|
unsigned SubVecVF =
|
|
cast<FixedVectorType>(SubVec->getType())->getNumElements();
|
|
NewVF = std::max(NewVF, SubVecVF);
|
|
}
|
|
// Adjust SubMask.
|
|
for (int &Idx : SubMask)
|
|
if (Idx != PoisonMaskElem)
|
|
Idx += NewVF;
|
|
copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
|
|
Vec = createShuffle(Vec, SubVec, VecMask);
|
|
TransformToIdentity(VecMask);
|
|
}
|
|
}
|
|
copy(VecMask, Mask.begin());
|
|
return Vec;
|
|
}
|
|
/// Checks if the specified entry \p E needs to be delayed because of its
|
|
/// dependency nodes.
|
|
std::optional<Value *>
|
|
needToDelay(const TreeEntry *E,
|
|
ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
|
|
// No need to delay emission if all deps are ready.
|
|
if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
|
|
return all_of(
|
|
TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
|
|
}))
|
|
return std::nullopt;
|
|
// Postpone gather emission, will be emitted after the end of the
|
|
// process to keep correct order.
|
|
auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
|
|
return Builder.CreateAlignedLoad(
|
|
ResVecTy,
|
|
PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
|
|
MaybeAlign());
|
|
}
|
|
/// Reset the builder to handle perfect diamond match.
|
|
void resetForSameNode() {
|
|
IsFinalized = false;
|
|
CommonMask.clear();
|
|
InVectors.clear();
|
|
}
|
|
/// Adds 2 input vectors (in form of tree entries) and the mask for their
|
|
/// shuffling.
|
|
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
|
|
Value *V1 = E1.VectorizedValue;
|
|
if (V1->getType()->isIntOrIntVectorTy())
|
|
V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
return !isKnownNonNegative(
|
|
V, SimplifyQuery(*R.DL));
|
|
}));
|
|
Value *V2 = E2.VectorizedValue;
|
|
if (V2->getType()->isIntOrIntVectorTy())
|
|
V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
return !isKnownNonNegative(
|
|
V, SimplifyQuery(*R.DL));
|
|
}));
|
|
add(V1, V2, Mask);
|
|
}
|
|
/// Adds single input vector (in form of tree entry) and the mask for its
|
|
/// shuffling.
|
|
void add(const TreeEntry &E1, ArrayRef<int> Mask) {
|
|
Value *V1 = E1.VectorizedValue;
|
|
if (V1->getType()->isIntOrIntVectorTy())
|
|
V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
return !isKnownNonNegative(
|
|
V, SimplifyQuery(*R.DL));
|
|
}));
|
|
add(V1, Mask);
|
|
}
|
|
/// Adds 2 input vectors and the mask for their shuffling.
|
|
void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
|
|
assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
|
|
assert(isa<FixedVectorType>(V1->getType()) &&
|
|
isa<FixedVectorType>(V2->getType()) &&
|
|
"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
|
|
V1 = castToScalarTyElem(V1);
|
|
V2 = castToScalarTyElem(V2);
|
|
if (InVectors.empty()) {
|
|
InVectors.push_back(V1);
|
|
InVectors.push_back(V2);
|
|
CommonMask.assign(Mask.begin(), Mask.end());
|
|
return;
|
|
}
|
|
Value *Vec = InVectors.front();
|
|
if (InVectors.size() == 2) {
|
|
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
} else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
|
|
Mask.size()) {
|
|
Vec = createShuffle(Vec, nullptr, CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
}
|
|
V1 = createShuffle(V1, V2, Mask);
|
|
unsigned VF = std::max(getVF(V1), getVF(Vec));
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (Mask[Idx] != PoisonMaskElem)
|
|
CommonMask[Idx] = Idx + VF;
|
|
InVectors.front() = Vec;
|
|
if (InVectors.size() == 2)
|
|
InVectors.back() = V1;
|
|
else
|
|
InVectors.push_back(V1);
|
|
}
|
|
/// Adds another one input vector and the mask for the shuffling.
|
|
void add(Value *V1, ArrayRef<int> Mask, bool = false) {
|
|
assert(isa<FixedVectorType>(V1->getType()) &&
|
|
"castToScalarTyElem expects V1 to be FixedVectorType");
|
|
V1 = castToScalarTyElem(V1);
|
|
if (InVectors.empty()) {
|
|
InVectors.push_back(V1);
|
|
CommonMask.assign(Mask.begin(), Mask.end());
|
|
return;
|
|
}
|
|
const auto *It = find(InVectors, V1);
|
|
if (It == InVectors.end()) {
|
|
if (InVectors.size() == 2 ||
|
|
InVectors.front()->getType() != V1->getType()) {
|
|
Value *V = InVectors.front();
|
|
if (InVectors.size() == 2) {
|
|
V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
} else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
|
|
CommonMask.size()) {
|
|
V = createShuffle(InVectors.front(), nullptr, CommonMask);
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
}
|
|
unsigned VF = std::max(CommonMask.size(), Mask.size());
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
|
|
CommonMask[Idx] = V->getType() != V1->getType()
|
|
? Idx + VF
|
|
: Mask[Idx] + getVF(V1);
|
|
if (V->getType() != V1->getType())
|
|
V1 = createShuffle(V1, nullptr, Mask);
|
|
InVectors.front() = V;
|
|
if (InVectors.size() == 2)
|
|
InVectors.back() = V1;
|
|
else
|
|
InVectors.push_back(V1);
|
|
return;
|
|
}
|
|
// Check if second vector is required if the used elements are already
|
|
// used from the first one.
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
|
|
InVectors.push_back(V1);
|
|
break;
|
|
}
|
|
}
|
|
unsigned VF = 0;
|
|
for (Value *V : InVectors)
|
|
VF = std::max(VF, getVF(V));
|
|
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
|
|
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
|
|
CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
|
|
}
|
|
/// Adds another one input vector and the mask for the shuffling.
|
|
void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
|
|
SmallVector<int> NewMask;
|
|
inversePermutation(Order, NewMask);
|
|
add(V1, NewMask);
|
|
}
|
|
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
|
|
Value *Root = nullptr) {
|
|
return R.gather(VL, Root, ScalarTy,
|
|
[&](Value *V1, Value *V2, ArrayRef<int> Mask) {
|
|
return createShuffle(V1, V2, Mask);
|
|
});
|
|
}
|
|
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
|
|
/// Finalize emission of the shuffles.
|
|
/// \param Action the action (if any) to be performed before final applying of
|
|
/// the \p ExtMask mask.
|
|
Value *
|
|
finalize(ArrayRef<int> ExtMask,
|
|
ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
|
|
ArrayRef<int> SubVectorsMask, unsigned VF = 0,
|
|
function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
|
|
IsFinalized = true;
|
|
if (Action) {
|
|
Value *Vec = InVectors.front();
|
|
if (InVectors.size() == 2) {
|
|
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
|
|
InVectors.pop_back();
|
|
} else {
|
|
Vec = createShuffle(Vec, nullptr, CommonMask);
|
|
}
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
assert(VF > 0 &&
|
|
"Expected vector length for the final value before action.");
|
|
unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
|
|
if (VecVF < VF) {
|
|
SmallVector<int> ResizeMask(VF, PoisonMaskElem);
|
|
std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
|
|
Vec = createShuffle(Vec, nullptr, ResizeMask);
|
|
}
|
|
Action(Vec, CommonMask);
|
|
InVectors.front() = Vec;
|
|
}
|
|
if (!SubVectors.empty()) {
|
|
Value *Vec = InVectors.front();
|
|
if (InVectors.size() == 2) {
|
|
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
|
|
InVectors.pop_back();
|
|
} else {
|
|
Vec = createShuffle(Vec, nullptr, CommonMask);
|
|
}
|
|
transformMaskAfterShuffle(CommonMask, CommonMask);
|
|
auto CreateSubVectors = [&](Value *Vec,
|
|
SmallVectorImpl<int> &CommonMask) {
|
|
for (auto [E, Idx] : SubVectors) {
|
|
Value *V = E->VectorizedValue;
|
|
if (V->getType()->isIntOrIntVectorTy())
|
|
V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
return !isKnownNonNegative(
|
|
V, SimplifyQuery(*R.DL));
|
|
}));
|
|
unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
|
|
Vec = createInsertVector(
|
|
Builder, Vec, V, InsertionIndex,
|
|
std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
|
|
_3));
|
|
if (!CommonMask.empty()) {
|
|
std::iota(std::next(CommonMask.begin(), Idx),
|
|
std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
|
|
Idx);
|
|
}
|
|
}
|
|
return Vec;
|
|
};
|
|
if (SubVectorsMask.empty()) {
|
|
Vec = CreateSubVectors(Vec, CommonMask);
|
|
} else {
|
|
SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
|
|
copy(SubVectorsMask, SVMask.begin());
|
|
for (auto [I1, I2] : zip(SVMask, CommonMask)) {
|
|
if (I2 != PoisonMaskElem) {
|
|
assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
|
|
I1 = I2 + CommonMask.size();
|
|
}
|
|
}
|
|
Value *InsertVec =
|
|
CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
|
|
Vec = createShuffle(InsertVec, Vec, SVMask);
|
|
transformMaskAfterShuffle(CommonMask, SVMask);
|
|
}
|
|
InVectors.front() = Vec;
|
|
}
|
|
|
|
if (!ExtMask.empty()) {
|
|
if (CommonMask.empty()) {
|
|
CommonMask.assign(ExtMask.begin(), ExtMask.end());
|
|
} else {
|
|
SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
|
|
for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
|
|
if (ExtMask[I] == PoisonMaskElem)
|
|
continue;
|
|
NewMask[I] = CommonMask[ExtMask[I]];
|
|
}
|
|
CommonMask.swap(NewMask);
|
|
}
|
|
}
|
|
if (CommonMask.empty()) {
|
|
assert(InVectors.size() == 1 && "Expected only one vector with no mask");
|
|
return InVectors.front();
|
|
}
|
|
if (InVectors.size() == 2)
|
|
return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
|
|
return createShuffle(InVectors.front(), nullptr, CommonMask);
|
|
}
|
|
|
|
~ShuffleInstructionBuilder() {
|
|
assert((IsFinalized || CommonMask.empty()) &&
|
|
"Shuffle construction must be finalized.");
|
|
}
|
|
};
|
|
|
|
BoUpSLP::TreeEntry *
|
|
BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
|
|
ArrayRef<Value *> VL,
|
|
const InstructionsState &S) {
|
|
if (!S)
|
|
return nullptr;
|
|
for (TreeEntry *TE : ScalarToTreeEntries.lookup(S.getMainOp()))
|
|
if (TE->UserTreeIndex.UserTE == E && TE->UserTreeIndex.EdgeIdx == NodeIdx &&
|
|
TE->isSame(VL))
|
|
return TE;
|
|
return nullptr;
|
|
}
|
|
|
|
Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
|
|
ValueList &VL = E->getOperand(NodeIdx);
|
|
InstructionsState S = getSameOpcode(VL, *TLI);
|
|
// Special processing for GEPs bundle, which may include non-gep values.
|
|
if (!S && VL.front()->getType()->isPointerTy()) {
|
|
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
|
|
if (It != VL.end())
|
|
S = getSameOpcode(*It, *TLI);
|
|
}
|
|
const unsigned VF = VL.size();
|
|
if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx, VL, S)) {
|
|
auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
|
|
// V may be affected by MinBWs.
|
|
// We want ShuffleInstructionBuilder to correctly support REVEC. The key
|
|
// factor is the number of elements, not their type.
|
|
Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
|
|
unsigned NumElements = getNumElements(VL.front()->getType());
|
|
ShuffleInstructionBuilder ShuffleBuilder(
|
|
NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
|
|
: ScalarTy,
|
|
Builder, *this);
|
|
ShuffleBuilder.add(V, Mask);
|
|
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
|
|
E->CombinedEntriesWithIndices.size());
|
|
transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
|
|
[&](const auto &P) {
|
|
return std::make_pair(VectorizableTree[P.first].get(),
|
|
P.second);
|
|
});
|
|
assert((E->CombinedEntriesWithIndices.empty() ||
|
|
E->ReorderIndices.empty()) &&
|
|
"Expected either combined subnodes or reordering");
|
|
return ShuffleBuilder.finalize({}, SubVectors, {});
|
|
};
|
|
Value *V = vectorizeTree(VE);
|
|
if (VF * getNumElements(VL[0]->getType()) !=
|
|
cast<FixedVectorType>(V->getType())->getNumElements()) {
|
|
if (!VE->ReuseShuffleIndices.empty()) {
|
|
// Reshuffle to get only unique values.
|
|
// If some of the scalars are duplicated in the vectorization
|
|
// tree entry, we do not vectorize them but instead generate a
|
|
// mask for the reuses. But if there are several users of the
|
|
// same entry, they may have different vectorization factors.
|
|
// This is especially important for PHI nodes. In this case, we
|
|
// need to adapt the resulting instruction for the user
|
|
// vectorization factor and have to reshuffle it again to take
|
|
// only unique elements of the vector. Without this code the
|
|
// function incorrectly returns reduced vector instruction with
|
|
// the same elements, not with the unique ones.
|
|
|
|
// block:
|
|
// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
|
|
// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
|
|
// ... (use %2)
|
|
// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
|
|
// br %block
|
|
SmallVector<int> Mask(VF, PoisonMaskElem);
|
|
for (auto [I, V] : enumerate(VL)) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
Mask[I] = VE->findLaneForValue(V);
|
|
}
|
|
V = FinalShuffle(V, Mask);
|
|
} else {
|
|
assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
|
|
"Expected vectorization factor less "
|
|
"than original vector size.");
|
|
SmallVector<int> UniformMask(VF, 0);
|
|
std::iota(UniformMask.begin(), UniformMask.end(), 0);
|
|
V = FinalShuffle(V, UniformMask);
|
|
}
|
|
}
|
|
// Need to update the operand gather node, if actually the operand is not a
|
|
// vectorized node, but the buildvector/gather node, which matches one of
|
|
// the vectorized nodes.
|
|
if (VE->UserTreeIndex.UserTE != E || VE->UserTreeIndex.EdgeIdx != NodeIdx) {
|
|
auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
|
|
[&](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->isGather() &&
|
|
TE->UserTreeIndex.UserTE == E &&
|
|
TE->UserTreeIndex.EdgeIdx == NodeIdx;
|
|
});
|
|
assert(It != VectorizableTree.end() && "Expected gather node operand.");
|
|
(*It)->VectorizedValue = V;
|
|
}
|
|
return V;
|
|
}
|
|
|
|
// Find the corresponding gather entry and vectorize it.
|
|
// Allows to be more accurate with tree/graph transformations, checks for the
|
|
// correctness of the transformations in many cases.
|
|
auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
|
|
[E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->isOperandGatherNode({E, NodeIdx}) ||
|
|
(TE->State == TreeEntry::SplitVectorize &&
|
|
TE->UserTreeIndex == EdgeInfo(E, NodeIdx));
|
|
});
|
|
assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
|
|
assert(I->get()->UserTreeIndex &&
|
|
"Expected only single user for the gather node.");
|
|
assert(I->get()->isSame(VL) && "Expected same list of scalars.");
|
|
return vectorizeTree(I->get());
|
|
}
|
|
|
|
template <typename BVTy, typename ResTy, typename... Args>
|
|
ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
|
|
Args &...Params) {
|
|
assert(E->isGather() && "Expected gather node.");
|
|
unsigned VF = E->getVectorFactor();
|
|
|
|
bool NeedFreeze = false;
|
|
SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
|
|
E->ReuseShuffleIndices.end());
|
|
SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
|
|
// Clear values, to be replaced by insertvector instructions.
|
|
for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
|
|
for_each(MutableArrayRef(GatheredScalars)
|
|
.slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
|
|
[&](Value *&V) { V = PoisonValue::get(V->getType()); });
|
|
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
|
|
E->CombinedEntriesWithIndices.size());
|
|
transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
|
|
[&](const auto &P) {
|
|
return std::make_pair(VectorizableTree[P.first].get(), P.second);
|
|
});
|
|
// Build a mask out of the reorder indices and reorder scalars per this
|
|
// mask.
|
|
SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
|
|
E->ReorderIndices.end());
|
|
if (!ReorderMask.empty())
|
|
reorderScalars(GatheredScalars, ReorderMask);
|
|
SmallVector<int> SubVectorsMask;
|
|
inversePermutation(E->ReorderIndices, SubVectorsMask);
|
|
// Transform non-clustered elements in the mask to poison (-1).
|
|
// "Clustered" operations will be reordered using this mask later.
|
|
if (!SubVectors.empty() && !SubVectorsMask.empty()) {
|
|
for (unsigned I : seq<unsigned>(GatheredScalars.size()))
|
|
if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
|
|
SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
|
|
} else {
|
|
SubVectorsMask.clear();
|
|
}
|
|
SmallVector<Value *> StoredGS(GatheredScalars);
|
|
auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
|
|
unsigned I, unsigned SliceSize,
|
|
bool IsNotPoisonous) {
|
|
if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
|
|
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
|
|
}))
|
|
return false;
|
|
TreeEntry *UserTE = E->UserTreeIndex.UserTE;
|
|
unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
|
|
if (UserTE->getNumOperands() != 2)
|
|
return false;
|
|
if (!IsNotPoisonous) {
|
|
auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
|
|
[=](const std::unique_ptr<TreeEntry> &TE) {
|
|
return TE->UserTreeIndex.UserTE == UserTE &&
|
|
TE->UserTreeIndex.EdgeIdx != EdgeIdx;
|
|
});
|
|
if (It == VectorizableTree.end())
|
|
return false;
|
|
SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
|
|
if (!(*It)->ReorderIndices.empty()) {
|
|
inversePermutation((*It)->ReorderIndices, ReorderMask);
|
|
reorderScalars(GS, ReorderMask);
|
|
}
|
|
if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
|
|
Value *V0 = std::get<0>(P);
|
|
Value *V1 = std::get<1>(P);
|
|
return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
|
|
(isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
|
|
is_contained(E->Scalars, V1));
|
|
}))
|
|
return false;
|
|
}
|
|
int Idx;
|
|
if ((Mask.size() < InputVF &&
|
|
ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
|
|
Idx == 0) ||
|
|
(Mask.size() == InputVF &&
|
|
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
|
|
std::iota(
|
|
std::next(Mask.begin(), I * SliceSize),
|
|
std::next(Mask.begin(),
|
|
I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
|
|
0);
|
|
} else {
|
|
unsigned IVal =
|
|
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
|
|
std::fill(
|
|
std::next(Mask.begin(), I * SliceSize),
|
|
std::next(Mask.begin(),
|
|
I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
|
|
IVal);
|
|
}
|
|
return true;
|
|
};
|
|
BVTy ShuffleBuilder(ScalarTy, Params...);
|
|
ResTy Res = ResTy();
|
|
SmallVector<int> Mask;
|
|
SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
|
|
SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
|
|
Value *ExtractVecBase = nullptr;
|
|
bool UseVecBaseAsInput = false;
|
|
SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
|
|
SmallVector<SmallVector<const TreeEntry *>> Entries;
|
|
Type *OrigScalarTy = GatheredScalars.front()->getType();
|
|
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
|
|
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
|
|
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
|
|
// Check for gathered extracts.
|
|
bool Resized = false;
|
|
ExtractShuffles =
|
|
tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
|
|
if (!ExtractShuffles.empty()) {
|
|
SmallVector<const TreeEntry *> ExtractEntries;
|
|
for (auto [Idx, I] : enumerate(ExtractMask)) {
|
|
if (I == PoisonMaskElem)
|
|
continue;
|
|
if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
|
|
cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
|
|
!TEs.empty())
|
|
ExtractEntries.append(TEs.begin(), TEs.end());
|
|
}
|
|
if (std::optional<ResTy> Delayed =
|
|
ShuffleBuilder.needToDelay(E, ExtractEntries)) {
|
|
// Delay emission of gathers which are not ready yet.
|
|
PostponedGathers.insert(E);
|
|
// Postpone gather emission, will be emitted after the end of the
|
|
// process to keep correct order.
|
|
return *Delayed;
|
|
}
|
|
if (Value *VecBase = ShuffleBuilder.adjustExtracts(
|
|
E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
|
|
ExtractVecBase = VecBase;
|
|
if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
|
|
if (VF == VecBaseTy->getNumElements() &&
|
|
GatheredScalars.size() != VF) {
|
|
Resized = true;
|
|
GatheredScalars.append(VF - GatheredScalars.size(),
|
|
PoisonValue::get(OrigScalarTy));
|
|
NumParts =
|
|
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
|
|
}
|
|
}
|
|
}
|
|
// Gather extracts after we check for full matched gathers only.
|
|
if (!ExtractShuffles.empty() || !E->hasState() ||
|
|
E->getOpcode() != Instruction::Load ||
|
|
(((E->hasState() && E->getOpcode() == Instruction::Load) ||
|
|
any_of(E->Scalars, IsaPred<LoadInst>)) &&
|
|
any_of(E->Scalars,
|
|
[this](Value *V) {
|
|
return isa<LoadInst>(V) && isVectorized(V);
|
|
})) ||
|
|
(E->hasState() && E->isAltShuffle()) ||
|
|
all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
|
|
isSplat(E->Scalars) ||
|
|
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
|
|
GatherShuffles =
|
|
isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
|
|
}
|
|
if (!GatherShuffles.empty()) {
|
|
if (std::optional<ResTy> Delayed =
|
|
ShuffleBuilder.needToDelay(E, Entries)) {
|
|
// Delay emission of gathers which are not ready yet.
|
|
PostponedGathers.insert(E);
|
|
// Postpone gather emission, will be emitted after the end of the
|
|
// process to keep correct order.
|
|
return *Delayed;
|
|
}
|
|
if (GatherShuffles.size() == 1 &&
|
|
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
|
|
Entries.front().front()->isSame(E->Scalars)) {
|
|
// Perfect match in the graph, will reuse the previously vectorized
|
|
// node. Cost is 0.
|
|
LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
|
|
<< shortBundleName(E->Scalars, E->Idx) << ".\n");
|
|
// Restore the mask for previous partially matched values.
|
|
Mask.resize(E->Scalars.size());
|
|
const TreeEntry *FrontTE = Entries.front().front();
|
|
if (FrontTE->ReorderIndices.empty() &&
|
|
((FrontTE->ReuseShuffleIndices.empty() &&
|
|
E->Scalars.size() == FrontTE->Scalars.size()) ||
|
|
(E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
|
|
std::iota(Mask.begin(), Mask.end(), 0);
|
|
} else {
|
|
for (auto [I, V] : enumerate(E->Scalars)) {
|
|
if (isa<PoisonValue>(V)) {
|
|
Mask[I] = PoisonMaskElem;
|
|
continue;
|
|
}
|
|
Mask[I] = FrontTE->findLaneForValue(V);
|
|
}
|
|
}
|
|
// Reset the builder(s) to correctly handle perfect diamond matched
|
|
// nodes.
|
|
ShuffleBuilder.resetForSameNode();
|
|
ShuffleBuilder.add(*FrontTE, Mask);
|
|
// Full matched entry found, no need to insert subvectors.
|
|
Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
|
|
return Res;
|
|
}
|
|
if (!Resized) {
|
|
if (GatheredScalars.size() != VF &&
|
|
any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
|
|
return any_of(TEs, [&](const TreeEntry *TE) {
|
|
return TE->getVectorFactor() == VF;
|
|
});
|
|
}))
|
|
GatheredScalars.append(VF - GatheredScalars.size(),
|
|
PoisonValue::get(OrigScalarTy));
|
|
}
|
|
// Remove shuffled elements from list of gathers.
|
|
for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
|
|
if (Mask[I] != PoisonMaskElem)
|
|
GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
|
|
}
|
|
}
|
|
}
|
|
auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
|
|
SmallVectorImpl<int> &ReuseMask,
|
|
bool IsRootPoison) {
|
|
// For splats with can emit broadcasts instead of gathers, so try to find
|
|
// such sequences.
|
|
bool IsSplat = IsRootPoison && isSplat(Scalars) &&
|
|
(Scalars.size() > 2 || Scalars.front() == Scalars.back());
|
|
Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
|
|
SmallVector<int> UndefPos;
|
|
DenseMap<Value *, unsigned> UniquePositions;
|
|
// Gather unique non-const values and all constant values.
|
|
// For repeated values, just shuffle them.
|
|
int NumNonConsts = 0;
|
|
int SinglePos = 0;
|
|
for (auto [I, V] : enumerate(Scalars)) {
|
|
if (isa<UndefValue>(V)) {
|
|
if (!isa<PoisonValue>(V)) {
|
|
ReuseMask[I] = I;
|
|
UndefPos.push_back(I);
|
|
}
|
|
continue;
|
|
}
|
|
if (isConstant(V)) {
|
|
ReuseMask[I] = I;
|
|
continue;
|
|
}
|
|
++NumNonConsts;
|
|
SinglePos = I;
|
|
Value *OrigV = V;
|
|
Scalars[I] = PoisonValue::get(OrigScalarTy);
|
|
if (IsSplat) {
|
|
Scalars.front() = OrigV;
|
|
ReuseMask[I] = 0;
|
|
} else {
|
|
const auto Res = UniquePositions.try_emplace(OrigV, I);
|
|
Scalars[Res.first->second] = OrigV;
|
|
ReuseMask[I] = Res.first->second;
|
|
}
|
|
}
|
|
if (NumNonConsts == 1) {
|
|
// Restore single insert element.
|
|
if (IsSplat) {
|
|
ReuseMask.assign(VF, PoisonMaskElem);
|
|
std::swap(Scalars.front(), Scalars[SinglePos]);
|
|
if (!UndefPos.empty() && UndefPos.front() == 0)
|
|
Scalars.front() = UndefValue::get(OrigScalarTy);
|
|
}
|
|
ReuseMask[SinglePos] = SinglePos;
|
|
} else if (!UndefPos.empty() && IsSplat) {
|
|
// For undef values, try to replace them with the simple broadcast.
|
|
// We can do it if the broadcasted value is guaranteed to be
|
|
// non-poisonous, or by freezing the incoming scalar value first.
|
|
auto *It = find_if(Scalars, [this, E](Value *V) {
|
|
return !isa<UndefValue>(V) &&
|
|
(isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
|
|
(E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
|
|
// Check if the value already used in the same operation in
|
|
// one of the nodes already.
|
|
return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
|
|
is_contained(E->UserTreeIndex.UserTE->Scalars,
|
|
U.getUser());
|
|
})));
|
|
});
|
|
if (It != Scalars.end()) {
|
|
// Replace undefs by the non-poisoned scalars and emit broadcast.
|
|
int Pos = std::distance(Scalars.begin(), It);
|
|
for (int I : UndefPos) {
|
|
// Set the undef position to the non-poisoned scalar.
|
|
ReuseMask[I] = Pos;
|
|
// Replace the undef by the poison, in the mask it is replaced by
|
|
// non-poisoned scalar already.
|
|
if (I != Pos)
|
|
Scalars[I] = PoisonValue::get(OrigScalarTy);
|
|
}
|
|
} else {
|
|
// Replace undefs by the poisons, emit broadcast and then emit
|
|
// freeze.
|
|
for (int I : UndefPos) {
|
|
ReuseMask[I] = PoisonMaskElem;
|
|
if (isa<UndefValue>(Scalars[I]))
|
|
Scalars[I] = PoisonValue::get(OrigScalarTy);
|
|
}
|
|
NeedFreeze = true;
|
|
}
|
|
}
|
|
};
|
|
if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
|
|
bool IsNonPoisoned = true;
|
|
bool IsUsedInExpr = true;
|
|
Value *Vec1 = nullptr;
|
|
if (!ExtractShuffles.empty()) {
|
|
// Gather of extractelements can be represented as just a shuffle of
|
|
// a single/two vectors the scalars are extracted from.
|
|
// Find input vectors.
|
|
Value *Vec2 = nullptr;
|
|
for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
|
|
if (!Mask.empty() && Mask[I] != PoisonMaskElem)
|
|
ExtractMask[I] = PoisonMaskElem;
|
|
}
|
|
if (UseVecBaseAsInput) {
|
|
Vec1 = ExtractVecBase;
|
|
} else {
|
|
for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
|
|
if (ExtractMask[I] == PoisonMaskElem)
|
|
continue;
|
|
if (isa<UndefValue>(StoredGS[I]))
|
|
continue;
|
|
auto *EI = cast<ExtractElementInst>(StoredGS[I]);
|
|
Value *VecOp = EI->getVectorOperand();
|
|
if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
|
|
!TEs.empty() && TEs.front()->VectorizedValue)
|
|
VecOp = TEs.front()->VectorizedValue;
|
|
if (!Vec1) {
|
|
Vec1 = VecOp;
|
|
} else if (Vec1 != VecOp) {
|
|
assert((!Vec2 || Vec2 == VecOp) &&
|
|
"Expected only 1 or 2 vectors shuffle.");
|
|
Vec2 = VecOp;
|
|
}
|
|
}
|
|
}
|
|
if (Vec2) {
|
|
IsUsedInExpr = false;
|
|
IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
|
|
isGuaranteedNotToBePoison(Vec2, AC);
|
|
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
|
|
} else if (Vec1) {
|
|
bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
|
|
IsUsedInExpr &= FindReusedSplat(
|
|
ExtractMask,
|
|
cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
|
|
ExtractMask.size(), IsNotPoisonedVec);
|
|
ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
|
|
IsNonPoisoned &= IsNotPoisonedVec;
|
|
} else {
|
|
IsUsedInExpr = false;
|
|
ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
|
|
/*ForExtracts=*/true);
|
|
}
|
|
}
|
|
if (!GatherShuffles.empty()) {
|
|
unsigned SliceSize =
|
|
getPartNumElems(E->Scalars.size(),
|
|
::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
|
|
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
|
|
for (const auto [I, TEs] : enumerate(Entries)) {
|
|
if (TEs.empty()) {
|
|
assert(!GatherShuffles[I] &&
|
|
"No shuffles with empty entries list expected.");
|
|
continue;
|
|
}
|
|
assert((TEs.size() == 1 || TEs.size() == 2) &&
|
|
"Expected shuffle of 1 or 2 entries.");
|
|
unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
|
|
auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
|
|
VecMask.assign(VecMask.size(), PoisonMaskElem);
|
|
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
|
|
if (TEs.size() == 1) {
|
|
bool IsNotPoisonedVec =
|
|
TEs.front()->VectorizedValue
|
|
? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
|
|
: true;
|
|
IsUsedInExpr &=
|
|
FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
|
|
SliceSize, IsNotPoisonedVec);
|
|
ShuffleBuilder.add(*TEs.front(), VecMask);
|
|
IsNonPoisoned &= IsNotPoisonedVec;
|
|
} else {
|
|
IsUsedInExpr = false;
|
|
ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
|
|
if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
|
|
IsNonPoisoned &=
|
|
isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
|
|
isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
|
|
}
|
|
}
|
|
}
|
|
// Try to figure out best way to combine values: build a shuffle and insert
|
|
// elements or just build several shuffles.
|
|
// Insert non-constant scalars.
|
|
SmallVector<Value *> NonConstants(GatheredScalars);
|
|
int EMSz = ExtractMask.size();
|
|
int MSz = Mask.size();
|
|
// Try to build constant vector and shuffle with it only if currently we
|
|
// have a single permutation and more than 1 scalar constants.
|
|
bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
|
|
bool IsIdentityShuffle =
|
|
((UseVecBaseAsInput ||
|
|
all_of(ExtractShuffles,
|
|
[](const std::optional<TTI::ShuffleKind> &SK) {
|
|
return SK.value_or(TTI::SK_PermuteTwoSrc) ==
|
|
TTI::SK_PermuteSingleSrc;
|
|
})) &&
|
|
none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
|
|
ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
|
|
(!GatherShuffles.empty() &&
|
|
all_of(GatherShuffles,
|
|
[](const std::optional<TTI::ShuffleKind> &SK) {
|
|
return SK.value_or(TTI::SK_PermuteTwoSrc) ==
|
|
TTI::SK_PermuteSingleSrc;
|
|
}) &&
|
|
none_of(Mask, [&](int I) { return I >= MSz; }) &&
|
|
ShuffleVectorInst::isIdentityMask(Mask, MSz));
|
|
bool EnoughConstsForShuffle =
|
|
IsSingleShuffle &&
|
|
(none_of(GatheredScalars,
|
|
[](Value *V) {
|
|
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
|
|
}) ||
|
|
any_of(GatheredScalars,
|
|
[](Value *V) {
|
|
return isa<Constant>(V) && !isa<UndefValue>(V);
|
|
})) &&
|
|
(!IsIdentityShuffle ||
|
|
(GatheredScalars.size() == 2 &&
|
|
any_of(GatheredScalars,
|
|
[](Value *V) { return !isa<UndefValue>(V); })) ||
|
|
count_if(GatheredScalars, [](Value *V) {
|
|
return isa<Constant>(V) && !isa<PoisonValue>(V);
|
|
}) > 1);
|
|
// NonConstants array contains just non-constant values, GatheredScalars
|
|
// contains only constant to build final vector and then shuffle.
|
|
for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
|
|
if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
|
|
NonConstants[I] = PoisonValue::get(OrigScalarTy);
|
|
else
|
|
GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
|
|
}
|
|
// Generate constants for final shuffle and build a mask for them.
|
|
if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
|
|
SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
|
|
TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
|
|
Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
|
|
ShuffleBuilder.add(BV, BVMask);
|
|
}
|
|
if (all_of(NonConstants, [=](Value *V) {
|
|
return isa<PoisonValue>(V) ||
|
|
(IsSingleShuffle && ((IsIdentityShuffle &&
|
|
IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
|
|
}))
|
|
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
|
|
SubVectorsMask);
|
|
else
|
|
Res = ShuffleBuilder.finalize(
|
|
E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
|
|
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
|
|
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
|
|
Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
|
|
});
|
|
} else if (!allConstant(GatheredScalars)) {
|
|
// Gather unique scalars and all constants.
|
|
SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
|
|
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
|
|
Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
|
|
ShuffleBuilder.add(BV, ReuseMask);
|
|
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
|
|
SubVectorsMask);
|
|
} else {
|
|
// Gather all constants.
|
|
SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
|
|
for (auto [I, V] : enumerate(GatheredScalars)) {
|
|
if (!isa<PoisonValue>(V))
|
|
Mask[I] = I;
|
|
}
|
|
Value *BV = ShuffleBuilder.gather(GatheredScalars);
|
|
ShuffleBuilder.add(BV, Mask);
|
|
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
|
|
SubVectorsMask);
|
|
}
|
|
|
|
if (NeedFreeze)
|
|
Res = ShuffleBuilder.createFreeze(Res);
|
|
return Res;
|
|
}
|
|
|
|
Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
|
|
for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
|
|
(void)vectorizeTree(VectorizableTree[EIdx].get());
|
|
return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
|
|
Builder, *this);
|
|
}
|
|
|
|
/// \returns \p I after propagating metadata from \p VL only for instructions in
|
|
/// \p VL.
|
|
static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
|
|
SmallVector<Value *> Insts;
|
|
for (Value *V : VL)
|
|
if (isa<Instruction>(V))
|
|
Insts.push_back(V);
|
|
return llvm::propagateMetadata(Inst, Insts);
|
|
}
|
|
|
|
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|
IRBuilderBase::InsertPointGuard Guard(Builder);
|
|
|
|
Value *V = E->Scalars.front();
|
|
Type *ScalarTy = V->getType();
|
|
if (!isa<CmpInst>(V))
|
|
ScalarTy = getValueType(V);
|
|
auto It = MinBWs.find(E);
|
|
if (It != MinBWs.end()) {
|
|
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
|
|
ScalarTy = IntegerType::get(F->getContext(), It->second.first);
|
|
if (VecTy)
|
|
ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
|
|
}
|
|
auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
|
|
if (E->isGather()) {
|
|
// Set insert point for non-reduction initial nodes.
|
|
if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
|
|
setInsertPointAfterBundle(E);
|
|
Value *Vec = createBuildVector(E, ScalarTy);
|
|
E->VectorizedValue = Vec;
|
|
return Vec;
|
|
}
|
|
if (E->State == TreeEntry::SplitVectorize) {
|
|
assert(E->CombinedEntriesWithIndices.size() == 2 &&
|
|
"Expected exactly 2 combined entries.");
|
|
setInsertPointAfterBundle(E);
|
|
TreeEntry &OpTE1 =
|
|
*VectorizableTree[E->CombinedEntriesWithIndices.front().first].get();
|
|
assert(OpTE1.isSame(
|
|
ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
|
|
"Expected same first part of scalars.");
|
|
Value *Op1 = vectorizeTree(&OpTE1);
|
|
TreeEntry &OpTE2 =
|
|
*VectorizableTree[E->CombinedEntriesWithIndices.back().first].get();
|
|
assert(
|
|
OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
|
|
"Expected same second part of scalars.");
|
|
Value *Op2 = vectorizeTree(&OpTE2);
|
|
auto GetOperandSignedness = [&](const TreeEntry *OpE) {
|
|
bool IsSigned = false;
|
|
auto It = MinBWs.find(OpE);
|
|
if (It != MinBWs.end())
|
|
IsSigned = It->second.second;
|
|
else
|
|
IsSigned = any_of(OpE->Scalars, [&](Value *R) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
return !isKnownNonNegative(R, SimplifyQuery(*DL));
|
|
});
|
|
return IsSigned;
|
|
};
|
|
if (cast<VectorType>(Op1->getType())->getElementType() !=
|
|
ScalarTy->getScalarType()) {
|
|
assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
|
|
Op1 = Builder.CreateIntCast(
|
|
Op1,
|
|
getWidenedType(
|
|
ScalarTy,
|
|
cast<FixedVectorType>(Op1->getType())->getNumElements()),
|
|
GetOperandSignedness(&OpTE1));
|
|
}
|
|
if (cast<VectorType>(Op2->getType())->getElementType() !=
|
|
ScalarTy->getScalarType()) {
|
|
assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
|
|
Op2 = Builder.CreateIntCast(
|
|
Op2,
|
|
getWidenedType(
|
|
ScalarTy,
|
|
cast<FixedVectorType>(Op2->getType())->getNumElements()),
|
|
GetOperandSignedness(&OpTE2));
|
|
}
|
|
if (E->ReorderIndices.empty()) {
|
|
SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
|
|
std::iota(
|
|
Mask.begin(),
|
|
std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
|
|
0);
|
|
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
|
|
if (ScalarTyNumElements != 1) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
|
|
}
|
|
Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
|
|
Vec = createInsertVector(Builder, Vec, Op2,
|
|
E->CombinedEntriesWithIndices.back().second *
|
|
ScalarTyNumElements);
|
|
E->VectorizedValue = Vec;
|
|
return Vec;
|
|
}
|
|
unsigned CommonVF =
|
|
std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
|
|
if (getNumElements(Op1->getType()) != CommonVF) {
|
|
SmallVector<int> Mask(CommonVF, PoisonMaskElem);
|
|
std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
|
|
0);
|
|
Op1 = Builder.CreateShuffleVector(Op1, Mask);
|
|
}
|
|
if (getNumElements(Op2->getType()) != CommonVF) {
|
|
SmallVector<int> Mask(CommonVF, PoisonMaskElem);
|
|
std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
|
|
0);
|
|
Op2 = Builder.CreateShuffleVector(Op2, Mask);
|
|
}
|
|
Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
|
|
E->VectorizedValue = Vec;
|
|
return Vec;
|
|
}
|
|
|
|
bool IsReverseOrder =
|
|
!E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
|
|
auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
|
|
ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
|
|
if (E->getOpcode() == Instruction::Store &&
|
|
E->State == TreeEntry::Vectorize) {
|
|
ArrayRef<int> Mask =
|
|
ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
|
|
E->ReorderIndices.size());
|
|
ShuffleBuilder.add(V, Mask);
|
|
} else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
|
|
E->State == TreeEntry::CompressVectorize) {
|
|
ShuffleBuilder.addOrdered(V, {});
|
|
} else {
|
|
ShuffleBuilder.addOrdered(V, E->ReorderIndices);
|
|
}
|
|
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
|
|
E->CombinedEntriesWithIndices.size());
|
|
transform(
|
|
E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
|
|
return std::make_pair(VectorizableTree[P.first].get(), P.second);
|
|
});
|
|
assert(
|
|
(E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
|
|
"Expected either combined subnodes or reordering");
|
|
return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
|
|
};
|
|
|
|
assert(!E->isGather() && "Unhandled state");
|
|
unsigned ShuffleOrOp =
|
|
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
|
|
Instruction *VL0 = E->getMainOp();
|
|
auto GetOperandSignedness = [&](unsigned Idx) {
|
|
const TreeEntry *OpE = getOperandEntry(E, Idx);
|
|
bool IsSigned = false;
|
|
auto It = MinBWs.find(OpE);
|
|
if (It != MinBWs.end())
|
|
IsSigned = It->second.second;
|
|
else
|
|
IsSigned = any_of(OpE->Scalars, [&](Value *R) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
return !isKnownNonNegative(R, SimplifyQuery(*DL));
|
|
});
|
|
return IsSigned;
|
|
};
|
|
switch (ShuffleOrOp) {
|
|
case Instruction::PHI: {
|
|
assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
|
|
E != VectorizableTree.front().get() || E->UserTreeIndex) &&
|
|
"PHI reordering is free.");
|
|
auto *PH = cast<PHINode>(VL0);
|
|
Builder.SetInsertPoint(PH->getParent(),
|
|
PH->getParent()->getFirstNonPHIIt());
|
|
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
|
|
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
|
|
Value *V = NewPhi;
|
|
|
|
// Adjust insertion point once all PHI's have been generated.
|
|
Builder.SetInsertPoint(PH->getParent(),
|
|
PH->getParent()->getFirstInsertionPt());
|
|
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
|
|
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
// If phi node is fully emitted - exit.
|
|
if (NewPhi->getNumIncomingValues() != 0)
|
|
return NewPhi;
|
|
|
|
// PHINodes may have multiple entries from the same block. We want to
|
|
// visit every block once.
|
|
SmallPtrSet<BasicBlock *, 4> VisitedBBs;
|
|
|
|
for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
|
|
ValueList Operands;
|
|
BasicBlock *IBB = PH->getIncomingBlock(I);
|
|
|
|
// Stop emission if all incoming values are generated.
|
|
if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
|
|
return NewPhi;
|
|
}
|
|
|
|
if (!VisitedBBs.insert(IBB).second) {
|
|
Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
|
|
NewPhi->addIncoming(VecOp, IBB);
|
|
TreeEntry *OpTE = getOperandEntry(E, I);
|
|
OpTE->VectorizedValue = VecOp;
|
|
continue;
|
|
}
|
|
|
|
Builder.SetInsertPoint(IBB->getTerminator());
|
|
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
|
|
Value *Vec = vectorizeOperand(E, I);
|
|
if (VecTy != Vec->getType()) {
|
|
assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
|
|
MinBWs.contains(getOperandEntry(E, I))) &&
|
|
"Expected item in MinBWs.");
|
|
Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
|
|
}
|
|
NewPhi->addIncoming(Vec, IBB);
|
|
}
|
|
|
|
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
|
|
"Invalid number of incoming values");
|
|
assert(E->VectorizedValue && "Expected vectorized value.");
|
|
return E->VectorizedValue;
|
|
}
|
|
|
|
case Instruction::ExtractElement: {
|
|
Value *V = E->getSingleOperand(0);
|
|
setInsertPointAfterBundle(E);
|
|
V = FinalShuffle(V, E);
|
|
E->VectorizedValue = V;
|
|
return V;
|
|
}
|
|
case Instruction::ExtractValue: {
|
|
auto *LI = cast<LoadInst>(E->getSingleOperand(0));
|
|
Builder.SetInsertPoint(LI);
|
|
Value *Ptr = LI->getPointerOperand();
|
|
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
|
|
Value *NewV = ::propagateMetadata(V, E->Scalars);
|
|
NewV = FinalShuffle(NewV, E);
|
|
E->VectorizedValue = NewV;
|
|
return NewV;
|
|
}
|
|
case Instruction::InsertElement: {
|
|
assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
|
|
Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
|
|
Value *V = vectorizeOperand(E, 1);
|
|
ArrayRef<Value *> Op = E->getOperand(1);
|
|
Type *ScalarTy = Op.front()->getType();
|
|
if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
|
|
assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
|
|
std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
|
|
assert(Res.first > 0 && "Expected item in MinBWs.");
|
|
V = Builder.CreateIntCast(
|
|
V,
|
|
getWidenedType(
|
|
ScalarTy,
|
|
cast<FixedVectorType>(V->getType())->getNumElements()),
|
|
Res.second);
|
|
}
|
|
|
|
// Create InsertVector shuffle if necessary
|
|
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
|
|
return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
|
|
}));
|
|
const unsigned NumElts =
|
|
cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
|
|
const unsigned NumScalars = E->Scalars.size();
|
|
|
|
unsigned Offset = *getElementIndex(VL0);
|
|
assert(Offset < NumElts && "Failed to find vector index offset");
|
|
|
|
// Create shuffle to resize vector
|
|
SmallVector<int> Mask;
|
|
if (!E->ReorderIndices.empty()) {
|
|
inversePermutation(E->ReorderIndices, Mask);
|
|
Mask.append(NumElts - NumScalars, PoisonMaskElem);
|
|
} else {
|
|
Mask.assign(NumElts, PoisonMaskElem);
|
|
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
|
|
}
|
|
// Create InsertVector shuffle if necessary
|
|
bool IsIdentity = true;
|
|
SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
|
|
Mask.swap(PrevMask);
|
|
for (unsigned I = 0; I < NumScalars; ++I) {
|
|
Value *Scalar = E->Scalars[PrevMask[I]];
|
|
unsigned InsertIdx = *getElementIndex(Scalar);
|
|
IsIdentity &= InsertIdx - Offset == I;
|
|
Mask[InsertIdx - Offset] = I;
|
|
}
|
|
if (!IsIdentity || NumElts != NumScalars) {
|
|
Value *V2 = nullptr;
|
|
bool IsVNonPoisonous =
|
|
!isConstant(V) && isGuaranteedNotToBePoison(V, AC);
|
|
SmallVector<int> InsertMask(Mask);
|
|
if (NumElts != NumScalars && Offset == 0) {
|
|
// Follow all insert element instructions from the current buildvector
|
|
// sequence.
|
|
InsertElementInst *Ins = cast<InsertElementInst>(VL0);
|
|
do {
|
|
std::optional<unsigned> InsertIdx = getElementIndex(Ins);
|
|
if (!InsertIdx)
|
|
break;
|
|
if (InsertMask[*InsertIdx] == PoisonMaskElem)
|
|
InsertMask[*InsertIdx] = *InsertIdx;
|
|
if (!Ins->hasOneUse())
|
|
break;
|
|
Ins = dyn_cast_or_null<InsertElementInst>(
|
|
Ins->getUniqueUndroppableUser());
|
|
} while (Ins);
|
|
SmallBitVector UseMask =
|
|
buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
|
|
SmallBitVector IsFirstPoison =
|
|
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
|
|
SmallBitVector IsFirstUndef =
|
|
isUndefVector(FirstInsert->getOperand(0), UseMask);
|
|
if (!IsFirstPoison.all()) {
|
|
unsigned Idx = 0;
|
|
for (unsigned I = 0; I < NumElts; I++) {
|
|
if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
|
|
IsFirstUndef.test(I)) {
|
|
if (IsVNonPoisonous) {
|
|
InsertMask[I] = I < NumScalars ? I : 0;
|
|
continue;
|
|
}
|
|
if (!V2)
|
|
V2 = UndefValue::get(V->getType());
|
|
if (Idx >= NumScalars)
|
|
Idx = NumScalars - 1;
|
|
InsertMask[I] = NumScalars + Idx;
|
|
++Idx;
|
|
} else if (InsertMask[I] != PoisonMaskElem &&
|
|
Mask[I] == PoisonMaskElem) {
|
|
InsertMask[I] = PoisonMaskElem;
|
|
}
|
|
}
|
|
} else {
|
|
InsertMask = Mask;
|
|
}
|
|
}
|
|
if (!V2)
|
|
V2 = PoisonValue::get(V->getType());
|
|
V = Builder.CreateShuffleVector(V, V2, InsertMask);
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
}
|
|
|
|
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
|
|
for (unsigned I = 0; I < NumElts; I++) {
|
|
if (Mask[I] != PoisonMaskElem)
|
|
InsertMask[Offset + I] = I;
|
|
}
|
|
SmallBitVector UseMask =
|
|
buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
|
|
SmallBitVector IsFirstUndef =
|
|
isUndefVector(FirstInsert->getOperand(0), UseMask);
|
|
if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
|
|
NumElts != NumScalars) {
|
|
if (IsFirstUndef.all()) {
|
|
if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
|
|
SmallBitVector IsFirstPoison =
|
|
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
|
|
if (!IsFirstPoison.all()) {
|
|
for (unsigned I = 0; I < NumElts; I++) {
|
|
if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
|
|
InsertMask[I] = I + NumElts;
|
|
}
|
|
}
|
|
V = Builder.CreateShuffleVector(
|
|
V,
|
|
IsFirstPoison.all() ? PoisonValue::get(V->getType())
|
|
: FirstInsert->getOperand(0),
|
|
InsertMask, cast<Instruction>(E->Scalars.back())->getName());
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
}
|
|
} else {
|
|
SmallBitVector IsFirstPoison =
|
|
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
|
|
for (unsigned I = 0; I < NumElts; I++) {
|
|
if (InsertMask[I] == PoisonMaskElem)
|
|
InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
|
|
else
|
|
InsertMask[I] += NumElts;
|
|
}
|
|
V = Builder.CreateShuffleVector(
|
|
FirstInsert->getOperand(0), V, InsertMask,
|
|
cast<Instruction>(E->Scalars.back())->getName());
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
}
|
|
}
|
|
|
|
++NumVectorInstructions;
|
|
E->VectorizedValue = V;
|
|
return V;
|
|
}
|
|
case Instruction::ZExt:
|
|
case Instruction::SExt:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPExt:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::SIToFP:
|
|
case Instruction::UIToFP:
|
|
case Instruction::Trunc:
|
|
case Instruction::FPTrunc:
|
|
case Instruction::BitCast: {
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *InVec = vectorizeOperand(E, 0);
|
|
|
|
auto *CI = cast<CastInst>(VL0);
|
|
Instruction::CastOps VecOpcode = CI->getOpcode();
|
|
Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
|
|
auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
|
|
if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
|
|
(SrcIt != MinBWs.end() || It != MinBWs.end() ||
|
|
SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
|
|
// Check if the values are candidates to demote.
|
|
unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
|
|
if (SrcIt != MinBWs.end())
|
|
SrcBWSz = SrcIt->second.first;
|
|
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
|
|
if (BWSz == SrcBWSz) {
|
|
VecOpcode = Instruction::BitCast;
|
|
} else if (BWSz < SrcBWSz) {
|
|
VecOpcode = Instruction::Trunc;
|
|
} else if (It != MinBWs.end()) {
|
|
assert(BWSz > SrcBWSz && "Invalid cast!");
|
|
VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
} else if (SrcIt != MinBWs.end()) {
|
|
assert(BWSz > SrcBWSz && "Invalid cast!");
|
|
VecOpcode =
|
|
SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
|
|
}
|
|
} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
|
|
!SrcIt->second.second) {
|
|
VecOpcode = Instruction::UIToFP;
|
|
}
|
|
Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
|
|
? InVec
|
|
: Builder.CreateCast(VecOpcode, InVec, VecTy);
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
case Instruction::FCmp:
|
|
case Instruction::ICmp: {
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *L = vectorizeOperand(E, 0);
|
|
Value *R = vectorizeOperand(E, 1);
|
|
if (L->getType() != R->getType()) {
|
|
assert((getOperandEntry(E, 0)->isGather() ||
|
|
getOperandEntry(E, 1)->isGather() ||
|
|
MinBWs.contains(getOperandEntry(E, 0)) ||
|
|
MinBWs.contains(getOperandEntry(E, 1))) &&
|
|
"Expected item in MinBWs.");
|
|
if (cast<VectorType>(L->getType())
|
|
->getElementType()
|
|
->getIntegerBitWidth() < cast<VectorType>(R->getType())
|
|
->getElementType()
|
|
->getIntegerBitWidth()) {
|
|
Type *CastTy = R->getType();
|
|
L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
|
|
} else {
|
|
Type *CastTy = L->getType();
|
|
R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
|
|
}
|
|
}
|
|
|
|
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
|
|
Value *V = Builder.CreateCmp(P0, L, R);
|
|
propagateIRFlags(V, E->Scalars, VL0);
|
|
if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
|
|
ICmp->setSameSign(/*B=*/false);
|
|
// Do not cast for cmps.
|
|
VecTy = cast<FixedVectorType>(V->getType());
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
case Instruction::Select: {
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *Cond = vectorizeOperand(E, 0);
|
|
Value *True = vectorizeOperand(E, 1);
|
|
Value *False = vectorizeOperand(E, 2);
|
|
if (True->getType() != VecTy || False->getType() != VecTy) {
|
|
assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
|
|
getOperandEntry(E, 2)->isGather() ||
|
|
MinBWs.contains(getOperandEntry(E, 1)) ||
|
|
MinBWs.contains(getOperandEntry(E, 2))) &&
|
|
"Expected item in MinBWs.");
|
|
if (True->getType() != VecTy)
|
|
True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
|
|
if (False->getType() != VecTy)
|
|
False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
|
|
}
|
|
|
|
unsigned CondNumElements = getNumElements(Cond->getType());
|
|
unsigned TrueNumElements = getNumElements(True->getType());
|
|
assert(TrueNumElements >= CondNumElements &&
|
|
TrueNumElements % CondNumElements == 0 &&
|
|
"Cannot vectorize Instruction::Select");
|
|
assert(TrueNumElements == getNumElements(False->getType()) &&
|
|
"Cannot vectorize Instruction::Select");
|
|
if (CondNumElements != TrueNumElements) {
|
|
// When the return type is i1 but the source is fixed vector type, we
|
|
// need to duplicate the condition value.
|
|
Cond = Builder.CreateShuffleVector(
|
|
Cond, createReplicatedMask(TrueNumElements / CondNumElements,
|
|
CondNumElements));
|
|
}
|
|
assert(getNumElements(Cond->getType()) == TrueNumElements &&
|
|
"Cannot vectorize Instruction::Select");
|
|
Value *V = Builder.CreateSelect(Cond, True, False);
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
case Instruction::FNeg: {
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *Op = vectorizeOperand(E, 0);
|
|
|
|
Value *V = Builder.CreateUnOp(
|
|
static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
|
|
propagateIRFlags(V, E->Scalars, VL0);
|
|
if (auto *I = dyn_cast<Instruction>(V))
|
|
V = ::propagateMetadata(I, E->Scalars);
|
|
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
|
|
return V;
|
|
}
|
|
case Instruction::Freeze: {
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *Op = vectorizeOperand(E, 0);
|
|
|
|
if (Op->getType() != VecTy) {
|
|
assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
|
|
MinBWs.contains(getOperandEntry(E, 0))) &&
|
|
"Expected item in MinBWs.");
|
|
Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
|
|
}
|
|
Value *V = Builder.CreateFreeze(Op);
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
|
|
return V;
|
|
}
|
|
case Instruction::Add:
|
|
case Instruction::FAdd:
|
|
case Instruction::Sub:
|
|
case Instruction::FSub:
|
|
case Instruction::Mul:
|
|
case Instruction::FMul:
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::FDiv:
|
|
case Instruction::URem:
|
|
case Instruction::SRem:
|
|
case Instruction::FRem:
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor: {
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *LHS = vectorizeOperand(E, 0);
|
|
Value *RHS = vectorizeOperand(E, 1);
|
|
if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
|
|
for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
|
|
ArrayRef<Value *> Ops = E->getOperand(I);
|
|
if (all_of(Ops, [&](Value *Op) {
|
|
auto *CI = dyn_cast<ConstantInt>(Op);
|
|
return CI && CI->getValue().countr_one() >= It->second.first;
|
|
})) {
|
|
V = FinalShuffle(I == 0 ? RHS : LHS, E);
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
}
|
|
}
|
|
if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
|
|
assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
|
|
getOperandEntry(E, 1)->isGather() ||
|
|
MinBWs.contains(getOperandEntry(E, 0)) ||
|
|
MinBWs.contains(getOperandEntry(E, 1))) &&
|
|
"Expected item in MinBWs.");
|
|
if (LHS->getType() != VecTy)
|
|
LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
|
|
if (RHS->getType() != VecTy)
|
|
RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
|
|
}
|
|
|
|
Value *V = Builder.CreateBinOp(
|
|
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
|
|
RHS);
|
|
propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
V = ::propagateMetadata(I, E->Scalars);
|
|
// Drop nuw flags for abs(sub(commutative), true).
|
|
if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
|
|
any_of(E->Scalars, [](Value *V) {
|
|
return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
|
|
}))
|
|
I->setHasNoUnsignedWrap(/*b=*/false);
|
|
}
|
|
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
|
|
return V;
|
|
}
|
|
case Instruction::Load: {
|
|
// Loads are inserted at the head of the tree because we don't want to
|
|
// sink them all the way down past store instructions.
|
|
setInsertPointAfterBundle(E);
|
|
|
|
LoadInst *LI = cast<LoadInst>(VL0);
|
|
Instruction *NewLI;
|
|
Value *PO = LI->getPointerOperand();
|
|
if (E->State == TreeEntry::Vectorize) {
|
|
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
|
|
} else if (E->State == TreeEntry::CompressVectorize) {
|
|
bool IsMasked;
|
|
unsigned InterleaveFactor;
|
|
SmallVector<int> CompressMask;
|
|
VectorType *LoadVecTy;
|
|
SmallVector<Value *> Scalars(E->Scalars.begin(), E->Scalars.end());
|
|
if (!E->ReorderIndices.empty()) {
|
|
SmallVector<int> Mask(E->ReorderIndices.begin(),
|
|
E->ReorderIndices.end());
|
|
reorderScalars(Scalars, Mask);
|
|
}
|
|
SmallVector<Value *> PointerOps(Scalars.size());
|
|
for (auto [I, V] : enumerate(Scalars))
|
|
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
|
|
[[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
|
|
Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
|
|
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
|
|
CompressMask, LoadVecTy);
|
|
assert(IsVectorized && "Expected to be vectorized");
|
|
Align CommonAlignment;
|
|
if (IsMasked)
|
|
CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
|
|
else
|
|
CommonAlignment = LI->getAlign();
|
|
if (IsMasked) {
|
|
SmallVector<Constant *> MaskValues(
|
|
getNumElements(LoadVecTy) / getNumElements(LI->getType()),
|
|
ConstantInt::getFalse(VecTy->getContext()));
|
|
for (int I : CompressMask)
|
|
MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
|
|
assert(SLPReVec && "Only supported by REVEC.");
|
|
MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
|
|
}
|
|
Constant *MaskValue = ConstantVector::get(MaskValues);
|
|
NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
|
|
MaskValue);
|
|
} else {
|
|
NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
|
|
}
|
|
NewLI = ::propagateMetadata(NewLI, E->Scalars);
|
|
// TODO: include this cost into CommonCost.
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
|
|
CompressMask);
|
|
}
|
|
NewLI =
|
|
cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
|
|
} else if (E->State == TreeEntry::StridedVectorize) {
|
|
Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
|
|
Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
|
|
PO = IsReverseOrder ? PtrN : Ptr0;
|
|
std::optional<int> Diff = getPointersDiff(
|
|
VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
|
|
Type *StrideTy = DL->getIndexType(PO->getType());
|
|
Value *StrideVal;
|
|
if (Diff) {
|
|
int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
|
|
StrideVal =
|
|
ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
|
|
DL->getTypeAllocSize(ScalarTy));
|
|
} else {
|
|
SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
|
|
transform(E->Scalars, PointerOps.begin(), [](Value *V) {
|
|
return cast<LoadInst>(V)->getPointerOperand();
|
|
});
|
|
OrdersType Order;
|
|
std::optional<Value *> Stride =
|
|
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
|
|
&*Builder.GetInsertPoint());
|
|
Value *NewStride =
|
|
Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
|
|
StrideVal = Builder.CreateMul(
|
|
NewStride,
|
|
ConstantInt::get(
|
|
StrideTy,
|
|
(IsReverseOrder ? -1 : 1) *
|
|
static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
|
|
}
|
|
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
|
|
auto *Inst = Builder.CreateIntrinsic(
|
|
Intrinsic::experimental_vp_strided_load,
|
|
{VecTy, PO->getType(), StrideTy},
|
|
{PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
|
|
Builder.getInt32(E->Scalars.size())});
|
|
Inst->addParamAttr(
|
|
/*ArgNo=*/0,
|
|
Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
|
|
NewLI = Inst;
|
|
} else {
|
|
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
|
|
Value *VecPtr = vectorizeOperand(E, 0);
|
|
if (isa<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
// CreateMaskedGather expects VecTy and VecPtr have same size. We need
|
|
// to expand VecPtr if ScalarTy is a vector type.
|
|
unsigned ScalarTyNumElements =
|
|
cast<FixedVectorType>(ScalarTy)->getNumElements();
|
|
unsigned VecTyNumElements =
|
|
cast<FixedVectorType>(VecTy)->getNumElements();
|
|
assert(VecTyNumElements % ScalarTyNumElements == 0 &&
|
|
"Cannot expand getelementptr.");
|
|
unsigned VF = VecTyNumElements / ScalarTyNumElements;
|
|
SmallVector<Constant *> Indices(VecTyNumElements);
|
|
transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
|
|
return Builder.getInt64(I % ScalarTyNumElements);
|
|
});
|
|
VecPtr = Builder.CreateGEP(
|
|
VecTy->getElementType(),
|
|
Builder.CreateShuffleVector(
|
|
VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
|
|
ConstantVector::get(Indices));
|
|
}
|
|
// Use the minimum alignment of the gathered loads.
|
|
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
|
|
NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
|
|
}
|
|
Value *V = E->State == TreeEntry::CompressVectorize
|
|
? NewLI
|
|
: ::propagateMetadata(NewLI, E->Scalars);
|
|
|
|
V = FinalShuffle(V, E);
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
case Instruction::Store: {
|
|
auto *SI = cast<StoreInst>(VL0);
|
|
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *VecValue = vectorizeOperand(E, 0);
|
|
if (VecValue->getType() != VecTy)
|
|
VecValue =
|
|
Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
|
|
VecValue = FinalShuffle(VecValue, E);
|
|
|
|
Value *Ptr = SI->getPointerOperand();
|
|
Instruction *ST;
|
|
if (E->State == TreeEntry::Vectorize) {
|
|
ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
|
|
} else {
|
|
assert(E->State == TreeEntry::StridedVectorize &&
|
|
"Expected either strided or consecutive stores.");
|
|
if (!E->ReorderIndices.empty()) {
|
|
SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
|
|
Ptr = SI->getPointerOperand();
|
|
}
|
|
Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
|
|
Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
|
|
auto *Inst = Builder.CreateIntrinsic(
|
|
Intrinsic::experimental_vp_strided_store,
|
|
{VecTy, Ptr->getType(), StrideTy},
|
|
{VecValue, Ptr,
|
|
ConstantInt::get(
|
|
StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
|
|
Builder.getAllOnesMask(VecTy->getElementCount()),
|
|
Builder.getInt32(E->Scalars.size())});
|
|
Inst->addParamAttr(
|
|
/*ArgNo=*/1,
|
|
Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
|
|
ST = Inst;
|
|
}
|
|
|
|
Value *V = ::propagateMetadata(ST, E->Scalars);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
case Instruction::GetElementPtr: {
|
|
auto *GEP0 = cast<GetElementPtrInst>(VL0);
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Value *Op0 = vectorizeOperand(E, 0);
|
|
|
|
SmallVector<Value *> OpVecs;
|
|
for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
|
|
Value *OpVec = vectorizeOperand(E, J);
|
|
OpVecs.push_back(OpVec);
|
|
}
|
|
|
|
Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
|
|
if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
|
|
SmallVector<Value *> GEPs;
|
|
for (Value *V : E->Scalars) {
|
|
if (isa<GetElementPtrInst>(V))
|
|
GEPs.push_back(V);
|
|
}
|
|
V = ::propagateMetadata(I, GEPs);
|
|
}
|
|
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
|
|
return V;
|
|
}
|
|
case Instruction::Call: {
|
|
CallInst *CI = cast<CallInst>(VL0);
|
|
setInsertPointAfterBundle(E);
|
|
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
|
|
SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
|
|
CI, ID, VecTy->getNumElements(),
|
|
It != MinBWs.end() ? It->second.first : 0, TTI);
|
|
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
|
|
bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
|
|
VecCallCosts.first <= VecCallCosts.second;
|
|
|
|
Value *ScalarArg = nullptr;
|
|
SmallVector<Value *> OpVecs;
|
|
SmallVector<Type *, 2> TysForDecl;
|
|
// Add return type if intrinsic is overloaded on it.
|
|
if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
|
|
TysForDecl.push_back(VecTy);
|
|
auto *CEI = cast<CallInst>(VL0);
|
|
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
|
|
ValueList OpVL;
|
|
// Some intrinsics have scalar arguments. This argument should not be
|
|
// vectorized.
|
|
if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
|
|
ScalarArg = CEI->getArgOperand(I);
|
|
// if decided to reduce bitwidth of abs intrinsic, it second argument
|
|
// must be set false (do not return poison, if value issigned min).
|
|
if (ID == Intrinsic::abs && It != MinBWs.end() &&
|
|
It->second.first < DL->getTypeSizeInBits(CEI->getType()))
|
|
ScalarArg = Builder.getFalse();
|
|
OpVecs.push_back(ScalarArg);
|
|
if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
|
|
TysForDecl.push_back(ScalarArg->getType());
|
|
continue;
|
|
}
|
|
|
|
Value *OpVec = vectorizeOperand(E, I);
|
|
ScalarArg = CEI->getArgOperand(I);
|
|
if (cast<VectorType>(OpVec->getType())->getElementType() !=
|
|
ScalarArg->getType()->getScalarType() &&
|
|
It == MinBWs.end()) {
|
|
auto *CastTy =
|
|
getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
|
|
OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
|
|
} else if (It != MinBWs.end()) {
|
|
OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
|
|
OpVecs.push_back(OpVec);
|
|
if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
|
|
TysForDecl.push_back(OpVec->getType());
|
|
}
|
|
|
|
Function *CF;
|
|
if (!UseIntrinsic) {
|
|
VFShape Shape =
|
|
VFShape::get(CI->getFunctionType(),
|
|
ElementCount::getFixed(
|
|
static_cast<unsigned>(VecTy->getNumElements())),
|
|
false /*HasGlobalPred*/);
|
|
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
|
|
} else {
|
|
CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
|
|
}
|
|
|
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
|
CI->getOperandBundlesAsDefs(OpBundles);
|
|
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
|
|
|
|
propagateIRFlags(V, E->Scalars, VL0);
|
|
V = FinalShuffle(V, E);
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
return V;
|
|
}
|
|
case Instruction::ShuffleVector: {
|
|
Value *V;
|
|
if (SLPReVec && !E->isAltShuffle()) {
|
|
setInsertPointAfterBundle(E);
|
|
Value *Src = vectorizeOperand(E, 0);
|
|
SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
|
|
if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
|
|
SmallVector<int> NewMask(ThisMask.size());
|
|
transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
|
|
return SVSrc->getShuffleMask()[Mask];
|
|
});
|
|
V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
|
|
SVSrc->getOperand(1), NewMask);
|
|
} else {
|
|
V = Builder.CreateShuffleVector(Src, ThisMask);
|
|
}
|
|
propagateIRFlags(V, E->Scalars, VL0);
|
|
if (auto *I = dyn_cast<Instruction>(V))
|
|
V = ::propagateMetadata(I, E->Scalars);
|
|
V = FinalShuffle(V, E);
|
|
} else {
|
|
assert(E->isAltShuffle() &&
|
|
((Instruction::isBinaryOp(E->getOpcode()) &&
|
|
Instruction::isBinaryOp(E->getAltOpcode())) ||
|
|
(Instruction::isCast(E->getOpcode()) &&
|
|
Instruction::isCast(E->getAltOpcode())) ||
|
|
(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
|
|
"Invalid Shuffle Vector Operand");
|
|
|
|
Value *LHS = nullptr, *RHS = nullptr;
|
|
if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
|
|
setInsertPointAfterBundle(E);
|
|
LHS = vectorizeOperand(E, 0);
|
|
RHS = vectorizeOperand(E, 1);
|
|
} else {
|
|
setInsertPointAfterBundle(E);
|
|
LHS = vectorizeOperand(E, 0);
|
|
}
|
|
if (LHS && RHS &&
|
|
((Instruction::isBinaryOp(E->getOpcode()) &&
|
|
(LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
|
|
(isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
|
|
assert((It != MinBWs.end() ||
|
|
getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
|
|
getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
|
|
MinBWs.contains(getOperandEntry(E, 0)) ||
|
|
MinBWs.contains(getOperandEntry(E, 1))) &&
|
|
"Expected item in MinBWs.");
|
|
Type *CastTy = VecTy;
|
|
if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
|
|
if (cast<VectorType>(LHS->getType())
|
|
->getElementType()
|
|
->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
|
|
->getElementType()
|
|
->getIntegerBitWidth())
|
|
CastTy = RHS->getType();
|
|
else
|
|
CastTy = LHS->getType();
|
|
}
|
|
if (LHS->getType() != CastTy)
|
|
LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
|
|
if (RHS->getType() != CastTy)
|
|
RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
|
|
}
|
|
|
|
Value *V0, *V1;
|
|
if (Instruction::isBinaryOp(E->getOpcode())) {
|
|
V0 = Builder.CreateBinOp(
|
|
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
|
|
V1 = Builder.CreateBinOp(
|
|
static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
|
|
} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
|
|
V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
|
|
auto *AltCI = cast<CmpInst>(E->getAltOp());
|
|
CmpInst::Predicate AltPred = AltCI->getPredicate();
|
|
V1 = Builder.CreateCmp(AltPred, LHS, RHS);
|
|
} else {
|
|
if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
|
|
unsigned SrcBWSz = DL->getTypeSizeInBits(
|
|
cast<VectorType>(LHS->getType())->getElementType());
|
|
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
|
|
if (BWSz <= SrcBWSz) {
|
|
if (BWSz < SrcBWSz)
|
|
LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
|
|
assert(LHS->getType() == VecTy &&
|
|
"Expected same type as operand.");
|
|
if (auto *I = dyn_cast<Instruction>(LHS))
|
|
LHS = ::propagateMetadata(I, E->Scalars);
|
|
LHS = FinalShuffle(LHS, E);
|
|
E->VectorizedValue = LHS;
|
|
++NumVectorInstructions;
|
|
return LHS;
|
|
}
|
|
}
|
|
V0 = Builder.CreateCast(
|
|
static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
|
|
V1 = Builder.CreateCast(
|
|
static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
|
|
}
|
|
// Add V0 and V1 to later analysis to try to find and remove matching
|
|
// instruction, if any.
|
|
for (Value *V : {V0, V1}) {
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
}
|
|
|
|
// Create shuffle to take alternate operations from the vector.
|
|
// Also, gather up main and alt scalar ops to propagate IR flags to
|
|
// each vector operation.
|
|
ValueList OpScalars, AltScalars;
|
|
SmallVector<int> Mask;
|
|
E->buildAltOpShuffleMask(
|
|
[E, this](Instruction *I) {
|
|
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
|
|
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
|
|
*TLI);
|
|
},
|
|
Mask, &OpScalars, &AltScalars);
|
|
|
|
propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
|
|
propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
|
|
auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
|
|
// Drop nuw flags for abs(sub(commutative), true).
|
|
if (auto *I = dyn_cast<Instruction>(Vec);
|
|
I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
|
|
any_of(E->Scalars, [](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return false;
|
|
auto *IV = cast<Instruction>(V);
|
|
return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
|
|
}))
|
|
I->setHasNoUnsignedWrap(/*b=*/false);
|
|
};
|
|
DropNuwFlag(V0, E->getOpcode());
|
|
DropNuwFlag(V1, E->getAltOpcode());
|
|
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
|
|
}
|
|
V = Builder.CreateShuffleVector(V0, V1, Mask);
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
V = ::propagateMetadata(I, E->Scalars);
|
|
GatherShuffleExtractSeq.insert(I);
|
|
CSEBlocks.insert(I->getParent());
|
|
}
|
|
}
|
|
|
|
E->VectorizedValue = V;
|
|
++NumVectorInstructions;
|
|
|
|
return V;
|
|
}
|
|
default:
|
|
llvm_unreachable("unknown inst");
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
Value *BoUpSLP::vectorizeTree() {
|
|
ExtraValueToDebugLocsMap ExternallyUsedValues;
|
|
return vectorizeTree(ExternallyUsedValues);
|
|
}
|
|
|
|
Value *BoUpSLP::vectorizeTree(
|
|
const ExtraValueToDebugLocsMap &ExternallyUsedValues,
|
|
Instruction *ReductionRoot,
|
|
ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
|
|
// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
|
|
// need to rebuild it.
|
|
EntryToLastInstruction.clear();
|
|
// All blocks must be scheduled before any instructions are inserted.
|
|
for (auto &BSIter : BlocksSchedules) {
|
|
scheduleBlock(BSIter.second.get());
|
|
}
|
|
|
|
if (ReductionRoot)
|
|
Builder.SetInsertPoint(ReductionRoot->getParent(),
|
|
ReductionRoot->getIterator());
|
|
else
|
|
Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
|
|
|
|
// Emit gathered loads first to emit better code for the users of those
|
|
// gathered loads.
|
|
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
|
|
if (GatheredLoadsEntriesFirst.has_value() &&
|
|
TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
|
|
(!TE->isGather() || TE->UserTreeIndex)) {
|
|
assert((TE->UserTreeIndex ||
|
|
(TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
|
|
"Expected gathered load node.");
|
|
(void)vectorizeTree(TE.get());
|
|
}
|
|
}
|
|
(void)vectorizeTree(VectorizableTree[0].get());
|
|
// Run through the list of postponed gathers and emit them, replacing the temp
|
|
// emitted allocas with actual vector instructions.
|
|
ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
|
|
DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
|
|
for (const TreeEntry *E : PostponedNodes) {
|
|
auto *TE = const_cast<TreeEntry *>(E);
|
|
auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
|
|
TE->VectorizedValue = nullptr;
|
|
auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
|
|
// If user is a PHI node, its vector code have to be inserted right before
|
|
// block terminator. Since the node was delayed, there were some unresolved
|
|
// dependencies at the moment when stab instruction was emitted. In a case
|
|
// when any of these dependencies turn out an operand of another PHI, coming
|
|
// from this same block, position of a stab instruction will become invalid.
|
|
// The is because source vector that supposed to feed this gather node was
|
|
// inserted at the end of the block [after stab instruction]. So we need
|
|
// to adjust insertion point again to the end of block.
|
|
if (isa<PHINode>(UserI)) {
|
|
// Insert before all users.
|
|
Instruction *InsertPt = PrevVec->getParent()->getTerminator();
|
|
for (User *U : PrevVec->users()) {
|
|
if (U == UserI)
|
|
continue;
|
|
auto *UI = dyn_cast<Instruction>(U);
|
|
if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
|
|
continue;
|
|
if (UI->comesBefore(InsertPt))
|
|
InsertPt = UI;
|
|
}
|
|
Builder.SetInsertPoint(InsertPt);
|
|
} else {
|
|
Builder.SetInsertPoint(PrevVec);
|
|
}
|
|
Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
|
|
Value *Vec = vectorizeTree(TE);
|
|
if (auto *VecI = dyn_cast<Instruction>(Vec);
|
|
VecI && VecI->getParent() == Builder.GetInsertBlock() &&
|
|
Builder.GetInsertPoint()->comesBefore(VecI))
|
|
VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
|
|
Builder.GetInsertPoint());
|
|
if (Vec->getType() != PrevVec->getType()) {
|
|
assert(Vec->getType()->isIntOrIntVectorTy() &&
|
|
PrevVec->getType()->isIntOrIntVectorTy() &&
|
|
"Expected integer vector types only.");
|
|
std::optional<bool> IsSigned;
|
|
for (Value *V : TE->Scalars) {
|
|
if (isVectorized(V)) {
|
|
for (const TreeEntry *MNTE : getTreeEntries(V)) {
|
|
auto It = MinBWs.find(MNTE);
|
|
if (It != MinBWs.end()) {
|
|
IsSigned = IsSigned.value_or(false) || It->second.second;
|
|
if (*IsSigned)
|
|
break;
|
|
}
|
|
}
|
|
if (IsSigned.value_or(false))
|
|
break;
|
|
// Scan through gather nodes.
|
|
for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
|
|
auto It = MinBWs.find(BVE);
|
|
if (It != MinBWs.end()) {
|
|
IsSigned = IsSigned.value_or(false) || It->second.second;
|
|
if (*IsSigned)
|
|
break;
|
|
}
|
|
}
|
|
if (IsSigned.value_or(false))
|
|
break;
|
|
if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
|
|
IsSigned =
|
|
IsSigned.value_or(false) ||
|
|
!isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
|
|
continue;
|
|
}
|
|
if (IsSigned.value_or(false))
|
|
break;
|
|
}
|
|
}
|
|
if (IsSigned.value_or(false)) {
|
|
// Final attempt - check user node.
|
|
auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
|
|
if (It != MinBWs.end())
|
|
IsSigned = It->second.second;
|
|
}
|
|
assert(IsSigned &&
|
|
"Expected user node or perfect diamond match in MinBWs.");
|
|
Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
|
|
}
|
|
PrevVec->replaceAllUsesWith(Vec);
|
|
PostponedValues.try_emplace(Vec).first->second.push_back(TE);
|
|
// Replace the stub vector node, if it was used before for one of the
|
|
// buildvector nodes already.
|
|
auto It = PostponedValues.find(PrevVec);
|
|
if (It != PostponedValues.end()) {
|
|
for (TreeEntry *VTE : It->getSecond())
|
|
VTE->VectorizedValue = Vec;
|
|
}
|
|
eraseInstruction(PrevVec);
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
|
|
<< " values .\n");
|
|
|
|
SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
|
|
// Maps vector instruction to original insertelement instruction
|
|
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
|
|
// Maps extract Scalar to the corresponding extractelement instruction in the
|
|
// basic block. Only one extractelement per block should be emitted.
|
|
DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
|
|
ScalarToEEs;
|
|
SmallDenseSet<Value *, 4> UsedInserts;
|
|
DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
|
|
SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
|
|
SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
|
|
// Extract all of the elements with the external uses.
|
|
for (const auto &ExternalUse : ExternalUses) {
|
|
Value *Scalar = ExternalUse.Scalar;
|
|
llvm::User *User = ExternalUse.User;
|
|
|
|
// Skip users that we already RAUW. This happens when one instruction
|
|
// has multiple uses of the same value.
|
|
if (User && !is_contained(Scalar->users(), User))
|
|
continue;
|
|
const TreeEntry *E = &ExternalUse.E;
|
|
assert(E && "Invalid scalar");
|
|
assert(!E->isGather() && "Extracting from a gather list");
|
|
// Non-instruction pointers are not deleted, just skip them.
|
|
if (E->getOpcode() == Instruction::GetElementPtr &&
|
|
!isa<GetElementPtrInst>(Scalar))
|
|
continue;
|
|
|
|
Value *Vec = E->VectorizedValue;
|
|
assert(Vec && "Can't find vectorizable value");
|
|
|
|
Value *Lane = Builder.getInt32(ExternalUse.Lane);
|
|
auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
|
|
if (Scalar->getType() != Vec->getType()) {
|
|
Value *Ex = nullptr;
|
|
Value *ExV = nullptr;
|
|
auto *Inst = dyn_cast<Instruction>(Scalar);
|
|
bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
|
|
auto It = ScalarToEEs.find(Scalar);
|
|
if (It != ScalarToEEs.end()) {
|
|
// No need to emit many extracts, just move the only one in the
|
|
// current block.
|
|
auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
|
|
: Builder.GetInsertBlock());
|
|
if (EEIt != It->second.end()) {
|
|
Value *PrevV = EEIt->second.first;
|
|
if (auto *I = dyn_cast<Instruction>(PrevV);
|
|
I && !ReplaceInst &&
|
|
Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
|
|
Builder.GetInsertPoint()->comesBefore(I)) {
|
|
I->moveBefore(*Builder.GetInsertPoint()->getParent(),
|
|
Builder.GetInsertPoint());
|
|
if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
|
|
CI->moveAfter(I);
|
|
}
|
|
Ex = PrevV;
|
|
ExV = EEIt->second.second ? EEIt->second.second : Ex;
|
|
}
|
|
}
|
|
if (!Ex) {
|
|
// "Reuse" the existing extract to improve final codegen.
|
|
if (ReplaceInst) {
|
|
// Leave the instruction as is, if it cheaper extracts and all
|
|
// operands are scalar.
|
|
if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
|
|
IgnoredExtracts.insert(EE);
|
|
Ex = EE;
|
|
} else {
|
|
auto *CloneInst = Inst->clone();
|
|
CloneInst->insertBefore(Inst->getIterator());
|
|
if (Inst->hasName())
|
|
CloneInst->takeName(Inst);
|
|
Ex = CloneInst;
|
|
}
|
|
} else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
|
|
ES && isa<Instruction>(Vec)) {
|
|
Value *V = ES->getVectorOperand();
|
|
auto *IVec = cast<Instruction>(Vec);
|
|
if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
|
|
V = ETEs.front()->VectorizedValue;
|
|
if (auto *IV = dyn_cast<Instruction>(V);
|
|
!IV || IV == Vec || IV->getParent() != IVec->getParent() ||
|
|
IV->comesBefore(IVec))
|
|
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
|
|
else
|
|
Ex = Builder.CreateExtractElement(Vec, Lane);
|
|
} else if (auto *VecTy =
|
|
dyn_cast<FixedVectorType>(Scalar->getType())) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
unsigned VecTyNumElements = VecTy->getNumElements();
|
|
// When REVEC is enabled, we need to extract a vector.
|
|
// Note: The element size of Scalar may be different from the
|
|
// element size of Vec.
|
|
Ex = createExtractVector(Builder, Vec, VecTyNumElements,
|
|
ExternalUse.Lane * VecTyNumElements);
|
|
} else {
|
|
Ex = Builder.CreateExtractElement(Vec, Lane);
|
|
}
|
|
// If necessary, sign-extend or zero-extend ScalarRoot
|
|
// to the larger type.
|
|
ExV = Ex;
|
|
if (Scalar->getType() != Ex->getType())
|
|
ExV = Builder.CreateIntCast(
|
|
Ex, Scalar->getType(),
|
|
!isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
|
|
auto *I = dyn_cast<Instruction>(Ex);
|
|
ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
|
|
: &F->getEntryBlock(),
|
|
std::make_pair(Ex, ExV));
|
|
}
|
|
// The then branch of the previous if may produce constants, since 0
|
|
// operand might be a constant.
|
|
if (auto *ExI = dyn_cast<Instruction>(Ex);
|
|
ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
|
|
GatherShuffleExtractSeq.insert(ExI);
|
|
CSEBlocks.insert(ExI->getParent());
|
|
}
|
|
return ExV;
|
|
}
|
|
assert(isa<FixedVectorType>(Scalar->getType()) &&
|
|
isa<InsertElementInst>(Scalar) &&
|
|
"In-tree scalar of vector type is not insertelement?");
|
|
auto *IE = cast<InsertElementInst>(Scalar);
|
|
VectorToInsertElement.try_emplace(Vec, IE);
|
|
return Vec;
|
|
};
|
|
// If User == nullptr, the Scalar remains as scalar in vectorized
|
|
// instructions or is used as extra arg. Generate ExtractElement instruction
|
|
// and update the record for this scalar in ExternallyUsedValues.
|
|
if (!User) {
|
|
if (!ScalarsWithNullptrUser.insert(Scalar).second)
|
|
continue;
|
|
assert(
|
|
(ExternallyUsedValues.count(Scalar) ||
|
|
Scalar->hasNUsesOrMore(UsesLimit) ||
|
|
ExternalUsesAsOriginalScalar.contains(Scalar) ||
|
|
any_of(
|
|
Scalar->users(),
|
|
[&, TTI = TTI](llvm::User *U) {
|
|
if (ExternalUsesAsOriginalScalar.contains(U))
|
|
return true;
|
|
ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
|
|
return !UseEntries.empty() &&
|
|
(E->State == TreeEntry::Vectorize ||
|
|
E->State == TreeEntry::StridedVectorize ||
|
|
E->State == TreeEntry::CompressVectorize) &&
|
|
any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
|
|
return (UseEntry->State == TreeEntry::Vectorize ||
|
|
UseEntry->State ==
|
|
TreeEntry::StridedVectorize ||
|
|
UseEntry->State ==
|
|
TreeEntry::CompressVectorize) &&
|
|
doesInTreeUserNeedToExtract(
|
|
Scalar, getRootEntryInstruction(*UseEntry),
|
|
TLI, TTI);
|
|
});
|
|
})) &&
|
|
"Scalar with nullptr User must be registered in "
|
|
"ExternallyUsedValues map or remain as scalar in vectorized "
|
|
"instructions");
|
|
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
|
|
if (auto *PHI = dyn_cast<PHINode>(VecI)) {
|
|
if (PHI->getParent()->isLandingPad())
|
|
Builder.SetInsertPoint(
|
|
PHI->getParent(),
|
|
std::next(
|
|
PHI->getParent()->getLandingPadInst()->getIterator()));
|
|
else
|
|
Builder.SetInsertPoint(PHI->getParent(),
|
|
PHI->getParent()->getFirstNonPHIIt());
|
|
} else {
|
|
Builder.SetInsertPoint(VecI->getParent(),
|
|
std::next(VecI->getIterator()));
|
|
}
|
|
} else {
|
|
Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
|
|
}
|
|
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
|
|
// Required to update internally referenced instructions.
|
|
if (Scalar != NewInst) {
|
|
assert((!isa<ExtractElementInst>(Scalar) ||
|
|
!IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
|
|
"Extractelements should not be replaced.");
|
|
Scalar->replaceAllUsesWith(NewInst);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (auto *VU = dyn_cast<InsertElementInst>(User);
|
|
VU && VU->getOperand(1) == Scalar) {
|
|
// Skip if the scalar is another vector op or Vec is not an instruction.
|
|
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
|
|
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
|
|
if (!UsedInserts.insert(VU).second)
|
|
continue;
|
|
// Need to use original vector, if the root is truncated.
|
|
auto BWIt = MinBWs.find(E);
|
|
if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
|
|
auto *ScalarTy = FTy->getElementType();
|
|
auto Key = std::make_pair(Vec, ScalarTy);
|
|
auto VecIt = VectorCasts.find(Key);
|
|
if (VecIt == VectorCasts.end()) {
|
|
IRBuilderBase::InsertPointGuard Guard(Builder);
|
|
if (auto *IVec = dyn_cast<PHINode>(Vec)) {
|
|
if (IVec->getParent()->isLandingPad())
|
|
Builder.SetInsertPoint(IVec->getParent(),
|
|
std::next(IVec->getParent()
|
|
->getLandingPadInst()
|
|
->getIterator()));
|
|
else
|
|
Builder.SetInsertPoint(
|
|
IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
|
|
} else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
|
|
Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
|
|
}
|
|
Vec = Builder.CreateIntCast(
|
|
Vec,
|
|
getWidenedType(
|
|
ScalarTy,
|
|
cast<FixedVectorType>(Vec->getType())->getNumElements()),
|
|
BWIt->second.second);
|
|
VectorCasts.try_emplace(Key, Vec);
|
|
} else {
|
|
Vec = VecIt->second;
|
|
}
|
|
}
|
|
|
|
std::optional<unsigned> InsertIdx = getElementIndex(VU);
|
|
if (InsertIdx) {
|
|
auto *It = find_if(
|
|
ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
|
|
// Checks if 2 insertelements are from the same buildvector.
|
|
InsertElementInst *VecInsert = Data.InsertElements.front();
|
|
return areTwoInsertFromSameBuildVector(
|
|
VU, VecInsert,
|
|
[](InsertElementInst *II) { return II->getOperand(0); });
|
|
});
|
|
unsigned Idx = *InsertIdx;
|
|
if (It == ShuffledInserts.end()) {
|
|
(void)ShuffledInserts.emplace_back();
|
|
It = std::next(ShuffledInserts.begin(),
|
|
ShuffledInserts.size() - 1);
|
|
}
|
|
SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
|
|
if (Mask.empty())
|
|
Mask.assign(FTy->getNumElements(), PoisonMaskElem);
|
|
Mask[Idx] = ExternalUse.Lane;
|
|
It->InsertElements.push_back(cast<InsertElementInst>(User));
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Generate extracts for out-of-tree users.
|
|
// Find the insertion point for the extractelement lane.
|
|
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
|
|
if (PHINode *PH = dyn_cast<PHINode>(User)) {
|
|
for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
|
|
if (PH->getIncomingValue(I) == Scalar) {
|
|
Instruction *IncomingTerminator =
|
|
PH->getIncomingBlock(I)->getTerminator();
|
|
if (isa<CatchSwitchInst>(IncomingTerminator)) {
|
|
Builder.SetInsertPoint(VecI->getParent(),
|
|
std::next(VecI->getIterator()));
|
|
} else {
|
|
Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
|
|
}
|
|
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
|
|
PH->setOperand(I, NewInst);
|
|
}
|
|
}
|
|
} else {
|
|
Builder.SetInsertPoint(cast<Instruction>(User));
|
|
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
|
|
User->replaceUsesOfWith(Scalar, NewInst);
|
|
}
|
|
} else {
|
|
Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
|
|
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
|
|
User->replaceUsesOfWith(Scalar, NewInst);
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
|
|
}
|
|
|
|
auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
|
|
SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
|
|
SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
|
|
int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
|
|
for (int I = 0, E = Mask.size(); I < E; ++I) {
|
|
if (Mask[I] < VF)
|
|
CombinedMask1[I] = Mask[I];
|
|
else
|
|
CombinedMask2[I] = Mask[I] - VF;
|
|
}
|
|
ShuffleInstructionBuilder ShuffleBuilder(
|
|
cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
|
|
ShuffleBuilder.add(V1, CombinedMask1);
|
|
if (V2)
|
|
ShuffleBuilder.add(V2, CombinedMask2);
|
|
return ShuffleBuilder.finalize({}, {}, {});
|
|
};
|
|
|
|
auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
|
|
bool ForSingleMask) {
|
|
unsigned VF = Mask.size();
|
|
unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
|
|
if (VF != VecVF) {
|
|
if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
|
|
Vec = CreateShuffle(Vec, nullptr, Mask);
|
|
return std::make_pair(Vec, true);
|
|
}
|
|
if (!ForSingleMask) {
|
|
SmallVector<int> ResizeMask(VF, PoisonMaskElem);
|
|
for (unsigned I = 0; I < VF; ++I) {
|
|
if (Mask[I] != PoisonMaskElem)
|
|
ResizeMask[Mask[I]] = Mask[I];
|
|
}
|
|
Vec = CreateShuffle(Vec, nullptr, ResizeMask);
|
|
}
|
|
}
|
|
|
|
return std::make_pair(Vec, false);
|
|
};
|
|
// Perform shuffling of the vectorize tree entries for better handling of
|
|
// external extracts.
|
|
for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
|
|
// Find the first and the last instruction in the list of insertelements.
|
|
sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
|
|
InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
|
|
InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
|
|
Builder.SetInsertPoint(LastInsert);
|
|
auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
|
|
Value *NewInst = performExtractsShuffleAction<Value>(
|
|
MutableArrayRef(Vector.data(), Vector.size()),
|
|
FirstInsert->getOperand(0),
|
|
[](Value *Vec) {
|
|
return cast<VectorType>(Vec->getType())
|
|
->getElementCount()
|
|
.getKnownMinValue();
|
|
},
|
|
ResizeToVF,
|
|
[FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
|
|
ArrayRef<Value *> Vals) {
|
|
assert((Vals.size() == 1 || Vals.size() == 2) &&
|
|
"Expected exactly 1 or 2 input values.");
|
|
if (Vals.size() == 1) {
|
|
// Do not create shuffle if the mask is a simple identity
|
|
// non-resizing mask.
|
|
if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
|
|
->getNumElements() ||
|
|
!ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
|
|
return CreateShuffle(Vals.front(), nullptr, Mask);
|
|
return Vals.front();
|
|
}
|
|
return CreateShuffle(Vals.front() ? Vals.front()
|
|
: FirstInsert->getOperand(0),
|
|
Vals.back(), Mask);
|
|
});
|
|
auto It = ShuffledInserts[I].InsertElements.rbegin();
|
|
// Rebuild buildvector chain.
|
|
InsertElementInst *II = nullptr;
|
|
if (It != ShuffledInserts[I].InsertElements.rend())
|
|
II = *It;
|
|
SmallVector<Instruction *> Inserts;
|
|
while (It != ShuffledInserts[I].InsertElements.rend()) {
|
|
assert(II && "Must be an insertelement instruction.");
|
|
if (*It == II)
|
|
++It;
|
|
else
|
|
Inserts.push_back(cast<Instruction>(II));
|
|
II = dyn_cast<InsertElementInst>(II->getOperand(0));
|
|
}
|
|
for (Instruction *II : reverse(Inserts)) {
|
|
II->replaceUsesOfWith(II->getOperand(0), NewInst);
|
|
if (auto *NewI = dyn_cast<Instruction>(NewInst))
|
|
if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
|
|
II->moveAfter(NewI);
|
|
NewInst = II;
|
|
}
|
|
LastInsert->replaceAllUsesWith(NewInst);
|
|
for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
|
|
IE->replaceUsesOfWith(IE->getOperand(0),
|
|
PoisonValue::get(IE->getOperand(0)->getType()));
|
|
IE->replaceUsesOfWith(IE->getOperand(1),
|
|
PoisonValue::get(IE->getOperand(1)->getType()));
|
|
eraseInstruction(IE);
|
|
}
|
|
CSEBlocks.insert(LastInsert->getParent());
|
|
}
|
|
|
|
SmallVector<Instruction *> RemovedInsts;
|
|
// For each vectorized value:
|
|
for (auto &TEPtr : VectorizableTree) {
|
|
TreeEntry *Entry = TEPtr.get();
|
|
|
|
// No need to handle users of gathered values.
|
|
if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
|
|
continue;
|
|
|
|
assert(Entry->VectorizedValue && "Can't find vectorizable value");
|
|
|
|
// For each lane:
|
|
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
|
|
Value *Scalar = Entry->Scalars[Lane];
|
|
|
|
if (Entry->getOpcode() == Instruction::GetElementPtr &&
|
|
!isa<GetElementPtrInst>(Scalar))
|
|
continue;
|
|
if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
|
|
EE && IgnoredExtracts.contains(EE))
|
|
continue;
|
|
if (isa<PoisonValue>(Scalar))
|
|
continue;
|
|
#ifndef NDEBUG
|
|
Type *Ty = Scalar->getType();
|
|
if (!Ty->isVoidTy()) {
|
|
for (User *U : Scalar->users()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
|
|
|
|
// It is legal to delete users in the ignorelist.
|
|
assert((isVectorized(U) ||
|
|
(UserIgnoreList && UserIgnoreList->contains(U)) ||
|
|
(isa_and_nonnull<Instruction>(U) &&
|
|
isDeleted(cast<Instruction>(U)))) &&
|
|
"Deleting out-of-tree value");
|
|
}
|
|
}
|
|
#endif
|
|
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
|
|
auto *I = cast<Instruction>(Scalar);
|
|
RemovedInsts.push_back(I);
|
|
}
|
|
}
|
|
|
|
// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
|
|
// new vector instruction.
|
|
if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
|
|
V->mergeDIAssignID(RemovedInsts);
|
|
|
|
// Clear up reduction references, if any.
|
|
if (UserIgnoreList) {
|
|
for (Instruction *I : RemovedInsts) {
|
|
const TreeEntry *IE = getTreeEntries(I).front();
|
|
if (IE->Idx != 0 &&
|
|
!(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
|
|
(ValueToGatherNodes.lookup(I).contains(
|
|
VectorizableTree.front().get()) ||
|
|
(IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
|
|
IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
|
|
!(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
|
|
IE->UserTreeIndex &&
|
|
is_contained(VectorizableTree.front()->Scalars, I)) &&
|
|
!(GatheredLoadsEntriesFirst.has_value() &&
|
|
IE->Idx >= *GatheredLoadsEntriesFirst &&
|
|
VectorizableTree.front()->isGather() &&
|
|
is_contained(VectorizableTree.front()->Scalars, I)))
|
|
continue;
|
|
SmallVector<SelectInst *> LogicalOpSelects;
|
|
I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
|
|
// Do not replace condition of the logical op in form select <cond>.
|
|
bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
|
|
(match(U.getUser(), m_LogicalAnd()) ||
|
|
match(U.getUser(), m_LogicalOr())) &&
|
|
U.getOperandNo() == 0;
|
|
if (IsPoisoningLogicalOp) {
|
|
LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
|
|
return false;
|
|
}
|
|
return UserIgnoreList->contains(U.getUser());
|
|
});
|
|
// Replace conditions of the poisoning logical ops with the non-poison
|
|
// constant value.
|
|
for (SelectInst *SI : LogicalOpSelects)
|
|
SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
|
|
}
|
|
}
|
|
// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
|
|
// cache correctness.
|
|
// NOTE: removeInstructionAndOperands only marks the instruction for deletion
|
|
// - instructions are not deleted until later.
|
|
removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
|
|
|
|
Builder.ClearInsertionPoint();
|
|
InstrElementSize.clear();
|
|
|
|
const TreeEntry &RootTE = *VectorizableTree.front();
|
|
Value *Vec = RootTE.VectorizedValue;
|
|
if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
|
|
It != MinBWs.end() &&
|
|
ReductionBitWidth != It->second.first) {
|
|
IRBuilder<>::InsertPointGuard Guard(Builder);
|
|
Builder.SetInsertPoint(ReductionRoot->getParent(),
|
|
ReductionRoot->getIterator());
|
|
Vec = Builder.CreateIntCast(
|
|
Vec,
|
|
VectorType::get(Builder.getIntNTy(ReductionBitWidth),
|
|
cast<VectorType>(Vec->getType())->getElementCount()),
|
|
It->second.second);
|
|
}
|
|
return Vec;
|
|
}
|
|
|
|
void BoUpSLP::optimizeGatherSequence() {
|
|
LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
|
|
<< " gather sequences instructions.\n");
|
|
// LICM InsertElementInst sequences.
|
|
for (Instruction *I : GatherShuffleExtractSeq) {
|
|
if (isDeleted(I))
|
|
continue;
|
|
|
|
// Check if this block is inside a loop.
|
|
Loop *L = LI->getLoopFor(I->getParent());
|
|
if (!L)
|
|
continue;
|
|
|
|
// Check if it has a preheader.
|
|
BasicBlock *PreHeader = L->getLoopPreheader();
|
|
if (!PreHeader)
|
|
continue;
|
|
|
|
// If the vector or the element that we insert into it are
|
|
// instructions that are defined in this basic block then we can't
|
|
// hoist this instruction.
|
|
if (any_of(I->operands(), [L](Value *V) {
|
|
auto *OpI = dyn_cast<Instruction>(V);
|
|
return OpI && L->contains(OpI);
|
|
}))
|
|
continue;
|
|
|
|
// We can hoist this instruction. Move it to the pre-header.
|
|
I->moveBefore(PreHeader->getTerminator()->getIterator());
|
|
CSEBlocks.insert(PreHeader);
|
|
}
|
|
|
|
// Make a list of all reachable blocks in our CSE queue.
|
|
SmallVector<const DomTreeNode *, 8> CSEWorkList;
|
|
CSEWorkList.reserve(CSEBlocks.size());
|
|
for (BasicBlock *BB : CSEBlocks)
|
|
if (DomTreeNode *N = DT->getNode(BB)) {
|
|
assert(DT->isReachableFromEntry(N));
|
|
CSEWorkList.push_back(N);
|
|
}
|
|
|
|
// Sort blocks by domination. This ensures we visit a block after all blocks
|
|
// dominating it are visited.
|
|
llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
|
|
assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
return A->getDFSNumIn() < B->getDFSNumIn();
|
|
});
|
|
|
|
// Less defined shuffles can be replaced by the more defined copies.
|
|
// Between two shuffles one is less defined if it has the same vector operands
|
|
// and its mask indeces are the same as in the first one or undefs. E.g.
|
|
// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
|
|
// poison, <0, 0, 0, 0>.
|
|
auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
|
|
Instruction *I2,
|
|
SmallVectorImpl<int> &NewMask) {
|
|
if (I1->getType() != I2->getType())
|
|
return false;
|
|
auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
|
|
auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
|
|
if (!SI1 || !SI2)
|
|
return I1->isIdenticalTo(I2);
|
|
if (SI1->isIdenticalTo(SI2))
|
|
return true;
|
|
for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
|
|
if (SI1->getOperand(I) != SI2->getOperand(I))
|
|
return false;
|
|
// Check if the second instruction is more defined than the first one.
|
|
NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
|
|
ArrayRef<int> SM1 = SI1->getShuffleMask();
|
|
// Count trailing undefs in the mask to check the final number of used
|
|
// registers.
|
|
unsigned LastUndefsCnt = 0;
|
|
for (int I = 0, E = NewMask.size(); I < E; ++I) {
|
|
if (SM1[I] == PoisonMaskElem)
|
|
++LastUndefsCnt;
|
|
else
|
|
LastUndefsCnt = 0;
|
|
if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
|
|
NewMask[I] != SM1[I])
|
|
return false;
|
|
if (NewMask[I] == PoisonMaskElem)
|
|
NewMask[I] = SM1[I];
|
|
}
|
|
// Check if the last undefs actually change the final number of used vector
|
|
// registers.
|
|
return SM1.size() - LastUndefsCnt > 1 &&
|
|
::getNumberOfParts(*TTI, SI1->getType()) ==
|
|
::getNumberOfParts(
|
|
*TTI, getWidenedType(SI1->getType()->getElementType(),
|
|
SM1.size() - LastUndefsCnt));
|
|
};
|
|
// Perform O(N^2) search over the gather/shuffle sequences and merge identical
|
|
// instructions. TODO: We can further optimize this scan if we split the
|
|
// instructions into different buckets based on the insert lane.
|
|
SmallVector<Instruction *, 16> Visited;
|
|
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
|
|
assert(*I &&
|
|
(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
|
|
"Worklist not sorted properly!");
|
|
BasicBlock *BB = (*I)->getBlock();
|
|
// For all instructions in blocks containing gather sequences:
|
|
for (Instruction &In : llvm::make_early_inc_range(*BB)) {
|
|
if (isDeleted(&In))
|
|
continue;
|
|
if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
|
|
!GatherShuffleExtractSeq.contains(&In))
|
|
continue;
|
|
|
|
// Check if we can replace this instruction with any of the
|
|
// visited instructions.
|
|
bool Replaced = false;
|
|
for (Instruction *&V : Visited) {
|
|
SmallVector<int> NewMask;
|
|
if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
|
|
DT->dominates(V->getParent(), In.getParent())) {
|
|
In.replaceAllUsesWith(V);
|
|
eraseInstruction(&In);
|
|
if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
|
|
if (!NewMask.empty())
|
|
SI->setShuffleMask(NewMask);
|
|
Replaced = true;
|
|
break;
|
|
}
|
|
if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
|
|
GatherShuffleExtractSeq.contains(V) &&
|
|
IsIdenticalOrLessDefined(V, &In, NewMask) &&
|
|
DT->dominates(In.getParent(), V->getParent())) {
|
|
In.moveAfter(V);
|
|
V->replaceAllUsesWith(&In);
|
|
eraseInstruction(V);
|
|
if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
|
|
if (!NewMask.empty())
|
|
SI->setShuffleMask(NewMask);
|
|
V = &In;
|
|
Replaced = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!Replaced) {
|
|
assert(!is_contained(Visited, &In));
|
|
Visited.push_back(&In);
|
|
}
|
|
}
|
|
}
|
|
CSEBlocks.clear();
|
|
GatherShuffleExtractSeq.clear();
|
|
}
|
|
|
|
BoUpSLP::ScheduleBundle &
|
|
BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
|
|
auto &BundlePtr =
|
|
ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
|
|
for (Value *V : VL) {
|
|
if (doesNotNeedToBeScheduled(V))
|
|
continue;
|
|
ScheduleData *BundleMember = getScheduleData(V);
|
|
assert(BundleMember && "no ScheduleData for bundle member "
|
|
"(maybe not in same basic block)");
|
|
// Group the instructions to a bundle.
|
|
BundlePtr->add(BundleMember);
|
|
ScheduledBundles.try_emplace(cast<Instruction>(V))
|
|
.first->getSecond()
|
|
.push_back(BundlePtr.get());
|
|
}
|
|
assert(BundlePtr.get() && *BundlePtr.get() &&
|
|
"Failed to find schedule bundle");
|
|
return *BundlePtr.get();
|
|
}
|
|
|
|
// Groups the instructions to a bundle (which is then a single scheduling entity)
|
|
// and schedules instructions until the bundle gets ready.
|
|
std::optional<BoUpSLP::ScheduleBundle *>
|
|
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
|
|
const InstructionsState &S) {
|
|
// No need to schedule PHIs, insertelement, extractelement and extractvalue
|
|
// instructions.
|
|
if (isa<PHINode>(S.getMainOp()) ||
|
|
isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
|
|
return nullptr;
|
|
|
|
// Initialize the instruction bundle.
|
|
Instruction *OldScheduleEnd = ScheduleEnd;
|
|
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
|
|
|
|
auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
|
|
// The scheduling region got new instructions at the lower end (or it is a
|
|
// new region for the first bundle). This makes it necessary to
|
|
// recalculate all dependencies.
|
|
// It is seldom that this needs to be done a second time after adding the
|
|
// initial bundle to the region.
|
|
if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
|
|
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
if (ScheduleData *SD = getScheduleData(I))
|
|
SD->clearDependencies();
|
|
}
|
|
ReSchedule = true;
|
|
}
|
|
if (Bundle && !Bundle.getBundle().empty()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
|
|
<< BB->getName() << "\n");
|
|
calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
|
|
}
|
|
|
|
if (ReSchedule) {
|
|
resetSchedule();
|
|
initialFillReadyList(ReadyInsts);
|
|
}
|
|
|
|
// Now try to schedule the new bundle or (if no bundle) just calculate
|
|
// dependencies. As soon as the bundle is "ready" it means that there are no
|
|
// cyclic dependencies and we can schedule it. Note that's important that we
|
|
// don't "schedule" the bundle yet.
|
|
while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
|
|
!ReadyInsts.empty()) {
|
|
ScheduleEntity *Picked = ReadyInsts.pop_back_val();
|
|
assert(Picked->isReady() && "must be ready to schedule");
|
|
schedule(Picked, ReadyInsts);
|
|
if (Picked == &Bundle)
|
|
break;
|
|
}
|
|
};
|
|
|
|
// Make sure that the scheduling region contains all
|
|
// instructions of the bundle.
|
|
for (Value *V : VL) {
|
|
if (doesNotNeedToBeScheduled(V))
|
|
continue;
|
|
if (!extendSchedulingRegion(V, S)) {
|
|
// If the scheduling region got new instructions at the lower end (or it
|
|
// is a new region for the first bundle). This makes it necessary to
|
|
// recalculate all dependencies.
|
|
// Otherwise the compiler may crash trying to incorrectly calculate
|
|
// dependencies and emit instruction in the wrong order at the actual
|
|
// scheduling.
|
|
ScheduleBundle Invalid = ScheduleBundle::invalid();
|
|
TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
|
|
bool ReSchedule = false;
|
|
for (Value *V : VL) {
|
|
if (doesNotNeedToBeScheduled(V))
|
|
continue;
|
|
ScheduleData *BundleMember = getScheduleData(V);
|
|
assert(BundleMember &&
|
|
"no ScheduleData for bundle member (maybe not in same basic block)");
|
|
|
|
// Make sure we don't leave the pieces of the bundle in the ready list when
|
|
// whole bundle might not be ready.
|
|
ReadyInsts.remove(BundleMember);
|
|
if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
|
|
!Bundles.empty())
|
|
for_each(Bundles, [&](ScheduleBundle *B) { ReadyInsts.remove(B); });
|
|
|
|
if (!BundleMember->isScheduled())
|
|
continue;
|
|
// A bundle member was scheduled as single instruction before and now
|
|
// needs to be scheduled as part of the bundle. We just get rid of the
|
|
// existing schedule.
|
|
LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
|
|
<< " was already scheduled\n");
|
|
ReSchedule = true;
|
|
}
|
|
|
|
ScheduleBundle &Bundle = buildBundle(VL);
|
|
TryScheduleBundleImpl(ReSchedule, Bundle);
|
|
if (!Bundle.isReady()) {
|
|
for (ScheduleData *BD : Bundle.getBundle()) {
|
|
if (BD->isReady()) {
|
|
ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
|
|
if (Bundles.empty()) {
|
|
ReadyInsts.insert(BD);
|
|
continue;
|
|
}
|
|
for (ScheduleBundle *B : Bundles)
|
|
if (B->isReady())
|
|
ReadyInsts.insert(B);
|
|
}
|
|
}
|
|
ScheduledBundlesList.pop_back();
|
|
for (Value *V : VL) {
|
|
if (doesNotNeedToBeScheduled(V))
|
|
continue;
|
|
ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
return &Bundle;
|
|
}
|
|
|
|
BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
|
|
// Allocate a new ScheduleData for the instruction.
|
|
if (ChunkPos >= ChunkSize) {
|
|
ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
|
|
ChunkPos = 0;
|
|
}
|
|
return &(ScheduleDataChunks.back()[ChunkPos++]);
|
|
}
|
|
|
|
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
|
|
Value *V, const InstructionsState &S) {
|
|
Instruction *I = dyn_cast<Instruction>(V);
|
|
assert(I && "bundle member must be an instruction");
|
|
assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
|
|
!doesNotNeedToBeScheduled(I) &&
|
|
"phi nodes/insertelements/extractelements/extractvalues don't need to "
|
|
"be scheduled");
|
|
if (getScheduleData(I))
|
|
return true;
|
|
if (!ScheduleStart) {
|
|
// It's the first instruction in the new region.
|
|
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
|
|
ScheduleStart = I;
|
|
ScheduleEnd = I->getNextNode();
|
|
assert(ScheduleEnd && "tried to vectorize a terminator?");
|
|
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
|
|
return true;
|
|
}
|
|
// Search up and down at the same time, because we don't know if the new
|
|
// instruction is above or below the existing scheduling region.
|
|
// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
|
|
// against the budget. Otherwise debug info could affect codegen.
|
|
BasicBlock::reverse_iterator UpIter =
|
|
++ScheduleStart->getIterator().getReverse();
|
|
BasicBlock::reverse_iterator UpperEnd = BB->rend();
|
|
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
|
|
BasicBlock::iterator LowerEnd = BB->end();
|
|
auto IsAssumeLikeIntr = [](const Instruction &I) {
|
|
if (auto *II = dyn_cast<IntrinsicInst>(&I))
|
|
return II->isAssumeLikeIntrinsic();
|
|
return false;
|
|
};
|
|
UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
|
|
DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
|
|
while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
|
|
&*DownIter != I) {
|
|
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
|
|
LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
|
|
return false;
|
|
}
|
|
|
|
++UpIter;
|
|
++DownIter;
|
|
|
|
UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
|
|
DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
|
|
}
|
|
if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
|
|
assert(I->getParent() == ScheduleStart->getParent() &&
|
|
"Instruction is in wrong basic block.");
|
|
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
|
|
ScheduleStart = I;
|
|
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
|
|
<< "\n");
|
|
return true;
|
|
}
|
|
assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
|
|
"Expected to reach top of the basic block or instruction down the "
|
|
"lower end.");
|
|
assert(I->getParent() == ScheduleEnd->getParent() &&
|
|
"Instruction is in wrong basic block.");
|
|
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
|
|
nullptr);
|
|
ScheduleEnd = I->getNextNode();
|
|
assert(ScheduleEnd && "tried to vectorize a terminator?");
|
|
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
|
|
return true;
|
|
}
|
|
|
|
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
|
|
Instruction *ToI,
|
|
ScheduleData *PrevLoadStore,
|
|
ScheduleData *NextLoadStore) {
|
|
ScheduleData *CurrentLoadStore = PrevLoadStore;
|
|
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
|
|
// No need to allocate data for non-schedulable instructions.
|
|
if (doesNotNeedToBeScheduled(I))
|
|
continue;
|
|
ScheduleData *SD = ScheduleDataMap.lookup(I);
|
|
if (!SD) {
|
|
SD = allocateScheduleDataChunks();
|
|
ScheduleDataMap[I] = SD;
|
|
}
|
|
assert(!isInSchedulingRegion(SD) &&
|
|
"new ScheduleData already in scheduling region");
|
|
SD->init(SchedulingRegionID, I);
|
|
|
|
if (I->mayReadOrWriteMemory() &&
|
|
(!isa<IntrinsicInst>(I) ||
|
|
(cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
|
|
cast<IntrinsicInst>(I)->getIntrinsicID() !=
|
|
Intrinsic::pseudoprobe))) {
|
|
// Update the linked list of memory accessing instructions.
|
|
if (CurrentLoadStore) {
|
|
CurrentLoadStore->setNextLoadStore(SD);
|
|
} else {
|
|
FirstLoadStoreInRegion = SD;
|
|
}
|
|
CurrentLoadStore = SD;
|
|
}
|
|
|
|
if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
|
|
match(I, m_Intrinsic<Intrinsic::stackrestore>()))
|
|
RegionHasStackSave = true;
|
|
}
|
|
if (NextLoadStore) {
|
|
if (CurrentLoadStore)
|
|
CurrentLoadStore->setNextLoadStore(NextLoadStore);
|
|
} else {
|
|
LastLoadStoreInRegion = CurrentLoadStore;
|
|
}
|
|
}
|
|
|
|
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
|
|
bool InsertInReadyList,
|
|
BoUpSLP *SLP) {
|
|
SmallVector<ScheduleData *> WorkList;
|
|
auto ProcessNode = [&](ScheduleData *BundleMember) {
|
|
if (BundleMember->hasValidDependencies())
|
|
return;
|
|
LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
|
|
BundleMember->initDependencies();
|
|
BundleMember->resetUnscheduledDeps();
|
|
// Handle def-use chain dependencies.
|
|
for (User *U : BundleMember->getInst()->users()) {
|
|
if (ScheduleData *UseSD = getScheduleData(U)) {
|
|
BundleMember->incDependencies();
|
|
if (!UseSD->isScheduled())
|
|
BundleMember->incrementUnscheduledDeps(1);
|
|
WorkList.push_back(UseSD);
|
|
}
|
|
}
|
|
|
|
auto MakeControlDependent = [&](Instruction *I) {
|
|
auto *DepDest = getScheduleData(I);
|
|
assert(DepDest && "must be in schedule window");
|
|
DepDest->addControlDependency(BundleMember);
|
|
BundleMember->incDependencies();
|
|
if (!DepDest->isScheduled())
|
|
BundleMember->incrementUnscheduledDeps(1);
|
|
WorkList.push_back(DepDest);
|
|
};
|
|
|
|
// Any instruction which isn't safe to speculate at the beginning of the
|
|
// block is control depend on any early exit or non-willreturn call
|
|
// which proceeds it.
|
|
if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
|
|
for (Instruction *I = BundleMember->getInst()->getNextNode();
|
|
I != ScheduleEnd; I = I->getNextNode()) {
|
|
if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
|
|
continue;
|
|
|
|
// Add the dependency
|
|
MakeControlDependent(I);
|
|
|
|
if (!isGuaranteedToTransferExecutionToSuccessor(I))
|
|
// Everything past here must be control dependent on I.
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (RegionHasStackSave) {
|
|
// If we have an inalloc alloca instruction, it needs to be scheduled
|
|
// after any preceeding stacksave. We also need to prevent any alloca
|
|
// from reordering above a preceeding stackrestore.
|
|
if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
|
|
match(BundleMember->getInst(),
|
|
m_Intrinsic<Intrinsic::stackrestore>())) {
|
|
for (Instruction *I = BundleMember->getInst()->getNextNode();
|
|
I != ScheduleEnd; I = I->getNextNode()) {
|
|
if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
|
|
match(I, m_Intrinsic<Intrinsic::stackrestore>()))
|
|
// Any allocas past here must be control dependent on I, and I
|
|
// must be memory dependend on BundleMember->Inst.
|
|
break;
|
|
|
|
if (!isa<AllocaInst>(I))
|
|
continue;
|
|
|
|
// Add the dependency
|
|
MakeControlDependent(I);
|
|
}
|
|
}
|
|
|
|
// In addition to the cases handle just above, we need to prevent
|
|
// allocas and loads/stores from moving below a stacksave or a
|
|
// stackrestore. Avoiding moving allocas below stackrestore is currently
|
|
// thought to be conservatism. Moving loads/stores below a stackrestore
|
|
// can lead to incorrect code.
|
|
if (isa<AllocaInst>(BundleMember->getInst()) ||
|
|
BundleMember->getInst()->mayReadOrWriteMemory()) {
|
|
for (Instruction *I = BundleMember->getInst()->getNextNode();
|
|
I != ScheduleEnd; I = I->getNextNode()) {
|
|
if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
|
|
!match(I, m_Intrinsic<Intrinsic::stackrestore>()))
|
|
continue;
|
|
|
|
// Add the dependency
|
|
MakeControlDependent(I);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Handle the memory dependencies (if any).
|
|
ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
|
|
if (!NextLoadStore)
|
|
return;
|
|
Instruction *SrcInst = BundleMember->getInst();
|
|
assert(SrcInst->mayReadOrWriteMemory() &&
|
|
"NextLoadStore list for non memory effecting bundle?");
|
|
MemoryLocation SrcLoc = getLocation(SrcInst);
|
|
bool SrcMayWrite = SrcInst->mayWriteToMemory();
|
|
unsigned NumAliased = 0;
|
|
unsigned DistToSrc = 1;
|
|
bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
|
|
|
|
for (ScheduleData *DepDest = NextLoadStore; DepDest;
|
|
DepDest = DepDest->getNextLoadStore()) {
|
|
assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
|
|
|
|
// We have two limits to reduce the complexity:
|
|
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
|
|
// SLP->isAliased (which is the expensive part in this loop).
|
|
// 2) MaxMemDepDistance: It's for very large blocks and it aborts
|
|
// the whole loop (even if the loop is fast, it's quadratic).
|
|
// It's important for the loop break condition (see below) to
|
|
// check this limit even between two read-only instructions.
|
|
if (DistToSrc >= MaxMemDepDistance ||
|
|
((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
|
|
(IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
|
|
SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
|
|
|
|
// We increment the counter only if the locations are aliased
|
|
// (instead of counting all alias checks). This gives a better
|
|
// balance between reduced runtime and accurate dependencies.
|
|
NumAliased++;
|
|
|
|
DepDest->addMemoryDependency(BundleMember);
|
|
BundleMember->incDependencies();
|
|
if (!DepDest->isScheduled())
|
|
BundleMember->incrementUnscheduledDeps(1);
|
|
WorkList.push_back(DepDest);
|
|
}
|
|
|
|
// Example, explaining the loop break condition: Let's assume our
|
|
// starting instruction is i0 and MaxMemDepDistance = 3.
|
|
//
|
|
// +--------v--v--v
|
|
// i0,i1,i2,i3,i4,i5,i6,i7,i8
|
|
// +--------^--^--^
|
|
//
|
|
// MaxMemDepDistance let us stop alias-checking at i3 and we add
|
|
// dependencies from i0 to i3,i4,.. (even if they are not aliased).
|
|
// Previously we already added dependencies from i3 to i6,i7,i8
|
|
// (because of MaxMemDepDistance). As we added a dependency from
|
|
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
|
|
// and we can abort this loop at i6.
|
|
if (DistToSrc >= 2 * MaxMemDepDistance)
|
|
break;
|
|
DistToSrc++;
|
|
}
|
|
};
|
|
|
|
WorkList.push_back(Bundle.getBundle().front());
|
|
SmallPtrSet<ScheduleBundle *, 16> Visited;
|
|
while (!WorkList.empty()) {
|
|
ScheduleData *SD = WorkList.pop_back_val();
|
|
ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(SD->getInst());
|
|
if (Bundles.empty()) {
|
|
ProcessNode(SD);
|
|
if (InsertInReadyList && SD->isReady()) {
|
|
ReadyInsts.insert(SD);
|
|
LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
|
|
}
|
|
continue;
|
|
}
|
|
for_each(Bundles, [&](ScheduleBundle *Bundle) {
|
|
if (!Visited.insert(Bundle).second || Bundle->hasValidDependencies())
|
|
return;
|
|
assert(isInSchedulingRegion(*Bundle) &&
|
|
"ScheduleData not in scheduling region");
|
|
for_each(Bundle->getBundle(), ProcessNode);
|
|
});
|
|
if (InsertInReadyList && SD->isReady()) {
|
|
for_each(Bundles, [&](ScheduleBundle *Bundle) {
|
|
assert(isInSchedulingRegion(*Bundle) &&
|
|
"ScheduleData not in scheduling region");
|
|
if (!Bundle->isReady())
|
|
return;
|
|
ReadyInsts.insert(Bundle);
|
|
LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
|
|
<< "\n");
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
void BoUpSLP::BlockScheduling::resetSchedule() {
|
|
assert(ScheduleStart &&
|
|
"tried to reset schedule on block which has not been scheduled");
|
|
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
if (ScheduleData *SD = getScheduleData(I)) {
|
|
assert(isInSchedulingRegion(SD) &&
|
|
"ScheduleData not in scheduling region");
|
|
SD->setScheduled(/*Scheduled=*/false);
|
|
SD->resetUnscheduledDeps();
|
|
}
|
|
for (ScheduleBundle *Bundle : getScheduleBundles(I)) {
|
|
assert(isInSchedulingRegion(*Bundle) &&
|
|
"ScheduleBundle not in scheduling region");
|
|
Bundle->setScheduled(/*Scheduled=*/false);
|
|
}
|
|
}
|
|
ReadyInsts.clear();
|
|
}
|
|
|
|
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
|
|
if (!BS->ScheduleStart)
|
|
return;
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
|
|
|
|
// A key point - if we got here, pre-scheduling was able to find a valid
|
|
// scheduling of the sub-graph of the scheduling window which consists
|
|
// of all vector bundles and their transitive users. As such, we do not
|
|
// need to reschedule anything *outside of* that subgraph.
|
|
|
|
BS->resetSchedule();
|
|
|
|
// For the real scheduling we use a more sophisticated ready-list: it is
|
|
// sorted by the original instruction location. This lets the final schedule
|
|
// be as close as possible to the original instruction order.
|
|
// WARNING: If changing this order causes a correctness issue, that means
|
|
// there is some missing dependence edge in the schedule data graph.
|
|
struct ScheduleDataCompare {
|
|
bool operator()(const ScheduleEntity *SD1,
|
|
const ScheduleEntity *SD2) const {
|
|
return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
|
|
}
|
|
};
|
|
std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
|
|
|
|
// Ensure that all dependency data is updated (for nodes in the sub-graph)
|
|
// and fill the ready-list with initial instructions.
|
|
int Idx = 0;
|
|
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
|
|
I = I->getNextNode()) {
|
|
ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
|
|
if (!Bundles.empty()) {
|
|
for (ScheduleBundle *Bundle : Bundles) {
|
|
Bundle->setSchedulingPriority(Idx++);
|
|
if (!Bundle->hasValidDependencies())
|
|
BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
|
|
}
|
|
continue;
|
|
}
|
|
if (ScheduleData *SD = BS->getScheduleData(I)) {
|
|
[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
|
|
assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
|
|
doesNotNeedToSchedule(SDTEs.front()->Scalars)) &&
|
|
"scheduler and vectorizer bundle mismatch");
|
|
SD->setSchedulingPriority(Idx++);
|
|
continue;
|
|
}
|
|
}
|
|
BS->initialFillReadyList(ReadyInsts);
|
|
|
|
Instruction *LastScheduledInst = BS->ScheduleEnd;
|
|
|
|
// Do the "real" scheduling.
|
|
SmallPtrSet<Instruction *, 16> Scheduled;
|
|
while (!ReadyInsts.empty()) {
|
|
auto *Picked = *ReadyInsts.begin();
|
|
ReadyInsts.erase(ReadyInsts.begin());
|
|
|
|
// Move the scheduled instruction(s) to their dedicated places, if not
|
|
// there yet.
|
|
if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
|
|
for (const ScheduleData *BundleMember : Bundle->getBundle()) {
|
|
Instruction *PickedInst = BundleMember->getInst();
|
|
if (!Scheduled.insert(PickedInst).second)
|
|
continue;
|
|
if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
|
|
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
|
|
LastScheduledInst = PickedInst;
|
|
}
|
|
EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
|
|
LastScheduledInst);
|
|
} else {
|
|
auto *SD = cast<ScheduleData>(Picked);
|
|
Instruction *PickedInst = SD->getInst();
|
|
if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
|
|
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
|
|
LastScheduledInst = PickedInst;
|
|
}
|
|
BS->schedule(Picked, ReadyInsts);
|
|
}
|
|
|
|
// Check that we didn't break any of our invariants.
|
|
#ifdef EXPENSIVE_CHECKS
|
|
BS->verify();
|
|
#endif
|
|
|
|
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
|
|
// Check that all schedulable entities got scheduled
|
|
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
|
|
I = I->getNextNode()) {
|
|
ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
|
|
assert(all_of(Bundles,
|
|
[](const ScheduleBundle *Bundle) {
|
|
return Bundle->isScheduled();
|
|
}) &&
|
|
"must be scheduled at this point");
|
|
}
|
|
#endif
|
|
|
|
// Avoid duplicate scheduling of the block.
|
|
BS->ScheduleStart = nullptr;
|
|
}
|
|
|
|
unsigned BoUpSLP::getVectorElementSize(Value *V) {
|
|
// If V is a store, just return the width of the stored value (or value
|
|
// truncated just before storing) without traversing the expression tree.
|
|
// This is the common case.
|
|
if (auto *Store = dyn_cast<StoreInst>(V))
|
|
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
|
|
|
|
if (auto *IEI = dyn_cast<InsertElementInst>(V))
|
|
return getVectorElementSize(IEI->getOperand(1));
|
|
|
|
auto E = InstrElementSize.find(V);
|
|
if (E != InstrElementSize.end())
|
|
return E->second;
|
|
|
|
// If V is not a store, we can traverse the expression tree to find loads
|
|
// that feed it. The type of the loaded value may indicate a more suitable
|
|
// width than V's type. We want to base the vector element size on the width
|
|
// of memory operations where possible.
|
|
SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
|
|
SmallPtrSet<Instruction *, 16> Visited;
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
Worklist.emplace_back(I, I->getParent(), 0);
|
|
Visited.insert(I);
|
|
}
|
|
|
|
// Traverse the expression tree in bottom-up order looking for loads. If we
|
|
// encounter an instruction we don't yet handle, we give up.
|
|
auto Width = 0u;
|
|
Value *FirstNonBool = nullptr;
|
|
while (!Worklist.empty()) {
|
|
auto [I, Parent, Level] = Worklist.pop_back_val();
|
|
|
|
// We should only be looking at scalar instructions here. If the current
|
|
// instruction has a vector type, skip.
|
|
auto *Ty = I->getType();
|
|
if (isa<VectorType>(Ty))
|
|
continue;
|
|
if (Ty != Builder.getInt1Ty() && !FirstNonBool)
|
|
FirstNonBool = I;
|
|
if (Level > RecursionMaxDepth)
|
|
continue;
|
|
|
|
// If the current instruction is a load, update MaxWidth to reflect the
|
|
// width of the loaded value.
|
|
if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
|
|
Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
|
|
|
|
// Otherwise, we need to visit the operands of the instruction. We only
|
|
// handle the interesting cases from buildTree here. If an operand is an
|
|
// instruction we haven't yet visited and from the same basic block as the
|
|
// user or the use is a PHI node, we add it to the worklist.
|
|
else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
|
|
BinaryOperator, UnaryOperator>(I)) {
|
|
for (Use &U : I->operands()) {
|
|
if (auto *J = dyn_cast<Instruction>(U.get()))
|
|
if (Visited.insert(J).second &&
|
|
(isa<PHINode>(I) || J->getParent() == Parent)) {
|
|
Worklist.emplace_back(J, J->getParent(), Level + 1);
|
|
continue;
|
|
}
|
|
if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
|
|
FirstNonBool = U.get();
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If we didn't encounter a memory access in the expression tree, or if we
|
|
// gave up for some reason, just return the width of V. Otherwise, return the
|
|
// maximum width we found.
|
|
if (!Width) {
|
|
if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
|
|
V = FirstNonBool;
|
|
Width = DL->getTypeSizeInBits(V->getType());
|
|
}
|
|
|
|
for (Instruction *I : Visited)
|
|
InstrElementSize[I] = Width;
|
|
|
|
return Width;
|
|
}
|
|
|
|
bool BoUpSLP::collectValuesToDemote(
|
|
const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
|
|
SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
|
|
const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
|
|
bool &IsProfitableToDemote, bool IsTruncRoot) const {
|
|
// We can always demote constants.
|
|
if (all_of(E.Scalars, IsaPred<Constant>))
|
|
return true;
|
|
|
|
unsigned OrigBitWidth =
|
|
DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
|
|
if (OrigBitWidth == BitWidth) {
|
|
MaxDepthLevel = 1;
|
|
return true;
|
|
}
|
|
|
|
// Check if the node was analyzed already and must keep its original bitwidth.
|
|
if (NodesToKeepBWs.contains(E.Idx))
|
|
return false;
|
|
|
|
// If the value is not a vectorized instruction in the expression and not used
|
|
// by the insertelement instruction and not used in multiple vector nodes, it
|
|
// cannot be demoted.
|
|
bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
|
|
if (isa<PoisonValue>(R))
|
|
return false;
|
|
return !isKnownNonNegative(R, SimplifyQuery(*DL));
|
|
});
|
|
auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
if (getTreeEntries(V).size() > 1)
|
|
return false;
|
|
// For lat shuffle of sext/zext with many uses need to check the extra bit
|
|
// for unsigned values, otherwise may have incorrect casting for reused
|
|
// scalars.
|
|
bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
|
|
if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
|
|
if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
|
|
return true;
|
|
}
|
|
unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
|
|
unsigned BitWidth1 = OrigBitWidth - NumSignBits;
|
|
if (IsSignedNode)
|
|
++BitWidth1;
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
APInt Mask = DB->getDemandedBits(I);
|
|
unsigned BitWidth2 =
|
|
std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
|
|
while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
|
|
if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
|
|
break;
|
|
BitWidth2 *= 2;
|
|
}
|
|
BitWidth1 = std::min(BitWidth1, BitWidth2);
|
|
}
|
|
BitWidth = std::max(BitWidth, BitWidth1);
|
|
return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
|
|
};
|
|
auto FinalAnalysis = [&, TTI = TTI]() {
|
|
if (!IsProfitableToDemote)
|
|
return false;
|
|
bool Res = all_of(
|
|
E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
|
|
// Demote gathers.
|
|
if (Res && E.isGather()) {
|
|
if (E.hasState()) {
|
|
if (const TreeEntry *SameTE =
|
|
getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
|
|
SameTE)
|
|
if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
|
|
ToDemote, Visited, NodesToKeepBWs,
|
|
MaxDepthLevel, IsProfitableToDemote,
|
|
IsTruncRoot)) {
|
|
ToDemote.push_back(E.Idx);
|
|
return true;
|
|
}
|
|
}
|
|
// Check possible extractelement instructions bases and final vector
|
|
// length.
|
|
SmallPtrSet<Value *, 4> UniqueBases;
|
|
for (Value *V : E.Scalars) {
|
|
auto *EE = dyn_cast<ExtractElementInst>(V);
|
|
if (!EE)
|
|
continue;
|
|
UniqueBases.insert(EE->getVectorOperand());
|
|
}
|
|
const unsigned VF = E.Scalars.size();
|
|
Type *OrigScalarTy = E.Scalars.front()->getType();
|
|
if (UniqueBases.size() <= 2 ||
|
|
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
|
|
::getNumberOfParts(
|
|
*TTI,
|
|
getWidenedType(
|
|
IntegerType::get(OrigScalarTy->getContext(), BitWidth),
|
|
VF))) {
|
|
ToDemote.push_back(E.Idx);
|
|
return true;
|
|
}
|
|
}
|
|
return Res;
|
|
};
|
|
if (E.isGather() || !Visited.insert(&E).second ||
|
|
any_of(E.Scalars, [&](Value *V) {
|
|
return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
|
|
return isa<InsertElementInst>(U) && !isVectorized(U);
|
|
});
|
|
}))
|
|
return FinalAnalysis();
|
|
|
|
if (any_of(E.Scalars, [&](Value *V) {
|
|
return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
|
|
return isVectorized(U) ||
|
|
(E.Idx == 0 && UserIgnoreList &&
|
|
UserIgnoreList->contains(U)) ||
|
|
(!isa<CmpInst>(U) && U->getType()->isSized() &&
|
|
!U->getType()->isScalableTy() &&
|
|
DL->getTypeSizeInBits(U->getType()) <= BitWidth);
|
|
}) && !IsPotentiallyTruncated(V, BitWidth);
|
|
}))
|
|
return false;
|
|
|
|
auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
|
|
bool &NeedToExit) {
|
|
NeedToExit = false;
|
|
unsigned InitLevel = MaxDepthLevel;
|
|
for (const TreeEntry *Op : Operands) {
|
|
unsigned Level = InitLevel;
|
|
if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
|
|
ToDemote, Visited, NodesToKeepBWs, Level,
|
|
IsProfitableToDemote, IsTruncRoot)) {
|
|
if (!IsProfitableToDemote)
|
|
return false;
|
|
NeedToExit = true;
|
|
if (!FinalAnalysis())
|
|
return false;
|
|
continue;
|
|
}
|
|
MaxDepthLevel = std::max(MaxDepthLevel, Level);
|
|
}
|
|
return true;
|
|
};
|
|
auto AttemptCheckBitwidth =
|
|
[&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
|
|
// Try all bitwidth < OrigBitWidth.
|
|
NeedToExit = false;
|
|
unsigned BestFailBitwidth = 0;
|
|
for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
|
|
if (Checker(BitWidth, OrigBitWidth))
|
|
return true;
|
|
if (BestFailBitwidth == 0 && FinalAnalysis())
|
|
BestFailBitwidth = BitWidth;
|
|
}
|
|
if (BitWidth >= OrigBitWidth) {
|
|
if (BestFailBitwidth == 0) {
|
|
BitWidth = OrigBitWidth;
|
|
return false;
|
|
}
|
|
MaxDepthLevel = 1;
|
|
BitWidth = BestFailBitwidth;
|
|
NeedToExit = true;
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
auto TryProcessInstruction =
|
|
[&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
|
|
function_ref<bool(unsigned, unsigned)> Checker = {}) {
|
|
if (Operands.empty()) {
|
|
if (!IsTruncRoot)
|
|
MaxDepthLevel = 1;
|
|
(void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
|
|
std::ref(BitWidth)));
|
|
} else {
|
|
// Several vectorized uses? Check if we can truncate it, otherwise -
|
|
// exit.
|
|
if (any_of(E.Scalars, [&](Value *V) {
|
|
return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
|
|
}))
|
|
return false;
|
|
bool NeedToExit = false;
|
|
if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
|
|
return false;
|
|
if (NeedToExit)
|
|
return true;
|
|
if (!ProcessOperands(Operands, NeedToExit))
|
|
return false;
|
|
if (NeedToExit)
|
|
return true;
|
|
}
|
|
|
|
++MaxDepthLevel;
|
|
// Record the entry that we can demote.
|
|
ToDemote.push_back(E.Idx);
|
|
return IsProfitableToDemote;
|
|
};
|
|
|
|
if (E.State == TreeEntry::SplitVectorize)
|
|
return TryProcessInstruction(
|
|
BitWidth,
|
|
{VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
|
|
VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
|
|
|
|
switch (E.getOpcode()) {
|
|
|
|
// We can always demote truncations and extensions. Since truncations can
|
|
// seed additional demotion, we save the truncated value.
|
|
case Instruction::Trunc:
|
|
if (IsProfitableToDemoteRoot)
|
|
IsProfitableToDemote = true;
|
|
return TryProcessInstruction(BitWidth);
|
|
case Instruction::ZExt:
|
|
case Instruction::SExt:
|
|
IsProfitableToDemote = true;
|
|
return TryProcessInstruction(BitWidth);
|
|
|
|
// We can demote certain binary operations if we can demote both of their
|
|
// operands.
|
|
case Instruction::Add:
|
|
case Instruction::Sub:
|
|
case Instruction::Mul:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor: {
|
|
return TryProcessInstruction(
|
|
BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
|
|
}
|
|
case Instruction::Freeze:
|
|
return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
|
|
case Instruction::Shl: {
|
|
// If we are truncating the result of this SHL, and if it's a shift of an
|
|
// inrange amount, we can always perform a SHL in a smaller type.
|
|
auto ShlChecker = [&](unsigned BitWidth, unsigned) {
|
|
return all_of(E.Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
auto *I = cast<Instruction>(V);
|
|
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
|
|
return AmtKnownBits.getMaxValue().ult(BitWidth);
|
|
});
|
|
};
|
|
return TryProcessInstruction(
|
|
BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
|
|
}
|
|
case Instruction::LShr: {
|
|
// If this is a truncate of a logical shr, we can truncate it to a smaller
|
|
// lshr iff we know that the bits we would otherwise be shifting in are
|
|
// already zeros.
|
|
auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
|
|
return all_of(E.Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
auto *I = cast<Instruction>(V);
|
|
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
|
|
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
|
|
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
|
|
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
|
|
SimplifyQuery(*DL));
|
|
});
|
|
};
|
|
return TryProcessInstruction(
|
|
BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
|
|
LShrChecker);
|
|
}
|
|
case Instruction::AShr: {
|
|
// If this is a truncate of an arithmetic shr, we can truncate it to a
|
|
// smaller ashr iff we know that all the bits from the sign bit of the
|
|
// original type and the sign bit of the truncate type are similar.
|
|
auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
|
|
return all_of(E.Scalars, [&](Value *V) {
|
|
if (isa<PoisonValue>(V))
|
|
return true;
|
|
auto *I = cast<Instruction>(V);
|
|
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
|
|
unsigned ShiftedBits = OrigBitWidth - BitWidth;
|
|
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
|
|
ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
|
|
nullptr, DT);
|
|
});
|
|
};
|
|
return TryProcessInstruction(
|
|
BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
|
|
AShrChecker);
|
|
}
|
|
case Instruction::UDiv:
|
|
case Instruction::URem: {
|
|
// UDiv and URem can be truncated if all the truncated bits are zero.
|
|
auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
|
|
assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
|
|
return all_of(E.Scalars, [&](Value *V) {
|
|
auto *I = cast<Instruction>(V);
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
|
|
return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
|
|
MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
|
|
});
|
|
};
|
|
return TryProcessInstruction(
|
|
BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
|
|
}
|
|
|
|
// We can demote selects if we can demote their true and false values.
|
|
case Instruction::Select: {
|
|
return TryProcessInstruction(
|
|
BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
|
|
}
|
|
|
|
// We can demote phis if we can demote all their incoming operands.
|
|
case Instruction::PHI: {
|
|
const unsigned NumOps = E.getNumOperands();
|
|
SmallVector<const TreeEntry *> Ops(NumOps);
|
|
transform(seq<unsigned>(0, NumOps), Ops.begin(),
|
|
[&](unsigned Idx) { return getOperandEntry(&E, Idx); });
|
|
|
|
return TryProcessInstruction(BitWidth, Ops);
|
|
}
|
|
|
|
case Instruction::Call: {
|
|
auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
|
|
if (!IC)
|
|
break;
|
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);
|
|
if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
|
|
ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
|
|
break;
|
|
SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
|
|
function_ref<bool(unsigned, unsigned)> CallChecker;
|
|
auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
|
|
assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
|
|
return all_of(E.Scalars, [&](Value *V) {
|
|
auto *I = cast<Instruction>(V);
|
|
if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
|
|
return MaskedValueIsZero(I->getOperand(0), Mask,
|
|
SimplifyQuery(*DL)) &&
|
|
MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
|
|
}
|
|
assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
|
|
"Expected min/max intrinsics only.");
|
|
unsigned SignBits = OrigBitWidth - BitWidth;
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
|
|
unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
|
|
nullptr, DT);
|
|
unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
|
|
nullptr, DT);
|
|
return SignBits <= Op0SignBits &&
|
|
((SignBits != Op0SignBits &&
|
|
!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
|
|
MaskedValueIsZero(I->getOperand(0), Mask,
|
|
SimplifyQuery(*DL))) &&
|
|
SignBits <= Op1SignBits &&
|
|
((SignBits != Op1SignBits &&
|
|
!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
|
|
MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
|
|
});
|
|
};
|
|
auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
|
|
assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
|
|
return all_of(E.Scalars, [&](Value *V) {
|
|
auto *I = cast<Instruction>(V);
|
|
unsigned SignBits = OrigBitWidth - BitWidth;
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
|
|
unsigned Op0SignBits =
|
|
ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
|
|
return SignBits <= Op0SignBits &&
|
|
((SignBits != Op0SignBits &&
|
|
!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
|
|
MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
|
|
});
|
|
};
|
|
if (ID != Intrinsic::abs) {
|
|
Operands.push_back(getOperandEntry(&E, 1));
|
|
CallChecker = CompChecker;
|
|
} else {
|
|
CallChecker = AbsChecker;
|
|
}
|
|
InstructionCost BestCost =
|
|
std::numeric_limits<InstructionCost::CostType>::max();
|
|
unsigned BestBitWidth = BitWidth;
|
|
unsigned VF = E.Scalars.size();
|
|
// Choose the best bitwidth based on cost estimations.
|
|
auto Checker = [&](unsigned BitWidth, unsigned) {
|
|
unsigned MinBW = PowerOf2Ceil(BitWidth);
|
|
SmallVector<Type *> ArgTys =
|
|
buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
|
|
auto VecCallCosts = getVectorCallCosts(
|
|
IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
|
|
TTI, TLI, ArgTys);
|
|
InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
|
|
if (Cost < BestCost) {
|
|
BestCost = Cost;
|
|
BestBitWidth = BitWidth;
|
|
}
|
|
return false;
|
|
};
|
|
[[maybe_unused]] bool NeedToExit;
|
|
(void)AttemptCheckBitwidth(Checker, NeedToExit);
|
|
BitWidth = BestBitWidth;
|
|
return TryProcessInstruction(BitWidth, Operands, CallChecker);
|
|
}
|
|
|
|
// Otherwise, conservatively give up.
|
|
default:
|
|
break;
|
|
}
|
|
MaxDepthLevel = 1;
|
|
return FinalAnalysis();
|
|
}
|
|
|
|
static RecurKind getRdxKind(Value *V);
|
|
|
|
void BoUpSLP::computeMinimumValueSizes() {
|
|
// We only attempt to truncate integer expressions.
|
|
bool IsStoreOrInsertElt =
|
|
VectorizableTree.front()->hasState() &&
|
|
(VectorizableTree.front()->getOpcode() == Instruction::Store ||
|
|
VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
|
|
if ((IsStoreOrInsertElt || UserIgnoreList) &&
|
|
ExtraBitWidthNodes.size() <= 1 &&
|
|
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
|
|
CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
|
|
return;
|
|
|
|
unsigned NodeIdx = 0;
|
|
if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
|
|
NodeIdx = 1;
|
|
|
|
// Ensure the roots of the vectorizable tree don't form a cycle.
|
|
assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
|
|
!VectorizableTree[NodeIdx]->UserTreeIndex) &&
|
|
"Unexpected tree is graph.");
|
|
|
|
// The first value node for store/insertelement is sext/zext/trunc? Skip it,
|
|
// resize to the final type.
|
|
bool IsTruncRoot = false;
|
|
bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
|
|
SmallVector<unsigned> RootDemotes;
|
|
SmallDenseSet<unsigned, 8> NodesToKeepBWs;
|
|
if (NodeIdx != 0 &&
|
|
VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
|
|
VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
|
|
assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
|
|
IsTruncRoot = true;
|
|
RootDemotes.push_back(NodeIdx);
|
|
IsProfitableToDemoteRoot = true;
|
|
++NodeIdx;
|
|
}
|
|
|
|
// Analyzed the reduction already and not profitable - exit.
|
|
if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
|
|
return;
|
|
|
|
SmallVector<unsigned> ToDemote;
|
|
auto ComputeMaxBitWidth =
|
|
[&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
|
|
unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
|
|
ToDemote.clear();
|
|
// Check if the root is trunc and the next node is gather/buildvector, then
|
|
// keep trunc in scalars, which is free in most cases.
|
|
if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
|
|
!NodesToKeepBWs.contains(E.Idx) &&
|
|
E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
|
|
all_of(E.Scalars, [&](Value *V) {
|
|
return V->hasOneUse() || isa<Constant>(V) ||
|
|
(!V->hasNUsesOrMore(UsesLimit) &&
|
|
none_of(V->users(), [&](User *U) {
|
|
ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
|
|
const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
|
|
if (TEs.empty() || is_contained(TEs, UserTE))
|
|
return false;
|
|
if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
|
|
SelectInst>(U) ||
|
|
isa<SIToFPInst, UIToFPInst>(U) ||
|
|
!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
|
|
SelectInst>(UserTE->getMainOp()) ||
|
|
isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))
|
|
return true;
|
|
unsigned UserTESz = DL->getTypeSizeInBits(
|
|
UserTE->Scalars.front()->getType());
|
|
if (all_of(TEs, [&](const TreeEntry *TE) {
|
|
auto It = MinBWs.find(TE);
|
|
return It != MinBWs.end() &&
|
|
It->second.first > UserTESz;
|
|
}))
|
|
return true;
|
|
return DL->getTypeSizeInBits(U->getType()) > UserTESz;
|
|
}));
|
|
})) {
|
|
ToDemote.push_back(E.Idx);
|
|
const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
|
|
auto It = MinBWs.find(UserTE);
|
|
if (It != MinBWs.end())
|
|
return It->second.first;
|
|
unsigned MaxBitWidth =
|
|
DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
|
|
MaxBitWidth = bit_ceil(MaxBitWidth);
|
|
if (MaxBitWidth < 8 && MaxBitWidth > 1)
|
|
MaxBitWidth = 8;
|
|
return MaxBitWidth;
|
|
}
|
|
|
|
if (!E.hasState())
|
|
return 0u;
|
|
|
|
unsigned VF = E.getVectorFactor();
|
|
Type *ScalarTy = E.Scalars.front()->getType();
|
|
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
|
|
auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
|
|
if (!TreeRootIT)
|
|
return 0u;
|
|
|
|
if (any_of(E.Scalars,
|
|
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
|
|
return 0u;
|
|
|
|
unsigned NumParts = ::getNumberOfParts(
|
|
*TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
|
|
|
|
// The maximum bit width required to represent all the values that can be
|
|
// demoted without loss of precision. It would be safe to truncate the roots
|
|
// of the expression to this width.
|
|
unsigned MaxBitWidth = 1u;
|
|
|
|
// True if the roots can be zero-extended back to their original type,
|
|
// rather than sign-extended. We know that if the leading bits are not
|
|
// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
|
|
// True.
|
|
// Determine if the sign bit of all the roots is known to be zero. If not,
|
|
// IsKnownPositive is set to False.
|
|
bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
|
|
if (isa<PoisonValue>(R))
|
|
return true;
|
|
KnownBits Known = computeKnownBits(R, *DL);
|
|
return Known.isNonNegative();
|
|
});
|
|
|
|
if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
|
|
E.UserTreeIndex.UserTE->hasState() &&
|
|
E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
|
|
MaxBitWidth =
|
|
std::min(DL->getTypeSizeInBits(
|
|
E.UserTreeIndex.UserTE->Scalars.front()->getType()),
|
|
DL->getTypeSizeInBits(ScalarTy));
|
|
|
|
// We first check if all the bits of the roots are demanded. If they're not,
|
|
// we can truncate the roots to this narrower type.
|
|
for (Value *Root : E.Scalars) {
|
|
if (isa<PoisonValue>(Root))
|
|
continue;
|
|
unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
|
|
TypeSize NumTypeBits =
|
|
DL->getTypeSizeInBits(Root->getType()->getScalarType());
|
|
unsigned BitWidth1 = NumTypeBits - NumSignBits;
|
|
// If we can't prove that the sign bit is zero, we must add one to the
|
|
// maximum bit width to account for the unknown sign bit. This preserves
|
|
// the existing sign bit so we can safely sign-extend the root back to the
|
|
// original type. Otherwise, if we know the sign bit is zero, we will
|
|
// zero-extend the root instead.
|
|
//
|
|
// FIXME: This is somewhat suboptimal, as there will be cases where adding
|
|
// one to the maximum bit width will yield a larger-than-necessary
|
|
// type. In general, we need to add an extra bit only if we can't
|
|
// prove that the upper bit of the original type is equal to the
|
|
// upper bit of the proposed smaller type. If these two bits are
|
|
// the same (either zero or one) we know that sign-extending from
|
|
// the smaller type will result in the same value. Here, since we
|
|
// can't yet prove this, we are just making the proposed smaller
|
|
// type larger to ensure correctness.
|
|
if (!IsKnownPositive)
|
|
++BitWidth1;
|
|
|
|
APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
|
|
unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
|
|
MaxBitWidth =
|
|
std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
|
|
}
|
|
|
|
if (MaxBitWidth < 8 && MaxBitWidth > 1)
|
|
MaxBitWidth = 8;
|
|
|
|
// If the original type is large, but reduced type does not improve the reg
|
|
// use - ignore it.
|
|
if (NumParts > 1 &&
|
|
NumParts ==
|
|
::getNumberOfParts(
|
|
*TTI, getWidenedType(IntegerType::get(F->getContext(),
|
|
bit_ceil(MaxBitWidth)),
|
|
VF)))
|
|
return 0u;
|
|
|
|
unsigned Opcode = E.getOpcode();
|
|
bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
|
|
Opcode == Instruction::SExt ||
|
|
Opcode == Instruction::ZExt || NumParts > 1;
|
|
// Conservatively determine if we can actually truncate the roots of the
|
|
// expression. Collect the values that can be demoted in ToDemote and
|
|
// additional roots that require investigating in Roots.
|
|
DenseSet<const TreeEntry *> Visited;
|
|
unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
|
|
bool NeedToDemote = IsProfitableToDemote;
|
|
|
|
if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
|
|
ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
|
|
NeedToDemote, IsTruncRoot) ||
|
|
(MaxDepthLevel <= Limit &&
|
|
!(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
|
|
(!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
|
|
DL->getTypeSizeInBits(TreeRootIT) /
|
|
DL->getTypeSizeInBits(
|
|
E.getMainOp()->getOperand(0)->getType()) >
|
|
2)))))
|
|
return 0u;
|
|
// Round MaxBitWidth up to the next power-of-two.
|
|
MaxBitWidth = bit_ceil(MaxBitWidth);
|
|
|
|
return MaxBitWidth;
|
|
};
|
|
|
|
// If we can truncate the root, we must collect additional values that might
|
|
// be demoted as a result. That is, those seeded by truncations we will
|
|
// modify.
|
|
// Add reduction ops sizes, if any.
|
|
if (UserIgnoreList &&
|
|
isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
|
|
// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
|
|
// x i1> to in)).
|
|
if (all_of(*UserIgnoreList,
|
|
[](Value *V) {
|
|
return isa<PoisonValue>(V) ||
|
|
cast<Instruction>(V)->getOpcode() == Instruction::Add;
|
|
}) &&
|
|
VectorizableTree.front()->State == TreeEntry::Vectorize &&
|
|
VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
|
|
cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
|
|
Builder.getInt1Ty()) {
|
|
ReductionBitWidth = 1;
|
|
} else {
|
|
for (Value *V : *UserIgnoreList) {
|
|
if (isa<PoisonValue>(V))
|
|
continue;
|
|
unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
|
|
TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
|
|
unsigned BitWidth1 = NumTypeBits - NumSignBits;
|
|
if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
|
|
++BitWidth1;
|
|
unsigned BitWidth2 = BitWidth1;
|
|
if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
|
|
APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
|
|
BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
|
|
}
|
|
ReductionBitWidth =
|
|
std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
|
|
}
|
|
if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
|
|
ReductionBitWidth = 8;
|
|
|
|
ReductionBitWidth = bit_ceil(ReductionBitWidth);
|
|
}
|
|
}
|
|
bool IsTopRoot = NodeIdx == 0;
|
|
while (NodeIdx < VectorizableTree.size() &&
|
|
VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
|
|
VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
|
|
RootDemotes.push_back(NodeIdx);
|
|
++NodeIdx;
|
|
IsTruncRoot = true;
|
|
}
|
|
bool IsSignedCmp = false;
|
|
while (NodeIdx < VectorizableTree.size()) {
|
|
ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
|
|
unsigned Limit = 2;
|
|
if (IsTopRoot &&
|
|
ReductionBitWidth ==
|
|
DL->getTypeSizeInBits(
|
|
VectorizableTree.front()->Scalars.front()->getType()))
|
|
Limit = 3;
|
|
unsigned MaxBitWidth = ComputeMaxBitWidth(
|
|
*VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
|
|
IsTruncRoot, IsSignedCmp);
|
|
if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
|
|
if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
|
|
ReductionBitWidth = bit_ceil(MaxBitWidth);
|
|
else if (MaxBitWidth == 0)
|
|
ReductionBitWidth = 0;
|
|
}
|
|
|
|
for (unsigned Idx : RootDemotes) {
|
|
if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
|
|
uint32_t OrigBitWidth =
|
|
DL->getTypeSizeInBits(V->getType()->getScalarType());
|
|
if (OrigBitWidth > MaxBitWidth) {
|
|
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
|
|
return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
|
|
}
|
|
return false;
|
|
}))
|
|
ToDemote.push_back(Idx);
|
|
}
|
|
RootDemotes.clear();
|
|
IsTopRoot = false;
|
|
IsProfitableToDemoteRoot = true;
|
|
|
|
if (ExtraBitWidthNodes.empty()) {
|
|
NodeIdx = VectorizableTree.size();
|
|
} else {
|
|
unsigned NewIdx = 0;
|
|
do {
|
|
NewIdx = *ExtraBitWidthNodes.begin();
|
|
ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
|
|
} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
|
|
NodeIdx = NewIdx;
|
|
IsTruncRoot =
|
|
NodeIdx < VectorizableTree.size() &&
|
|
VectorizableTree[NodeIdx]->UserTreeIndex &&
|
|
VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
|
|
VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
|
|
Instruction::Trunc &&
|
|
!VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
|
|
IsSignedCmp =
|
|
NodeIdx < VectorizableTree.size() &&
|
|
VectorizableTree[NodeIdx]->UserTreeIndex &&
|
|
VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
|
|
VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
|
|
Instruction::ICmp &&
|
|
any_of(
|
|
VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
|
|
[&](Value *V) {
|
|
auto *IC = dyn_cast<ICmpInst>(V);
|
|
return IC && (IC->isSigned() ||
|
|
!isKnownNonNegative(IC->getOperand(0),
|
|
SimplifyQuery(*DL)) ||
|
|
!isKnownNonNegative(IC->getOperand(1),
|
|
SimplifyQuery(*DL)));
|
|
});
|
|
}
|
|
|
|
// If the maximum bit width we compute is less than the width of the roots'
|
|
// type, we can proceed with the narrowing. Otherwise, do nothing.
|
|
if (MaxBitWidth == 0 ||
|
|
MaxBitWidth >=
|
|
cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
|
|
->getBitWidth()) {
|
|
if (UserIgnoreList)
|
|
AnalyzedMinBWVals.insert_range(TreeRoot);
|
|
NodesToKeepBWs.insert_range(ToDemote);
|
|
continue;
|
|
}
|
|
|
|
// Finally, map the values we can demote to the maximum bit with we
|
|
// computed.
|
|
for (unsigned Idx : ToDemote) {
|
|
TreeEntry *TE = VectorizableTree[Idx].get();
|
|
if (MinBWs.contains(TE))
|
|
continue;
|
|
bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
|
|
if (isa<PoisonValue>(R))
|
|
return false;
|
|
return !isKnownNonNegative(R, SimplifyQuery(*DL));
|
|
});
|
|
MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
|
|
}
|
|
}
|
|
}
|
|
|
|
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
|
|
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
|
|
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
|
|
auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
|
|
auto *AA = &AM.getResult<AAManager>(F);
|
|
auto *LI = &AM.getResult<LoopAnalysis>(F);
|
|
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
|
|
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
|
|
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
|
|
auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
|
|
|
|
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
|
|
if (!Changed)
|
|
return PreservedAnalyses::all();
|
|
|
|
PreservedAnalyses PA;
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|
|
|
|
bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
|
|
TargetTransformInfo *TTI_,
|
|
TargetLibraryInfo *TLI_, AAResults *AA_,
|
|
LoopInfo *LI_, DominatorTree *DT_,
|
|
AssumptionCache *AC_, DemandedBits *DB_,
|
|
OptimizationRemarkEmitter *ORE_) {
|
|
if (!RunSLPVectorization)
|
|
return false;
|
|
SE = SE_;
|
|
TTI = TTI_;
|
|
TLI = TLI_;
|
|
AA = AA_;
|
|
LI = LI_;
|
|
DT = DT_;
|
|
AC = AC_;
|
|
DB = DB_;
|
|
DL = &F.getDataLayout();
|
|
|
|
Stores.clear();
|
|
GEPs.clear();
|
|
bool Changed = false;
|
|
|
|
// If the target claims to have no vector registers don't attempt
|
|
// vectorization.
|
|
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
|
|
return false;
|
|
}
|
|
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
|
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
|
|
|
|
// Use the bottom up slp vectorizer to construct chains that start with
|
|
// store instructions.
|
|
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
|
|
|
|
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
|
|
// delete instructions.
|
|
|
|
// Update DFS numbers now so that we can use them for ordering.
|
|
DT->updateDFSNumbers();
|
|
|
|
// Scan the blocks in the function in post order.
|
|
for (auto *BB : post_order(&F.getEntryBlock())) {
|
|
if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
|
|
continue;
|
|
|
|
// Start new block - clear the list of reduction roots.
|
|
R.clearReductionData();
|
|
collectSeedInstructions(BB);
|
|
|
|
// Vectorize trees that end at stores.
|
|
if (!Stores.empty()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
|
|
<< " underlying objects.\n");
|
|
Changed |= vectorizeStoreChains(R);
|
|
}
|
|
|
|
// Vectorize trees that end at reductions.
|
|
Changed |= vectorizeChainsInBlock(BB, R);
|
|
|
|
// Vectorize the index computations of getelementptr instructions. This
|
|
// is primarily intended to catch gather-like idioms ending at
|
|
// non-consecutive loads.
|
|
if (!GEPs.empty()) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
|
|
<< " underlying objects.\n");
|
|
Changed |= vectorizeGEPIndices(BB, R);
|
|
}
|
|
}
|
|
|
|
if (Changed) {
|
|
R.optimizeGatherSequence();
|
|
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
std::optional<bool>
|
|
SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
|
|
unsigned Idx, unsigned MinVF,
|
|
unsigned &Size) {
|
|
Size = 0;
|
|
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
|
|
<< "\n");
|
|
const unsigned Sz = R.getVectorElementSize(Chain[0]);
|
|
unsigned VF = Chain.size();
|
|
|
|
if (!has_single_bit(Sz) ||
|
|
!hasFullVectorsOrPowerOf2(
|
|
*TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
|
|
VF) ||
|
|
VF < 2 || VF < MinVF) {
|
|
// Check if vectorizing with a non-power-of-2 VF should be considered. At
|
|
// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
|
|
// all vector lanes are used.
|
|
if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
|
|
return false;
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
|
|
<< "\n");
|
|
|
|
SetVector<Value *> ValOps;
|
|
for (Value *V : Chain)
|
|
ValOps.insert(cast<StoreInst>(V)->getValueOperand());
|
|
// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
|
|
InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
|
|
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
|
|
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
|
|
bool IsAllowedSize =
|
|
hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
|
|
ValOps.size()) ||
|
|
(VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
|
|
if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
|
|
(!S.getMainOp()->isSafeToRemove() ||
|
|
any_of(ValOps.getArrayRef(),
|
|
[&](Value *V) {
|
|
return !isa<ExtractElementInst>(V) &&
|
|
(V->getNumUses() > Chain.size() ||
|
|
any_of(V->users(), [&](User *U) {
|
|
return !Stores.contains(U);
|
|
}));
|
|
}))) ||
|
|
(ValOps.size() > Chain.size() / 2 && !S)) {
|
|
Size = (!IsAllowedSize && S) ? 1 : 2;
|
|
return false;
|
|
}
|
|
}
|
|
if (R.isLoadCombineCandidate(Chain))
|
|
return true;
|
|
R.buildTree(Chain);
|
|
// Check if tree tiny and store itself or its value is not vectorized.
|
|
if (R.isTreeTinyAndNotFullyVectorizable()) {
|
|
if (R.isGathered(Chain.front()) ||
|
|
R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
|
|
return std::nullopt;
|
|
Size = R.getCanonicalGraphSize();
|
|
return false;
|
|
}
|
|
if (R.isProfitableToReorder()) {
|
|
R.reorderTopToBottom();
|
|
R.reorderBottomToTop();
|
|
}
|
|
R.transformNodes();
|
|
R.buildExternalUses();
|
|
|
|
R.computeMinimumValueSizes();
|
|
|
|
Size = R.getCanonicalGraphSize();
|
|
if (S && S.getOpcode() == Instruction::Load)
|
|
Size = 2; // cut off masked gather small trees
|
|
InstructionCost Cost = R.getTreeCost();
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
|
|
if (Cost < -SLPCostThreshold) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
|
|
|
|
using namespace ore;
|
|
|
|
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
|
|
cast<StoreInst>(Chain[0]))
|
|
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
|
|
<< " and with tree size "
|
|
<< NV("TreeSize", R.getTreeSize()));
|
|
|
|
R.vectorizeTree();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Checks if the quadratic mean deviation is less than 90% of the mean size.
|
|
static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
|
|
bool First) {
|
|
unsigned Num = 0;
|
|
uint64_t Sum = std::accumulate(
|
|
Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
|
|
[&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
|
|
unsigned Size = First ? Val.first : Val.second;
|
|
if (Size == 1)
|
|
return V;
|
|
++Num;
|
|
return V + Size;
|
|
});
|
|
if (Num == 0)
|
|
return true;
|
|
uint64_t Mean = Sum / Num;
|
|
if (Mean == 0)
|
|
return true;
|
|
uint64_t Dev = std::accumulate(
|
|
Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
|
|
[&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
|
|
unsigned P = First ? Val.first : Val.second;
|
|
if (P == 1)
|
|
return V;
|
|
return V + (P - Mean) * (P - Mean);
|
|
}) /
|
|
Num;
|
|
return Dev * 96 / (Mean * Mean) == 0;
|
|
}
|
|
|
|
namespace {
|
|
|
|
/// A group of stores that we'll try to bundle together using vector ops.
|
|
/// They are ordered using the signed distance of their address operand to the
|
|
/// address of this group's BaseInstr.
|
|
struct RelatedStoreInsts {
|
|
RelatedStoreInsts(unsigned BaseInstrIdx) { reset(BaseInstrIdx); }
|
|
void reset(unsigned NewBaseInstr) {
|
|
BaseInstrIdx = NewBaseInstr;
|
|
Instrs.clear();
|
|
insertOrLookup(NewBaseInstr, 0);
|
|
}
|
|
|
|
/// Tries to insert \p InstrIdx as the store with a pointer distance of
|
|
/// \p PtrDist.
|
|
/// Does nothing if there is already a store with that \p PtrDist.
|
|
/// \returns The previously associated Instruction index, or std::nullopt
|
|
std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int PtrDist) {
|
|
auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
|
|
return Inserted ? std::nullopt : std::optional<unsigned>(It->second);
|
|
}
|
|
|
|
/// The index of the Base instruction, i.e. the one with a 0 pointer distance.
|
|
unsigned BaseInstrIdx;
|
|
|
|
/// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
|
|
using DistToInstMap = std::map<int, unsigned>;
|
|
DistToInstMap Instrs;
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
bool SLPVectorizerPass::vectorizeStores(
|
|
ArrayRef<StoreInst *> Stores, BoUpSLP &R,
|
|
DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
|
|
&Visited) {
|
|
// We may run into multiple chains that merge into a single chain. We mark the
|
|
// stores that we vectorized so that we don't visit the same store twice.
|
|
BoUpSLP::ValueSet VectorizedStores;
|
|
bool Changed = false;
|
|
|
|
auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
|
|
int PrevDist = -1;
|
|
BoUpSLP::ValueList Operands;
|
|
// Collect the chain into a list.
|
|
for (auto [Idx, Data] : enumerate(StoreSeq)) {
|
|
auto &[Dist, InstIdx] = Data;
|
|
if (Operands.empty() || Dist - PrevDist == 1) {
|
|
Operands.push_back(Stores[InstIdx]);
|
|
PrevDist = Dist;
|
|
if (Idx != StoreSeq.size() - 1)
|
|
continue;
|
|
}
|
|
auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
|
|
Operands.clear();
|
|
Operands.push_back(Stores[InstIdx]);
|
|
PrevDist = Dist;
|
|
});
|
|
|
|
if (Operands.size() <= 1 ||
|
|
!Visited
|
|
.insert({Operands.front(),
|
|
cast<StoreInst>(Operands.front())->getValueOperand(),
|
|
Operands.back(),
|
|
cast<StoreInst>(Operands.back())->getValueOperand(),
|
|
Operands.size()})
|
|
.second)
|
|
continue;
|
|
|
|
unsigned MaxVecRegSize = R.getMaxVecRegSize();
|
|
unsigned EltSize = R.getVectorElementSize(Operands[0]);
|
|
unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
|
|
|
|
unsigned MaxVF =
|
|
std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
|
|
auto *Store = cast<StoreInst>(Operands[0]);
|
|
Type *StoreTy = Store->getValueOperand()->getType();
|
|
Type *ValueTy = StoreTy;
|
|
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
|
|
ValueTy = Trunc->getSrcTy();
|
|
// When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
|
|
// getStoreMinimumVF only support scalar type as arguments. As a result,
|
|
// we need to use the element type of StoreTy and ValueTy to retrieve the
|
|
// VF and then transform it back.
|
|
// Remember: VF is defined as the number we want to vectorize, not the
|
|
// number of elements in the final vector.
|
|
Type *StoreScalarTy = StoreTy->getScalarType();
|
|
unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
|
|
R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
|
|
ValueTy->getScalarType()));
|
|
MinVF /= getNumElements(StoreTy);
|
|
MinVF = std::max<unsigned>(2, MinVF);
|
|
|
|
if (MaxVF < MinVF) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
|
|
<< ") < "
|
|
<< "MinVF (" << MinVF << ")\n");
|
|
continue;
|
|
}
|
|
|
|
unsigned NonPowerOf2VF = 0;
|
|
if (VectorizeNonPowerOf2) {
|
|
// First try vectorizing with a non-power-of-2 VF. At the moment, only
|
|
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
|
|
// lanes are used.
|
|
unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
|
|
if (has_single_bit(CandVF + 1)) {
|
|
NonPowerOf2VF = CandVF;
|
|
assert(NonPowerOf2VF != MaxVF &&
|
|
"Non-power-of-2 VF should not be equal to MaxVF");
|
|
}
|
|
}
|
|
|
|
unsigned MaxRegVF = MaxVF;
|
|
MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
|
|
if (MaxVF < MinVF) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
|
|
<< ") < "
|
|
<< "MinVF (" << MinVF << ")\n");
|
|
continue;
|
|
}
|
|
|
|
unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
|
|
SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
|
|
unsigned Size = MinVF;
|
|
for_each(reverse(CandidateVFs), [&](unsigned &VF) {
|
|
VF = Size > MaxVF ? NonPowerOf2VF : Size;
|
|
Size *= 2;
|
|
});
|
|
unsigned End = Operands.size();
|
|
unsigned Repeat = 0;
|
|
constexpr unsigned MaxAttempts = 4;
|
|
OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
|
|
for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
|
|
P.first = P.second = 1;
|
|
});
|
|
DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
|
|
auto IsNotVectorized = [](bool First,
|
|
const std::pair<unsigned, unsigned> &P) {
|
|
return First ? P.first > 0 : P.second > 0;
|
|
};
|
|
auto IsVectorized = [](bool First,
|
|
const std::pair<unsigned, unsigned> &P) {
|
|
return First ? P.first == 0 : P.second == 0;
|
|
};
|
|
auto VFIsProfitable = [](bool First, unsigned Size,
|
|
const std::pair<unsigned, unsigned> &P) {
|
|
return First ? Size >= P.first : Size >= P.second;
|
|
};
|
|
auto FirstSizeSame = [](unsigned Size,
|
|
const std::pair<unsigned, unsigned> &P) {
|
|
return Size == P.first;
|
|
};
|
|
while (true) {
|
|
++Repeat;
|
|
bool RepeatChanged = false;
|
|
bool AnyProfitableGraph = false;
|
|
for (unsigned Size : CandidateVFs) {
|
|
AnyProfitableGraph = false;
|
|
unsigned StartIdx = std::distance(
|
|
RangeSizes.begin(),
|
|
find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
|
|
std::placeholders::_1)));
|
|
while (StartIdx < End) {
|
|
unsigned EndIdx =
|
|
std::distance(RangeSizes.begin(),
|
|
find_if(RangeSizes.drop_front(StartIdx),
|
|
std::bind(IsVectorized, Size >= MaxRegVF,
|
|
std::placeholders::_1)));
|
|
unsigned Sz = EndIdx >= End ? End : EndIdx;
|
|
for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
|
|
if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
|
|
Size >= MaxRegVF)) {
|
|
++Cnt;
|
|
continue;
|
|
}
|
|
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
|
|
assert(all_of(Slice,
|
|
[&](Value *V) {
|
|
return cast<StoreInst>(V)
|
|
->getValueOperand()
|
|
->getType() ==
|
|
cast<StoreInst>(Slice.front())
|
|
->getValueOperand()
|
|
->getType();
|
|
}) &&
|
|
"Expected all operands of same type.");
|
|
if (!NonSchedulable.empty()) {
|
|
auto [NonSchedSizeMax, NonSchedSizeMin] =
|
|
NonSchedulable.lookup(Slice.front());
|
|
if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
|
|
Cnt += NonSchedSizeMax;
|
|
continue;
|
|
}
|
|
}
|
|
unsigned TreeSize;
|
|
std::optional<bool> Res =
|
|
vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
|
|
if (!Res) {
|
|
NonSchedulable
|
|
.try_emplace(Slice.front(), std::make_pair(Size, Size))
|
|
.first->getSecond()
|
|
.second = Size;
|
|
} else if (*Res) {
|
|
// Mark the vectorized stores so that we don't vectorize them
|
|
// again.
|
|
VectorizedStores.insert_range(Slice);
|
|
// Mark the vectorized stores so that we don't vectorize them
|
|
// again.
|
|
AnyProfitableGraph = RepeatChanged = Changed = true;
|
|
// If we vectorized initial block, no need to try to vectorize
|
|
// it again.
|
|
for_each(RangeSizes.slice(Cnt, Size),
|
|
[](std::pair<unsigned, unsigned> &P) {
|
|
P.first = P.second = 0;
|
|
});
|
|
if (Cnt < StartIdx + MinVF) {
|
|
for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
|
|
[](std::pair<unsigned, unsigned> &P) {
|
|
P.first = P.second = 0;
|
|
});
|
|
StartIdx = Cnt + Size;
|
|
}
|
|
if (Cnt > Sz - Size - MinVF) {
|
|
for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
|
|
[](std::pair<unsigned, unsigned> &P) {
|
|
P.first = P.second = 0;
|
|
});
|
|
if (Sz == End)
|
|
End = Cnt;
|
|
Sz = Cnt;
|
|
}
|
|
Cnt += Size;
|
|
continue;
|
|
}
|
|
if (Size > 2 && Res &&
|
|
!all_of(RangeSizes.slice(Cnt, Size),
|
|
std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
|
|
std::placeholders::_1))) {
|
|
Cnt += Size;
|
|
continue;
|
|
}
|
|
// Check for the very big VFs that we're not rebuilding same
|
|
// trees, just with larger number of elements.
|
|
if (Size > MaxRegVF && TreeSize > 1 &&
|
|
all_of(RangeSizes.slice(Cnt, Size),
|
|
std::bind(FirstSizeSame, TreeSize,
|
|
std::placeholders::_1))) {
|
|
Cnt += Size;
|
|
while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
|
|
++Cnt;
|
|
continue;
|
|
}
|
|
if (TreeSize > 1)
|
|
for_each(RangeSizes.slice(Cnt, Size),
|
|
[&](std::pair<unsigned, unsigned> &P) {
|
|
if (Size >= MaxRegVF)
|
|
P.second = std::max(P.second, TreeSize);
|
|
else
|
|
P.first = std::max(P.first, TreeSize);
|
|
});
|
|
++Cnt;
|
|
AnyProfitableGraph = true;
|
|
}
|
|
if (StartIdx >= End)
|
|
break;
|
|
if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
|
|
AnyProfitableGraph = true;
|
|
StartIdx = std::distance(
|
|
RangeSizes.begin(),
|
|
find_if(RangeSizes.drop_front(Sz),
|
|
std::bind(IsNotVectorized, Size >= MaxRegVF,
|
|
std::placeholders::_1)));
|
|
}
|
|
if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
|
|
break;
|
|
}
|
|
// All values vectorized - exit.
|
|
if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
|
|
return P.first == 0 && P.second == 0;
|
|
}))
|
|
break;
|
|
// Check if tried all attempts or no need for the last attempts at all.
|
|
if (Repeat >= MaxAttempts ||
|
|
(Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
|
|
break;
|
|
constexpr unsigned StoresLimit = 64;
|
|
const unsigned MaxTotalNum = std::min<unsigned>(
|
|
Operands.size(),
|
|
static_cast<unsigned>(
|
|
End -
|
|
std::distance(
|
|
RangeSizes.begin(),
|
|
find_if(RangeSizes, std::bind(IsNotVectorized, true,
|
|
std::placeholders::_1))) +
|
|
1));
|
|
unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
|
|
unsigned Limit =
|
|
getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
|
|
CandidateVFs.clear();
|
|
if (bit_floor(Limit) == VF)
|
|
CandidateVFs.push_back(Limit);
|
|
if (VF > MaxTotalNum || VF >= StoresLimit)
|
|
break;
|
|
for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
|
|
if (P.first != 0)
|
|
P.first = std::max(P.second, P.first);
|
|
});
|
|
// Last attempt to vectorize max number of elements, if all previous
|
|
// attempts were unsuccessful because of the cost issues.
|
|
CandidateVFs.push_back(VF);
|
|
}
|
|
}
|
|
};
|
|
|
|
// Stores pair (first: index of the store into Stores array ref, address of
|
|
// which taken as base, second: sorted set of pairs {index, dist}, which are
|
|
// indices of stores in the set and their store location distances relative to
|
|
// the base address).
|
|
|
|
// Need to store the index of the very first store separately, since the set
|
|
// may be reordered after the insertion and the first store may be moved. This
|
|
// container allows to reduce number of calls of getPointersDiff() function.
|
|
SmallVector<RelatedStoreInsts> SortedStores;
|
|
|
|
// Inserts the specified store SI with the given index Idx to the set of the
|
|
// stores. If the store with the same distance is found already - stop
|
|
// insertion, try to vectorize already found stores. If some stores from this
|
|
// sequence were not vectorized - try to vectorize them with the new store
|
|
// later. But this logic is applied only to the stores, that come before the
|
|
// previous store with the same distance.
|
|
// Example:
|
|
// 1. store x, %p
|
|
// 2. store y, %p+1
|
|
// 3. store z, %p+2
|
|
// 4. store a, %p
|
|
// 5. store b, %p+3
|
|
// - Scan this from the last to first store. The very first bunch of stores is
|
|
// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
|
|
// vector).
|
|
// - The next store in the list - #1 - has the same distance from store #5 as
|
|
// the store #4.
|
|
// - Try to vectorize sequence of stores 4,2,3,5.
|
|
// - If all these stores are vectorized - just drop them.
|
|
// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
|
|
// - Start new stores sequence.
|
|
// The new bunch of stores is {1, {1, 0}}.
|
|
// - Add the stores from previous sequence, that were not vectorized.
|
|
// Here we consider the stores in the reversed order, rather they are used in
|
|
// the IR (Stores are reversed already, see vectorizeStoreChains() function).
|
|
// Store #3 can be added -> comes after store #4 with the same distance as
|
|
// store #1.
|
|
// Store #5 cannot be added - comes before store #4.
|
|
// This logic allows to improve the compile time, we assume that the stores
|
|
// after previous store with the same distance most likely have memory
|
|
// dependencies and no need to waste compile time to try to vectorize them.
|
|
// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
|
|
auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
|
|
for (RelatedStoreInsts &StoreSeq : SortedStores) {
|
|
std::optional<int> Diff = getPointersDiff(
|
|
Stores[StoreSeq.BaseInstrIdx]->getValueOperand()->getType(),
|
|
Stores[StoreSeq.BaseInstrIdx]->getPointerOperand(),
|
|
SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
|
|
/*StrictCheck=*/true);
|
|
if (!Diff)
|
|
continue;
|
|
std::optional<unsigned> PrevInst =
|
|
StoreSeq.insertOrLookup(/*InstrIdx=*/Idx, /*PtrDist=*/*Diff);
|
|
if (!PrevInst) {
|
|
// No store was associated to that distance. Keep collecting.
|
|
return;
|
|
}
|
|
// Try to vectorize the first found set to avoid duplicate analysis.
|
|
TryToVectorize(StoreSeq.Instrs);
|
|
RelatedStoreInsts::DistToInstMap PrevSet;
|
|
copy_if(StoreSeq.Instrs, std::inserter(PrevSet, PrevSet.end()),
|
|
[&](const std::pair<int, unsigned> &DistAndIdx) {
|
|
return DistAndIdx.second > *PrevInst;
|
|
});
|
|
StoreSeq.reset(Idx);
|
|
// Insert stores that followed previous match to try to vectorize them
|
|
// with this store.
|
|
unsigned StartIdx = *PrevInst + 1;
|
|
SmallBitVector UsedStores(Idx - StartIdx);
|
|
// Distances to previously found dup store (or this store, since they
|
|
// store to the same addresses).
|
|
SmallVector<int> Dists(Idx - StartIdx, 0);
|
|
for (auto [PtrDist, InstIdx] : reverse(PrevSet)) {
|
|
// Do not try to vectorize sequences, we already tried.
|
|
if (VectorizedStores.contains(Stores[InstIdx]))
|
|
break;
|
|
unsigned BI = InstIdx - StartIdx;
|
|
UsedStores.set(BI);
|
|
Dists[BI] = PtrDist - *Diff;
|
|
}
|
|
for (unsigned I = StartIdx; I < Idx; ++I) {
|
|
unsigned BI = I - StartIdx;
|
|
if (UsedStores.test(BI))
|
|
StoreSeq.insertOrLookup(I, Dists[BI]);
|
|
}
|
|
return;
|
|
}
|
|
// We did not find a comparable store, start a new sequence.
|
|
SortedStores.emplace_back(Idx);
|
|
};
|
|
Type *PrevValTy = nullptr;
|
|
for (auto [I, SI] : enumerate(Stores)) {
|
|
if (R.isDeleted(SI))
|
|
continue;
|
|
if (!PrevValTy)
|
|
PrevValTy = SI->getValueOperand()->getType();
|
|
// Check that we do not try to vectorize stores of different types.
|
|
if (PrevValTy != SI->getValueOperand()->getType()) {
|
|
for (RelatedStoreInsts &StoreSeq : SortedStores)
|
|
TryToVectorize(StoreSeq.Instrs);
|
|
SortedStores.clear();
|
|
PrevValTy = SI->getValueOperand()->getType();
|
|
}
|
|
FillStoresSet(I, SI);
|
|
}
|
|
|
|
// Final vectorization attempt.
|
|
for (RelatedStoreInsts &StoreSeq : SortedStores)
|
|
TryToVectorize(StoreSeq.Instrs);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
|
|
// Initialize the collections. We will make a single pass over the block.
|
|
Stores.clear();
|
|
GEPs.clear();
|
|
|
|
// Visit the store and getelementptr instructions in BB and organize them in
|
|
// Stores and GEPs according to the underlying objects of their pointer
|
|
// operands.
|
|
for (Instruction &I : *BB) {
|
|
// Ignore store instructions that are volatile or have a pointer operand
|
|
// that doesn't point to a scalar type.
|
|
if (auto *SI = dyn_cast<StoreInst>(&I)) {
|
|
if (!SI->isSimple())
|
|
continue;
|
|
if (!isValidElementType(SI->getValueOperand()->getType()))
|
|
continue;
|
|
Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
|
|
}
|
|
|
|
// Ignore getelementptr instructions that have more than one index, a
|
|
// constant index, or a pointer operand that doesn't point to a scalar
|
|
// type.
|
|
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
|
|
if (GEP->getNumIndices() != 1)
|
|
continue;
|
|
Value *Idx = GEP->idx_begin()->get();
|
|
if (isa<Constant>(Idx))
|
|
continue;
|
|
if (!isValidElementType(Idx->getType()))
|
|
continue;
|
|
if (GEP->getType()->isVectorTy())
|
|
continue;
|
|
GEPs[GEP->getPointerOperand()].push_back(GEP);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
|
bool MaxVFOnly) {
|
|
if (VL.size() < 2)
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
|
|
<< VL.size() << ".\n");
|
|
|
|
// Check that all of the parts are instructions of the same type,
|
|
// we permit an alternate opcode via InstructionsState.
|
|
InstructionsState S = getSameOpcode(VL, *TLI);
|
|
if (!S)
|
|
return false;
|
|
|
|
Instruction *I0 = S.getMainOp();
|
|
// Make sure invalid types (including vector type) are rejected before
|
|
// determining vectorization factor for scalar instructions.
|
|
for (Value *V : VL) {
|
|
Type *Ty = V->getType();
|
|
if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
|
|
// NOTE: the following will give user internal llvm type name, which may
|
|
// not be useful.
|
|
R.getORE()->emit([&]() {
|
|
std::string TypeStr;
|
|
llvm::raw_string_ostream rso(TypeStr);
|
|
Ty->print(rso);
|
|
return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
|
|
<< "Cannot SLP vectorize list: type "
|
|
<< TypeStr + " is unsupported by vectorizer";
|
|
});
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Type *ScalarTy = getValueType(VL[0]);
|
|
unsigned Sz = R.getVectorElementSize(I0);
|
|
unsigned MinVF = R.getMinVF(Sz);
|
|
unsigned MaxVF = std::max<unsigned>(
|
|
getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
|
|
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
|
|
if (MaxVF < 2) {
|
|
R.getORE()->emit([&]() {
|
|
return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
|
|
<< "Cannot SLP vectorize list: vectorization factor "
|
|
<< "less than 2 is not supported";
|
|
});
|
|
return false;
|
|
}
|
|
|
|
bool Changed = false;
|
|
bool CandidateFound = false;
|
|
InstructionCost MinCost = SLPCostThreshold.getValue();
|
|
|
|
unsigned NextInst = 0, MaxInst = VL.size();
|
|
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
|
|
VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
|
|
// No actual vectorization should happen, if number of parts is the same as
|
|
// provided vectorization factor (i.e. the scalar type is used for vector
|
|
// code during codegen).
|
|
auto *VecTy = getWidenedType(ScalarTy, VF);
|
|
if (TTI->getNumberOfParts(VecTy) == VF)
|
|
continue;
|
|
for (unsigned I = NextInst; I < MaxInst; ++I) {
|
|
unsigned ActualVF = std::min(MaxInst - I, VF);
|
|
|
|
if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
|
|
continue;
|
|
|
|
if (MaxVFOnly && ActualVF < MaxVF)
|
|
break;
|
|
if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
|
|
break;
|
|
|
|
SmallVector<Value *> Ops(ActualVF, nullptr);
|
|
unsigned Idx = 0;
|
|
for (Value *V : VL.drop_front(I)) {
|
|
// Check that a previous iteration of this loop did not delete the
|
|
// Value.
|
|
if (auto *Inst = dyn_cast<Instruction>(V);
|
|
!Inst || !R.isDeleted(Inst)) {
|
|
Ops[Idx] = V;
|
|
++Idx;
|
|
if (Idx == ActualVF)
|
|
break;
|
|
}
|
|
}
|
|
// Not enough vectorizable instructions - exit.
|
|
if (Idx != ActualVF)
|
|
break;
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
|
|
<< "\n");
|
|
|
|
R.buildTree(Ops);
|
|
if (R.isTreeTinyAndNotFullyVectorizable())
|
|
continue;
|
|
if (R.isProfitableToReorder()) {
|
|
R.reorderTopToBottom();
|
|
R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
|
|
}
|
|
R.transformNodes();
|
|
R.buildExternalUses();
|
|
|
|
R.computeMinimumValueSizes();
|
|
InstructionCost Cost = R.getTreeCost();
|
|
CandidateFound = true;
|
|
MinCost = std::min(MinCost, Cost);
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
|
|
<< " for VF=" << ActualVF << "\n");
|
|
if (Cost < -SLPCostThreshold) {
|
|
LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
|
|
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
|
|
cast<Instruction>(Ops[0]))
|
|
<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
|
|
<< " and with tree size "
|
|
<< ore::NV("TreeSize", R.getTreeSize()));
|
|
|
|
R.vectorizeTree();
|
|
// Move to the next bundle.
|
|
I += VF - 1;
|
|
NextInst = I + 1;
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!Changed && CandidateFound) {
|
|
R.getORE()->emit([&]() {
|
|
return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
|
|
<< "List vectorization was possible but not beneficial with cost "
|
|
<< ore::NV("Cost", MinCost) << " >= "
|
|
<< ore::NV("Treshold", -SLPCostThreshold);
|
|
});
|
|
} else if (!Changed) {
|
|
R.getORE()->emit([&]() {
|
|
return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
|
|
<< "Cannot SLP vectorize list: vectorization was impossible"
|
|
<< " with available vectorization factors";
|
|
});
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
|
|
if (!I)
|
|
return false;
|
|
|
|
if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
|
|
return false;
|
|
|
|
Value *P = I->getParent();
|
|
|
|
// Vectorize in current basic block only.
|
|
auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
|
|
auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
|
|
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
|
|
R.isDeleted(Op0) || R.isDeleted(Op1))
|
|
return false;
|
|
|
|
// First collect all possible candidates
|
|
SmallVector<std::pair<Value *, Value *>, 4> Candidates;
|
|
Candidates.emplace_back(Op0, Op1);
|
|
|
|
auto *A = dyn_cast<BinaryOperator>(Op0);
|
|
auto *B = dyn_cast<BinaryOperator>(Op1);
|
|
// Try to skip B.
|
|
if (A && B && B->hasOneUse()) {
|
|
auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
|
|
auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
|
|
if (B0 && B0->getParent() == P && !R.isDeleted(B0))
|
|
Candidates.emplace_back(A, B0);
|
|
if (B1 && B1->getParent() == P && !R.isDeleted(B1))
|
|
Candidates.emplace_back(A, B1);
|
|
}
|
|
// Try to skip A.
|
|
if (B && A && A->hasOneUse()) {
|
|
auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
|
|
auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
|
|
if (A0 && A0->getParent() == P && !R.isDeleted(A0))
|
|
Candidates.emplace_back(A0, B);
|
|
if (A1 && A1->getParent() == P && !R.isDeleted(A1))
|
|
Candidates.emplace_back(A1, B);
|
|
}
|
|
|
|
if (Candidates.size() == 1)
|
|
return tryToVectorizeList({Op0, Op1}, R);
|
|
|
|
// We have multiple options. Try to pick the single best.
|
|
std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
|
|
if (!BestCandidate)
|
|
return false;
|
|
return tryToVectorizeList(
|
|
{Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
|
|
}
|
|
|
|
namespace {
|
|
|
|
/// Model horizontal reductions.
|
|
///
|
|
/// A horizontal reduction is a tree of reduction instructions that has values
|
|
/// that can be put into a vector as its leaves. For example:
|
|
///
|
|
/// mul mul mul mul
|
|
/// \ / \ /
|
|
/// + +
|
|
/// \ /
|
|
/// +
|
|
/// This tree has "mul" as its leaf values and "+" as its reduction
|
|
/// instructions. A reduction can feed into a store or a binary operation
|
|
/// feeding a phi.
|
|
/// ...
|
|
/// \ /
|
|
/// +
|
|
/// |
|
|
/// phi +=
|
|
///
|
|
/// Or:
|
|
/// ...
|
|
/// \ /
|
|
/// +
|
|
/// |
|
|
/// *p =
|
|
///
|
|
class HorizontalReduction {
|
|
using ReductionOpsType = SmallVector<Value *, 16>;
|
|
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
|
|
ReductionOpsListType ReductionOps;
|
|
/// List of possibly reduced values.
|
|
SmallVector<SmallVector<Value *>> ReducedVals;
|
|
/// Maps reduced value to the corresponding reduction operation.
|
|
SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
|
|
WeakTrackingVH ReductionRoot;
|
|
/// The type of reduction operation.
|
|
RecurKind RdxKind;
|
|
/// Checks if the optimization of original scalar identity operations on
|
|
/// matched horizontal reductions is enabled and allowed.
|
|
bool IsSupportedHorRdxIdentityOp = false;
|
|
/// Contains vector values for reduction including their scale factor and
|
|
/// signedness.
|
|
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
|
|
|
|
static bool isCmpSelMinMax(Instruction *I) {
|
|
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
|
|
RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
|
|
}
|
|
|
|
// And/or are potentially poison-safe logical patterns like:
|
|
// select x, y, false
|
|
// select x, true, y
|
|
static bool isBoolLogicOp(Instruction *I) {
|
|
return isa<SelectInst>(I) &&
|
|
(match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
|
|
}
|
|
|
|
/// Checks if instruction is associative and can be vectorized.
|
|
static bool isVectorizable(RecurKind Kind, Instruction *I) {
|
|
if (Kind == RecurKind::None)
|
|
return false;
|
|
|
|
// Integer ops that map to select instructions or intrinsics are fine.
|
|
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
|
|
isBoolLogicOp(I))
|
|
return true;
|
|
|
|
if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
|
|
// FP min/max are associative except for NaN and -0.0. We do not
|
|
// have to rule out -0.0 here because the intrinsic semantics do not
|
|
// specify a fixed result for it.
|
|
return I->getFastMathFlags().noNaNs();
|
|
}
|
|
|
|
if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
|
|
return true;
|
|
|
|
return I->isAssociative();
|
|
}
|
|
|
|
static Value *getRdxOperand(Instruction *I, unsigned Index) {
|
|
// Poison-safe 'or' takes the form: select X, true, Y
|
|
// To make that work with the normal operand processing, we skip the
|
|
// true value operand.
|
|
// TODO: Change the code and data structures to handle this without a hack.
|
|
if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
|
|
return I->getOperand(2);
|
|
return I->getOperand(Index);
|
|
}
|
|
|
|
/// Creates reduction operation with the current opcode.
|
|
static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
|
|
Value *RHS, const Twine &Name, bool UseSelect) {
|
|
Type *OpTy = LHS->getType();
|
|
assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
|
|
switch (Kind) {
|
|
case RecurKind::Or: {
|
|
if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
|
|
return Builder.CreateSelect(
|
|
LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
|
|
RHS, Name);
|
|
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
|
|
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
|
|
Name);
|
|
}
|
|
case RecurKind::And: {
|
|
if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
|
|
return Builder.CreateSelect(
|
|
LHS, RHS,
|
|
ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
|
|
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
|
|
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
|
|
Name);
|
|
}
|
|
case RecurKind::Add:
|
|
case RecurKind::Mul:
|
|
case RecurKind::Xor:
|
|
case RecurKind::FAdd:
|
|
case RecurKind::FMul: {
|
|
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
|
|
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
|
|
Name);
|
|
}
|
|
case RecurKind::SMax:
|
|
case RecurKind::SMin:
|
|
case RecurKind::UMax:
|
|
case RecurKind::UMin:
|
|
if (UseSelect) {
|
|
CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
|
|
Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
|
|
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
|
|
}
|
|
[[fallthrough]];
|
|
case RecurKind::FMax:
|
|
case RecurKind::FMin:
|
|
case RecurKind::FMaximum:
|
|
case RecurKind::FMinimum: {
|
|
Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind);
|
|
return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
|
|
}
|
|
default:
|
|
llvm_unreachable("Unknown reduction operation.");
|
|
}
|
|
}
|
|
|
|
/// Creates reduction operation with the current opcode with the IR flags
|
|
/// from \p ReductionOps, dropping nuw/nsw flags.
|
|
static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
|
|
Value *RHS, const Twine &Name,
|
|
const ReductionOpsListType &ReductionOps) {
|
|
bool UseSelect = ReductionOps.size() == 2 ||
|
|
// Logical or/and.
|
|
(ReductionOps.size() == 1 &&
|
|
any_of(ReductionOps.front(), IsaPred<SelectInst>));
|
|
assert((!UseSelect || ReductionOps.size() != 2 ||
|
|
isa<SelectInst>(ReductionOps[1][0])) &&
|
|
"Expected cmp + select pairs for reduction");
|
|
Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
|
|
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
|
|
if (auto *Sel = dyn_cast<SelectInst>(Op)) {
|
|
propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
|
|
/*IncludeWrapFlags=*/false);
|
|
propagateIRFlags(Op, ReductionOps[1], nullptr,
|
|
/*IncludeWrapFlags=*/false);
|
|
return Op;
|
|
}
|
|
}
|
|
propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
|
|
return Op;
|
|
}
|
|
|
|
public:
|
|
static RecurKind getRdxKind(Value *V) {
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
if (!I)
|
|
return RecurKind::None;
|
|
if (match(I, m_Add(m_Value(), m_Value())))
|
|
return RecurKind::Add;
|
|
if (match(I, m_Mul(m_Value(), m_Value())))
|
|
return RecurKind::Mul;
|
|
if (match(I, m_And(m_Value(), m_Value())) ||
|
|
match(I, m_LogicalAnd(m_Value(), m_Value())))
|
|
return RecurKind::And;
|
|
if (match(I, m_Or(m_Value(), m_Value())) ||
|
|
match(I, m_LogicalOr(m_Value(), m_Value())))
|
|
return RecurKind::Or;
|
|
if (match(I, m_Xor(m_Value(), m_Value())))
|
|
return RecurKind::Xor;
|
|
if (match(I, m_FAdd(m_Value(), m_Value())))
|
|
return RecurKind::FAdd;
|
|
if (match(I, m_FMul(m_Value(), m_Value())))
|
|
return RecurKind::FMul;
|
|
|
|
if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
|
|
return RecurKind::FMax;
|
|
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
|
|
return RecurKind::FMin;
|
|
|
|
if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
|
|
return RecurKind::FMaximum;
|
|
if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
|
|
return RecurKind::FMinimum;
|
|
// This matches either cmp+select or intrinsics. SLP is expected to handle
|
|
// either form.
|
|
// TODO: If we are canonicalizing to intrinsics, we can remove several
|
|
// special-case paths that deal with selects.
|
|
if (match(I, m_SMax(m_Value(), m_Value())))
|
|
return RecurKind::SMax;
|
|
if (match(I, m_SMin(m_Value(), m_Value())))
|
|
return RecurKind::SMin;
|
|
if (match(I, m_UMax(m_Value(), m_Value())))
|
|
return RecurKind::UMax;
|
|
if (match(I, m_UMin(m_Value(), m_Value())))
|
|
return RecurKind::UMin;
|
|
|
|
if (auto *Select = dyn_cast<SelectInst>(I)) {
|
|
// Try harder: look for min/max pattern based on instructions producing
|
|
// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
|
|
// During the intermediate stages of SLP, it's very common to have
|
|
// pattern like this (since optimizeGatherSequence is run only once
|
|
// at the end):
|
|
// %1 = extractelement <2 x i32> %a, i32 0
|
|
// %2 = extractelement <2 x i32> %a, i32 1
|
|
// %cond = icmp sgt i32 %1, %2
|
|
// %3 = extractelement <2 x i32> %a, i32 0
|
|
// %4 = extractelement <2 x i32> %a, i32 1
|
|
// %select = select i1 %cond, i32 %3, i32 %4
|
|
CmpPredicate Pred;
|
|
Instruction *L1;
|
|
Instruction *L2;
|
|
|
|
Value *LHS = Select->getTrueValue();
|
|
Value *RHS = Select->getFalseValue();
|
|
Value *Cond = Select->getCondition();
|
|
|
|
// TODO: Support inverse predicates.
|
|
if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
|
|
if (!isa<ExtractElementInst>(RHS) ||
|
|
!L2->isIdenticalTo(cast<Instruction>(RHS)))
|
|
return RecurKind::None;
|
|
} else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
|
|
if (!isa<ExtractElementInst>(LHS) ||
|
|
!L1->isIdenticalTo(cast<Instruction>(LHS)))
|
|
return RecurKind::None;
|
|
} else {
|
|
if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
|
|
return RecurKind::None;
|
|
if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
|
|
!L1->isIdenticalTo(cast<Instruction>(LHS)) ||
|
|
!L2->isIdenticalTo(cast<Instruction>(RHS)))
|
|
return RecurKind::None;
|
|
}
|
|
|
|
switch (Pred) {
|
|
default:
|
|
return RecurKind::None;
|
|
case CmpInst::ICMP_SGT:
|
|
case CmpInst::ICMP_SGE:
|
|
return RecurKind::SMax;
|
|
case CmpInst::ICMP_SLT:
|
|
case CmpInst::ICMP_SLE:
|
|
return RecurKind::SMin;
|
|
case CmpInst::ICMP_UGT:
|
|
case CmpInst::ICMP_UGE:
|
|
return RecurKind::UMax;
|
|
case CmpInst::ICMP_ULT:
|
|
case CmpInst::ICMP_ULE:
|
|
return RecurKind::UMin;
|
|
}
|
|
}
|
|
return RecurKind::None;
|
|
}
|
|
|
|
/// Get the index of the first operand.
|
|
static unsigned getFirstOperandIndex(Instruction *I) {
|
|
return isCmpSelMinMax(I) ? 1 : 0;
|
|
}
|
|
|
|
private:
|
|
/// Total number of operands in the reduction operation.
|
|
static unsigned getNumberOfOperands(Instruction *I) {
|
|
return isCmpSelMinMax(I) ? 3 : 2;
|
|
}
|
|
|
|
/// Checks if the instruction is in basic block \p BB.
|
|
/// For a cmp+sel min/max reduction check that both ops are in \p BB.
|
|
static bool hasSameParent(Instruction *I, BasicBlock *BB) {
|
|
if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
|
|
auto *Sel = cast<SelectInst>(I);
|
|
auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
|
|
return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
|
|
}
|
|
return I->getParent() == BB;
|
|
}
|
|
|
|
/// Expected number of uses for reduction operations/reduced values.
|
|
static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
|
|
if (IsCmpSelMinMax) {
|
|
// SelectInst must be used twice while the condition op must have single
|
|
// use only.
|
|
if (auto *Sel = dyn_cast<SelectInst>(I))
|
|
return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
|
|
return I->hasNUses(2);
|
|
}
|
|
|
|
// Arithmetic reduction operation must be used once only.
|
|
return I->hasOneUse();
|
|
}
|
|
|
|
/// Initializes the list of reduction operations.
|
|
void initReductionOps(Instruction *I) {
|
|
if (isCmpSelMinMax(I))
|
|
ReductionOps.assign(2, ReductionOpsType());
|
|
else
|
|
ReductionOps.assign(1, ReductionOpsType());
|
|
}
|
|
|
|
/// Add all reduction operations for the reduction instruction \p I.
|
|
void addReductionOps(Instruction *I) {
|
|
if (isCmpSelMinMax(I)) {
|
|
ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
|
|
ReductionOps[1].emplace_back(I);
|
|
} else {
|
|
ReductionOps[0].emplace_back(I);
|
|
}
|
|
}
|
|
|
|
static bool isGoodForReduction(ArrayRef<Value *> Data) {
|
|
int Sz = Data.size();
|
|
auto *I = dyn_cast<Instruction>(Data.front());
|
|
return Sz > 1 || isConstant(Data.front()) ||
|
|
(I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
|
|
}
|
|
|
|
public:
|
|
HorizontalReduction() = default;
|
|
|
|
/// Try to find a reduction tree.
|
|
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
|
|
ScalarEvolution &SE, const DataLayout &DL,
|
|
const TargetLibraryInfo &TLI) {
|
|
RdxKind = HorizontalReduction::getRdxKind(Root);
|
|
if (!isVectorizable(RdxKind, Root))
|
|
return false;
|
|
|
|
// Analyze "regular" integer/FP types for reductions - no target-specific
|
|
// types or pointers.
|
|
Type *Ty = Root->getType();
|
|
if (!isValidElementType(Ty) || Ty->isPointerTy())
|
|
return false;
|
|
|
|
// Though the ultimate reduction may have multiple uses, its condition must
|
|
// have only single use.
|
|
if (auto *Sel = dyn_cast<SelectInst>(Root))
|
|
if (!Sel->getCondition()->hasOneUse())
|
|
return false;
|
|
|
|
ReductionRoot = Root;
|
|
|
|
// Iterate through all the operands of the possible reduction tree and
|
|
// gather all the reduced values, sorting them by their value id.
|
|
BasicBlock *BB = Root->getParent();
|
|
bool IsCmpSelMinMax = isCmpSelMinMax(Root);
|
|
SmallVector<std::pair<Instruction *, unsigned>> Worklist(
|
|
1, std::make_pair(Root, 0));
|
|
// Checks if the operands of the \p TreeN instruction are also reduction
|
|
// operations or should be treated as reduced values or an extra argument,
|
|
// which is not part of the reduction.
|
|
auto CheckOperands = [&](Instruction *TreeN,
|
|
SmallVectorImpl<Value *> &PossibleReducedVals,
|
|
SmallVectorImpl<Instruction *> &ReductionOps,
|
|
unsigned Level) {
|
|
for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
|
|
getNumberOfOperands(TreeN)))) {
|
|
Value *EdgeVal = getRdxOperand(TreeN, I);
|
|
ReducedValsToOps[EdgeVal].push_back(TreeN);
|
|
auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
|
|
// If the edge is not an instruction, or it is different from the main
|
|
// reduction opcode or has too many uses - possible reduced value.
|
|
// Also, do not try to reduce const values, if the operation is not
|
|
// foldable.
|
|
if (!EdgeInst || Level > RecursionMaxDepth ||
|
|
getRdxKind(EdgeInst) != RdxKind ||
|
|
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
|
|
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
|
|
!isVectorizable(RdxKind, EdgeInst) ||
|
|
(R.isAnalyzedReductionRoot(EdgeInst) &&
|
|
all_of(EdgeInst->operands(), IsaPred<Constant>))) {
|
|
PossibleReducedVals.push_back(EdgeVal);
|
|
continue;
|
|
}
|
|
ReductionOps.push_back(EdgeInst);
|
|
}
|
|
};
|
|
// Try to regroup reduced values so that it gets more profitable to try to
|
|
// reduce them. Values are grouped by their value ids, instructions - by
|
|
// instruction op id and/or alternate op id, plus do extra analysis for
|
|
// loads (grouping them by the distabce between pointers) and cmp
|
|
// instructions (grouping them by the predicate).
|
|
SmallMapVector<
|
|
size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
|
|
8>
|
|
PossibleReducedVals;
|
|
initReductionOps(Root);
|
|
DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
|
|
SmallSet<size_t, 2> LoadKeyUsed;
|
|
|
|
auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
|
|
Key = hash_combine(hash_value(LI->getParent()), Key);
|
|
Value *Ptr =
|
|
getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
|
|
if (!LoadKeyUsed.insert(Key).second) {
|
|
auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
|
|
if (LIt != LoadsMap.end()) {
|
|
for (LoadInst *RLI : LIt->second) {
|
|
if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
|
|
LI->getType(), LI->getPointerOperand(), DL, SE,
|
|
/*StrictCheck=*/true))
|
|
return hash_value(RLI->getPointerOperand());
|
|
}
|
|
for (LoadInst *RLI : LIt->second) {
|
|
if (arePointersCompatible(RLI->getPointerOperand(),
|
|
LI->getPointerOperand(), TLI)) {
|
|
hash_code SubKey = hash_value(RLI->getPointerOperand());
|
|
return SubKey;
|
|
}
|
|
}
|
|
if (LIt->second.size() > 2) {
|
|
hash_code SubKey =
|
|
hash_value(LIt->second.back()->getPointerOperand());
|
|
return SubKey;
|
|
}
|
|
}
|
|
}
|
|
LoadsMap.try_emplace(std::make_pair(Key, Ptr))
|
|
.first->second.push_back(LI);
|
|
return hash_value(LI->getPointerOperand());
|
|
};
|
|
|
|
while (!Worklist.empty()) {
|
|
auto [TreeN, Level] = Worklist.pop_back_val();
|
|
SmallVector<Value *> PossibleRedVals;
|
|
SmallVector<Instruction *> PossibleReductionOps;
|
|
CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
|
|
addReductionOps(TreeN);
|
|
// Add reduction values. The values are sorted for better vectorization
|
|
// results.
|
|
for (Value *V : PossibleRedVals) {
|
|
size_t Key, Idx;
|
|
std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
|
|
/*AllowAlternate=*/false);
|
|
++PossibleReducedVals[Key][Idx]
|
|
.insert(std::make_pair(V, 0))
|
|
.first->second;
|
|
}
|
|
for (Instruction *I : reverse(PossibleReductionOps))
|
|
Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
|
|
}
|
|
auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
|
|
// Sort values by the total number of values kinds to start the reduction
|
|
// from the longest possible reduced values sequences.
|
|
for (auto &PossibleReducedVals : PossibleReducedValsVect) {
|
|
auto PossibleRedVals = PossibleReducedVals.second.takeVector();
|
|
SmallVector<SmallVector<Value *>> PossibleRedValsVect;
|
|
for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
|
|
It != E; ++It) {
|
|
PossibleRedValsVect.emplace_back();
|
|
auto RedValsVect = It->second.takeVector();
|
|
stable_sort(RedValsVect, llvm::less_second());
|
|
for (const std::pair<Value *, unsigned> &Data : RedValsVect)
|
|
PossibleRedValsVect.back().append(Data.second, Data.first);
|
|
}
|
|
stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
|
|
return P1.size() > P2.size();
|
|
});
|
|
int NewIdx = -1;
|
|
for (ArrayRef<Value *> Data : PossibleRedValsVect) {
|
|
if (NewIdx < 0 ||
|
|
(!isGoodForReduction(Data) &&
|
|
(!isa<LoadInst>(Data.front()) ||
|
|
!isa<LoadInst>(ReducedVals[NewIdx].front()) ||
|
|
getUnderlyingObject(
|
|
cast<LoadInst>(Data.front())->getPointerOperand()) !=
|
|
getUnderlyingObject(
|
|
cast<LoadInst>(ReducedVals[NewIdx].front())
|
|
->getPointerOperand())))) {
|
|
NewIdx = ReducedVals.size();
|
|
ReducedVals.emplace_back();
|
|
}
|
|
ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
|
|
}
|
|
}
|
|
// Sort the reduced values by number of same/alternate opcode and/or pointer
|
|
// operand.
|
|
stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
|
|
return P1.size() > P2.size();
|
|
});
|
|
return true;
|
|
}
|
|
|
|
/// Attempt to vectorize the tree found by matchAssociativeReduction.
|
|
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
|
|
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
|
|
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
|
|
constexpr unsigned RegMaxNumber = 4;
|
|
constexpr unsigned RedValsMaxNumber = 128;
|
|
// If there are a sufficient number of reduction values, reduce
|
|
// to a nearby power-of-2. We can safely generate oversized
|
|
// vectors and rely on the backend to split them to legal sizes.
|
|
if (unsigned NumReducedVals = std::accumulate(
|
|
ReducedVals.begin(), ReducedVals.end(), 0,
|
|
[](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
|
|
if (!isGoodForReduction(Vals))
|
|
return Num;
|
|
return Num + Vals.size();
|
|
});
|
|
NumReducedVals < ReductionLimit &&
|
|
all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
|
|
return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
|
|
})) {
|
|
for (ReductionOpsType &RdxOps : ReductionOps)
|
|
for (Value *RdxOp : RdxOps)
|
|
V.analyzedReductionRoot(cast<Instruction>(RdxOp));
|
|
return nullptr;
|
|
}
|
|
|
|
IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
|
|
TargetFolder(DL));
|
|
Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
|
|
|
|
// Track the reduced values in case if they are replaced by extractelement
|
|
// because of the vectorization.
|
|
DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
|
|
ReducedVals.front().size());
|
|
|
|
// The compare instruction of a min/max is the insertion point for new
|
|
// instructions and may be replaced with a new compare instruction.
|
|
auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
|
|
assert(isa<SelectInst>(RdxRootInst) &&
|
|
"Expected min/max reduction to have select root instruction");
|
|
Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
|
|
assert(isa<Instruction>(ScalarCond) &&
|
|
"Expected min/max reduction to have compare condition");
|
|
return cast<Instruction>(ScalarCond);
|
|
};
|
|
|
|
bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
|
|
return isBoolLogicOp(cast<Instruction>(V));
|
|
});
|
|
// Return new VectorizedTree, based on previous value.
|
|
auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
|
|
if (VectorizedTree) {
|
|
// Update the final value in the reduction.
|
|
Builder.SetCurrentDebugLocation(
|
|
cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
|
|
if (AnyBoolLogicOp) {
|
|
auto It = ReducedValsToOps.find(VectorizedTree);
|
|
auto It1 = ReducedValsToOps.find(Res);
|
|
if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
|
|
isGuaranteedNotToBePoison(VectorizedTree, AC) ||
|
|
(It != ReducedValsToOps.end() &&
|
|
any_of(It->getSecond(), [&](Instruction *I) {
|
|
return isBoolLogicOp(I) &&
|
|
getRdxOperand(I, 0) == VectorizedTree;
|
|
}))) {
|
|
;
|
|
} else if (isGuaranteedNotToBePoison(Res, AC) ||
|
|
(It1 != ReducedValsToOps.end() &&
|
|
any_of(It1->getSecond(), [&](Instruction *I) {
|
|
return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
|
|
}))) {
|
|
std::swap(VectorizedTree, Res);
|
|
} else {
|
|
VectorizedTree = Builder.CreateFreeze(VectorizedTree);
|
|
}
|
|
}
|
|
|
|
return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
|
|
ReductionOps);
|
|
}
|
|
// Initialize the final value in the reduction.
|
|
return Res;
|
|
};
|
|
SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
|
|
ReductionOps.front().size());
|
|
for (ReductionOpsType &RdxOps : ReductionOps)
|
|
for (Value *RdxOp : RdxOps) {
|
|
if (!RdxOp)
|
|
continue;
|
|
IgnoreList.insert(RdxOp);
|
|
}
|
|
// Intersect the fast-math-flags from all reduction operations.
|
|
FastMathFlags RdxFMF;
|
|
RdxFMF.set();
|
|
for (Value *U : IgnoreList)
|
|
if (auto *FPMO = dyn_cast<FPMathOperator>(U))
|
|
RdxFMF &= FPMO->getFastMathFlags();
|
|
bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
|
|
|
|
// Need to track reduced vals, they may be changed during vectorization of
|
|
// subvectors.
|
|
for (ArrayRef<Value *> Candidates : ReducedVals)
|
|
for (Value *V : Candidates)
|
|
TrackedVals.try_emplace(V, V);
|
|
|
|
auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
|
|
Value *V) -> unsigned & {
|
|
auto *It = MV.find(V);
|
|
assert(It != MV.end() && "Unable to find given key.");
|
|
return It->second;
|
|
};
|
|
|
|
DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
|
|
// List of the values that were reduced in other trees as part of gather
|
|
// nodes and thus requiring extract if fully vectorized in other trees.
|
|
SmallPtrSet<Value *, 4> RequiredExtract;
|
|
WeakTrackingVH VectorizedTree = nullptr;
|
|
bool CheckForReusedReductionOps = false;
|
|
// Try to vectorize elements based on their type.
|
|
SmallVector<InstructionsState> States;
|
|
for (ArrayRef<Value *> RV : ReducedVals)
|
|
States.push_back(getSameOpcode(RV, TLI));
|
|
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
|
|
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
|
|
InstructionsState S = States[I];
|
|
SmallVector<Value *> Candidates;
|
|
Candidates.reserve(2 * OrigReducedVals.size());
|
|
DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
|
|
for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
|
|
Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
|
|
// Check if the reduction value was not overriden by the extractelement
|
|
// instruction because of the vectorization and exclude it, if it is not
|
|
// compatible with other values.
|
|
// Also check if the instruction was folded to constant/other value.
|
|
auto *Inst = dyn_cast<Instruction>(RdxVal);
|
|
if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
|
|
(!S || !S.isOpcodeOrAlt(Inst))) ||
|
|
(S && !Inst))
|
|
continue;
|
|
Candidates.push_back(RdxVal);
|
|
TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
|
|
}
|
|
bool ShuffledExtracts = false;
|
|
// Try to handle shuffled extractelements.
|
|
if (S && S.getOpcode() == Instruction::ExtractElement &&
|
|
!S.isAltShuffle() && I + 1 < E) {
|
|
SmallVector<Value *> CommonCandidates(Candidates);
|
|
for (Value *RV : ReducedVals[I + 1]) {
|
|
Value *RdxVal = TrackedVals.at(RV);
|
|
// Check if the reduction value was not overriden by the
|
|
// extractelement instruction because of the vectorization and
|
|
// exclude it, if it is not compatible with other values.
|
|
auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
|
|
if (!Inst)
|
|
continue;
|
|
CommonCandidates.push_back(RdxVal);
|
|
TrackedToOrig.try_emplace(RdxVal, RV);
|
|
}
|
|
SmallVector<int> Mask;
|
|
if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
|
|
++I;
|
|
Candidates.swap(CommonCandidates);
|
|
ShuffledExtracts = true;
|
|
}
|
|
}
|
|
|
|
// Emit code for constant values.
|
|
if (Candidates.size() > 1 && allConstant(Candidates)) {
|
|
Value *Res = Candidates.front();
|
|
Value *OrigV = TrackedToOrig.at(Candidates.front());
|
|
++VectorizedVals.try_emplace(OrigV).first->getSecond();
|
|
for (Value *VC : ArrayRef(Candidates).drop_front()) {
|
|
Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
|
|
Value *OrigV = TrackedToOrig.at(VC);
|
|
++VectorizedVals.try_emplace(OrigV).first->getSecond();
|
|
if (auto *ResI = dyn_cast<Instruction>(Res))
|
|
V.analyzedReductionRoot(ResI);
|
|
}
|
|
VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
|
|
continue;
|
|
}
|
|
|
|
unsigned NumReducedVals = Candidates.size();
|
|
if (NumReducedVals < ReductionLimit &&
|
|
(NumReducedVals < 2 || !isSplat(Candidates)))
|
|
continue;
|
|
|
|
// Check if we support repeated scalar values processing (optimization of
|
|
// original scalar identity operations on matched horizontal reductions).
|
|
IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
|
|
RdxKind != RecurKind::FMul &&
|
|
RdxKind != RecurKind::FMulAdd;
|
|
// Gather same values.
|
|
SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
|
|
if (IsSupportedHorRdxIdentityOp)
|
|
for (Value *V : Candidates) {
|
|
Value *OrigV = TrackedToOrig.at(V);
|
|
++SameValuesCounter.try_emplace(OrigV).first->second;
|
|
}
|
|
// Used to check if the reduced values used same number of times. In this
|
|
// case the compiler may produce better code. E.g. if reduced values are
|
|
// aabbccdd (8 x values), then the first node of the tree will have a node
|
|
// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
|
|
// Plus, the final reduction will be performed on <8 x aabbccdd>.
|
|
// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
|
|
// x abcd) * 2.
|
|
// Currently it only handles add/fadd/xor. and/or/min/max do not require
|
|
// this analysis, other operations may require an extra estimation of
|
|
// the profitability.
|
|
bool SameScaleFactor = false;
|
|
bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
|
|
SameValuesCounter.size() != Candidates.size();
|
|
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
|
|
if (OptReusedScalars) {
|
|
SameScaleFactor =
|
|
(RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
|
|
RdxKind == RecurKind::Xor) &&
|
|
all_of(drop_begin(SameValuesCounter),
|
|
[&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
|
|
return P.second == SameValuesCounter.front().second;
|
|
});
|
|
Candidates.resize(SameValuesCounter.size());
|
|
transform(SameValuesCounter, Candidates.begin(),
|
|
[&](const auto &P) { return TrackedVals.at(P.first); });
|
|
NumReducedVals = Candidates.size();
|
|
// Have a reduction of the same element.
|
|
if (NumReducedVals == 1) {
|
|
Value *OrigV = TrackedToOrig.at(Candidates.front());
|
|
unsigned Cnt = At(SameValuesCounter, OrigV);
|
|
Value *RedVal =
|
|
emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
|
|
VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
|
|
VectorizedVals.try_emplace(OrigV, Cnt);
|
|
ExternallyUsedValues.insert(OrigV);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
unsigned MaxVecRegSize = V.getMaxVecRegSize();
|
|
unsigned EltSize = V.getVectorElementSize(Candidates[0]);
|
|
const unsigned MaxElts = std::clamp<unsigned>(
|
|
llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
|
|
RegMaxNumber * RedValsMaxNumber);
|
|
|
|
unsigned ReduxWidth = NumReducedVals;
|
|
auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
|
|
unsigned NumParts, NumRegs;
|
|
Type *ScalarTy = Candidates.front()->getType();
|
|
ReduxWidth =
|
|
getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
|
|
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
|
|
NumParts = ::getNumberOfParts(TTI, Tp);
|
|
NumRegs =
|
|
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
|
|
while (NumParts > NumRegs) {
|
|
assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
|
|
ReduxWidth = bit_floor(ReduxWidth - 1);
|
|
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
|
|
NumParts = ::getNumberOfParts(TTI, Tp);
|
|
NumRegs =
|
|
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
|
|
}
|
|
if (NumParts > NumRegs / 2)
|
|
ReduxWidth = bit_floor(ReduxWidth);
|
|
return ReduxWidth;
|
|
};
|
|
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
|
|
ReduxWidth = GetVectorFactor(ReduxWidth);
|
|
ReduxWidth = std::min(ReduxWidth, MaxElts);
|
|
|
|
unsigned Start = 0;
|
|
unsigned Pos = Start;
|
|
// Restarts vectorization attempt with lower vector factor.
|
|
unsigned PrevReduxWidth = ReduxWidth;
|
|
bool CheckForReusedReductionOpsLocal = false;
|
|
auto AdjustReducedVals = [&](bool IgnoreVL = false) {
|
|
bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
|
|
if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
|
|
// Check if any of the reduction ops are gathered. If so, worth
|
|
// trying again with less number of reduction ops.
|
|
CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
|
|
}
|
|
++Pos;
|
|
if (Pos < NumReducedVals - ReduxWidth + 1)
|
|
return IsAnyRedOpGathered;
|
|
Pos = Start;
|
|
--ReduxWidth;
|
|
if (ReduxWidth > 1)
|
|
ReduxWidth = GetVectorFactor(ReduxWidth);
|
|
return IsAnyRedOpGathered;
|
|
};
|
|
bool AnyVectorized = false;
|
|
SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
|
|
while (Pos < NumReducedVals - ReduxWidth + 1 &&
|
|
ReduxWidth >= ReductionLimit) {
|
|
// Dependency in tree of the reduction ops - drop this attempt, try
|
|
// later.
|
|
if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
|
|
Start == 0) {
|
|
CheckForReusedReductionOps = true;
|
|
break;
|
|
}
|
|
PrevReduxWidth = ReduxWidth;
|
|
ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
|
|
// Been analyzed already - skip.
|
|
if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
|
|
(!has_single_bit(ReduxWidth) &&
|
|
(IgnoredCandidates.contains(
|
|
std::make_pair(Pos, bit_floor(ReduxWidth))) ||
|
|
IgnoredCandidates.contains(
|
|
std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
|
|
bit_floor(ReduxWidth))))) ||
|
|
V.areAnalyzedReductionVals(VL)) {
|
|
(void)AdjustReducedVals(/*IgnoreVL=*/true);
|
|
continue;
|
|
}
|
|
// Early exit if any of the reduction values were deleted during
|
|
// previous vectorization attempts.
|
|
if (any_of(VL, [&V](Value *RedVal) {
|
|
auto *RedValI = dyn_cast<Instruction>(RedVal);
|
|
if (!RedValI)
|
|
return false;
|
|
return V.isDeleted(RedValI);
|
|
}))
|
|
break;
|
|
V.buildTree(VL, IgnoreList);
|
|
if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
|
|
if (!AdjustReducedVals())
|
|
V.analyzedReductionVals(VL);
|
|
continue;
|
|
}
|
|
if (V.isLoadCombineReductionCandidate(RdxKind)) {
|
|
if (!AdjustReducedVals())
|
|
V.analyzedReductionVals(VL);
|
|
continue;
|
|
}
|
|
V.reorderTopToBottom();
|
|
// No need to reorder the root node at all.
|
|
V.reorderBottomToTop(/*IgnoreReorder=*/true);
|
|
// Keep extracted other reduction values, if they are used in the
|
|
// vectorization trees.
|
|
BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
|
|
ExternallyUsedValues);
|
|
// The reduction root is used as the insertion point for new
|
|
// instructions, so set it as externally used to prevent it from being
|
|
// deleted.
|
|
LocalExternallyUsedValues.insert(ReductionRoot);
|
|
for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
|
|
if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
|
|
continue;
|
|
for (Value *V : ReducedVals[Cnt])
|
|
if (isa<Instruction>(V))
|
|
LocalExternallyUsedValues.insert(TrackedVals[V]);
|
|
}
|
|
if (!IsSupportedHorRdxIdentityOp) {
|
|
// Number of uses of the candidates in the vector of values.
|
|
assert(SameValuesCounter.empty() &&
|
|
"Reused values counter map is not empty");
|
|
for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
|
|
if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
|
|
continue;
|
|
Value *V = Candidates[Cnt];
|
|
Value *OrigV = TrackedToOrig.at(V);
|
|
++SameValuesCounter.try_emplace(OrigV).first->second;
|
|
}
|
|
}
|
|
V.transformNodes();
|
|
SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
|
|
// Gather externally used values.
|
|
SmallPtrSet<Value *, 4> Visited;
|
|
for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
|
|
if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
|
|
continue;
|
|
Value *RdxVal = Candidates[Cnt];
|
|
if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
|
|
RdxVal = It->second;
|
|
if (!Visited.insert(RdxVal).second)
|
|
continue;
|
|
// Check if the scalar was vectorized as part of the vectorization
|
|
// tree but not the top node.
|
|
if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
|
|
LocalExternallyUsedValues.insert(RdxVal);
|
|
continue;
|
|
}
|
|
Value *OrigV = TrackedToOrig.at(RdxVal);
|
|
unsigned NumOps =
|
|
VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
|
|
if (NumOps != ReducedValsToOps.at(OrigV).size())
|
|
LocalExternallyUsedValues.insert(RdxVal);
|
|
}
|
|
// Do not need the list of reused scalars in regular mode anymore.
|
|
if (!IsSupportedHorRdxIdentityOp)
|
|
SameValuesCounter.clear();
|
|
for (Value *RdxVal : VL)
|
|
if (RequiredExtract.contains(RdxVal))
|
|
LocalExternallyUsedValues.insert(RdxVal);
|
|
V.buildExternalUses(LocalExternallyUsedValues);
|
|
|
|
V.computeMinimumValueSizes();
|
|
|
|
// Estimate cost.
|
|
InstructionCost ReductionCost =
|
|
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
|
|
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
|
|
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
|
|
<< " for reduction\n");
|
|
if (!Cost.isValid())
|
|
break;
|
|
if (Cost >= -SLPCostThreshold) {
|
|
V.getORE()->emit([&]() {
|
|
return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
|
|
ReducedValsToOps.at(VL[0]).front())
|
|
<< "Vectorizing horizontal reduction is possible "
|
|
<< "but not beneficial with cost " << ore::NV("Cost", Cost)
|
|
<< " and threshold "
|
|
<< ore::NV("Threshold", -SLPCostThreshold);
|
|
});
|
|
if (!AdjustReducedVals()) {
|
|
V.analyzedReductionVals(VL);
|
|
unsigned Offset = Pos == Start ? Pos : Pos - 1;
|
|
if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
|
|
// Add subvectors of VL to the list of the analyzed values.
|
|
for (unsigned VF = getFloorFullVectorNumberOfElements(
|
|
*TTI, VL.front()->getType(), ReduxWidth - 1);
|
|
VF >= ReductionLimit;
|
|
VF = getFloorFullVectorNumberOfElements(
|
|
*TTI, VL.front()->getType(), VF - 1)) {
|
|
if (has_single_bit(VF) &&
|
|
V.getCanonicalGraphSize() != V.getTreeSize())
|
|
continue;
|
|
for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
|
|
IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
|
|
}
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
|
|
<< Cost << ". (HorRdx)\n");
|
|
V.getORE()->emit([&]() {
|
|
return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
|
|
ReducedValsToOps.at(VL[0]).front())
|
|
<< "Vectorized horizontal reduction with cost "
|
|
<< ore::NV("Cost", Cost) << " and with tree size "
|
|
<< ore::NV("TreeSize", V.getTreeSize());
|
|
});
|
|
|
|
Builder.setFastMathFlags(RdxFMF);
|
|
|
|
// Emit a reduction. If the root is a select (min/max idiom), the insert
|
|
// point is the compare condition of that select.
|
|
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
|
|
Instruction *InsertPt = RdxRootInst;
|
|
if (IsCmpSelMinMax)
|
|
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
|
|
|
|
// Vectorize a tree.
|
|
Value *VectorizedRoot = V.vectorizeTree(
|
|
LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
|
|
// Update TrackedToOrig mapping, since the tracked values might be
|
|
// updated.
|
|
for (Value *RdxVal : Candidates) {
|
|
Value *OrigVal = TrackedToOrig.at(RdxVal);
|
|
Value *TransformedRdxVal = TrackedVals.at(OrigVal);
|
|
if (TransformedRdxVal != RdxVal)
|
|
TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
|
|
}
|
|
|
|
Builder.SetInsertPoint(InsertPt);
|
|
|
|
// To prevent poison from leaking across what used to be sequential,
|
|
// safe, scalar boolean logic operations, the reduction operand must be
|
|
// frozen.
|
|
if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
|
|
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
|
|
|
|
// Emit code to correctly handle reused reduced values, if required.
|
|
if (OptReusedScalars && !SameScaleFactor) {
|
|
VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
|
|
SameValuesCounter, TrackedToOrig);
|
|
}
|
|
|
|
Type *ScalarTy = VL.front()->getType();
|
|
if (isa<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
|
|
Value *ReducedSubTree = PoisonValue::get(
|
|
getWidenedType(ScalarTy->getScalarType(), ScalarTyNumElements));
|
|
for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
|
|
// Do reduction for each lane.
|
|
// e.g., do reduce add for
|
|
// VL[0] = <4 x Ty> <a, b, c, d>
|
|
// VL[1] = <4 x Ty> <e, f, g, h>
|
|
// Lane[0] = <2 x Ty> <a, e>
|
|
// Lane[1] = <2 x Ty> <b, f>
|
|
// Lane[2] = <2 x Ty> <c, g>
|
|
// Lane[3] = <2 x Ty> <d, h>
|
|
// result[0] = reduce add Lane[0]
|
|
// result[1] = reduce add Lane[1]
|
|
// result[2] = reduce add Lane[2]
|
|
// result[3] = reduce add Lane[3]
|
|
SmallVector<int, 16> Mask =
|
|
createStrideMask(I, ScalarTyNumElements, VL.size());
|
|
Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
|
|
Value *Val =
|
|
createSingleOp(Builder, *TTI, Lane,
|
|
OptReusedScalars && SameScaleFactor
|
|
? SameValuesCounter.front().second
|
|
: 1,
|
|
Lane->getType()->getScalarType() !=
|
|
VL.front()->getType()->getScalarType()
|
|
? V.isSignedMinBitwidthRootNode()
|
|
: true,
|
|
RdxRootInst->getType());
|
|
ReducedSubTree =
|
|
Builder.CreateInsertElement(ReducedSubTree, Val, I);
|
|
}
|
|
VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
|
|
} else {
|
|
Type *VecTy = VectorizedRoot->getType();
|
|
Type *RedScalarTy = VecTy->getScalarType();
|
|
VectorValuesAndScales.emplace_back(
|
|
VectorizedRoot,
|
|
OptReusedScalars && SameScaleFactor
|
|
? SameValuesCounter.front().second
|
|
: 1,
|
|
RedScalarTy != ScalarTy->getScalarType()
|
|
? V.isSignedMinBitwidthRootNode()
|
|
: true);
|
|
}
|
|
|
|
// Count vectorized reduced values to exclude them from final reduction.
|
|
for (Value *RdxVal : VL) {
|
|
Value *OrigV = TrackedToOrig.at(RdxVal);
|
|
if (IsSupportedHorRdxIdentityOp) {
|
|
VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
|
|
continue;
|
|
}
|
|
++VectorizedVals.try_emplace(OrigV).first->getSecond();
|
|
if (!V.isVectorized(RdxVal))
|
|
RequiredExtract.insert(RdxVal);
|
|
}
|
|
Pos += ReduxWidth;
|
|
Start = Pos;
|
|
ReduxWidth = NumReducedVals - Pos;
|
|
if (ReduxWidth > 1)
|
|
ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
|
|
AnyVectorized = true;
|
|
}
|
|
if (OptReusedScalars && !AnyVectorized) {
|
|
for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
|
|
Value *RdxVal = TrackedVals.at(P.first);
|
|
Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
|
|
VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
|
|
VectorizedVals.try_emplace(P.first, P.second);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
if (!VectorValuesAndScales.empty())
|
|
VectorizedTree = GetNewVectorizedTree(
|
|
VectorizedTree,
|
|
emitReduction(Builder, *TTI, ReductionRoot->getType()));
|
|
if (VectorizedTree) {
|
|
// Reorder operands of bool logical op in the natural order to avoid
|
|
// possible problem with poison propagation. If not possible to reorder
|
|
// (both operands are originally RHS), emit an extra freeze instruction
|
|
// for the LHS operand.
|
|
// I.e., if we have original code like this:
|
|
// RedOp1 = select i1 ?, i1 LHS, i1 false
|
|
// RedOp2 = select i1 RHS, i1 ?, i1 false
|
|
|
|
// Then, we swap LHS/RHS to create a new op that matches the poison
|
|
// semantics of the original code.
|
|
|
|
// If we have original code like this and both values could be poison:
|
|
// RedOp1 = select i1 ?, i1 LHS, i1 false
|
|
// RedOp2 = select i1 ?, i1 RHS, i1 false
|
|
|
|
// Then, we must freeze LHS in the new op.
|
|
auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
|
|
Instruction *RedOp1,
|
|
Instruction *RedOp2,
|
|
bool InitStep) {
|
|
if (!AnyBoolLogicOp)
|
|
return;
|
|
if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
|
|
getRdxOperand(RedOp1, 0) == LHS ||
|
|
isGuaranteedNotToBePoison(LHS, AC)))
|
|
return;
|
|
if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
|
|
getRdxOperand(RedOp2, 0) == RHS ||
|
|
isGuaranteedNotToBePoison(RHS, AC))) {
|
|
std::swap(LHS, RHS);
|
|
return;
|
|
}
|
|
if (LHS != VectorizedTree)
|
|
LHS = Builder.CreateFreeze(LHS);
|
|
};
|
|
// Finish the reduction.
|
|
// Need to add extra arguments and not vectorized possible reduction
|
|
// values.
|
|
// Try to avoid dependencies between the scalar remainders after
|
|
// reductions.
|
|
auto FinalGen =
|
|
[&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
|
|
bool InitStep) {
|
|
unsigned Sz = InstVals.size();
|
|
SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
|
|
Sz % 2);
|
|
for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
|
|
Instruction *RedOp = InstVals[I + 1].first;
|
|
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
|
|
Value *RdxVal1 = InstVals[I].second;
|
|
Value *StableRdxVal1 = RdxVal1;
|
|
auto It1 = TrackedVals.find(RdxVal1);
|
|
if (It1 != TrackedVals.end())
|
|
StableRdxVal1 = It1->second;
|
|
Value *RdxVal2 = InstVals[I + 1].second;
|
|
Value *StableRdxVal2 = RdxVal2;
|
|
auto It2 = TrackedVals.find(RdxVal2);
|
|
if (It2 != TrackedVals.end())
|
|
StableRdxVal2 = It2->second;
|
|
// To prevent poison from leaking across what used to be
|
|
// sequential, safe, scalar boolean logic operations, the
|
|
// reduction operand must be frozen.
|
|
FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
|
|
RedOp, InitStep);
|
|
Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
|
|
StableRdxVal2, "op.rdx", ReductionOps);
|
|
ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
|
|
}
|
|
if (Sz % 2 == 1)
|
|
ExtraReds[Sz / 2] = InstVals.back();
|
|
return ExtraReds;
|
|
};
|
|
SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
|
|
ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
|
|
VectorizedTree);
|
|
SmallPtrSet<Value *, 8> Visited;
|
|
for (ArrayRef<Value *> Candidates : ReducedVals) {
|
|
for (Value *RdxVal : Candidates) {
|
|
if (!Visited.insert(RdxVal).second)
|
|
continue;
|
|
unsigned NumOps = VectorizedVals.lookup(RdxVal);
|
|
for (Instruction *RedOp :
|
|
ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
|
|
ExtraReductions.emplace_back(RedOp, RdxVal);
|
|
}
|
|
}
|
|
// Iterate through all not-vectorized reduction values/extra arguments.
|
|
bool InitStep = true;
|
|
while (ExtraReductions.size() > 1) {
|
|
SmallVector<std::pair<Instruction *, Value *>> NewReds =
|
|
FinalGen(ExtraReductions, InitStep);
|
|
ExtraReductions.swap(NewReds);
|
|
InitStep = false;
|
|
}
|
|
VectorizedTree = ExtraReductions.front().second;
|
|
|
|
ReductionRoot->replaceAllUsesWith(VectorizedTree);
|
|
|
|
// The original scalar reduction is expected to have no remaining
|
|
// uses outside the reduction tree itself. Assert that we got this
|
|
// correct, replace internal uses with undef, and mark for eventual
|
|
// deletion.
|
|
#ifndef NDEBUG
|
|
SmallSet<Value *, 4> IgnoreSet;
|
|
for (ArrayRef<Value *> RdxOps : ReductionOps)
|
|
IgnoreSet.insert_range(RdxOps);
|
|
#endif
|
|
for (ArrayRef<Value *> RdxOps : ReductionOps) {
|
|
for (Value *Ignore : RdxOps) {
|
|
if (!Ignore)
|
|
continue;
|
|
#ifndef NDEBUG
|
|
for (auto *U : Ignore->users()) {
|
|
assert(IgnoreSet.count(U) &&
|
|
"All users must be either in the reduction ops list.");
|
|
}
|
|
#endif
|
|
if (!Ignore->use_empty()) {
|
|
Value *P = PoisonValue::get(Ignore->getType());
|
|
Ignore->replaceAllUsesWith(P);
|
|
}
|
|
}
|
|
V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
|
|
}
|
|
} else if (!CheckForReusedReductionOps) {
|
|
for (ReductionOpsType &RdxOps : ReductionOps)
|
|
for (Value *RdxOp : RdxOps)
|
|
V.analyzedReductionRoot(cast<Instruction>(RdxOp));
|
|
}
|
|
return VectorizedTree;
|
|
}
|
|
|
|
private:
|
|
/// Creates the reduction from the given \p Vec vector value with the given
|
|
/// scale \p Scale and signedness \p IsSigned.
|
|
Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
|
|
Value *Vec, unsigned Scale, bool IsSigned,
|
|
Type *DestTy) {
|
|
Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
|
|
if (Rdx->getType() != DestTy->getScalarType())
|
|
Rdx = Builder.CreateIntCast(Rdx, DestTy->getScalarType(), IsSigned);
|
|
// Improved analysis for add/fadd/xor reductions with same scale
|
|
// factor for all operands of reductions. We can emit scalar ops for
|
|
// them instead.
|
|
if (Scale > 1)
|
|
Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
|
|
return Rdx;
|
|
}
|
|
|
|
/// Calculate the cost of a reduction.
|
|
InstructionCost getReductionCost(TargetTransformInfo *TTI,
|
|
ArrayRef<Value *> ReducedVals,
|
|
bool IsCmpSelMinMax, FastMathFlags FMF,
|
|
const BoUpSLP &R) {
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
Type *ScalarTy = ReducedVals.front()->getType();
|
|
unsigned ReduxWidth = ReducedVals.size();
|
|
FixedVectorType *VectorTy = R.getReductionType();
|
|
InstructionCost VectorCost = 0, ScalarCost;
|
|
// If all of the reduced values are constant, the vector cost is 0, since
|
|
// the reduction value can be calculated at the compile time.
|
|
bool AllConsts = allConstant(ReducedVals);
|
|
auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
|
|
InstructionCost Cost = 0;
|
|
// Scalar cost is repeated for N-1 elements.
|
|
int Cnt = ReducedVals.size();
|
|
for (Value *RdxVal : ReducedVals) {
|
|
if (Cnt == 1)
|
|
break;
|
|
--Cnt;
|
|
if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
|
|
Cost += GenCostFn();
|
|
continue;
|
|
}
|
|
InstructionCost ScalarCost = 0;
|
|
for (User *U : RdxVal->users()) {
|
|
auto *RdxOp = cast<Instruction>(U);
|
|
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
|
|
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
|
|
continue;
|
|
}
|
|
ScalarCost = InstructionCost::getInvalid();
|
|
break;
|
|
}
|
|
if (ScalarCost.isValid())
|
|
Cost += ScalarCost;
|
|
else
|
|
Cost += GenCostFn();
|
|
}
|
|
return Cost;
|
|
};
|
|
// Require reduction cost if:
|
|
// 1. This type is not a full register type and no other vectors with the
|
|
// same type in the storage (first vector with small type).
|
|
// 2. The storage does not have any vector with full vector use (first
|
|
// vector with full register use).
|
|
bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
|
|
switch (RdxKind) {
|
|
case RecurKind::Add:
|
|
case RecurKind::Mul:
|
|
case RecurKind::Or:
|
|
case RecurKind::And:
|
|
case RecurKind::Xor:
|
|
case RecurKind::FAdd:
|
|
case RecurKind::FMul: {
|
|
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
|
|
if (!AllConsts) {
|
|
if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
|
|
assert(SLPReVec && "FixedVectorType is not expected.");
|
|
unsigned ScalarTyNumElements = VecTy->getNumElements();
|
|
for (unsigned I : seq<unsigned>(ReducedVals.size())) {
|
|
VectorCost += TTI->getShuffleCost(
|
|
TTI::SK_PermuteSingleSrc, VectorTy,
|
|
createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
|
|
VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
|
|
CostKind);
|
|
}
|
|
VectorCost += TTI->getScalarizationOverhead(
|
|
VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
|
|
/*Extract*/ false, TTI::TCK_RecipThroughput);
|
|
} else if (DoesRequireReductionOp) {
|
|
Type *RedTy = VectorTy->getElementType();
|
|
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
|
|
std::make_pair(RedTy, true));
|
|
if (RType == RedTy) {
|
|
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
|
|
FMF, CostKind);
|
|
} else {
|
|
VectorCost = TTI->getExtendedReductionCost(
|
|
RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
|
|
FMF, CostKind);
|
|
}
|
|
} else {
|
|
Type *RedTy = VectorTy->getElementType();
|
|
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
|
|
std::make_pair(RedTy, true));
|
|
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
|
|
VectorCost +=
|
|
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
|
|
if (RType != RedTy) {
|
|
unsigned Opcode = Instruction::Trunc;
|
|
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
|
|
Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
|
|
VectorCost += TTI->getCastInstrCost(
|
|
Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
|
|
}
|
|
}
|
|
}
|
|
ScalarCost = EvaluateScalarCost([&]() {
|
|
return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
|
|
});
|
|
break;
|
|
}
|
|
case RecurKind::FMax:
|
|
case RecurKind::FMin:
|
|
case RecurKind::FMaximum:
|
|
case RecurKind::FMinimum:
|
|
case RecurKind::SMax:
|
|
case RecurKind::SMin:
|
|
case RecurKind::UMax:
|
|
case RecurKind::UMin: {
|
|
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
|
|
if (!AllConsts) {
|
|
if (DoesRequireReductionOp) {
|
|
VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
|
|
} else {
|
|
// Check if the previous reduction already exists and account it as
|
|
// series of operations + single reduction.
|
|
Type *RedTy = VectorTy->getElementType();
|
|
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
|
|
std::make_pair(RedTy, true));
|
|
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
|
|
IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
|
|
VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
|
|
if (RType != RedTy) {
|
|
unsigned Opcode = Instruction::Trunc;
|
|
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
|
|
Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
|
|
VectorCost += TTI->getCastInstrCost(
|
|
Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
|
|
}
|
|
}
|
|
}
|
|
ScalarCost = EvaluateScalarCost([&]() {
|
|
IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
|
|
return TTI->getIntrinsicInstrCost(ICA, CostKind);
|
|
});
|
|
break;
|
|
}
|
|
default:
|
|
llvm_unreachable("Expected arithmetic or min/max reduction operation");
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
|
|
<< " for reduction of " << shortBundleName(ReducedVals)
|
|
<< " (It is a splitting reduction)\n");
|
|
return VectorCost - ScalarCost;
|
|
}
|
|
|
|
/// Splits the values, stored in VectorValuesAndScales, into registers/free
|
|
/// sub-registers, combines them with the given reduction operation as a
|
|
/// vector operation and then performs single (small enough) reduction.
|
|
Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
|
|
Type *DestTy) {
|
|
Value *ReducedSubTree = nullptr;
|
|
// Creates reduction and combines with the previous reduction.
|
|
auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
|
|
Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
|
|
if (ReducedSubTree)
|
|
ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
|
|
"op.rdx", ReductionOps);
|
|
else
|
|
ReducedSubTree = Rdx;
|
|
};
|
|
if (VectorValuesAndScales.size() == 1) {
|
|
const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
|
|
CreateSingleOp(Vec, Scale, IsSigned);
|
|
return ReducedSubTree;
|
|
}
|
|
// Scales Vec using given Cnt scale factor and then performs vector combine
|
|
// with previous value of VecOp.
|
|
Value *VecRes = nullptr;
|
|
bool VecResSignedness = false;
|
|
auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
|
|
Type *ScalarTy = Vec->getType()->getScalarType();
|
|
// Scale Vec using given Cnt scale factor.
|
|
if (Cnt > 1) {
|
|
ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
|
|
switch (RdxKind) {
|
|
case RecurKind::Add: {
|
|
if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
|
|
unsigned VF = getNumElements(Vec->getType());
|
|
LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
|
|
<< ". (HorRdx)\n");
|
|
SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
|
|
for (unsigned I : seq<unsigned>(Cnt))
|
|
std::iota(std::next(Mask.begin(), VF * I),
|
|
std::next(Mask.begin(), VF * (I + 1)), 0);
|
|
++NumVectorInstructions;
|
|
Vec = Builder.CreateShuffleVector(Vec, Mask);
|
|
break;
|
|
}
|
|
// res = mul vv, n
|
|
if (ScalarTy != DestTy->getScalarType())
|
|
Vec = Builder.CreateIntCast(
|
|
Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
|
|
IsSigned);
|
|
Value *Scale = ConstantVector::getSplat(
|
|
EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
|
|
LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
|
|
<< ". (HorRdx)\n");
|
|
++NumVectorInstructions;
|
|
Vec = Builder.CreateMul(Vec, Scale);
|
|
break;
|
|
}
|
|
case RecurKind::Xor: {
|
|
// res = n % 2 ? 0 : vv
|
|
LLVM_DEBUG(dbgs()
|
|
<< "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
|
|
if (Cnt % 2 == 0)
|
|
Vec = Constant::getNullValue(Vec->getType());
|
|
break;
|
|
}
|
|
case RecurKind::FAdd: {
|
|
// res = fmul v, n
|
|
Value *Scale =
|
|
ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
|
|
LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
|
|
<< ". (HorRdx)\n");
|
|
++NumVectorInstructions;
|
|
Vec = Builder.CreateFMul(Vec, Scale);
|
|
break;
|
|
}
|
|
case RecurKind::And:
|
|
case RecurKind::Or:
|
|
case RecurKind::SMax:
|
|
case RecurKind::SMin:
|
|
case RecurKind::UMax:
|
|
case RecurKind::UMin:
|
|
case RecurKind::FMax:
|
|
case RecurKind::FMin:
|
|
case RecurKind::FMaximum:
|
|
case RecurKind::FMinimum:
|
|
// res = vv
|
|
break;
|
|
case RecurKind::Mul:
|
|
case RecurKind::FMul:
|
|
case RecurKind::FMulAdd:
|
|
case RecurKind::IAnyOf:
|
|
case RecurKind::FAnyOf:
|
|
case RecurKind::IFindLastIV:
|
|
case RecurKind::FFindLastIV:
|
|
case RecurKind::None:
|
|
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
|
|
}
|
|
}
|
|
// Combine Vec with the previous VecOp.
|
|
if (!VecRes) {
|
|
VecRes = Vec;
|
|
VecResSignedness = IsSigned;
|
|
} else {
|
|
++NumVectorInstructions;
|
|
if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
|
|
VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
|
|
// Handle ctpop.
|
|
unsigned VecResVF = getNumElements(VecRes->getType());
|
|
unsigned VecVF = getNumElements(Vec->getType());
|
|
SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
|
|
std::iota(Mask.begin(), Mask.end(), 0);
|
|
// Ensure that VecRes is always larger than Vec
|
|
if (VecResVF < VecVF) {
|
|
std::swap(VecRes, Vec);
|
|
std::swap(VecResVF, VecVF);
|
|
}
|
|
if (VecResVF != VecVF) {
|
|
SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
|
|
std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
|
|
Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
|
|
}
|
|
VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
|
|
return;
|
|
}
|
|
if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
|
|
VecRes = Builder.CreateIntCast(
|
|
VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
|
|
VecResSignedness);
|
|
if (ScalarTy != DestTy->getScalarType())
|
|
Vec = Builder.CreateIntCast(
|
|
Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
|
|
IsSigned);
|
|
unsigned VecResVF = getNumElements(VecRes->getType());
|
|
unsigned VecVF = getNumElements(Vec->getType());
|
|
// Ensure that VecRes is always larger than Vec
|
|
if (VecResVF < VecVF) {
|
|
std::swap(VecRes, Vec);
|
|
std::swap(VecResVF, VecVF);
|
|
}
|
|
// extract + op + insert
|
|
Value *Op = VecRes;
|
|
if (VecResVF != VecVF)
|
|
Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
|
|
Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
|
|
if (VecResVF != VecVF)
|
|
Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
|
|
VecRes = Op;
|
|
}
|
|
};
|
|
for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
|
|
CreateVecOp(Vec, Scale, IsSigned);
|
|
CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
|
|
|
|
return ReducedSubTree;
|
|
}
|
|
|
|
/// Emit a horizontal reduction of the vectorized value.
|
|
Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
|
|
const TargetTransformInfo *TTI, Type *DestTy) {
|
|
assert(VectorizedValue && "Need to have a vectorized tree node");
|
|
assert(RdxKind != RecurKind::FMulAdd &&
|
|
"A call to the llvm.fmuladd intrinsic is not handled yet");
|
|
|
|
auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
|
|
if (FTy->getScalarType() == Builder.getInt1Ty() &&
|
|
RdxKind == RecurKind::Add &&
|
|
DestTy->getScalarType() != FTy->getScalarType()) {
|
|
// Convert vector_reduce_add(ZExt(<n x i1>)) to
|
|
// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
|
|
Value *V = Builder.CreateBitCast(
|
|
VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
|
|
++NumVectorInstructions;
|
|
return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
|
|
}
|
|
++NumVectorInstructions;
|
|
return createSimpleReduction(Builder, VectorizedValue, RdxKind);
|
|
}
|
|
|
|
/// Emits optimized code for unique scalar value reused \p Cnt times.
|
|
Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
|
|
unsigned Cnt) {
|
|
assert(IsSupportedHorRdxIdentityOp &&
|
|
"The optimization of matched scalar identity horizontal reductions "
|
|
"must be supported.");
|
|
if (Cnt == 1)
|
|
return VectorizedValue;
|
|
switch (RdxKind) {
|
|
case RecurKind::Add: {
|
|
// res = mul vv, n
|
|
Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
|
|
LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
|
|
<< VectorizedValue << ". (HorRdx)\n");
|
|
return Builder.CreateMul(VectorizedValue, Scale);
|
|
}
|
|
case RecurKind::Xor: {
|
|
// res = n % 2 ? 0 : vv
|
|
LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
|
|
<< ". (HorRdx)\n");
|
|
if (Cnt % 2 == 0)
|
|
return Constant::getNullValue(VectorizedValue->getType());
|
|
return VectorizedValue;
|
|
}
|
|
case RecurKind::FAdd: {
|
|
// res = fmul v, n
|
|
Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
|
|
LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
|
|
<< VectorizedValue << ". (HorRdx)\n");
|
|
return Builder.CreateFMul(VectorizedValue, Scale);
|
|
}
|
|
case RecurKind::And:
|
|
case RecurKind::Or:
|
|
case RecurKind::SMax:
|
|
case RecurKind::SMin:
|
|
case RecurKind::UMax:
|
|
case RecurKind::UMin:
|
|
case RecurKind::FMax:
|
|
case RecurKind::FMin:
|
|
case RecurKind::FMaximum:
|
|
case RecurKind::FMinimum:
|
|
// res = vv
|
|
return VectorizedValue;
|
|
case RecurKind::Mul:
|
|
case RecurKind::FMul:
|
|
case RecurKind::FMulAdd:
|
|
case RecurKind::IAnyOf:
|
|
case RecurKind::FAnyOf:
|
|
case RecurKind::IFindLastIV:
|
|
case RecurKind::FFindLastIV:
|
|
case RecurKind::None:
|
|
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
/// Emits actual operation for the scalar identity values, found during
|
|
/// horizontal reduction analysis.
|
|
Value *
|
|
emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
|
|
const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
|
|
const DenseMap<Value *, Value *> &TrackedToOrig) {
|
|
assert(IsSupportedHorRdxIdentityOp &&
|
|
"The optimization of matched scalar identity horizontal reductions "
|
|
"must be supported.");
|
|
ArrayRef<Value *> VL = R.getRootNodeScalars();
|
|
auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
|
|
if (VTy->getElementType() != VL.front()->getType()) {
|
|
VectorizedValue = Builder.CreateIntCast(
|
|
VectorizedValue,
|
|
getWidenedType(VL.front()->getType(), VTy->getNumElements()),
|
|
R.isSignedMinBitwidthRootNode());
|
|
}
|
|
switch (RdxKind) {
|
|
case RecurKind::Add: {
|
|
// root = mul prev_root, <1, 1, n, 1>
|
|
SmallVector<Constant *> Vals;
|
|
for (Value *V : VL) {
|
|
unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
|
|
Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
|
|
}
|
|
auto *Scale = ConstantVector::get(Vals);
|
|
LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
|
|
<< VectorizedValue << ". (HorRdx)\n");
|
|
return Builder.CreateMul(VectorizedValue, Scale);
|
|
}
|
|
case RecurKind::And:
|
|
case RecurKind::Or:
|
|
// No need for multiple or/and(s).
|
|
LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
|
|
<< ". (HorRdx)\n");
|
|
return VectorizedValue;
|
|
case RecurKind::SMax:
|
|
case RecurKind::SMin:
|
|
case RecurKind::UMax:
|
|
case RecurKind::UMin:
|
|
case RecurKind::FMax:
|
|
case RecurKind::FMin:
|
|
case RecurKind::FMaximum:
|
|
case RecurKind::FMinimum:
|
|
// No need for multiple min/max(s) of the same value.
|
|
LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
|
|
<< ". (HorRdx)\n");
|
|
return VectorizedValue;
|
|
case RecurKind::Xor: {
|
|
// Replace values with even number of repeats with 0, since
|
|
// x xor x = 0.
|
|
// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
|
|
// 7>, if elements 4th and 6th elements have even number of repeats.
|
|
SmallVector<int> Mask(
|
|
cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
|
|
PoisonMaskElem);
|
|
std::iota(Mask.begin(), Mask.end(), 0);
|
|
bool NeedShuffle = false;
|
|
for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
|
|
Value *V = VL[I];
|
|
unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
|
|
if (Cnt % 2 == 0) {
|
|
Mask[I] = VF;
|
|
NeedShuffle = true;
|
|
}
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
|
|
: Mask) dbgs()
|
|
<< I << " ";
|
|
dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
|
|
if (NeedShuffle)
|
|
VectorizedValue = Builder.CreateShuffleVector(
|
|
VectorizedValue,
|
|
ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
|
|
return VectorizedValue;
|
|
}
|
|
case RecurKind::FAdd: {
|
|
// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
|
|
SmallVector<Constant *> Vals;
|
|
for (Value *V : VL) {
|
|
unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
|
|
Vals.push_back(ConstantFP::get(V->getType(), Cnt));
|
|
}
|
|
auto *Scale = ConstantVector::get(Vals);
|
|
return Builder.CreateFMul(VectorizedValue, Scale);
|
|
}
|
|
case RecurKind::Mul:
|
|
case RecurKind::FMul:
|
|
case RecurKind::FMulAdd:
|
|
case RecurKind::IAnyOf:
|
|
case RecurKind::FAnyOf:
|
|
case RecurKind::IFindLastIV:
|
|
case RecurKind::FFindLastIV:
|
|
case RecurKind::None:
|
|
llvm_unreachable("Unexpected reduction kind for reused scalars.");
|
|
}
|
|
return nullptr;
|
|
}
|
|
};
|
|
} // end anonymous namespace
|
|
|
|
/// Gets recurrence kind from the specified value.
|
|
static RecurKind getRdxKind(Value *V) {
|
|
return HorizontalReduction::getRdxKind(V);
|
|
}
|
|
static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
|
|
if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
|
|
return cast<FixedVectorType>(IE->getType())->getNumElements();
|
|
|
|
unsigned AggregateSize = 1;
|
|
auto *IV = cast<InsertValueInst>(InsertInst);
|
|
Type *CurrentType = IV->getType();
|
|
do {
|
|
if (auto *ST = dyn_cast<StructType>(CurrentType)) {
|
|
for (auto *Elt : ST->elements())
|
|
if (Elt != ST->getElementType(0)) // check homogeneity
|
|
return std::nullopt;
|
|
AggregateSize *= ST->getNumElements();
|
|
CurrentType = ST->getElementType(0);
|
|
} else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
|
|
AggregateSize *= AT->getNumElements();
|
|
CurrentType = AT->getElementType();
|
|
} else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
|
|
AggregateSize *= VT->getNumElements();
|
|
return AggregateSize;
|
|
} else if (CurrentType->isSingleValueType()) {
|
|
return AggregateSize;
|
|
} else {
|
|
return std::nullopt;
|
|
}
|
|
} while (true);
|
|
}
|
|
|
|
static void findBuildAggregate_rec(Instruction *LastInsertInst,
|
|
TargetTransformInfo *TTI,
|
|
SmallVectorImpl<Value *> &BuildVectorOpds,
|
|
SmallVectorImpl<Value *> &InsertElts,
|
|
unsigned OperandOffset, const BoUpSLP &R) {
|
|
do {
|
|
Value *InsertedOperand = LastInsertInst->getOperand(1);
|
|
std::optional<unsigned> OperandIndex =
|
|
getElementIndex(LastInsertInst, OperandOffset);
|
|
if (!OperandIndex || R.isDeleted(LastInsertInst))
|
|
return;
|
|
if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
|
|
findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
|
|
BuildVectorOpds, InsertElts, *OperandIndex, R);
|
|
|
|
} else {
|
|
BuildVectorOpds[*OperandIndex] = InsertedOperand;
|
|
InsertElts[*OperandIndex] = LastInsertInst;
|
|
}
|
|
LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
|
|
} while (LastInsertInst != nullptr &&
|
|
isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
|
|
LastInsertInst->hasOneUse());
|
|
}
|
|
|
|
/// Recognize construction of vectors like
|
|
/// %ra = insertelement <4 x float> poison, float %s0, i32 0
|
|
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
|
|
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
|
|
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
|
|
/// starting from the last insertelement or insertvalue instruction.
|
|
///
|
|
/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
|
|
/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
|
|
/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
|
|
///
|
|
/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
|
|
///
|
|
/// \return true if it matches.
|
|
static bool findBuildAggregate(Instruction *LastInsertInst,
|
|
TargetTransformInfo *TTI,
|
|
SmallVectorImpl<Value *> &BuildVectorOpds,
|
|
SmallVectorImpl<Value *> &InsertElts,
|
|
const BoUpSLP &R) {
|
|
|
|
assert((isa<InsertElementInst>(LastInsertInst) ||
|
|
isa<InsertValueInst>(LastInsertInst)) &&
|
|
"Expected insertelement or insertvalue instruction!");
|
|
|
|
assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
|
|
"Expected empty result vectors!");
|
|
|
|
std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
|
|
if (!AggregateSize)
|
|
return false;
|
|
BuildVectorOpds.resize(*AggregateSize);
|
|
InsertElts.resize(*AggregateSize);
|
|
|
|
findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
|
|
R);
|
|
llvm::erase(BuildVectorOpds, nullptr);
|
|
llvm::erase(InsertElts, nullptr);
|
|
if (BuildVectorOpds.size() >= 2)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Try and get a reduction instruction from a phi node.
|
|
///
|
|
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
|
|
/// if they come from either \p ParentBB or a containing loop latch.
|
|
///
|
|
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
|
|
/// if not possible.
|
|
static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
|
|
BasicBlock *ParentBB, LoopInfo *LI) {
|
|
// There are situations where the reduction value is not dominated by the
|
|
// reduction phi. Vectorizing such cases has been reported to cause
|
|
// miscompiles. See PR25787.
|
|
auto DominatedReduxValue = [&](Value *R) {
|
|
return isa<Instruction>(R) &&
|
|
DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
|
|
};
|
|
|
|
Instruction *Rdx = nullptr;
|
|
|
|
// Return the incoming value if it comes from the same BB as the phi node.
|
|
if (P->getIncomingBlock(0) == ParentBB) {
|
|
Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
|
|
} else if (P->getIncomingBlock(1) == ParentBB) {
|
|
Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
|
|
}
|
|
|
|
if (Rdx && DominatedReduxValue(Rdx))
|
|
return Rdx;
|
|
|
|
// Otherwise, check whether we have a loop latch to look at.
|
|
Loop *BBL = LI->getLoopFor(ParentBB);
|
|
if (!BBL)
|
|
return nullptr;
|
|
BasicBlock *BBLatch = BBL->getLoopLatch();
|
|
if (!BBLatch)
|
|
return nullptr;
|
|
|
|
// There is a loop latch, return the incoming value if it comes from
|
|
// that. This reduction pattern occasionally turns up.
|
|
if (P->getIncomingBlock(0) == BBLatch) {
|
|
Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
|
|
} else if (P->getIncomingBlock(1) == BBLatch) {
|
|
Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
|
|
}
|
|
|
|
if (Rdx && DominatedReduxValue(Rdx))
|
|
return Rdx;
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
|
|
if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
/// We could have an initial reduction that is not an add.
|
|
/// r *= v1 + v2 + v3 + v4
|
|
/// In such a case start looking for a tree rooted in the first '+'.
|
|
/// \Returns the new root if found, which may be nullptr if not an instruction.
|
|
static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
|
|
Instruction *Root) {
|
|
assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
|
|
isa<IntrinsicInst>(Root)) &&
|
|
"Expected binop, select, or intrinsic for reduction matching");
|
|
Value *LHS =
|
|
Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
|
|
Value *RHS =
|
|
Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
|
|
if (LHS == Phi)
|
|
return dyn_cast<Instruction>(RHS);
|
|
if (RHS == Phi)
|
|
return dyn_cast<Instruction>(LHS);
|
|
return nullptr;
|
|
}
|
|
|
|
/// \p Returns the first operand of \p I that does not match \p Phi. If
|
|
/// operand is not an instruction it returns nullptr.
|
|
static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
|
|
Value *Op0 = nullptr;
|
|
Value *Op1 = nullptr;
|
|
if (!matchRdxBop(I, Op0, Op1))
|
|
return nullptr;
|
|
return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
|
|
}
|
|
|
|
/// \Returns true if \p I is a candidate instruction for reduction vectorization.
|
|
static bool isReductionCandidate(Instruction *I) {
|
|
bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
|
|
Value *B0 = nullptr, *B1 = nullptr;
|
|
bool IsBinop = matchRdxBop(I, B0, B1);
|
|
return IsBinop || IsSelect;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeHorReduction(
|
|
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
|
|
SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
|
|
if (!ShouldVectorizeHor)
|
|
return false;
|
|
bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
|
|
|
|
if (Root->getParent() != BB || isa<PHINode>(Root))
|
|
return false;
|
|
|
|
// If we can find a secondary reduction root, use that instead.
|
|
auto SelectRoot = [&]() {
|
|
if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
|
|
HorizontalReduction::getRdxKind(Root) != RecurKind::None)
|
|
if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
|
|
return NewRoot;
|
|
return Root;
|
|
};
|
|
|
|
// Start analysis starting from Root instruction. If horizontal reduction is
|
|
// found, try to vectorize it. If it is not a horizontal reduction or
|
|
// vectorization is not possible or not effective, and currently analyzed
|
|
// instruction is a binary operation, try to vectorize the operands, using
|
|
// pre-order DFS traversal order. If the operands were not vectorized, repeat
|
|
// the same procedure considering each operand as a possible root of the
|
|
// horizontal reduction.
|
|
// Interrupt the process if the Root instruction itself was vectorized or all
|
|
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
|
|
// If a horizintal reduction was not matched or vectorized we collect
|
|
// instructions for possible later attempts for vectorization.
|
|
std::queue<std::pair<Instruction *, unsigned>> Stack;
|
|
Stack.emplace(SelectRoot(), 0);
|
|
SmallPtrSet<Value *, 8> VisitedInstrs;
|
|
bool Res = false;
|
|
auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
|
|
if (R.isAnalyzedReductionRoot(Inst))
|
|
return nullptr;
|
|
if (!isReductionCandidate(Inst))
|
|
return nullptr;
|
|
HorizontalReduction HorRdx;
|
|
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
|
|
return nullptr;
|
|
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
|
|
};
|
|
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
|
|
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
|
|
FutureSeed = getNonPhiOperand(Root, P);
|
|
if (!FutureSeed)
|
|
return false;
|
|
}
|
|
// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
|
|
// analysis is done separately.
|
|
if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
|
|
PostponedInsts.push_back(FutureSeed);
|
|
return true;
|
|
};
|
|
|
|
while (!Stack.empty()) {
|
|
Instruction *Inst;
|
|
unsigned Level;
|
|
std::tie(Inst, Level) = Stack.front();
|
|
Stack.pop();
|
|
// Do not try to analyze instruction that has already been vectorized.
|
|
// This may happen when we vectorize instruction operands on a previous
|
|
// iteration while stack was populated before that happened.
|
|
if (R.isDeleted(Inst))
|
|
continue;
|
|
if (Value *VectorizedV = TryToReduce(Inst)) {
|
|
Res = true;
|
|
if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
|
|
// Try to find another reduction.
|
|
Stack.emplace(I, Level);
|
|
continue;
|
|
}
|
|
if (R.isDeleted(Inst))
|
|
continue;
|
|
} else {
|
|
// We could not vectorize `Inst` so try to use it as a future seed.
|
|
if (!TryAppendToPostponedInsts(Inst)) {
|
|
assert(Stack.empty() && "Expected empty stack");
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to vectorize operands.
|
|
// Continue analysis for the instruction from the same basic block only to
|
|
// save compile time.
|
|
if (++Level < RecursionMaxDepth)
|
|
for (auto *Op : Inst->operand_values())
|
|
if (VisitedInstrs.insert(Op).second)
|
|
if (auto *I = dyn_cast<Instruction>(Op))
|
|
// Do not try to vectorize CmpInst operands, this is done
|
|
// separately.
|
|
if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
|
|
!R.isDeleted(I) && I->getParent() == BB)
|
|
Stack.emplace(I, Level);
|
|
}
|
|
return Res;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
|
|
BasicBlock *BB, BoUpSLP &R) {
|
|
SmallVector<WeakTrackingVH> PostponedInsts;
|
|
bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
|
|
Res |= tryToVectorize(PostponedInsts, R);
|
|
return Res;
|
|
}
|
|
|
|
bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
|
|
BoUpSLP &R) {
|
|
bool Res = false;
|
|
for (Value *V : Insts)
|
|
if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
|
|
Res |= tryToVectorize(Inst, R);
|
|
return Res;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
|
|
BasicBlock *BB, BoUpSLP &R,
|
|
bool MaxVFOnly) {
|
|
if (!R.canMapToVector(IVI->getType()))
|
|
return false;
|
|
|
|
SmallVector<Value *, 16> BuildVectorOpds;
|
|
SmallVector<Value *, 16> BuildVectorInsts;
|
|
if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
|
|
return false;
|
|
|
|
if (MaxVFOnly && BuildVectorOpds.size() == 2) {
|
|
R.getORE()->emit([&]() {
|
|
return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
|
|
<< "Cannot SLP vectorize list: only 2 elements of buildvalue, "
|
|
"trying reduction first.";
|
|
});
|
|
return false;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
|
|
// Aggregate value is unlikely to be processed in vector register.
|
|
return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
|
|
BasicBlock *BB, BoUpSLP &R,
|
|
bool MaxVFOnly) {
|
|
SmallVector<Value *, 16> BuildVectorInsts;
|
|
SmallVector<Value *, 16> BuildVectorOpds;
|
|
SmallVector<int> Mask;
|
|
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
|
|
(all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
|
|
isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
|
|
return false;
|
|
|
|
if (MaxVFOnly && BuildVectorInsts.size() == 2) {
|
|
R.getORE()->emit([&]() {
|
|
return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
|
|
<< "Cannot SLP vectorize list: only 2 elements of buildvector, "
|
|
"trying reduction first.";
|
|
});
|
|
return false;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
|
|
return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
|
|
}
|
|
|
|
template <typename T>
|
|
static bool tryToVectorizeSequence(
|
|
SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
|
|
function_ref<bool(T *, T *)> AreCompatible,
|
|
function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
|
|
bool MaxVFOnly, BoUpSLP &R) {
|
|
bool Changed = false;
|
|
// Sort by type, parent, operands.
|
|
stable_sort(Incoming, Comparator);
|
|
|
|
// Try to vectorize elements base on their type.
|
|
SmallVector<T *> Candidates;
|
|
SmallVector<T *> VL;
|
|
for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
|
|
VL.clear()) {
|
|
// Look for the next elements with the same type, parent and operand
|
|
// kinds.
|
|
auto *I = dyn_cast<Instruction>(*IncIt);
|
|
if (!I || R.isDeleted(I)) {
|
|
++IncIt;
|
|
continue;
|
|
}
|
|
auto *SameTypeIt = IncIt;
|
|
while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
|
|
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
|
|
AreCompatible(*SameTypeIt, *IncIt))) {
|
|
auto *I = dyn_cast<Instruction>(*SameTypeIt);
|
|
++SameTypeIt;
|
|
if (I && !R.isDeleted(I))
|
|
VL.push_back(cast<T>(I));
|
|
}
|
|
|
|
// Try to vectorize them.
|
|
unsigned NumElts = VL.size();
|
|
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
|
|
<< NumElts << ")\n");
|
|
// The vectorization is a 3-state attempt:
|
|
// 1. Try to vectorize instructions with the same/alternate opcodes with the
|
|
// size of maximal register at first.
|
|
// 2. Try to vectorize remaining instructions with the same type, if
|
|
// possible. This may result in the better vectorization results rather than
|
|
// if we try just to vectorize instructions with the same/alternate opcodes.
|
|
// 3. Final attempt to try to vectorize all instructions with the
|
|
// same/alternate ops only, this may result in some extra final
|
|
// vectorization.
|
|
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
|
|
// Success start over because instructions might have been changed.
|
|
Changed = true;
|
|
VL.swap(Candidates);
|
|
Candidates.clear();
|
|
for (T *V : VL) {
|
|
if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
|
|
Candidates.push_back(V);
|
|
}
|
|
} else {
|
|
/// \Returns the minimum number of elements that we will attempt to
|
|
/// vectorize.
|
|
auto GetMinNumElements = [&R](Value *V) {
|
|
unsigned EltSize = R.getVectorElementSize(V);
|
|
return std::max(2U, R.getMaxVecRegSize() / EltSize);
|
|
};
|
|
if (NumElts < GetMinNumElements(*IncIt) &&
|
|
(Candidates.empty() ||
|
|
Candidates.front()->getType() == (*IncIt)->getType())) {
|
|
for (T *V : VL) {
|
|
if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
|
|
Candidates.push_back(V);
|
|
}
|
|
}
|
|
}
|
|
// Final attempt to vectorize instructions with the same types.
|
|
if (Candidates.size() > 1 &&
|
|
(SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
|
|
if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
|
|
// Success start over because instructions might have been changed.
|
|
Changed = true;
|
|
} else if (MaxVFOnly) {
|
|
// Try to vectorize using small vectors.
|
|
SmallVector<T *> VL;
|
|
for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
|
|
VL.clear()) {
|
|
auto *I = dyn_cast<Instruction>(*It);
|
|
if (!I || R.isDeleted(I)) {
|
|
++It;
|
|
continue;
|
|
}
|
|
auto *SameTypeIt = It;
|
|
while (SameTypeIt != End &&
|
|
(!isa<Instruction>(*SameTypeIt) ||
|
|
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
|
|
AreCompatible(*SameTypeIt, *It))) {
|
|
auto *I = dyn_cast<Instruction>(*SameTypeIt);
|
|
++SameTypeIt;
|
|
if (I && !R.isDeleted(I))
|
|
VL.push_back(cast<T>(I));
|
|
}
|
|
unsigned NumElts = VL.size();
|
|
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
|
|
/*MaxVFOnly=*/false))
|
|
Changed = true;
|
|
It = SameTypeIt;
|
|
}
|
|
}
|
|
Candidates.clear();
|
|
}
|
|
|
|
// Start over at the next instruction of a different type (or the end).
|
|
IncIt = SameTypeIt;
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
/// Compare two cmp instructions. If IsCompatibility is true, function returns
|
|
/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
|
|
/// operands. If IsCompatibility is false, function implements strict weak
|
|
/// ordering relation between two cmp instructions, returning true if the first
|
|
/// instruction is "less" than the second, i.e. its predicate is less than the
|
|
/// predicate of the second or the operands IDs are less than the operands IDs
|
|
/// of the second cmp instruction.
|
|
template <bool IsCompatibility>
|
|
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
|
|
const DominatorTree &DT) {
|
|
assert(isValidElementType(V->getType()) &&
|
|
isValidElementType(V2->getType()) &&
|
|
"Expected valid element types only.");
|
|
if (V == V2)
|
|
return IsCompatibility;
|
|
auto *CI1 = cast<CmpInst>(V);
|
|
auto *CI2 = cast<CmpInst>(V2);
|
|
if (CI1->getOperand(0)->getType()->getTypeID() <
|
|
CI2->getOperand(0)->getType()->getTypeID())
|
|
return !IsCompatibility;
|
|
if (CI1->getOperand(0)->getType()->getTypeID() >
|
|
CI2->getOperand(0)->getType()->getTypeID())
|
|
return false;
|
|
if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
|
|
CI2->getOperand(0)->getType()->getScalarSizeInBits())
|
|
return !IsCompatibility;
|
|
if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
|
|
CI2->getOperand(0)->getType()->getScalarSizeInBits())
|
|
return false;
|
|
CmpInst::Predicate Pred1 = CI1->getPredicate();
|
|
CmpInst::Predicate Pred2 = CI2->getPredicate();
|
|
CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
|
|
CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
|
|
CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
|
|
CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
|
|
if (BasePred1 < BasePred2)
|
|
return !IsCompatibility;
|
|
if (BasePred1 > BasePred2)
|
|
return false;
|
|
// Compare operands.
|
|
bool CI1Preds = Pred1 == BasePred1;
|
|
bool CI2Preds = Pred2 == BasePred1;
|
|
for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
|
|
auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
|
|
auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
|
|
if (Op1 == Op2)
|
|
continue;
|
|
if (Op1->getValueID() < Op2->getValueID())
|
|
return !IsCompatibility;
|
|
if (Op1->getValueID() > Op2->getValueID())
|
|
return false;
|
|
if (auto *I1 = dyn_cast<Instruction>(Op1))
|
|
if (auto *I2 = dyn_cast<Instruction>(Op2)) {
|
|
if (IsCompatibility) {
|
|
if (I1->getParent() != I2->getParent())
|
|
return false;
|
|
} else {
|
|
// Try to compare nodes with same parent.
|
|
DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
|
|
DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
|
|
if (!NodeI1)
|
|
return NodeI2 != nullptr;
|
|
if (!NodeI2)
|
|
return false;
|
|
assert((NodeI1 == NodeI2) ==
|
|
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
if (NodeI1 != NodeI2)
|
|
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
|
|
}
|
|
InstructionsState S = getSameOpcode({I1, I2}, TLI);
|
|
if (S && (IsCompatibility || !S.isAltShuffle()))
|
|
continue;
|
|
if (IsCompatibility)
|
|
return false;
|
|
if (I1->getOpcode() != I2->getOpcode())
|
|
return I1->getOpcode() < I2->getOpcode();
|
|
}
|
|
}
|
|
return IsCompatibility;
|
|
}
|
|
|
|
template <typename ItT>
|
|
bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
|
|
BasicBlock *BB, BoUpSLP &R) {
|
|
bool Changed = false;
|
|
// Try to find reductions first.
|
|
for (CmpInst *I : CmpInsts) {
|
|
if (R.isDeleted(I))
|
|
continue;
|
|
for (Value *Op : I->operands())
|
|
if (auto *RootOp = dyn_cast<Instruction>(Op)) {
|
|
Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
|
|
if (R.isDeleted(I))
|
|
break;
|
|
}
|
|
}
|
|
// Try to vectorize operands as vector bundles.
|
|
for (CmpInst *I : CmpInsts) {
|
|
if (R.isDeleted(I))
|
|
continue;
|
|
Changed |= tryToVectorize(I, R);
|
|
}
|
|
// Try to vectorize list of compares.
|
|
// Sort by type, compare predicate, etc.
|
|
auto CompareSorter = [&](Value *V, Value *V2) {
|
|
if (V == V2)
|
|
return false;
|
|
return compareCmp<false>(V, V2, *TLI, *DT);
|
|
};
|
|
|
|
auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
|
|
if (V1 == V2)
|
|
return true;
|
|
return compareCmp<true>(V1, V2, *TLI, *DT);
|
|
};
|
|
|
|
SmallVector<Value *> Vals;
|
|
for (Instruction *V : CmpInsts)
|
|
if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
|
|
Vals.push_back(V);
|
|
if (Vals.size() <= 1)
|
|
return Changed;
|
|
Changed |= tryToVectorizeSequence<Value>(
|
|
Vals, CompareSorter, AreCompatibleCompares,
|
|
[this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
|
|
// Exclude possible reductions from other blocks.
|
|
bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
|
|
return any_of(V->users(), [V](User *U) {
|
|
auto *Select = dyn_cast<SelectInst>(U);
|
|
return Select &&
|
|
Select->getParent() != cast<Instruction>(V)->getParent();
|
|
});
|
|
});
|
|
if (ArePossiblyReducedInOtherBlock)
|
|
return false;
|
|
return tryToVectorizeList(Candidates, R, MaxVFOnly);
|
|
},
|
|
/*MaxVFOnly=*/true, R);
|
|
return Changed;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
|
|
BasicBlock *BB, BoUpSLP &R) {
|
|
assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
|
|
"This function only accepts Insert instructions");
|
|
bool OpsChanged = false;
|
|
SmallVector<WeakTrackingVH> PostponedInsts;
|
|
for (auto *I : reverse(Instructions)) {
|
|
// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
|
|
if (R.isDeleted(I) || isa<CmpInst>(I))
|
|
continue;
|
|
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
|
|
OpsChanged |=
|
|
vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
|
|
} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
|
|
OpsChanged |=
|
|
vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
|
|
}
|
|
// pass2 - try to vectorize reductions only
|
|
if (R.isDeleted(I))
|
|
continue;
|
|
OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
|
|
if (R.isDeleted(I) || isa<CmpInst>(I))
|
|
continue;
|
|
// pass3 - try to match and vectorize a buildvector sequence.
|
|
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
|
|
OpsChanged |=
|
|
vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
|
|
} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
|
|
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
|
|
/*MaxVFOnly=*/false);
|
|
}
|
|
}
|
|
// Now try to vectorize postponed instructions.
|
|
OpsChanged |= tryToVectorize(PostponedInsts, R);
|
|
|
|
Instructions.clear();
|
|
return OpsChanged;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
|
bool Changed = false;
|
|
SmallVector<Value *, 4> Incoming;
|
|
SmallPtrSet<Value *, 16> VisitedInstrs;
|
|
// Maps phi nodes to the non-phi nodes found in the use tree for each phi
|
|
// node. Allows better to identify the chains that can be vectorized in the
|
|
// better way.
|
|
DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
|
|
auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
|
|
assert(isValidElementType(V1->getType()) &&
|
|
isValidElementType(V2->getType()) &&
|
|
"Expected vectorizable types only.");
|
|
// It is fine to compare type IDs here, since we expect only vectorizable
|
|
// types, like ints, floats and pointers, we don't care about other type.
|
|
if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
|
|
return true;
|
|
if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
|
|
return false;
|
|
if (V1->getType()->getScalarSizeInBits() <
|
|
V2->getType()->getScalarSizeInBits())
|
|
return true;
|
|
if (V1->getType()->getScalarSizeInBits() >
|
|
V2->getType()->getScalarSizeInBits())
|
|
return false;
|
|
ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
|
|
ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
|
|
if (Opcodes1.size() < Opcodes2.size())
|
|
return true;
|
|
if (Opcodes1.size() > Opcodes2.size())
|
|
return false;
|
|
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
|
|
{
|
|
// Instructions come first.
|
|
auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
|
|
auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
|
|
if (I1 && I2) {
|
|
DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
|
|
DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
|
|
if (!NodeI1)
|
|
return NodeI2 != nullptr;
|
|
if (!NodeI2)
|
|
return false;
|
|
assert((NodeI1 == NodeI2) ==
|
|
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
if (NodeI1 != NodeI2)
|
|
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
|
|
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
|
|
if (S && !S.isAltShuffle()) {
|
|
const auto *E1 = dyn_cast<ExtractElementInst>(I1);
|
|
const auto *E2 = dyn_cast<ExtractElementInst>(I2);
|
|
if (!E1 || !E2)
|
|
continue;
|
|
|
|
// Sort on ExtractElementInsts primarily by vector operands. Prefer
|
|
// program order of the vector operands.
|
|
const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
|
|
const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
|
|
if (V1 != V2) {
|
|
if (!V1 || !V2)
|
|
continue;
|
|
if (V1->getParent() != V2->getParent())
|
|
continue;
|
|
return V1->comesBefore(V2);
|
|
}
|
|
// If we have the same vector operand, try to sort by constant
|
|
// index.
|
|
std::optional<unsigned> Id1 = getExtractIndex(E1);
|
|
std::optional<unsigned> Id2 = getExtractIndex(E2);
|
|
// Bring constants to the top
|
|
if (Id1 && !Id2)
|
|
return true;
|
|
if (!Id1 && Id2)
|
|
return false;
|
|
// First elements come first.
|
|
if (Id1 && Id2)
|
|
return *Id1 < *Id2;
|
|
|
|
continue;
|
|
}
|
|
return I1->getOpcode() < I2->getOpcode();
|
|
}
|
|
if (I1)
|
|
return true;
|
|
if (I2)
|
|
return false;
|
|
}
|
|
{
|
|
// Non-undef constants come next.
|
|
bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
|
|
bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
|
|
if (C1 && C2)
|
|
continue;
|
|
if (C1)
|
|
return true;
|
|
if (C2)
|
|
return false;
|
|
}
|
|
bool U1 = isa<UndefValue>(Opcodes1[I]);
|
|
bool U2 = isa<UndefValue>(Opcodes2[I]);
|
|
{
|
|
// Non-constant non-instructions come next.
|
|
if (!U1 && !U2) {
|
|
auto ValID1 = Opcodes1[I]->getValueID();
|
|
auto ValID2 = Opcodes2[I]->getValueID();
|
|
if (ValID1 == ValID2)
|
|
continue;
|
|
if (ValID1 < ValID2)
|
|
return true;
|
|
if (ValID1 > ValID2)
|
|
return false;
|
|
}
|
|
if (!U1)
|
|
return true;
|
|
if (!U2)
|
|
return false;
|
|
}
|
|
// Undefs come last.
|
|
assert(U1 && U2 && "The only thing left should be undef & undef.");
|
|
}
|
|
return false;
|
|
};
|
|
auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
|
|
if (V1 == V2)
|
|
return true;
|
|
if (V1->getType() != V2->getType())
|
|
return false;
|
|
ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
|
|
ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
|
|
if (Opcodes1.size() != Opcodes2.size())
|
|
return false;
|
|
for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
|
|
// Undefs are compatible with any other value.
|
|
if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
|
|
continue;
|
|
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
|
|
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
|
|
if (R.isDeleted(I1) || R.isDeleted(I2))
|
|
return false;
|
|
if (I1->getParent() != I2->getParent())
|
|
return false;
|
|
if (getSameOpcode({I1, I2}, *TLI))
|
|
continue;
|
|
return false;
|
|
}
|
|
if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
|
|
continue;
|
|
if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
bool HaveVectorizedPhiNodes = false;
|
|
do {
|
|
// Collect the incoming values from the PHIs.
|
|
Incoming.clear();
|
|
for (Instruction &I : *BB) {
|
|
auto *P = dyn_cast<PHINode>(&I);
|
|
if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
|
|
break;
|
|
|
|
// No need to analyze deleted, vectorized and non-vectorizable
|
|
// instructions.
|
|
if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
|
|
isValidElementType(P->getType()))
|
|
Incoming.push_back(P);
|
|
}
|
|
|
|
if (Incoming.size() <= 1)
|
|
break;
|
|
|
|
// Find the corresponding non-phi nodes for better matching when trying to
|
|
// build the tree.
|
|
for (Value *V : Incoming) {
|
|
SmallVectorImpl<Value *> &Opcodes =
|
|
PHIToOpcodes.try_emplace(V).first->getSecond();
|
|
if (!Opcodes.empty())
|
|
continue;
|
|
SmallVector<Value *, 4> Nodes(1, V);
|
|
SmallPtrSet<Value *, 4> Visited;
|
|
while (!Nodes.empty()) {
|
|
auto *PHI = cast<PHINode>(Nodes.pop_back_val());
|
|
if (!Visited.insert(PHI).second)
|
|
continue;
|
|
for (Value *V : PHI->incoming_values()) {
|
|
if (auto *PHI1 = dyn_cast<PHINode>((V))) {
|
|
Nodes.push_back(PHI1);
|
|
continue;
|
|
}
|
|
Opcodes.emplace_back(V);
|
|
}
|
|
}
|
|
}
|
|
|
|
HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
|
|
Incoming, PHICompare, AreCompatiblePHIs,
|
|
[this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
|
|
return tryToVectorizeList(Candidates, R, MaxVFOnly);
|
|
},
|
|
/*MaxVFOnly=*/true, R);
|
|
Changed |= HaveVectorizedPhiNodes;
|
|
if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
|
|
auto *PHI = dyn_cast<PHINode>(P.first);
|
|
return !PHI || R.isDeleted(PHI);
|
|
}))
|
|
PHIToOpcodes.clear();
|
|
VisitedInstrs.insert_range(Incoming);
|
|
} while (HaveVectorizedPhiNodes);
|
|
|
|
VisitedInstrs.clear();
|
|
|
|
InstSetVector PostProcessInserts;
|
|
SmallSetVector<CmpInst *, 8> PostProcessCmps;
|
|
// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
|
|
// also vectorizes `PostProcessCmps`.
|
|
auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
|
|
bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
|
|
if (VectorizeCmps) {
|
|
Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
|
|
PostProcessCmps.clear();
|
|
}
|
|
PostProcessInserts.clear();
|
|
return Changed;
|
|
};
|
|
// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
|
|
auto IsInPostProcessInstrs = [&](Instruction *I) {
|
|
if (auto *Cmp = dyn_cast<CmpInst>(I))
|
|
return PostProcessCmps.contains(Cmp);
|
|
return isa<InsertElementInst, InsertValueInst>(I) &&
|
|
PostProcessInserts.contains(I);
|
|
};
|
|
// Returns true if `I` is an instruction without users, like terminator, or
|
|
// function call with ignored return value, store. Ignore unused instructions
|
|
// (basing on instruction type, except for CallInst and InvokeInst).
|
|
auto HasNoUsers = [](Instruction *I) {
|
|
return I->use_empty() &&
|
|
(I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
|
|
};
|
|
for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
|
|
// Skip instructions with scalable type. The num of elements is unknown at
|
|
// compile-time for scalable type.
|
|
if (isa<ScalableVectorType>(It->getType()))
|
|
continue;
|
|
|
|
// Skip instructions marked for the deletion.
|
|
if (R.isDeleted(&*It))
|
|
continue;
|
|
// We may go through BB multiple times so skip the one we have checked.
|
|
if (!VisitedInstrs.insert(&*It).second) {
|
|
if (HasNoUsers(&*It) &&
|
|
VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
|
|
// We would like to start over since some instructions are deleted
|
|
// and the iterator may become invalid value.
|
|
Changed = true;
|
|
It = BB->begin();
|
|
E = BB->end();
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (isa<DbgInfoIntrinsic>(It))
|
|
continue;
|
|
|
|
// Try to vectorize reductions that use PHINodes.
|
|
if (PHINode *P = dyn_cast<PHINode>(It)) {
|
|
// Check that the PHI is a reduction PHI.
|
|
if (P->getNumIncomingValues() == 2) {
|
|
// Try to match and vectorize a horizontal reduction.
|
|
Instruction *Root = getReductionInstr(DT, P, BB, LI);
|
|
if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
|
|
Changed = true;
|
|
It = BB->begin();
|
|
E = BB->end();
|
|
continue;
|
|
}
|
|
}
|
|
// Try to vectorize the incoming values of the PHI, to catch reductions
|
|
// that feed into PHIs.
|
|
for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
|
|
// Skip if the incoming block is the current BB for now. Also, bypass
|
|
// unreachable IR for efficiency and to avoid crashing.
|
|
// TODO: Collect the skipped incoming values and try to vectorize them
|
|
// after processing BB.
|
|
if (BB == P->getIncomingBlock(I) ||
|
|
!DT->isReachableFromEntry(P->getIncomingBlock(I)))
|
|
continue;
|
|
|
|
// Postponed instructions should not be vectorized here, delay their
|
|
// vectorization.
|
|
if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
|
|
PI && !IsInPostProcessInstrs(PI)) {
|
|
bool Res =
|
|
vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
|
|
Changed |= Res;
|
|
if (Res && R.isDeleted(P)) {
|
|
It = BB->begin();
|
|
E = BB->end();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (HasNoUsers(&*It)) {
|
|
bool OpsChanged = false;
|
|
auto *SI = dyn_cast<StoreInst>(It);
|
|
bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
|
|
if (SI) {
|
|
auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
|
|
// Try to vectorize chain in store, if this is the only store to the
|
|
// address in the block.
|
|
// TODO: This is just a temporarily solution to save compile time. Need
|
|
// to investigate if we can safely turn on slp-vectorize-hor-store
|
|
// instead to allow lookup for reduction chains in all non-vectorized
|
|
// stores (need to check side effects and compile time).
|
|
TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
|
|
SI->getValueOperand()->hasOneUse();
|
|
}
|
|
if (TryToVectorizeRoot) {
|
|
for (auto *V : It->operand_values()) {
|
|
// Postponed instructions should not be vectorized here, delay their
|
|
// vectorization.
|
|
if (auto *VI = dyn_cast<Instruction>(V);
|
|
VI && !IsInPostProcessInstrs(VI))
|
|
// Try to match and vectorize a horizontal reduction.
|
|
OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
|
|
}
|
|
}
|
|
// Start vectorization of post-process list of instructions from the
|
|
// top-tree instructions to try to vectorize as many instructions as
|
|
// possible.
|
|
OpsChanged |=
|
|
VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
|
|
if (OpsChanged) {
|
|
// We would like to start over since some instructions are deleted
|
|
// and the iterator may become invalid value.
|
|
Changed = true;
|
|
It = BB->begin();
|
|
E = BB->end();
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (isa<InsertElementInst, InsertValueInst>(It))
|
|
PostProcessInserts.insert(&*It);
|
|
else if (isa<CmpInst>(It))
|
|
PostProcessCmps.insert(cast<CmpInst>(&*It));
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
|
|
auto Changed = false;
|
|
for (auto &Entry : GEPs) {
|
|
// If the getelementptr list has fewer than two elements, there's nothing
|
|
// to do.
|
|
if (Entry.second.size() < 2)
|
|
continue;
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
|
|
<< Entry.second.size() << ".\n");
|
|
|
|
// Process the GEP list in chunks suitable for the target's supported
|
|
// vector size. If a vector register can't hold 1 element, we are done. We
|
|
// are trying to vectorize the index computations, so the maximum number of
|
|
// elements is based on the size of the index expression, rather than the
|
|
// size of the GEP itself (the target's pointer size).
|
|
auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
|
|
return !R.isDeleted(GEP);
|
|
});
|
|
if (It == Entry.second.end())
|
|
continue;
|
|
unsigned MaxVecRegSize = R.getMaxVecRegSize();
|
|
unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
|
|
if (MaxVecRegSize < EltSize)
|
|
continue;
|
|
|
|
unsigned MaxElts = MaxVecRegSize / EltSize;
|
|
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
|
|
auto Len = std::min<unsigned>(BE - BI, MaxElts);
|
|
ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
|
|
|
|
// Initialize a set a candidate getelementptrs. Note that we use a
|
|
// SetVector here to preserve program order. If the index computations
|
|
// are vectorizable and begin with loads, we want to minimize the chance
|
|
// of having to reorder them later.
|
|
SetVector<Value *> Candidates(llvm::from_range, GEPList);
|
|
|
|
// Some of the candidates may have already been vectorized after we
|
|
// initially collected them or their index is optimized to constant value.
|
|
// If so, they are marked as deleted, so remove them from the set of
|
|
// candidates.
|
|
Candidates.remove_if([&R](Value *I) {
|
|
return R.isDeleted(cast<Instruction>(I)) ||
|
|
isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
|
|
});
|
|
|
|
// Remove from the set of candidates all pairs of getelementptrs with
|
|
// constant differences. Such getelementptrs are likely not good
|
|
// candidates for vectorization in a bottom-up phase since one can be
|
|
// computed from the other. We also ensure all candidate getelementptr
|
|
// indices are unique.
|
|
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
|
|
auto *GEPI = GEPList[I];
|
|
if (!Candidates.count(GEPI))
|
|
continue;
|
|
const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
|
|
for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
|
|
auto *GEPJ = GEPList[J];
|
|
const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
|
|
if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
|
|
Candidates.remove(GEPI);
|
|
Candidates.remove(GEPJ);
|
|
} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
|
|
Candidates.remove(GEPJ);
|
|
}
|
|
}
|
|
}
|
|
|
|
// We break out of the above computation as soon as we know there are
|
|
// fewer than two candidates remaining.
|
|
if (Candidates.size() < 2)
|
|
continue;
|
|
|
|
// Add the single, non-constant index of each candidate to the bundle. We
|
|
// ensured the indices met these constraints when we originally collected
|
|
// the getelementptrs.
|
|
SmallVector<Value *, 16> Bundle(Candidates.size());
|
|
auto BundleIndex = 0u;
|
|
for (auto *V : Candidates) {
|
|
auto *GEP = cast<GetElementPtrInst>(V);
|
|
auto *GEPIdx = GEP->idx_begin()->get();
|
|
assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
|
|
Bundle[BundleIndex++] = GEPIdx;
|
|
}
|
|
|
|
// Try and vectorize the indices. We are currently only interested in
|
|
// gather-like cases of the form:
|
|
//
|
|
// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
|
|
//
|
|
// where the loads of "a", the loads of "b", and the subtractions can be
|
|
// performed in parallel. It's likely that detecting this pattern in a
|
|
// bottom-up phase will be simpler and less costly than building a
|
|
// full-blown top-down phase beginning at the consecutive loads.
|
|
Changed |= tryToVectorizeList(Bundle, R);
|
|
}
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
|
|
bool Changed = false;
|
|
// Sort by type, base pointers and values operand. Value operands must be
|
|
// compatible (have the same opcode, same parent), otherwise it is
|
|
// definitely not profitable to try to vectorize them.
|
|
auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
|
|
if (V->getValueOperand()->getType()->getTypeID() <
|
|
V2->getValueOperand()->getType()->getTypeID())
|
|
return true;
|
|
if (V->getValueOperand()->getType()->getTypeID() >
|
|
V2->getValueOperand()->getType()->getTypeID())
|
|
return false;
|
|
if (V->getPointerOperandType()->getTypeID() <
|
|
V2->getPointerOperandType()->getTypeID())
|
|
return true;
|
|
if (V->getPointerOperandType()->getTypeID() >
|
|
V2->getPointerOperandType()->getTypeID())
|
|
return false;
|
|
if (V->getValueOperand()->getType()->getScalarSizeInBits() <
|
|
V2->getValueOperand()->getType()->getScalarSizeInBits())
|
|
return true;
|
|
if (V->getValueOperand()->getType()->getScalarSizeInBits() >
|
|
V2->getValueOperand()->getType()->getScalarSizeInBits())
|
|
return false;
|
|
// UndefValues are compatible with all other values.
|
|
if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
|
|
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
|
|
DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
|
|
DT->getNode(I1->getParent());
|
|
DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
|
|
DT->getNode(I2->getParent());
|
|
assert(NodeI1 && "Should only process reachable instructions");
|
|
assert(NodeI2 && "Should only process reachable instructions");
|
|
assert((NodeI1 == NodeI2) ==
|
|
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
|
|
"Different nodes should have different DFS numbers");
|
|
if (NodeI1 != NodeI2)
|
|
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
|
|
return I1->getOpcode() < I2->getOpcode();
|
|
}
|
|
return V->getValueOperand()->getValueID() <
|
|
V2->getValueOperand()->getValueID();
|
|
};
|
|
|
|
auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
|
|
if (V1 == V2)
|
|
return true;
|
|
if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
|
|
return false;
|
|
if (V1->getPointerOperandType() != V2->getPointerOperandType())
|
|
return false;
|
|
// Undefs are compatible with any other value.
|
|
if (isa<UndefValue>(V1->getValueOperand()) ||
|
|
isa<UndefValue>(V2->getValueOperand()))
|
|
return true;
|
|
if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
|
|
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
|
|
if (I1->getParent() != I2->getParent())
|
|
return false;
|
|
return getSameOpcode({I1, I2}, *TLI).valid();
|
|
}
|
|
if (isa<Constant>(V1->getValueOperand()) &&
|
|
isa<Constant>(V2->getValueOperand()))
|
|
return true;
|
|
return V1->getValueOperand()->getValueID() ==
|
|
V2->getValueOperand()->getValueID();
|
|
};
|
|
|
|
// Attempt to sort and vectorize each of the store-groups.
|
|
DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
|
|
for (auto &Pair : Stores) {
|
|
if (Pair.second.size() < 2)
|
|
continue;
|
|
|
|
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
|
|
<< Pair.second.size() << ".\n");
|
|
|
|
if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
|
|
continue;
|
|
|
|
// Reverse stores to do bottom-to-top analysis. This is important if the
|
|
// values are stores to the same addresses several times, in this case need
|
|
// to follow the stores order (reversed to meet the memory dependecies).
|
|
SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
|
|
Pair.second.rend());
|
|
Changed |= tryToVectorizeSequence<StoreInst>(
|
|
ReversedStores, StoreSorter, AreCompatibleStores,
|
|
[&](ArrayRef<StoreInst *> Candidates, bool) {
|
|
return vectorizeStores(Candidates, R, Attempted);
|
|
},
|
|
/*MaxVFOnly=*/false, R);
|
|
}
|
|
return Changed;
|
|
}
|