mirror of
https://github.com/llvm/llvm-project.git
synced 2025-05-03 19:46:04 +00:00

WebAssembly doesn't support horizontal operations nor does it have a way of expressing fast-math or reassoc flags, so runtimes are currently unable to use pairwise operations when generating code from the existing shuffle patterns. This patch allows the backend to select which, arbitary, shuffle pattern to be used per reduction intrinsic. The default behaviour is the same as the existing, which is by splitting the vector into a top and bottom half. The other pattern introduced is for a pairwise shuffle. WebAssembly enables pairwise reductions for int/fp add/sub.
195 lines
6.9 KiB
C++
195 lines
6.9 KiB
C++
//===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass implements IR expansion for reduction intrinsics, allowing targets
|
|
// to enable the intrinsics until just before codegen.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/CodeGen/ExpandReductions.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/CodeGen/Passes.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstIterator.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
|
|
bool Changed = false;
|
|
SmallVector<IntrinsicInst *, 4> Worklist;
|
|
for (auto &I : instructions(F)) {
|
|
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
|
|
switch (II->getIntrinsicID()) {
|
|
default: break;
|
|
case Intrinsic::vector_reduce_fadd:
|
|
case Intrinsic::vector_reduce_fmul:
|
|
case Intrinsic::vector_reduce_add:
|
|
case Intrinsic::vector_reduce_mul:
|
|
case Intrinsic::vector_reduce_and:
|
|
case Intrinsic::vector_reduce_or:
|
|
case Intrinsic::vector_reduce_xor:
|
|
case Intrinsic::vector_reduce_smax:
|
|
case Intrinsic::vector_reduce_smin:
|
|
case Intrinsic::vector_reduce_umax:
|
|
case Intrinsic::vector_reduce_umin:
|
|
case Intrinsic::vector_reduce_fmax:
|
|
case Intrinsic::vector_reduce_fmin:
|
|
if (TTI->shouldExpandReduction(II))
|
|
Worklist.push_back(II);
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto *II : Worklist) {
|
|
FastMathFlags FMF =
|
|
isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
|
|
Intrinsic::ID ID = II->getIntrinsicID();
|
|
RecurKind RK = getMinMaxReductionRecurKind(ID);
|
|
TargetTransformInfo::ReductionShuffle RS =
|
|
TTI->getPreferredExpandedReductionShuffle(II);
|
|
|
|
Value *Rdx = nullptr;
|
|
IRBuilder<> Builder(II);
|
|
IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
|
|
Builder.setFastMathFlags(FMF);
|
|
switch (ID) {
|
|
default: llvm_unreachable("Unexpected intrinsic!");
|
|
case Intrinsic::vector_reduce_fadd:
|
|
case Intrinsic::vector_reduce_fmul: {
|
|
// FMFs must be attached to the call, otherwise it's an ordered reduction
|
|
// and it can't be handled by generating a shuffle sequence.
|
|
Value *Acc = II->getArgOperand(0);
|
|
Value *Vec = II->getArgOperand(1);
|
|
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
|
|
if (!FMF.allowReassoc())
|
|
Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
|
|
else {
|
|
if (!isPowerOf2_32(
|
|
cast<FixedVectorType>(Vec->getType())->getNumElements()))
|
|
continue;
|
|
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
|
|
Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
|
|
"bin.rdx");
|
|
}
|
|
break;
|
|
}
|
|
case Intrinsic::vector_reduce_and:
|
|
case Intrinsic::vector_reduce_or: {
|
|
// Canonicalize logical or/and reductions:
|
|
// Or reduction for i1 is represented as:
|
|
// %val = bitcast <ReduxWidth x i1> to iReduxWidth
|
|
// %res = cmp ne iReduxWidth %val, 0
|
|
// And reduction for i1 is represented as:
|
|
// %val = bitcast <ReduxWidth x i1> to iReduxWidth
|
|
// %res = cmp eq iReduxWidth %val, 11111
|
|
Value *Vec = II->getArgOperand(0);
|
|
auto *FTy = cast<FixedVectorType>(Vec->getType());
|
|
unsigned NumElts = FTy->getNumElements();
|
|
if (!isPowerOf2_32(NumElts))
|
|
continue;
|
|
|
|
if (FTy->getElementType() == Builder.getInt1Ty()) {
|
|
Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
|
|
if (ID == Intrinsic::vector_reduce_and) {
|
|
Rdx = Builder.CreateICmpEQ(
|
|
Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
|
|
} else {
|
|
assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
|
|
Rdx = Builder.CreateIsNotNull(Rdx);
|
|
}
|
|
break;
|
|
}
|
|
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
|
|
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
|
|
break;
|
|
}
|
|
case Intrinsic::vector_reduce_add:
|
|
case Intrinsic::vector_reduce_mul:
|
|
case Intrinsic::vector_reduce_xor:
|
|
case Intrinsic::vector_reduce_smax:
|
|
case Intrinsic::vector_reduce_smin:
|
|
case Intrinsic::vector_reduce_umax:
|
|
case Intrinsic::vector_reduce_umin: {
|
|
Value *Vec = II->getArgOperand(0);
|
|
if (!isPowerOf2_32(
|
|
cast<FixedVectorType>(Vec->getType())->getNumElements()))
|
|
continue;
|
|
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
|
|
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
|
|
break;
|
|
}
|
|
case Intrinsic::vector_reduce_fmax:
|
|
case Intrinsic::vector_reduce_fmin: {
|
|
// We require "nnan" to use a shuffle reduction; "nsz" is implied by the
|
|
// semantics of the reduction.
|
|
Value *Vec = II->getArgOperand(0);
|
|
if (!isPowerOf2_32(
|
|
cast<FixedVectorType>(Vec->getType())->getNumElements()) ||
|
|
!FMF.noNaNs())
|
|
continue;
|
|
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
|
|
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
|
|
break;
|
|
}
|
|
}
|
|
II->replaceAllUsesWith(Rdx);
|
|
II->eraseFromParent();
|
|
Changed = true;
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
class ExpandReductions : public FunctionPass {
|
|
public:
|
|
static char ID;
|
|
ExpandReductions() : FunctionPass(ID) {
|
|
initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
|
|
}
|
|
|
|
bool runOnFunction(Function &F) override {
|
|
const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
|
return expandReductions(F, TTI);
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
|
AU.setPreservesCFG();
|
|
}
|
|
};
|
|
}
|
|
|
|
char ExpandReductions::ID;
|
|
INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
|
|
"Expand reduction intrinsics", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
|
INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
|
|
"Expand reduction intrinsics", false, false)
|
|
|
|
FunctionPass *llvm::createExpandReductionsPass() {
|
|
return new ExpandReductions();
|
|
}
|
|
|
|
PreservedAnalyses ExpandReductionsPass::run(Function &F,
|
|
FunctionAnalysisManager &AM) {
|
|
const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
|
|
if (!expandReductions(F, &TTI))
|
|
return PreservedAnalyses::all();
|
|
PreservedAnalyses PA;
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|