llvm-project/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
Felipe Magno de Almeida e3fdc3aa81
[RISCV] Allow hoisting VXRM writes out of loops speculatively (#110044)
Change the intersect for the anticipated algorithm to ignore unknown
when anticipating. This effectively allows VXRM writes speculatively
because it could do a VXRM write even when there's branches where VXRM
is unneeded.

The importance of this change is because VXRM writes causes pipeline
flushes in some micro-architectures and so it makes sense to allow more
aggressive hoisting even if it causes some degradation for the slow
path.

An example is this code:
```
typedef unsigned char uint8_t;
__attribute__ ((noipa))
void foo (uint8_t *dst,  int i_dst_stride,
           uint8_t *src1, int i_src1_stride,
           uint8_t *src2, int i_src2_stride,
           int i_width, int i_height )
{
   for( int y = 0; y < i_height; y++ )
     {
       for( int x = 0; x < i_width; x++ )
         dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
       dst  += i_dst_stride;
       src1 += i_src1_stride;
       src2 += i_src2_stride;
     }
}
```
With this patch, the code above generates a hoisting VXRM writes out of
the outer loop.
2024-11-27 13:31:39 -08:00

505 lines
15 KiB
C++

//===-- RISCVInsertWriteVXRM.cpp - Insert Write of RISC-V VXRM CSR --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass inserts writes to the VXRM CSR as needed by vector instructions.
// Each instruction that uses VXRM carries an operand that contains its required
// VXRM value. This pass tries to optimize placement to avoid redundant writes
// to VXRM.
//
// This is done using 2 dataflow algorithms. The first is a forward data flow
// to calculate where a VXRM value is available. The second is a backwards
// dataflow to determine where a VXRM value is anticipated.
//
// Finally, we use the results of these two dataflows to insert VXRM writes
// where a value is anticipated, but not available.
//
// FIXME: This pass does not split critical edges, so there can still be some
// redundancy.
//
// FIXME: If we are willing to have writes that aren't always needed, we could
// reduce the number of VXRM writes in some cases.
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/RISCVBaseInfo.h"
#include "RISCV.h"
#include "RISCVSubtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <queue>
using namespace llvm;
#define DEBUG_TYPE "riscv-insert-write-vxrm"
#define RISCV_INSERT_WRITE_VXRM_NAME "RISC-V Insert Write VXRM Pass"
namespace {
class VXRMInfo {
uint8_t VXRMImm = 0;
enum : uint8_t {
Uninitialized,
Static,
Unknown,
} State = Uninitialized;
public:
VXRMInfo() {}
static VXRMInfo getUnknown() {
VXRMInfo Info;
Info.setUnknown();
return Info;
}
bool isValid() const { return State != Uninitialized; }
void setUnknown() { State = Unknown; }
bool isUnknown() const { return State == Unknown; }
bool isStatic() const { return State == Static; }
void setVXRMImm(unsigned Imm) {
assert(Imm <= 3 && "Unexpected VXRM value");
VXRMImm = Imm;
State = Static;
}
unsigned getVXRMImm() const {
assert(isStatic() && VXRMImm <= 3 && "Unexpected state");
return VXRMImm;
}
bool operator==(const VXRMInfo &Other) const {
// Uninitialized is only equal to another Uninitialized.
if (State != Other.State)
return false;
if (isStatic())
return VXRMImm == Other.VXRMImm;
assert((isValid() || isUnknown()) && "Unexpected state");
return true;
}
bool operator!=(const VXRMInfo &Other) const { return !(*this == Other); }
// Calculate the VXRMInfo visible to a block assuming this and Other are
// both predecessors.
VXRMInfo intersect(const VXRMInfo &Other) const {
// If the new value isn't valid, ignore it.
if (!Other.isValid())
return *this;
// If this value isn't valid, this must be the first predecessor, use it.
if (!isValid())
return Other;
// If either is unknown, the result is unknown.
if (isUnknown() || Other.isUnknown())
return VXRMInfo::getUnknown();
// If we have an exact match, return this.
if (*this == Other)
return *this;
// Otherwise the result is unknown.
return VXRMInfo::getUnknown();
}
// Calculate the VXRMInfo visible to a block assuming this and Other
// are both predecessors. To allow speculatively running WriteVXRM
// we will ignore Unknowns if one of this and Other have valid
// WriteVXRM. Rationale: WriteVXRM causes a pipeline flush in some
// uarchs and moving it outside loops is very important for some
// workloads.
VXRMInfo intersectAnticipated(const VXRMInfo &Other) const {
// If the new value isn't valid, ignore it.
if (!Other.isValid())
return *this;
// If this value isn't valid, this must be the first predecessor, use it.
if (!isValid())
return Other;
// If either is unknown, the result is the other one.
if (isUnknown())
return Other;
if (Other.isUnknown())
return *this;
// If we have an exact match, return this.
if (*this == Other)
return *this;
// Otherwise the result is unknown.
return VXRMInfo::getUnknown();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Support for debugging, callable in GDB: V->dump()
LLVM_DUMP_METHOD void dump() const {
print(dbgs());
dbgs() << "\n";
}
void print(raw_ostream &OS) const {
OS << '{';
if (!isValid())
OS << "Uninitialized";
else if (isUnknown())
OS << "Unknown";
else
OS << getVXRMImm();
OS << '}';
}
#endif
};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_ATTRIBUTE_USED
inline raw_ostream &operator<<(raw_ostream &OS, const VXRMInfo &V) {
V.print(OS);
return OS;
}
#endif
struct BlockData {
// Indicates if the block uses VXRM. Uninitialized means no use.
VXRMInfo VXRMUse;
// Indicates the VXRM output from the block. Unitialized means transparent.
VXRMInfo VXRMOut;
// Keeps track of the available VXRM value at the start of the basic bloc.
VXRMInfo AvailableIn;
// Keeps track of the available VXRM value at the end of the basic block.
VXRMInfo AvailableOut;
// Keeps track of what VXRM is anticipated at the start of the basic block.
VXRMInfo AnticipatedIn;
// Keeps track of what VXRM is anticipated at the end of the basic block.
VXRMInfo AnticipatedOut;
// Keeps track of whether the block is already in the queue.
bool InQueue;
BlockData() = default;
};
class RISCVInsertWriteVXRM : public MachineFunctionPass {
const TargetInstrInfo *TII;
std::vector<BlockData> BlockInfo;
std::queue<const MachineBasicBlock *> WorkList;
public:
static char ID;
RISCVInsertWriteVXRM() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
StringRef getPassName() const override {
return RISCV_INSERT_WRITE_VXRM_NAME;
}
private:
bool computeVXRMChanges(const MachineBasicBlock &MBB);
void computeAvailable(const MachineBasicBlock &MBB);
void computeAnticipated(const MachineFunction &MF, const MachineBasicBlock &MBB);
void emitWriteVXRM(MachineBasicBlock &MBB);
};
} // end anonymous namespace
char RISCVInsertWriteVXRM::ID = 0;
INITIALIZE_PASS(RISCVInsertWriteVXRM, DEBUG_TYPE, RISCV_INSERT_WRITE_VXRM_NAME,
false, false)
static bool ignoresVXRM(const MachineInstr &MI) {
switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
default:
return false;
case RISCV::VNCLIP_WI:
case RISCV::VNCLIPU_WI:
return MI.getOperand(3).getImm() == 0;
}
}
bool RISCVInsertWriteVXRM::computeVXRMChanges(const MachineBasicBlock &MBB) {
BlockData &BBInfo = BlockInfo[MBB.getNumber()];
bool NeedVXRMWrite = false;
for (const MachineInstr &MI : MBB) {
int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc());
if (VXRMIdx >= 0 && !ignoresVXRM(MI)) {
unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm();
if (!BBInfo.VXRMUse.isValid())
BBInfo.VXRMUse.setVXRMImm(NewVXRMImm);
BBInfo.VXRMOut.setVXRMImm(NewVXRMImm);
NeedVXRMWrite = true;
continue;
}
if (MI.isCall() || MI.isInlineAsm() ||
MI.modifiesRegister(RISCV::VXRM, /*TRI=*/nullptr)) {
if (!BBInfo.VXRMUse.isValid())
BBInfo.VXRMUse.setUnknown();
BBInfo.VXRMOut.setUnknown();
}
}
return NeedVXRMWrite;
}
void RISCVInsertWriteVXRM::computeAvailable(const MachineBasicBlock &MBB) {
BlockData &BBInfo = BlockInfo[MBB.getNumber()];
BBInfo.InQueue = false;
VXRMInfo Available;
if (MBB.pred_empty()) {
Available.setUnknown();
} else {
for (const MachineBasicBlock *P : MBB.predecessors())
Available = Available.intersect(BlockInfo[P->getNumber()].AvailableOut);
}
// If we don't have any valid available info, wait until we do.
if (!Available.isValid())
return;
if (Available != BBInfo.AvailableIn) {
BBInfo.AvailableIn = Available;
LLVM_DEBUG(dbgs() << "AvailableIn state of " << printMBBReference(MBB)
<< " changed to " << BBInfo.AvailableIn << "\n");
}
if (BBInfo.VXRMOut.isValid())
Available = BBInfo.VXRMOut;
if (Available == BBInfo.AvailableOut)
return;
BBInfo.AvailableOut = Available;
LLVM_DEBUG(dbgs() << "AvailableOut state of " << printMBBReference(MBB)
<< " changed to " << BBInfo.AvailableOut << "\n");
// Add the successors to the work list so that we can propagate.
for (MachineBasicBlock *S : MBB.successors()) {
if (!BlockInfo[S->getNumber()].InQueue) {
BlockInfo[S->getNumber()].InQueue = true;
WorkList.push(S);
}
}
}
void RISCVInsertWriteVXRM::computeAnticipated(const MachineFunction &MF, const MachineBasicBlock &MBB) {
BlockData &BBInfo = BlockInfo[MBB.getNumber()];
const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
BBInfo.InQueue = false;
VXRMInfo Anticipated;
if (MBB.succ_empty()) {
Anticipated.setUnknown();
} else {
for (const MachineBasicBlock *S : MBB.successors())
if (ST.hasVXRMPipelineFlush())
Anticipated =
Anticipated.intersectAnticipated(BlockInfo[S->getNumber()].AnticipatedIn);
else
Anticipated =
Anticipated.intersect(BlockInfo[S->getNumber()].AnticipatedIn);
}
// If we don't have any valid anticipated info, wait until we do.
if (!Anticipated.isValid())
return;
if (Anticipated != BBInfo.AnticipatedOut) {
BBInfo.AnticipatedOut = Anticipated;
LLVM_DEBUG(dbgs() << "AnticipatedOut state of " << printMBBReference(MBB)
<< " changed to " << BBInfo.AnticipatedOut << "\n");
}
// If this block reads VXRM, copy it.
if (BBInfo.VXRMUse.isValid())
Anticipated = BBInfo.VXRMUse;
if (Anticipated == BBInfo.AnticipatedIn)
return;
BBInfo.AnticipatedIn = Anticipated;
LLVM_DEBUG(dbgs() << "AnticipatedIn state of " << printMBBReference(MBB)
<< " changed to " << BBInfo.AnticipatedIn << "\n");
// Add the predecessors to the work list so that we can propagate.
for (MachineBasicBlock *P : MBB.predecessors()) {
if (!BlockInfo[P->getNumber()].InQueue) {
BlockInfo[P->getNumber()].InQueue = true;
WorkList.push(P);
}
}
}
void RISCVInsertWriteVXRM::emitWriteVXRM(MachineBasicBlock &MBB) {
const BlockData &BBInfo = BlockInfo[MBB.getNumber()];
VXRMInfo Info = BBInfo.AvailableIn;
// Flag to indicates we need to insert a VXRM write. We want to delay it as
// late as possible in this block.
bool PendingInsert = false;
// Insert VXRM write if anticipated and not available.
if (BBInfo.AnticipatedIn.isStatic()) {
// If this is the entry block and the value is anticipated, insert.
if (MBB.isEntryBlock()) {
PendingInsert = true;
} else {
// Search for any predecessors that wouldn't satisfy our requirement and
// insert a write VXRM if needed.
// NOTE: If one predecessor is able to provide the requirement, but
// another isn't, it means we have a critical edge. The better placement
// would be to split the critical edge.
for (MachineBasicBlock *P : MBB.predecessors()) {
const BlockData &PInfo = BlockInfo[P->getNumber()];
// If it's available out of the predecessor, then we're ok.
if (PInfo.AvailableOut.isStatic() &&
PInfo.AvailableOut.getVXRMImm() ==
BBInfo.AnticipatedIn.getVXRMImm())
continue;
// If the predecessor anticipates this value for all its succesors,
// then a write to VXRM would have already occured before this block is
// executed.
if (PInfo.AnticipatedOut.isStatic() &&
PInfo.AnticipatedOut.getVXRMImm() ==
BBInfo.AnticipatedIn.getVXRMImm())
continue;
PendingInsert = true;
break;
}
}
Info = BBInfo.AnticipatedIn;
}
for (MachineInstr &MI : MBB) {
int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc());
if (VXRMIdx >= 0 && !ignoresVXRM(MI)) {
unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm();
if (PendingInsert || !Info.isStatic() ||
Info.getVXRMImm() != NewVXRMImm) {
assert((!PendingInsert ||
(Info.isStatic() && Info.getVXRMImm() == NewVXRMImm)) &&
"Pending VXRM insertion mismatch");
LLVM_DEBUG(dbgs() << "Inserting before "; MI.print(dbgs()));
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(RISCV::WriteVXRMImm))
.addImm(NewVXRMImm);
PendingInsert = false;
}
MI.addOperand(MachineOperand::CreateReg(RISCV::VXRM, /*IsDef*/ false,
/*IsImp*/ true));
Info.setVXRMImm(NewVXRMImm);
continue;
}
if (MI.isCall() || MI.isInlineAsm() ||
MI.modifiesRegister(RISCV::VXRM, /*TRI=*/nullptr))
Info.setUnknown();
}
// If all our successors anticipate a value, do the insert.
// NOTE: It's possible that not all predecessors of our successor provide the
// correct value. This can occur on critical edges. If we don't split the
// critical edge we'll also have a write vxrm in the succesor that is
// redundant with this one.
if (PendingInsert ||
(BBInfo.AnticipatedOut.isStatic() &&
(!Info.isStatic() ||
Info.getVXRMImm() != BBInfo.AnticipatedOut.getVXRMImm()))) {
assert((!PendingInsert ||
(Info.isStatic() && BBInfo.AnticipatedOut.isStatic() &&
Info.getVXRMImm() == BBInfo.AnticipatedOut.getVXRMImm())) &&
"Pending VXRM insertion mismatch");
LLVM_DEBUG(dbgs() << "Inserting at end of " << printMBBReference(MBB)
<< " changing to " << BBInfo.AnticipatedOut << "\n");
BuildMI(MBB, MBB.getFirstTerminator(), DebugLoc(),
TII->get(RISCV::WriteVXRMImm))
.addImm(BBInfo.AnticipatedOut.getVXRMImm());
}
}
bool RISCVInsertWriteVXRM::runOnMachineFunction(MachineFunction &MF) {
// Skip if the vector extension is not enabled.
const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
if (!ST.hasVInstructions())
return false;
TII = ST.getInstrInfo();
assert(BlockInfo.empty() && "Expect empty block infos");
BlockInfo.resize(MF.getNumBlockIDs());
// Phase 1 - collect block information.
bool NeedVXRMChange = false;
for (const MachineBasicBlock &MBB : MF)
NeedVXRMChange |= computeVXRMChanges(MBB);
if (!NeedVXRMChange) {
BlockInfo.clear();
return false;
}
// Phase 2 - Compute available VXRM using a forward walk.
for (const MachineBasicBlock &MBB : MF) {
WorkList.push(&MBB);
BlockInfo[MBB.getNumber()].InQueue = true;
}
while (!WorkList.empty()) {
const MachineBasicBlock &MBB = *WorkList.front();
WorkList.pop();
computeAvailable(MBB);
}
// Phase 3 - Compute anticipated VXRM using a backwards walk.
for (const MachineBasicBlock &MBB : llvm::reverse(MF)) {
WorkList.push(&MBB);
BlockInfo[MBB.getNumber()].InQueue = true;
}
while (!WorkList.empty()) {
const MachineBasicBlock &MBB = *WorkList.front();
WorkList.pop();
computeAnticipated(MF, MBB);
}
// Phase 4 - Emit VXRM writes at the earliest place possible.
for (MachineBasicBlock &MBB : MF)
emitWriteVXRM(MBB);
BlockInfo.clear();
return true;
}
FunctionPass *llvm::createRISCVInsertWriteVXRMPass() {
return new RISCVInsertWriteVXRM();
}