mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 20:26:06 +00:00
711 lines
27 KiB
C++
711 lines
27 KiB
C++
//======----------- WindowScheduler.cpp - window scheduler -------------======//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// An implementation of the Window Scheduling software pipelining algorithm.
|
|
//
|
|
// The fundamental concept of the window scheduling algorithm involves folding
|
|
// the original MBB at a specific position, followed by list scheduling on the
|
|
// folded MIs. The optimal scheduling result is then chosen from various folding
|
|
// positions as the final scheduling outcome.
|
|
//
|
|
// The primary challenge in this algorithm lies in generating the folded MIs and
|
|
// establishing their dependencies. We have innovatively employed a new MBB,
|
|
// created by copying the original MBB three times, known as TripleMBB. This
|
|
// TripleMBB enables the convenient implementation of MI folding and dependency
|
|
// establishment. To facilitate the algorithm's implementation, we have also
|
|
// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
|
|
//
|
|
// Another challenge in the algorithm is the scheduling of phis. Semantically,
|
|
// it is difficult to place the phis in the window and perform list scheduling.
|
|
// Therefore, we schedule these phis separately after each list scheduling.
|
|
//
|
|
// The provided implementation is designed for use before the Register Allocator
|
|
// (RA). If the target requires implementation after RA, it is recommended to
|
|
// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
|
|
// target-specific logic can be added in initialize(), preProcess(), and
|
|
// postProcess().
|
|
//
|
|
// Lastly, it is worth mentioning that getSearchIndexes() is an important
|
|
// function. We have experimented with more complex heuristics on downstream
|
|
// target and achieved favorable results.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
#include "llvm/CodeGen/WindowScheduler.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/CodeGen/LiveIntervals.h"
|
|
#include "llvm/CodeGen/MachineLoopInfo.h"
|
|
#include "llvm/CodeGen/MachinePipeliner.h"
|
|
#include "llvm/CodeGen/ModuloSchedule.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/TimeProfiler.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "pipeliner"
|
|
|
|
namespace {
|
|
STATISTIC(NumTryWindowSchedule,
|
|
"Number of loops that we attempt to use window scheduling");
|
|
STATISTIC(NumTryWindowSearch,
|
|
"Number of times that we run list schedule in the window scheduling");
|
|
STATISTIC(NumWindowSchedule,
|
|
"Number of loops that we successfully use window scheduling");
|
|
STATISTIC(NumFailAnalyseII,
|
|
"Window scheduling abort due to the failure of the II analysis");
|
|
|
|
cl::opt<unsigned>
|
|
WindowSearchNum("window-search-num",
|
|
cl::desc("The number of searches per loop in the window "
|
|
"algorithm. 0 means no search number limit."),
|
|
cl::Hidden, cl::init(6));
|
|
|
|
cl::opt<unsigned> WindowSearchRatio(
|
|
"window-search-ratio",
|
|
cl::desc("The ratio of searches per loop in the window algorithm. 100 "
|
|
"means search all positions in the loop, while 0 means not "
|
|
"performing any search."),
|
|
cl::Hidden, cl::init(40));
|
|
|
|
cl::opt<unsigned> WindowIICoeff(
|
|
"window-ii-coeff",
|
|
cl::desc(
|
|
"The coefficient used when initializing II in the window algorithm."),
|
|
cl::Hidden, cl::init(5));
|
|
|
|
cl::opt<unsigned> WindowRegionLimit(
|
|
"window-region-limit",
|
|
cl::desc(
|
|
"The lower limit of the scheduling region in the window algorithm."),
|
|
cl::Hidden, cl::init(3));
|
|
|
|
cl::opt<unsigned> WindowDiffLimit(
|
|
"window-diff-limit",
|
|
cl::desc("The lower limit of the difference between best II and base II in "
|
|
"the window algorithm. If the difference is smaller than "
|
|
"this lower limit, window scheduling will not be performed."),
|
|
cl::Hidden, cl::init(2));
|
|
} // namespace
|
|
|
|
// WindowIILimit serves as an indicator of abnormal scheduling results and could
|
|
// potentially be referenced by the derived target window scheduler.
|
|
static cl::opt<unsigned>
|
|
WindowIILimit("window-ii-limit",
|
|
cl::desc("The upper limit of II in the window algorithm."),
|
|
cl::Hidden, cl::init(1000));
|
|
|
|
WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
|
|
: Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
|
|
Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
|
|
TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
|
|
TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
|
|
createMachineScheduler(/*OnlyBuildGraph=*/true));
|
|
}
|
|
|
|
bool WindowScheduler::run() {
|
|
if (!initialize()) {
|
|
LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
|
|
return false;
|
|
}
|
|
// The window algorithm is time-consuming, and its compilation time should be
|
|
// taken into consideration.
|
|
TimeTraceScope Scope("WindowSearch");
|
|
++NumTryWindowSchedule;
|
|
// Performing the relevant processing before window scheduling.
|
|
preProcess();
|
|
// The main window scheduling begins.
|
|
std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
|
|
auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
|
|
for (unsigned Idx : SearchIndexes) {
|
|
OriToCycle.clear();
|
|
++NumTryWindowSearch;
|
|
// The scheduling starts with non-phi instruction, so SchedPhiNum needs to
|
|
// be added to Idx.
|
|
unsigned Offset = Idx + SchedPhiNum;
|
|
auto Range = getScheduleRange(Offset, SchedInstrNum);
|
|
SchedDAG->startBlock(MBB);
|
|
SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
|
|
SchedDAG->schedule();
|
|
LLVM_DEBUG(SchedDAG->dump());
|
|
unsigned II = analyseII(*SchedDAG, Offset);
|
|
if (II == WindowIILimit) {
|
|
restoreTripleMBB();
|
|
LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
|
|
++NumFailAnalyseII;
|
|
continue;
|
|
}
|
|
schedulePhi(Offset, II);
|
|
updateScheduleResult(Offset, II);
|
|
restoreTripleMBB();
|
|
LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
|
|
<< II << ".\n");
|
|
}
|
|
// Performing the relevant processing after window scheduling.
|
|
postProcess();
|
|
// Check whether the scheduling result is valid.
|
|
if (!isScheduleValid()) {
|
|
LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
|
|
return false;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
|
|
<< " and Best II is " << BestII << ".\n");
|
|
// Expand the scheduling result to prologue, kernel, and epilogue.
|
|
expand();
|
|
++NumWindowSchedule;
|
|
return true;
|
|
}
|
|
|
|
ScheduleDAGInstrs *
|
|
WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
|
|
return OnlyBuildGraph
|
|
? new ScheduleDAGMI(
|
|
Context, std::make_unique<PostGenericScheduler>(Context),
|
|
true)
|
|
: Context->TM->createMachineScheduler(Context);
|
|
}
|
|
|
|
bool WindowScheduler::initialize() {
|
|
if (!Subtarget->enableWindowScheduler()) {
|
|
LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
|
|
return false;
|
|
}
|
|
// Initialized the member variables used by window algorithm.
|
|
OriMIs.clear();
|
|
TriMIs.clear();
|
|
TriToOri.clear();
|
|
OriToCycle.clear();
|
|
SchedResult.clear();
|
|
SchedPhiNum = 0;
|
|
SchedInstrNum = 0;
|
|
BestII = UINT_MAX;
|
|
BestOffset = 0;
|
|
BaseII = 0;
|
|
// List scheduling used in the window algorithm depends on LiveIntervals.
|
|
if (!Context->LIS) {
|
|
LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
|
|
return false;
|
|
}
|
|
// Check each MI in MBB.
|
|
SmallSet<Register, 8> PrevDefs;
|
|
SmallSet<Register, 8> PrevUses;
|
|
auto IsLoopCarried = [&](MachineInstr &Phi) {
|
|
// Two cases are checked here: (1)The virtual register defined by the
|
|
// preceding phi is used by the succeeding phi;(2)The preceding phi uses the
|
|
// virtual register defined by the succeeding phi.
|
|
if (PrevUses.count(Phi.getOperand(0).getReg()))
|
|
return true;
|
|
PrevDefs.insert(Phi.getOperand(0).getReg());
|
|
for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
|
|
if (PrevDefs.count(Phi.getOperand(I).getReg()))
|
|
return true;
|
|
PrevUses.insert(Phi.getOperand(I).getReg());
|
|
}
|
|
return false;
|
|
};
|
|
auto PLI = TII->analyzeLoopForPipelining(MBB);
|
|
for (auto &MI : *MBB) {
|
|
if (MI.isMetaInstruction() || MI.isTerminator())
|
|
continue;
|
|
if (MI.isPHI()) {
|
|
if (IsLoopCarried(MI)) {
|
|
LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
|
|
return false;
|
|
}
|
|
++SchedPhiNum;
|
|
++BestOffset;
|
|
} else
|
|
++SchedInstrNum;
|
|
if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "Boundary MI is not allowed in window scheduling!\n");
|
|
return false;
|
|
}
|
|
if (PLI->shouldIgnoreForPipelining(&MI)) {
|
|
LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
|
|
"window scheduling!\n");
|
|
return false;
|
|
}
|
|
for (auto &Def : MI.all_defs())
|
|
if (Def.isReg() && Def.getReg().isPhysical()) {
|
|
LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
|
|
"window scheduling!\n");
|
|
return false;
|
|
}
|
|
}
|
|
if (SchedInstrNum <= WindowRegionLimit) {
|
|
LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void WindowScheduler::preProcess() {
|
|
// Prior to window scheduling, it's necessary to backup the original MBB,
|
|
// generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
|
|
backupMBB();
|
|
generateTripleMBB();
|
|
TripleDAG->startBlock(MBB);
|
|
TripleDAG->enterRegion(
|
|
MBB, MBB->begin(), MBB->getFirstTerminator(),
|
|
std::distance(MBB->begin(), MBB->getFirstTerminator()));
|
|
TripleDAG->buildSchedGraph(Context->AA);
|
|
}
|
|
|
|
void WindowScheduler::postProcess() {
|
|
// After window scheduling, it's necessary to clear the TripleDAG and restore
|
|
// to the original MBB.
|
|
TripleDAG->exitRegion();
|
|
TripleDAG->finishBlock();
|
|
restoreMBB();
|
|
}
|
|
|
|
void WindowScheduler::backupMBB() {
|
|
for (auto &MI : MBB->instrs())
|
|
OriMIs.push_back(&MI);
|
|
// Remove MIs and the corresponding live intervals.
|
|
for (auto &MI : make_early_inc_range(*MBB)) {
|
|
Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
|
|
MBB->remove(&MI);
|
|
}
|
|
}
|
|
|
|
void WindowScheduler::restoreMBB() {
|
|
// Erase MIs and the corresponding live intervals.
|
|
for (auto &MI : make_early_inc_range(*MBB)) {
|
|
Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
|
|
MI.eraseFromParent();
|
|
}
|
|
// Restore MBB to the state before window scheduling.
|
|
llvm::append_range(*MBB, OriMIs);
|
|
updateLiveIntervals();
|
|
}
|
|
|
|
void WindowScheduler::generateTripleMBB() {
|
|
const unsigned DuplicateNum = 3;
|
|
TriMIs.clear();
|
|
TriToOri.clear();
|
|
assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
|
|
// Step 1: Performing the first copy of MBB instructions, excluding
|
|
// terminators. At the same time, we back up the anti-register of phis.
|
|
// DefPairs hold the old and new define register pairs.
|
|
DenseMap<Register, Register> DefPairs;
|
|
for (auto *MI : OriMIs) {
|
|
if (MI->isMetaInstruction() || MI->isTerminator())
|
|
continue;
|
|
if (MI->isPHI())
|
|
if (Register AntiReg = getAntiRegister(MI))
|
|
DefPairs[MI->getOperand(0).getReg()] = AntiReg;
|
|
auto *NewMI = MF->CloneMachineInstr(MI);
|
|
MBB->push_back(NewMI);
|
|
TriMIs.push_back(NewMI);
|
|
TriToOri[NewMI] = MI;
|
|
}
|
|
// Step 2: Performing the remaining two copies of MBB instructions excluding
|
|
// phis, and the last one contains terminators. At the same time, registers
|
|
// are updated accordingly.
|
|
for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
|
|
for (auto *MI : OriMIs) {
|
|
if (MI->isPHI() || MI->isMetaInstruction() ||
|
|
(MI->isTerminator() && Cnt < DuplicateNum - 1))
|
|
continue;
|
|
auto *NewMI = MF->CloneMachineInstr(MI);
|
|
DenseMap<Register, Register> NewDefs;
|
|
// New defines are updated.
|
|
for (auto MO : NewMI->all_defs())
|
|
if (MO.isReg() && MO.getReg().isVirtual()) {
|
|
Register NewDef =
|
|
MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
|
|
NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
|
|
NewDefs[MO.getReg()] = NewDef;
|
|
}
|
|
// New uses are updated.
|
|
for (auto DefRegPair : DefPairs)
|
|
if (NewMI->readsRegister(DefRegPair.first, TRI)) {
|
|
Register NewUse = DefRegPair.second;
|
|
// Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
|
|
//
|
|
// BB.3: DefPairs
|
|
// ==================================
|
|
// %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7)
|
|
// ...
|
|
// ==================================
|
|
// ...
|
|
// %4 = sub i32 %1, %3
|
|
// ...
|
|
// %7 = add i32 %5, %6
|
|
// ...
|
|
// ----------------------------------
|
|
// ...
|
|
// %8 = sub i32 %7, %3 (%1,%7),(%4,%8)
|
|
// ...
|
|
// %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9)
|
|
// ...
|
|
// ----------------------------------
|
|
// ...
|
|
// %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9)
|
|
// ... ^
|
|
// %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11)
|
|
// ...
|
|
// ==================================
|
|
// < Terminators >
|
|
// ==================================
|
|
if (auto It = DefPairs.find(NewUse); It != DefPairs.end())
|
|
NewUse = It->second;
|
|
NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
|
|
}
|
|
// DefPairs is updated at last.
|
|
for (auto &NewDef : NewDefs)
|
|
DefPairs[NewDef.first] = NewDef.second;
|
|
MBB->push_back(NewMI);
|
|
TriMIs.push_back(NewMI);
|
|
TriToOri[NewMI] = MI;
|
|
}
|
|
}
|
|
// Step 3: The registers used by phis are updated, and they are generated in
|
|
// the third copy of MBB.
|
|
// In the privious example, the old phi is:
|
|
// %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
|
|
// The new phi is:
|
|
// %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
|
|
for (auto &Phi : MBB->phis()) {
|
|
for (auto DefRegPair : DefPairs)
|
|
if (Phi.readsRegister(DefRegPair.first, TRI))
|
|
Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
|
|
}
|
|
updateLiveIntervals();
|
|
}
|
|
|
|
void WindowScheduler::restoreTripleMBB() {
|
|
// After list scheduling, the MBB is restored in one traversal.
|
|
for (size_t I = 0; I < TriMIs.size(); ++I) {
|
|
auto *MI = TriMIs[I];
|
|
auto OldPos = MBB->begin();
|
|
std::advance(OldPos, I);
|
|
auto CurPos = MI->getIterator();
|
|
if (CurPos != OldPos) {
|
|
MBB->splice(OldPos, MBB, CurPos);
|
|
Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
|
|
}
|
|
}
|
|
}
|
|
|
|
SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
|
|
unsigned SearchRatio) {
|
|
// We use SearchRatio to get the index range, and then evenly get the indexes
|
|
// according to the SearchNum. This is a simple huristic. Depending on the
|
|
// characteristics of the target, more complex algorithms can be used for both
|
|
// performance and compilation time.
|
|
assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
|
|
unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
|
|
unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
|
|
SmallVector<unsigned> SearchIndexes;
|
|
for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
|
|
SearchIndexes.push_back(Idx);
|
|
return SearchIndexes;
|
|
}
|
|
|
|
int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
|
|
// Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
|
|
unsigned MaxDepth = 1;
|
|
for (auto &SU : DAG.SUnits)
|
|
MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
|
|
return MaxDepth * WindowIICoeff;
|
|
}
|
|
|
|
int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
|
|
unsigned Offset) {
|
|
int InitII = getEstimatedII(DAG);
|
|
ResourceManager RM(Subtarget, &DAG);
|
|
RM.init(InitII);
|
|
// ResourceManager and DAG are used to calculate the maximum cycle for the
|
|
// scheduled MIs. Since MIs in the Region have already been scheduled, the
|
|
// emit cycles can be estimated in order here.
|
|
int CurCycle = 0;
|
|
auto Range = getScheduleRange(Offset, SchedInstrNum);
|
|
for (auto &MI : Range) {
|
|
auto *SU = DAG.getSUnit(&MI);
|
|
int ExpectCycle = CurCycle;
|
|
// The predecessors of current MI determine its earliest issue cycle.
|
|
for (auto &Pred : SU->Preds) {
|
|
if (Pred.isWeak())
|
|
continue;
|
|
auto *PredMI = Pred.getSUnit()->getInstr();
|
|
int PredCycle = getOriCycle(PredMI);
|
|
ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
|
|
}
|
|
// Zero cost instructions do not need to check resource.
|
|
if (!TII->isZeroCost(MI.getOpcode())) {
|
|
// ResourceManager can be used to detect resource conflicts between the
|
|
// current MI and the previously inserted MIs.
|
|
while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
|
|
++CurCycle;
|
|
if (CurCycle == (int)WindowIILimit)
|
|
return CurCycle;
|
|
}
|
|
RM.reserveResources(*SU, CurCycle);
|
|
}
|
|
OriToCycle[getOriMI(&MI)] = CurCycle;
|
|
LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
|
|
<< getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
|
|
}
|
|
LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
|
|
return CurCycle;
|
|
}
|
|
|
|
// By utilizing TripleDAG, we can easily establish dependencies between A and B.
|
|
// Based on the MaxCycle and the issue cycle of A and B, we can determine
|
|
// whether it is necessary to add a stall cycle. This is because, without
|
|
// inserting the stall cycle, the latency constraint between A and B cannot be
|
|
// satisfied. The details are as follows:
|
|
//
|
|
// New MBB:
|
|
// ========================================
|
|
// < Phis >
|
|
// ======================================== (sliding direction)
|
|
// MBB copy 1 |
|
|
// V
|
|
//
|
|
// ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window-----
|
|
// | |
|
|
// ===================V==================== |
|
|
// MBB copy 2 < MI B > |
|
|
// |
|
|
// < MI A > V
|
|
// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------
|
|
// :
|
|
// ===================V====================
|
|
// MBB copy 3 < MI B'>
|
|
//
|
|
//
|
|
//
|
|
//
|
|
// ========================================
|
|
// < Terminators >
|
|
// ========================================
|
|
int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
|
|
int MaxStallCycle = 0;
|
|
int CurrentII = MaxCycle + 1;
|
|
auto Range = getScheduleRange(Offset, SchedInstrNum);
|
|
for (auto &MI : Range) {
|
|
auto *SU = TripleDAG->getSUnit(&MI);
|
|
int DefCycle = getOriCycle(&MI);
|
|
for (auto &Succ : SU->Succs) {
|
|
if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
|
|
continue;
|
|
// If the expected cycle does not exceed CurrentII, no check is needed.
|
|
if (DefCycle + (int)Succ.getLatency() <= CurrentII)
|
|
continue;
|
|
// If the cycle of the scheduled MI A is less than that of the scheduled
|
|
// MI B, the scheduling will fail because the lifetime of the
|
|
// corresponding register exceeds II.
|
|
auto *SuccMI = Succ.getSUnit()->getInstr();
|
|
int UseCycle = getOriCycle(SuccMI);
|
|
if (DefCycle < UseCycle)
|
|
return WindowIILimit;
|
|
// Get the stall cycle introduced by the register between two trips.
|
|
int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
|
|
MaxStallCycle = std::max(MaxStallCycle, StallCycle);
|
|
}
|
|
}
|
|
LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
|
|
return MaxStallCycle;
|
|
}
|
|
|
|
unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
|
|
LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
|
|
int MaxCycle = calculateMaxCycle(DAG, Offset);
|
|
if (MaxCycle == (int)WindowIILimit)
|
|
return MaxCycle;
|
|
int StallCycle = calculateStallCycle(Offset, MaxCycle);
|
|
if (StallCycle == (int)WindowIILimit)
|
|
return StallCycle;
|
|
// The value of II is equal to the maximum execution cycle plus 1.
|
|
return MaxCycle + StallCycle + 1;
|
|
}
|
|
|
|
void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
|
|
LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
|
|
for (auto &Phi : MBB->phis()) {
|
|
int LateCycle = INT_MAX;
|
|
auto *SU = TripleDAG->getSUnit(&Phi);
|
|
for (auto &Succ : SU->Succs) {
|
|
// Phi doesn't have any Anti successors.
|
|
if (Succ.getKind() != SDep::Data)
|
|
continue;
|
|
// Phi is scheduled before the successor of stage 0. The issue cycle of
|
|
// phi is the latest cycle in this interval.
|
|
auto *SuccMI = Succ.getSUnit()->getInstr();
|
|
int Cycle = getOriCycle(SuccMI);
|
|
if (getOriStage(getOriMI(SuccMI), Offset) == 0)
|
|
LateCycle = std::min(LateCycle, Cycle);
|
|
}
|
|
// The anti-dependency of phi need to be handled separately in the same way.
|
|
if (Register AntiReg = getAntiRegister(&Phi)) {
|
|
auto *AntiMI = MRI->getVRegDef(AntiReg);
|
|
// AntiReg may be defined outside the kernel MBB.
|
|
if (AntiMI->getParent() == MBB) {
|
|
auto AntiCycle = getOriCycle(AntiMI);
|
|
if (getOriStage(getOriMI(AntiMI), Offset) == 0)
|
|
LateCycle = std::min(LateCycle, AntiCycle);
|
|
}
|
|
}
|
|
// If there is no limit to the late cycle, a default value is given.
|
|
if (LateCycle == INT_MAX)
|
|
LateCycle = (int)(II - 1);
|
|
LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
|
|
// The issue cycle of phi is set to the latest cycle in the interval.
|
|
auto *OriPhi = getOriMI(&Phi);
|
|
OriToCycle[OriPhi] = LateCycle;
|
|
}
|
|
}
|
|
|
|
DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
|
|
unsigned II) {
|
|
// At each issue cycle, phi is placed before MIs in stage 0. So the simplest
|
|
// way is to put phi at the beginning of the current cycle.
|
|
DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
|
|
auto Range = getScheduleRange(Offset, SchedInstrNum);
|
|
for (auto &Phi : MBB->phis())
|
|
CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
|
|
for (auto &MI : Range)
|
|
CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
|
|
// Each MI is assigned a separate ordered Id, which is used as a sort marker
|
|
// in the following expand process.
|
|
DenseMap<MachineInstr *, int> IssueOrder;
|
|
int Id = 0;
|
|
for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
|
|
auto It = CycleToMIs.find(Cycle);
|
|
if (It == CycleToMIs.end())
|
|
continue;
|
|
for (auto *MI : It->second)
|
|
IssueOrder[MI] = Id++;
|
|
}
|
|
return IssueOrder;
|
|
}
|
|
|
|
void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
|
|
// At the first update, Offset is equal to SchedPhiNum. At this time, only
|
|
// BestII, BestOffset, and BaseII need to be updated.
|
|
if (Offset == SchedPhiNum) {
|
|
BestII = II;
|
|
BestOffset = SchedPhiNum;
|
|
BaseII = II;
|
|
return;
|
|
}
|
|
// The update will only continue if the II is smaller than BestII and the II
|
|
// is sufficiently small.
|
|
if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
|
|
return;
|
|
BestII = II;
|
|
BestOffset = Offset;
|
|
// Record the result of the current list scheduling, noting that each MI is
|
|
// stored unordered in SchedResult.
|
|
SchedResult.clear();
|
|
auto IssueOrder = getIssueOrder(Offset, II);
|
|
for (auto &Pair : OriToCycle) {
|
|
assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
|
|
SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
|
|
getOriStage(Pair.first, Offset),
|
|
IssueOrder[Pair.first]));
|
|
}
|
|
}
|
|
|
|
void WindowScheduler::expand() {
|
|
// The MIs in the SchedResult are sorted by the issue order ID.
|
|
llvm::stable_sort(SchedResult,
|
|
[](const std::tuple<MachineInstr *, int, int, int> &A,
|
|
const std::tuple<MachineInstr *, int, int, int> &B) {
|
|
return std::get<3>(A) < std::get<3>(B);
|
|
});
|
|
// Use the scheduling infrastructure for expansion, noting that InstrChanges
|
|
// is not supported here.
|
|
DenseMap<MachineInstr *, int> Cycles, Stages;
|
|
std::vector<MachineInstr *> OrderedInsts;
|
|
for (auto &Info : SchedResult) {
|
|
auto *MI = std::get<0>(Info);
|
|
OrderedInsts.push_back(MI);
|
|
Cycles[MI] = std::get<1>(Info);
|
|
Stages[MI] = std::get<2>(Info);
|
|
LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
|
|
<< "]: " << *MI);
|
|
}
|
|
ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
|
|
std::move(Stages));
|
|
ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
|
|
ModuloScheduleExpander::InstrChangesTy());
|
|
MSE.expand();
|
|
MSE.cleanup();
|
|
}
|
|
|
|
void WindowScheduler::updateLiveIntervals() {
|
|
SmallVector<Register, 128> UsedRegs;
|
|
for (MachineInstr &MI : *MBB)
|
|
for (const MachineOperand &MO : MI.operands()) {
|
|
if (!MO.isReg() || MO.getReg() == 0)
|
|
continue;
|
|
Register Reg = MO.getReg();
|
|
if (!is_contained(UsedRegs, Reg))
|
|
UsedRegs.push_back(Reg);
|
|
}
|
|
Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs);
|
|
}
|
|
|
|
iterator_range<MachineBasicBlock::iterator>
|
|
WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
|
|
auto RegionBegin = MBB->begin();
|
|
std::advance(RegionBegin, Offset);
|
|
auto RegionEnd = RegionBegin;
|
|
std::advance(RegionEnd, Num);
|
|
return make_range(RegionBegin, RegionEnd);
|
|
}
|
|
|
|
int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
|
|
assert(TriToOri.count(NewMI) && "Cannot find original MI!");
|
|
auto *OriMI = TriToOri[NewMI];
|
|
assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
|
|
return OriToCycle[OriMI];
|
|
}
|
|
|
|
MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
|
|
assert(TriToOri.count(NewMI) && "Cannot find original MI!");
|
|
return TriToOri[NewMI];
|
|
}
|
|
|
|
unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
|
|
assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
|
|
"Cannot find OriMI in OriMIs!");
|
|
// If there is no instruction fold, all MI stages are 0.
|
|
if (Offset == SchedPhiNum)
|
|
return 0;
|
|
// For those MIs with an ID less than the Offset, their stages are set to 0,
|
|
// while the rest are set to 1.
|
|
unsigned Id = 0;
|
|
for (auto *MI : OriMIs) {
|
|
if (MI->isMetaInstruction())
|
|
continue;
|
|
if (MI == OriMI)
|
|
break;
|
|
++Id;
|
|
}
|
|
return Id >= (size_t)Offset ? 1 : 0;
|
|
}
|
|
|
|
Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
|
|
assert(Phi->isPHI() && "Expecting PHI!");
|
|
Register AntiReg;
|
|
for (auto MO : Phi->uses()) {
|
|
if (MO.isReg())
|
|
AntiReg = MO.getReg();
|
|
else if (MO.isMBB() && MO.getMBB() == MBB)
|
|
return AntiReg;
|
|
}
|
|
return 0;
|
|
}
|