mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-24 04:16:08 +00:00
[mlir][SCF] Modernize coalesceLoops
method to handle scf.for
loops with iter_args (#87019)
As part of this extension this change also does some general cleanup 1) Make all the methods take `RewriterBase` as arguments instead of creating their own builders that tend to crash when used within pattern rewrites 2) Split `coalesePerfectlyNestedLoops` into two separate methods, one for `scf.for` and other for `affine.for`. The templatization didnt seem to be buying much there. Also general clean up of tests.
This commit is contained in:
parent
fd2a5c46d8
commit
5aeb604c7c
@ -299,53 +299,8 @@ LogicalResult
|
||||
separateFullTiles(MutableArrayRef<AffineForOp> nest,
|
||||
SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
|
||||
|
||||
/// Walk either an scf.for or an affine.for to find a band to coalesce.
|
||||
template <typename LoopOpTy>
|
||||
LogicalResult coalescePerfectlyNestedLoops(LoopOpTy op) {
|
||||
LogicalResult result(failure());
|
||||
SmallVector<LoopOpTy> loops;
|
||||
getPerfectlyNestedLoops(loops, op);
|
||||
|
||||
// Look for a band of loops that can be coalesced, i.e. perfectly nested
|
||||
// loops with bounds defined above some loop.
|
||||
// 1. For each loop, find above which parent loop its operands are
|
||||
// defined.
|
||||
SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
|
||||
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
|
||||
operandsDefinedAbove[i] = i;
|
||||
for (unsigned j = 0; j < i; ++j) {
|
||||
if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
|
||||
operandsDefinedAbove[i] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Identify bands of loops such that the operands of all of them are
|
||||
// defined above the first loop in the band. Traverse the nest bottom-up
|
||||
// so that modifications don't invalidate the inner loops.
|
||||
for (unsigned end = loops.size(); end > 0; --end) {
|
||||
unsigned start = 0;
|
||||
for (; start < end - 1; ++start) {
|
||||
auto maxPos =
|
||||
*std::max_element(std::next(operandsDefinedAbove.begin(), start),
|
||||
std::next(operandsDefinedAbove.begin(), end));
|
||||
if (maxPos > start)
|
||||
continue;
|
||||
assert(maxPos == start &&
|
||||
"expected loop bounds to be known at the start of the band");
|
||||
auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
|
||||
if (succeeded(coalesceLoops(band)))
|
||||
result = success();
|
||||
break;
|
||||
}
|
||||
// If a band was found and transformed, keep looking at the loops above
|
||||
// the outermost transformed loop.
|
||||
if (start != end - 1)
|
||||
end = start + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/// Walk an affine.for to find a band to coalesce.
|
||||
LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op);
|
||||
|
||||
} // namespace affine
|
||||
} // namespace mlir
|
||||
|
@ -100,11 +100,16 @@ getSCFMinMaxExpr(Value value, SmallVectorImpl<Value> &dims,
|
||||
/// `loops` contains a list of perfectly nested loops with bounds and steps
|
||||
/// independent of any loop induction variable involved in the nest.
|
||||
LogicalResult coalesceLoops(MutableArrayRef<scf::ForOp> loops);
|
||||
LogicalResult coalesceLoops(RewriterBase &rewriter,
|
||||
MutableArrayRef<scf::ForOp>);
|
||||
|
||||
/// Walk an affine.for to find a band to coalesce.
|
||||
LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op);
|
||||
|
||||
/// Take the ParallelLoop and for each set of dimension indices, combine them
|
||||
/// into a single dimension. combinedDimensions must contain each index into
|
||||
/// loops exactly once.
|
||||
void collapseParallelLoops(scf::ParallelOp loops,
|
||||
void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops,
|
||||
ArrayRef<std::vector<unsigned>> combinedDimensions);
|
||||
|
||||
/// Unrolls this for operation by the specified unroll factor. Returns failure
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "llvm/Support/TypeName.h"
|
||||
#include <optional>
|
||||
|
||||
using llvm::SmallPtrSetImpl;
|
||||
namespace mlir {
|
||||
|
||||
class PatternRewriter;
|
||||
@ -704,6 +705,8 @@ public:
|
||||
return user != exceptedUser;
|
||||
});
|
||||
}
|
||||
void replaceAllUsesExcept(Value from, Value to,
|
||||
const SmallPtrSetImpl<Operation *> &preservedUsers);
|
||||
|
||||
/// Used to notify the listener that the IR failed to be rewritten because of
|
||||
/// a match failure, and provide a callback to populate a diagnostic with the
|
||||
|
@ -39,9 +39,9 @@ struct LoopCoalescingPass
|
||||
func::FuncOp func = getOperation();
|
||||
func.walk<WalkOrder::PreOrder>([](Operation *op) {
|
||||
if (auto scfForOp = dyn_cast<scf::ForOp>(op))
|
||||
(void)coalescePerfectlyNestedLoops(scfForOp);
|
||||
(void)coalescePerfectlyNestedSCFForLoops(scfForOp);
|
||||
else if (auto affineForOp = dyn_cast<AffineForOp>(op))
|
||||
(void)coalescePerfectlyNestedLoops(affineForOp);
|
||||
(void)coalescePerfectlyNestedAffineLoops(affineForOp);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
@ -2765,3 +2765,51 @@ mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
|
||||
LogicalResult result(failure());
|
||||
SmallVector<AffineForOp> loops;
|
||||
getPerfectlyNestedLoops(loops, op);
|
||||
if (loops.size() <= 1)
|
||||
return success();
|
||||
|
||||
// Look for a band of loops that can be coalesced, i.e. perfectly nested
|
||||
// loops with bounds defined above some loop.
|
||||
// 1. For each loop, find above which parent loop its operands are
|
||||
// defined.
|
||||
SmallVector<unsigned> operandsDefinedAbove(loops.size());
|
||||
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
|
||||
operandsDefinedAbove[i] = i;
|
||||
for (unsigned j = 0; j < i; ++j) {
|
||||
if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
|
||||
operandsDefinedAbove[i] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Identify bands of loops such that the operands of all of them are
|
||||
// defined above the first loop in the band. Traverse the nest bottom-up
|
||||
// so that modifications don't invalidate the inner loops.
|
||||
for (unsigned end = loops.size(); end > 0; --end) {
|
||||
unsigned start = 0;
|
||||
for (; start < end - 1; ++start) {
|
||||
auto maxPos =
|
||||
*std::max_element(std::next(operandsDefinedAbove.begin(), start),
|
||||
std::next(operandsDefinedAbove.begin(), end));
|
||||
if (maxPos > start)
|
||||
continue;
|
||||
assert(maxPos == start &&
|
||||
"expected loop bounds to be known at the start of the band");
|
||||
auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
|
||||
if (succeeded(coalesceLoops(band)))
|
||||
result = success();
|
||||
break;
|
||||
}
|
||||
// If a band was found and transformed, keep looking at the loops above
|
||||
// the outermost transformed loop.
|
||||
if (start != end - 1)
|
||||
end = start + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -332,9 +332,9 @@ transform::LoopCoalesceOp::applyToOne(transform::TransformRewriter &rewriter,
|
||||
transform::TransformState &state) {
|
||||
LogicalResult result(failure());
|
||||
if (scf::ForOp scfForOp = dyn_cast<scf::ForOp>(op))
|
||||
result = coalescePerfectlyNestedLoops(scfForOp);
|
||||
result = coalescePerfectlyNestedSCFForLoops(scfForOp);
|
||||
else if (AffineForOp affineForOp = dyn_cast<AffineForOp>(op))
|
||||
result = coalescePerfectlyNestedLoops(affineForOp);
|
||||
result = coalescePerfectlyNestedAffineLoops(affineForOp);
|
||||
|
||||
results.push_back(op);
|
||||
if (failed(result)) {
|
||||
|
@ -28,6 +28,7 @@ namespace {
|
||||
struct TestSCFParallelLoopCollapsing
|
||||
: public impl::TestSCFParallelLoopCollapsingBase<
|
||||
TestSCFParallelLoopCollapsing> {
|
||||
|
||||
void runOnOperation() override {
|
||||
Operation *module = getOperation();
|
||||
|
||||
@ -88,6 +89,7 @@ struct TestSCFParallelLoopCollapsing
|
||||
// Only apply the transformation on parallel loops where the specified
|
||||
// transformation is valid, but do NOT early abort in the case of invalid
|
||||
// loops.
|
||||
IRRewriter rewriter(&getContext());
|
||||
module->walk([&](scf::ParallelOp op) {
|
||||
if (flattenedCombinedLoops.size() != op.getNumLoops()) {
|
||||
op.emitOpError("has ")
|
||||
@ -97,7 +99,7 @@ struct TestSCFParallelLoopCollapsing
|
||||
<< flattenedCombinedLoops.size() << " iter args.";
|
||||
return;
|
||||
}
|
||||
collapseParallelLoops(op, combinedLoops);
|
||||
collapseParallelLoops(rewriter, op, combinedLoops);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "mlir/Dialect/SCF/Utils/Utils.h"
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Dialect/Arith/IR/Arith.h"
|
||||
#include "mlir/Dialect/Arith/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||
#include "mlir/Dialect/SCF/IR/SCF.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
@ -472,61 +473,6 @@ LogicalResult mlir::loopUnrollByFactor(
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Return the new lower bound, upper bound, and step in that order. Insert any
|
||||
/// additional bounds calculations before the given builder and any additional
|
||||
/// conversion back to the original loop induction value inside the given Block.
|
||||
static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
|
||||
OpBuilder &insideLoopBuilder, Location loc,
|
||||
Value lowerBound, Value upperBound, Value step,
|
||||
Value inductionVar) {
|
||||
// Check if the loop is already known to have a constant zero lower bound or
|
||||
// a constant one step.
|
||||
bool isZeroBased = false;
|
||||
if (auto ubCst = getConstantIntValue(lowerBound))
|
||||
isZeroBased = ubCst.value() == 0;
|
||||
|
||||
bool isStepOne = false;
|
||||
if (auto stepCst = getConstantIntValue(step))
|
||||
isStepOne = stepCst.value() == 1;
|
||||
|
||||
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
|
||||
// assuming the step is strictly positive. Update the bounds and the step
|
||||
// of the loop to go from 0 to the number of iterations, if necessary.
|
||||
if (isZeroBased && isStepOne)
|
||||
return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
|
||||
/*step=*/step};
|
||||
|
||||
Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
|
||||
Value newUpperBound =
|
||||
boundsBuilder.create<arith::CeilDivSIOp>(loc, diff, step);
|
||||
|
||||
Value newLowerBound =
|
||||
isZeroBased ? lowerBound
|
||||
: boundsBuilder.create<arith::ConstantOp>(
|
||||
loc, boundsBuilder.getZeroAttr(lowerBound.getType()));
|
||||
Value newStep =
|
||||
isStepOne ? step
|
||||
: boundsBuilder.create<arith::ConstantOp>(
|
||||
loc, boundsBuilder.getIntegerAttr(step.getType(), 1));
|
||||
|
||||
// Insert code computing the value of the original loop induction variable
|
||||
// from the "normalized" one.
|
||||
Value scaled =
|
||||
isStepOne
|
||||
? inductionVar
|
||||
: insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
|
||||
Value shifted =
|
||||
isZeroBased
|
||||
? scaled
|
||||
: insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
|
||||
|
||||
SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
|
||||
shifted.getDefiningOp()};
|
||||
inductionVar.replaceAllUsesExcept(shifted, preserve);
|
||||
return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
|
||||
/*step=*/newStep};
|
||||
}
|
||||
|
||||
/// Transform a loop with a strictly positive step
|
||||
/// for %i = %lb to %ub step %s
|
||||
/// into a 0-based loop with step 1
|
||||
@ -536,19 +482,107 @@ static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
|
||||
/// expected to be either `loop` or another loop perfectly nested under `loop`.
|
||||
/// Insert the definition of new bounds immediate before `outer`, which is
|
||||
/// expected to be either `loop` or its parent in the loop nest.
|
||||
static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
|
||||
OpBuilder builder(outer);
|
||||
OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
|
||||
auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
|
||||
loop.getLowerBound(), loop.getUpperBound(),
|
||||
loop.getStep(), loop.getInductionVar());
|
||||
static LoopParams emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
|
||||
Value lb, Value ub, Value step) {
|
||||
// For non-index types, generate `arith` instructions
|
||||
// Check if the loop is already known to have a constant zero lower bound or
|
||||
// a constant one step.
|
||||
bool isZeroBased = false;
|
||||
if (auto lbCst = getConstantIntValue(lb))
|
||||
isZeroBased = lbCst.value() == 0;
|
||||
|
||||
loop.setLowerBound(loopPieces.lowerBound);
|
||||
loop.setUpperBound(loopPieces.upperBound);
|
||||
loop.setStep(loopPieces.step);
|
||||
bool isStepOne = false;
|
||||
if (auto stepCst = getConstantIntValue(step))
|
||||
isStepOne = stepCst.value() == 1;
|
||||
|
||||
// Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
|
||||
// assuming the step is strictly positive. Update the bounds and the step
|
||||
// of the loop to go from 0 to the number of iterations, if necessary.
|
||||
if (isZeroBased && isStepOne)
|
||||
return {lb, ub, step};
|
||||
|
||||
Value diff = isZeroBased ? ub : rewriter.create<arith::SubIOp>(loc, ub, lb);
|
||||
Value newUpperBound =
|
||||
isStepOne ? diff : rewriter.create<arith::CeilDivSIOp>(loc, diff, step);
|
||||
|
||||
Value newLowerBound = isZeroBased
|
||||
? lb
|
||||
: rewriter.create<arith::ConstantOp>(
|
||||
loc, rewriter.getZeroAttr(lb.getType()));
|
||||
Value newStep = isStepOne
|
||||
? step
|
||||
: rewriter.create<arith::ConstantOp>(
|
||||
loc, rewriter.getIntegerAttr(step.getType(), 1));
|
||||
|
||||
return {newLowerBound, newUpperBound, newStep};
|
||||
}
|
||||
|
||||
LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
|
||||
/// Get back the original induction variable values after loop normalization
|
||||
static void denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
|
||||
Value normalizedIv, Value origLb,
|
||||
Value origStep) {
|
||||
Value denormalizedIv;
|
||||
SmallPtrSet<Operation *, 2> preserve;
|
||||
bool isStepOne = isConstantIntValue(origStep, 1);
|
||||
bool isZeroBased = isConstantIntValue(origLb, 0);
|
||||
|
||||
Value scaled = normalizedIv;
|
||||
if (!isStepOne) {
|
||||
scaled = rewriter.create<arith::MulIOp>(loc, normalizedIv, origStep);
|
||||
preserve.insert(scaled.getDefiningOp());
|
||||
}
|
||||
denormalizedIv = scaled;
|
||||
if (!isZeroBased) {
|
||||
denormalizedIv = rewriter.create<arith::AddIOp>(loc, scaled, origLb);
|
||||
preserve.insert(denormalizedIv.getDefiningOp());
|
||||
}
|
||||
|
||||
rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
|
||||
}
|
||||
|
||||
/// Helper function to multiply a sequence of values.
|
||||
static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
|
||||
ArrayRef<Value> values) {
|
||||
assert(!values.empty() && "unexpected empty list");
|
||||
Value productOf = values.front();
|
||||
for (auto v : values.drop_front()) {
|
||||
productOf = rewriter.create<arith::MulIOp>(loc, productOf, v);
|
||||
}
|
||||
return productOf;
|
||||
}
|
||||
|
||||
/// For each original loop, the value of the
|
||||
/// induction variable can be obtained by dividing the induction variable of
|
||||
/// the linearized loop by the total number of iterations of the loops nested
|
||||
/// in it modulo the number of iterations in this loop (remove the values
|
||||
/// related to the outer loops):
|
||||
/// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
|
||||
/// Compute these iteratively from the innermost loop by creating a "running
|
||||
/// quotient" of division by the range.
|
||||
static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
|
||||
delinearizeInductionVariable(RewriterBase &rewriter, Location loc,
|
||||
Value linearizedIv, ArrayRef<Value> ubs) {
|
||||
Value previous = linearizedIv;
|
||||
SmallVector<Value> delinearizedIvs(ubs.size());
|
||||
SmallPtrSet<Operation *, 2> preservedUsers;
|
||||
for (unsigned i = 0, e = ubs.size(); i < e; ++i) {
|
||||
unsigned idx = ubs.size() - i - 1;
|
||||
if (i != 0) {
|
||||
previous = rewriter.create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
|
||||
preservedUsers.insert(previous.getDefiningOp());
|
||||
}
|
||||
Value iv = previous;
|
||||
if (i != e - 1) {
|
||||
iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
|
||||
preservedUsers.insert(iv.getDefiningOp());
|
||||
}
|
||||
delinearizedIvs[idx] = iv;
|
||||
}
|
||||
return {delinearizedIvs, preservedUsers};
|
||||
}
|
||||
|
||||
LogicalResult mlir::coalesceLoops(RewriterBase &rewriter,
|
||||
MutableArrayRef<scf::ForOp> loops) {
|
||||
if (loops.size() < 2)
|
||||
return failure();
|
||||
|
||||
@ -557,57 +591,148 @@ LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
|
||||
|
||||
// 1. Make sure all loops iterate from 0 to upperBound with step 1. This
|
||||
// allows the following code to assume upperBound is the number of iterations.
|
||||
for (auto loop : loops)
|
||||
normalizeLoop(loop, outermost, innermost);
|
||||
for (auto loop : loops) {
|
||||
OpBuilder::InsertionGuard g(rewriter);
|
||||
rewriter.setInsertionPoint(outermost);
|
||||
Value lb = loop.getLowerBound();
|
||||
Value ub = loop.getUpperBound();
|
||||
Value step = loop.getStep();
|
||||
auto newLoopParams =
|
||||
emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step);
|
||||
|
||||
rewriter.modifyOpInPlace(loop, [&]() {
|
||||
loop.setLowerBound(newLoopParams.lowerBound);
|
||||
loop.setUpperBound(newLoopParams.upperBound);
|
||||
loop.setStep(newLoopParams.step);
|
||||
});
|
||||
|
||||
rewriter.setInsertionPointToStart(innermost.getBody());
|
||||
denormalizeInductionVariable(rewriter, loop.getLoc(),
|
||||
loop.getInductionVar(), lb, step);
|
||||
}
|
||||
|
||||
// 2. Emit code computing the upper bound of the coalesced loop as product
|
||||
// of the number of iterations of all loops.
|
||||
OpBuilder builder(outermost);
|
||||
OpBuilder::InsertionGuard g(rewriter);
|
||||
rewriter.setInsertionPoint(outermost);
|
||||
Location loc = outermost.getLoc();
|
||||
Value upperBound = outermost.getUpperBound();
|
||||
for (auto loop : loops.drop_front())
|
||||
upperBound =
|
||||
builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
|
||||
SmallVector<Value> upperBounds = llvm::map_to_vector(
|
||||
loops, [](auto loop) { return loop.getUpperBound(); });
|
||||
Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds);
|
||||
outermost.setUpperBound(upperBound);
|
||||
|
||||
builder.setInsertionPointToStart(outermost.getBody());
|
||||
rewriter.setInsertionPointToStart(innermost.getBody());
|
||||
auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable(
|
||||
rewriter, loc, outermost.getInductionVar(), upperBounds);
|
||||
rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0],
|
||||
preservedUsers);
|
||||
|
||||
// 3. Remap induction variables. For each original loop, the value of the
|
||||
// induction variable can be obtained by dividing the induction variable of
|
||||
// the linearized loop by the total number of iterations of the loops nested
|
||||
// in it modulo the number of iterations in this loop (remove the values
|
||||
// related to the outer loops):
|
||||
// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
|
||||
// Compute these iteratively from the innermost loop by creating a "running
|
||||
// quotient" of division by the range.
|
||||
Value previous = outermost.getInductionVar();
|
||||
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
|
||||
unsigned idx = loops.size() - i - 1;
|
||||
if (i != 0)
|
||||
previous = builder.create<arith::DivSIOp>(loc, previous,
|
||||
loops[idx + 1].getUpperBound());
|
||||
for (int i = loops.size() - 1; i > 0; --i) {
|
||||
auto outerLoop = loops[i - 1];
|
||||
auto innerLoop = loops[i];
|
||||
|
||||
Value iv = (i == e - 1) ? previous
|
||||
: builder.create<arith::RemSIOp>(
|
||||
loc, previous, loops[idx].getUpperBound());
|
||||
replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
|
||||
loops.back().getRegion());
|
||||
Operation *innerTerminator = innerLoop.getBody()->getTerminator();
|
||||
auto yieldedVals = llvm::to_vector(innerTerminator->getOperands());
|
||||
rewriter.eraseOp(innerTerminator);
|
||||
|
||||
SmallVector<Value> innerBlockArgs;
|
||||
innerBlockArgs.push_back(delinearizeIvs[i]);
|
||||
llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());
|
||||
rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(),
|
||||
Block::iterator(innerLoop), innerBlockArgs);
|
||||
rewriter.replaceOp(innerLoop, yieldedVals);
|
||||
}
|
||||
|
||||
// 4. Move the operations from the innermost just above the second-outermost
|
||||
// loop, delete the extra terminator and the second-outermost loop.
|
||||
scf::ForOp second = loops[1];
|
||||
innermost.getBody()->back().erase();
|
||||
outermost.getBody()->getOperations().splice(
|
||||
Block::iterator(second.getOperation()),
|
||||
innermost.getBody()->getOperations());
|
||||
second.erase();
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
|
||||
if (loops.empty()) {
|
||||
return failure();
|
||||
}
|
||||
IRRewriter rewriter(loops.front().getContext());
|
||||
return coalesceLoops(rewriter, loops);
|
||||
}
|
||||
|
||||
LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) {
|
||||
LogicalResult result(failure());
|
||||
SmallVector<scf::ForOp> loops;
|
||||
getPerfectlyNestedLoops(loops, op);
|
||||
|
||||
// Look for a band of loops that can be coalesced, i.e. perfectly nested
|
||||
// loops with bounds defined above some loop.
|
||||
|
||||
// 1. For each loop, find above which parent loop its bounds operands are
|
||||
// defined.
|
||||
SmallVector<unsigned> operandsDefinedAbove(loops.size());
|
||||
for (unsigned i = 0, e = loops.size(); i < e; ++i) {
|
||||
operandsDefinedAbove[i] = i;
|
||||
for (unsigned j = 0; j < i; ++j) {
|
||||
SmallVector<Value> boundsOperands = {loops[i].getLowerBound(),
|
||||
loops[i].getUpperBound(),
|
||||
loops[i].getStep()};
|
||||
if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) {
|
||||
operandsDefinedAbove[i] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. For each inner loop check that the iter_args for the immediately outer
|
||||
// loop are the init for the immediately inner loop and that the yields of the
|
||||
// return of the inner loop is the yield for the immediately outer loop. Keep
|
||||
// track of where the chain starts from for each loop.
|
||||
SmallVector<unsigned> iterArgChainStart(loops.size());
|
||||
iterArgChainStart[0] = 0;
|
||||
for (unsigned i = 1, e = loops.size(); i < e; ++i) {
|
||||
// By default set the start of the chain to itself.
|
||||
iterArgChainStart[i] = i;
|
||||
auto outerloop = loops[i - 1];
|
||||
auto innerLoop = loops[i];
|
||||
if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {
|
||||
continue;
|
||||
}
|
||||
if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {
|
||||
continue;
|
||||
}
|
||||
auto outerloopTerminator = outerloop.getBody()->getTerminator();
|
||||
if (!llvm::equal(outerloopTerminator->getOperands(),
|
||||
innerLoop.getResults())) {
|
||||
continue;
|
||||
}
|
||||
iterArgChainStart[i] = iterArgChainStart[i - 1];
|
||||
}
|
||||
|
||||
// 3. Identify bands of loops such that the operands of all of them are
|
||||
// defined above the first loop in the band. Traverse the nest bottom-up
|
||||
// so that modifications don't invalidate the inner loops.
|
||||
for (unsigned end = loops.size(); end > 0; --end) {
|
||||
unsigned start = 0;
|
||||
for (; start < end - 1; ++start) {
|
||||
auto maxPos =
|
||||
*std::max_element(std::next(operandsDefinedAbove.begin(), start),
|
||||
std::next(operandsDefinedAbove.begin(), end));
|
||||
if (maxPos > start)
|
||||
continue;
|
||||
if (iterArgChainStart[end - 1] > start)
|
||||
continue;
|
||||
auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
|
||||
if (succeeded(coalesceLoops(band)))
|
||||
result = success();
|
||||
break;
|
||||
}
|
||||
// If a band was found and transformed, keep looking at the loops above
|
||||
// the outermost transformed loop.
|
||||
if (start != end - 1)
|
||||
end = start + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void mlir::collapseParallelLoops(
|
||||
scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
|
||||
OpBuilder outsideBuilder(loops);
|
||||
RewriterBase &rewriter, scf::ParallelOp loops,
|
||||
ArrayRef<std::vector<unsigned>> combinedDimensions) {
|
||||
OpBuilder::InsertionGuard g(rewriter);
|
||||
rewriter.setInsertionPoint(loops);
|
||||
Location loc = loops.getLoc();
|
||||
|
||||
// Presort combined dimensions.
|
||||
@ -619,25 +744,29 @@ void mlir::collapseParallelLoops(
|
||||
SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
|
||||
normalizedUpperBounds;
|
||||
for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
|
||||
OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
|
||||
auto resultBounds =
|
||||
normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
|
||||
loops.getLowerBound()[i], loops.getUpperBound()[i],
|
||||
loops.getStep()[i], loops.getBody()->getArgument(i));
|
||||
OpBuilder::InsertionGuard g2(rewriter);
|
||||
rewriter.setInsertionPoint(loops);
|
||||
Value lb = loops.getLowerBound()[i];
|
||||
Value ub = loops.getUpperBound()[i];
|
||||
Value step = loops.getStep()[i];
|
||||
auto newLoopParams = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);
|
||||
normalizedLowerBounds.push_back(newLoopParams.lowerBound);
|
||||
normalizedUpperBounds.push_back(newLoopParams.upperBound);
|
||||
normalizedSteps.push_back(newLoopParams.step);
|
||||
|
||||
normalizedLowerBounds.push_back(resultBounds.lowerBound);
|
||||
normalizedUpperBounds.push_back(resultBounds.upperBound);
|
||||
normalizedSteps.push_back(resultBounds.step);
|
||||
rewriter.setInsertionPointToStart(loops.getBody());
|
||||
denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb,
|
||||
step);
|
||||
}
|
||||
|
||||
// Combine iteration spaces.
|
||||
SmallVector<Value, 3> lowerBounds, upperBounds, steps;
|
||||
auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
|
||||
auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
auto cst0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
|
||||
auto cst1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
|
||||
for (auto &sortedDimension : sortedDimensions) {
|
||||
Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
|
||||
Value newUpperBound = rewriter.create<arith::ConstantIndexOp>(loc, 1);
|
||||
for (auto idx : sortedDimension) {
|
||||
newUpperBound = outsideBuilder.create<arith::MulIOp>(
|
||||
newUpperBound = rewriter.create<arith::MulIOp>(
|
||||
loc, newUpperBound, normalizedUpperBounds[idx]);
|
||||
}
|
||||
lowerBounds.push_back(cst0);
|
||||
@ -651,7 +780,7 @@ void mlir::collapseParallelLoops(
|
||||
// value. The remainders then determine based on that range, which iteration
|
||||
// of the original induction value this represents. This is a normalized value
|
||||
// that is un-normalized already by the previous logic.
|
||||
auto newPloop = outsideBuilder.create<scf::ParallelOp>(
|
||||
auto newPloop = rewriter.create<scf::ParallelOp>(
|
||||
loc, lowerBounds, upperBounds, steps,
|
||||
[&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
|
||||
for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "mlir/IR/IRMapping.h"
|
||||
#include "mlir/IR/Iterators.h"
|
||||
#include "mlir/IR/RegionKindInterface.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
@ -250,6 +251,14 @@ void RewriterBase::finalizeOpModification(Operation *op) {
|
||||
rewriteListener->notifyOperationModified(op);
|
||||
}
|
||||
|
||||
void RewriterBase::replaceAllUsesExcept(
|
||||
Value from, Value to, const SmallPtrSetImpl<Operation *> &preservedUsers) {
|
||||
return replaceUsesWithIf(from, to, [&](OpOperand &use) {
|
||||
Operation *user = use.getOwner();
|
||||
return !preservedUsers.contains(user);
|
||||
});
|
||||
}
|
||||
|
||||
void RewriterBase::replaceUsesWithIf(Value from, Value to,
|
||||
function_ref<bool(OpOperand &)> functor,
|
||||
bool *allUsesReplaced) {
|
||||
|
@ -1,4 +1,4 @@
|
||||
// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing %s | FileCheck %s
|
||||
// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse %s | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: @one_3d_nest
|
||||
func.func @one_3d_nest() {
|
||||
@ -239,19 +239,15 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
|
||||
}
|
||||
return
|
||||
}
|
||||
// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]]
|
||||
// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
|
||||
// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
|
||||
// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]]
|
||||
// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
|
||||
// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
|
||||
// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
|
||||
// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]]
|
||||
// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
|
||||
// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]]
|
||||
// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
|
||||
// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
|
||||
// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T0]]]
|
||||
// CHECK: affine.for %[[IV:.*]] = 0 to %[[T2]]
|
||||
// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T0]]]
|
||||
// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T0]]]
|
||||
// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T0]]]
|
||||
// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T0]]]
|
||||
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
@ -277,18 +273,16 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
|
||||
}
|
||||
return
|
||||
}
|
||||
// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[T2:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]]
|
||||
// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
|
||||
// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]]
|
||||
// CHECK-DAG: %[[T5:.*]] = affine.apply #[[SIXTY_FOUR]]()
|
||||
// CHECK-DAG: %[[T6:.*]] = affine.apply #[[PRODUCT]](%[[T4]])[%[[T5]]]
|
||||
// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]]
|
||||
// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T5]]]
|
||||
// CHECK-DAG: %[[T8:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T5]]]
|
||||
// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T8]])[%[[T3]]]
|
||||
// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T8]])[%[[T3]]]
|
||||
// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
|
||||
// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
|
||||
// CHECK-DAG: %[[T2:.*]] = affine.apply #[[SIXTY_FOUR]]()
|
||||
// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T2]]]
|
||||
// CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
|
||||
// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T2]]]
|
||||
// CHECK-DAG: %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T2]]]
|
||||
// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T0]]]
|
||||
// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T0]]]
|
||||
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
@ -316,19 +310,16 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
|
||||
}
|
||||
return
|
||||
}
|
||||
// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]]
|
||||
// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
|
||||
// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
|
||||
// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]]
|
||||
// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
|
||||
// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
|
||||
// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
|
||||
// CHECK-DAG: %[[T9:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T6]]]
|
||||
// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
|
||||
// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T9]])[%[[T4]]]
|
||||
// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[DIM]]]
|
||||
// CHECK-DAG: %[[T1:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
|
||||
// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
|
||||
// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T1]]]
|
||||
// CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
|
||||
// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T1]]]
|
||||
// CHECK-DAG: %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T1]]]
|
||||
// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T1]]]
|
||||
// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T1]]]
|
||||
// CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]])
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
@ -342,12 +333,14 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
|
||||
func.func @test_loops_do_not_get_coalesced() {
|
||||
affine.for %i = 0 to 7 {
|
||||
affine.for %j = #map0(%i) to min #map1(%i) {
|
||||
"use"(%i, %j) : (index, index) -> ()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
// CHECK: affine.for %[[IV0:.*]] = 0 to 7
|
||||
// CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]])
|
||||
// CHECK-NEXT: "use"(%[[IV0]], %[[IV1]])
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
|
@ -1,4 +1,4 @@
|
||||
// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
|
||||
// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse | FileCheck %s
|
||||
|
||||
func.func @coalesce_inner() {
|
||||
%c0 = arith.constant 0 : index
|
||||
@ -14,7 +14,7 @@ func.func @coalesce_inner() {
|
||||
scf.for %k = %i to %j step %c1 {
|
||||
// Inner loop must have been removed.
|
||||
scf.for %l = %i to %j step %c1 {
|
||||
arith.addi %i, %j : index
|
||||
"use"(%i, %j) : (index, index) -> ()
|
||||
}
|
||||
} {coalesce}
|
||||
}
|
||||
@ -33,13 +33,19 @@ module attributes {transform.with_named_sequence} {
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: #[[MAP:.+]] = affine_map<() -> (64)>
|
||||
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * s0)>
|
||||
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
|
||||
// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
|
||||
func.func @coalesce_outer(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64xf32, 1>, %arg3: memref<64x64xf32, 1>) attributes {} {
|
||||
// CHECK: %[[T0:.+]] = affine.apply #[[MAP]]()
|
||||
// CHECK: %[[UB:.+]] = affine.apply #[[MAP1]](%[[T0]])[%[[T0]]]
|
||||
// CHECK: affine.for %[[IV1:.+]] = 0 to %[[UB:.+]] {
|
||||
// CHECK-NOT: affine.for %[[IV2:.+]]
|
||||
affine.for %arg4 = 0 to 64 {
|
||||
affine.for %arg5 = 0 to 64 {
|
||||
// CHECK: %[[IDX0:.+]] = affine.apply #[[MAP0:.+]](%[[IV1]])[%{{.+}}]
|
||||
// CHECK: %[[IDX1:.+]] = affine.apply #[[MAP1:.+]](%[[IV1]])[%{{.+}}]
|
||||
// CHECK: %[[IDX0:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%{{.+}}]
|
||||
// CHECK: %[[IDX1:.+]] = affine.apply #[[MAP3]](%[[IV1]])[%{{.+}}]
|
||||
// CHECK-NEXT: %{{.+}} = affine.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1>
|
||||
%0 = affine.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1>
|
||||
%1 = affine.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1>
|
||||
@ -96,3 +102,200 @@ module attributes {transform.with_named_sequence} {
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @tensor_loops(%arg0 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
|
||||
%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> tensor<?x?xf32> {
|
||||
%0 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg1 = %arg0) -> tensor<?x?xf32> {
|
||||
%1 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg2 = %arg1) -> tensor<?x?xf32> {
|
||||
%2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg3 = %arg2) -> tensor<?x?xf32> {
|
||||
%3 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>)
|
||||
scf.yield %3 : tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %2 : tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %1 : tensor<?x?xf32>
|
||||
} {coalesce}
|
||||
return %0 : tensor<?x?xf32>
|
||||
}
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
|
||||
%1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
|
||||
%2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
// CHECK: func.func @tensor_loops(
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK: %[[NEWUB0_DIFF:.+]] = arith.subi %[[UB0]], %[[LB0]]
|
||||
// CHECK-DAG: %[[NEWUB0:.+]] = arith.ceildivsi %[[NEWUB0_DIFF]], %[[STEP0]]
|
||||
// CHECK-DAG: %[[C0:.+]] = arith.constant 0
|
||||
// CHECK-DAG: %[[C1:.+]] = arith.constant 1
|
||||
// CHECK: %[[NEWUB1_DIFF:.+]] = arith.subi %[[UB1]], %[[LB1]]
|
||||
// CHECK-DAG: %[[NEWUB1:.+]] = arith.ceildivsi %[[NEWUB1_DIFF]], %[[STEP1]]
|
||||
// CHECK: %[[NEWUB2_DIFF:.+]] = arith.subi %[[UB2]], %[[LB2]]
|
||||
// CHECK-DAG: %[[NEWUB2:.+]] = arith.ceildivsi %[[NEWUB2_DIFF]], %[[STEP2]]
|
||||
// CHECK: %[[PROD1:.+]] = arith.muli %[[NEWUB0]], %[[NEWUB1]]
|
||||
// CHECK: %[[NEWUB:.+]] = arith.muli %[[PROD1]], %[[NEWUB2]]
|
||||
// CHECK: %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]])
|
||||
// CHECK: %[[IV2:.+]] = arith.remsi %[[IV]], %[[NEWUB2]]
|
||||
// CHECK: %[[PREVIOUS:.+]] = arith.divsi %[[IV]], %[[NEWUB2]]
|
||||
// CHECK: %[[IV1:.+]] = arith.remsi %[[PREVIOUS]], %[[NEWUB1]]
|
||||
// CHECK: %[[IV0:.+]] = arith.divsi %[[PREVIOUS]], %[[NEWUB1]]
|
||||
// CHECK: %[[K_STEP:.+]] = arith.muli %[[IV2]], %[[STEP2]]
|
||||
// CHECK: %[[K:.+]] = arith.addi %[[K_STEP]], %[[LB2]]
|
||||
// CHECK: %[[J_STEP:.+]] = arith.muli %[[IV1]], %[[STEP1]]
|
||||
// CHECK: %[[J:.+]] = arith.addi %[[J_STEP]], %[[LB1]]
|
||||
// CHECK: %[[I_STEP:.+]] = arith.muli %[[IV0]], %[[STEP0]]
|
||||
// CHECK: %[[I:.+]] = arith.addi %[[I_STEP]], %[[LB0]]
|
||||
// CHECK: %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]])
|
||||
// CHECK: scf.yield %[[USE]]
|
||||
// CHECK: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
// Coalesce only first two loops, but not the last since the iter_args dont line up
|
||||
func.func @tensor_loops_first_two(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
|
||||
%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg5, %arg7 = %arg4) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %2#0, %2#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %1#0, %1#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
} {coalesce}
|
||||
return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
|
||||
%1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
|
||||
%2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
// CHECK: func.func @tensor_loops_first_two(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK: scf.for
|
||||
// CHECK: arith.remsi
|
||||
// CHECK: arith.divsi
|
||||
// CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
|
||||
// CHECK-NOT: scf.for
|
||||
// CHECK: transform.named_sequence
|
||||
|
||||
// -----
|
||||
|
||||
// Coalesce only first two loops, but not the last since the yields dont match up
|
||||
func.func @tensor_loops_first_two_2(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
|
||||
%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg4, %arg7 = %arg5) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %2#1, %2#0 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %1#0, %1#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
} {coalesce}
|
||||
return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
|
||||
%1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
|
||||
%2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
// CHECK: func.func @tensor_loops_first_two_2(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK: scf.for
|
||||
// CHECK: arith.remsi
|
||||
// CHECK: arith.divsi
|
||||
// CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
|
||||
// CHECK-NOT: scf.for
|
||||
// CHECK: transform.named_sequence
|
||||
|
||||
// -----
|
||||
|
||||
// Coalesce only last two loops, but not the first since the yields dont match up
|
||||
func.func @tensor_loops_last_two(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
|
||||
%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg4, %arg7 = %arg5) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %2#0, %2#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %1#1, %1#0 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
} {coalesce}
|
||||
return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
|
||||
%1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
|
||||
%2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
// CHECK: func.func @tensor_loops_last_two(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB0]] to %[[UB0]] step %[[STEP0]]
|
||||
// CHECK: arith.subi
|
||||
// CHECK: arith.ceildivsi
|
||||
// CHECK: arith.subi
|
||||
// CHECK: arith.ceildivsi
|
||||
// CHECK: scf.for
|
||||
// CHECK: arith.remsi
|
||||
// CHECK: arith.divsi
|
||||
// CHECK-NOT: scf.for
|
||||
// CHECK: transform.named_sequence
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func @parallel_many_dims() {
|
||||
// CHECK: func @parallel_many_dims() {
|
||||
func.func @parallel_many_dims() {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
@ -28,19 +28,19 @@ func.func @parallel_many_dims() {
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-DAG: [[C12:%.*]] = arith.constant 12 : index
|
||||
// CHECK-DAG: [[C10:%.*]] = arith.constant 10 : index
|
||||
// CHECK-DAG: [[C9:%.*]] = arith.constant 9 : index
|
||||
// CHECK-DAG: [[C6:%.*]] = arith.constant 6 : index
|
||||
// CHECK-DAG: [[C4:%.*]] = arith.constant 4 : index
|
||||
// CHECK-DAG: [[C3:%.*]] = arith.constant 3 : index
|
||||
// CHECK-DAG: [[C2:%.*]] = arith.constant 2 : index
|
||||
// CHECK-DAG: [[C1:%.*]] = arith.constant 1 : index
|
||||
// CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
|
||||
// CHECK: scf.parallel ([[NEW_I0:%.*]]) = ([[C0]]) to ([[C4]]) step ([[C1]]) {
|
||||
// CHECK: [[V0:%.*]] = arith.remsi [[NEW_I0]], [[C2]] : index
|
||||
// CHECK: [[I0:%.*]] = arith.divsi [[NEW_I0]], [[C2]] : index
|
||||
// CHECK: [[V2:%.*]] = arith.muli [[V0]], [[C10]] : index
|
||||
// CHECK: [[I3:%.*]] = arith.addi [[V2]], [[C9]] : index
|
||||
// CHECK: "magic.op"([[I0]], [[C3]], [[C6]], [[I3]], [[C12]]) : (index, index, index, index, index) -> index
|
||||
// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
|
||||
// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index
|
||||
// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
|
||||
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
|
||||
// CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index
|
||||
// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
|
||||
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
|
||||
// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
|
||||
// CHECK: scf.parallel (%[[NEW_I0:.*]]) = (%[[C0]]) to (%[[C4]]) step (%[[C1]]) {
|
||||
// CHECK: %[[V0:.*]] = arith.remsi %[[NEW_I0]], %[[C2]] : index
|
||||
// CHECK: %[[I0:.*]] = arith.divsi %[[NEW_I0]], %[[C2]] : index
|
||||
// CHECK: %[[V2:.*]] = arith.muli %[[V0]], %[[C10]]
|
||||
// CHECK: %[[I3:.*]] = arith.addi %[[V2]], %[[C9]]
|
||||
// CHECK: "magic.op"(%[[I0]], %[[C3]], %[[C6]], %[[I3]], %[[C12]]) : (index, index, index, index, index) -> index
|
||||
// CHECK: scf.reduce
|
||||
|
@ -13,22 +13,22 @@ func.func @collapse_to_single() {
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @collapse_to_single() {
|
||||
// CHECK-DAG: [[C18:%.*]] = arith.constant 18 : index
|
||||
// CHECK-DAG: [[C6:%.*]] = arith.constant 6 : index
|
||||
// CHECK-DAG: [[C3:%.*]] = arith.constant 3 : index
|
||||
// CHECK-DAG: [[C7:%.*]] = arith.constant 7 : index
|
||||
// CHECK-DAG: [[C4:%.*]] = arith.constant 4 : index
|
||||
// CHECK-DAG: [[C1:%.*]] = arith.constant 1 : index
|
||||
// CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
|
||||
// CHECK: scf.parallel ([[NEW_I:%.*]]) = ([[C0]]) to ([[C18]]) step ([[C1]]) {
|
||||
// CHECK: [[I0_COUNT:%.*]] = arith.remsi [[NEW_I]], [[C6]] : index
|
||||
// CHECK: [[I1_COUNT:%.*]] = arith.divsi [[NEW_I]], [[C6]] : index
|
||||
// CHECK: [[V0:%.*]] = arith.muli [[I0_COUNT]], [[C4]] : index
|
||||
// CHECK: [[I1:%.*]] = arith.addi [[V0]], [[C7]] : index
|
||||
// CHECK: [[V1:%.*]] = arith.muli [[I1_COUNT]], [[C3]] : index
|
||||
// CHECK: [[I0:%.*]] = arith.addi [[V1]], [[C3]] : index
|
||||
// CHECK: "magic.op"([[I0]], [[I1]]) : (index, index) -> index
|
||||
// CHECK: func @collapse_to_single() {
|
||||
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
|
||||
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
|
||||
// CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index
|
||||
// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
|
||||
// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index
|
||||
// CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index
|
||||
// CHECK: scf.parallel (%[[NEW_I:.*]]) = (%[[C0]]) to (%[[C18]]) step (%[[C1]]) {
|
||||
// CHECK: %[[I0_COUNT:.*]] = arith.remsi %[[NEW_I]], %[[C6]] : index
|
||||
// CHECK: %[[I1_COUNT:.*]] = arith.divsi %[[NEW_I]], %[[C6]] : index
|
||||
// CHECK: %[[V0:.*]] = arith.muli %[[I0_COUNT]], %[[C4]]
|
||||
// CHECK: %[[I1:.*]] = arith.addi %[[V0]], %[[C7]]
|
||||
// CHECK: %[[V1:.*]] = arith.muli %[[I1_COUNT]], %[[C3]]
|
||||
// CHECK: %[[I0:.*]] = arith.addi %[[V1]], %[[C3]]
|
||||
// CHECK: "magic.op"(%[[I0]], %[[I1]]) : (index, index) -> index
|
||||
// CHECK: scf.reduce
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
|
Loading…
x
Reference in New Issue
Block a user