llvm-project/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
Matthias Springer 5cc0f76d34
[mlir][IR] Add rewriter API for moving operations (#78988)
The pattern rewriter documentation states that "*all* IR mutations [...]
are required to be performed via the `PatternRewriter`." This commit
adds two functions that were missing from the rewriter API:
`moveOpBefore` and `moveOpAfter`.

After an operation was moved, the `notifyOperationInserted` callback is
triggered. This allows listeners such as the greedy pattern rewrite
driver to react to IR changes.

This commit narrows the discrepancy between the kind of IR modification
that can be performed and the kind of IR modifications that can be
listened to.
2024-01-25 11:01:28 +01:00

1071 lines
43 KiB
C++

//===- BufferizableOpInterfaceImpl.cpp - Impl. of BufferizableOpInterface -===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Bufferization/IR/DstBufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/Operation.h"
using namespace mlir;
using namespace mlir::bufferization;
using namespace mlir::tensor;
namespace mlir {
namespace tensor {
namespace {
struct CastOpInterface
: public BufferizableOpInterface::ExternalModel<CastOpInterface,
tensor::CastOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {{op->getResult(0), BufferRelation::Equivalent}};
}
FailureOr<BaseMemRefType>
getBufferType(Operation *op, Value value, const BufferizationOptions &options,
SmallVector<Value> &invocationStack) const {
auto castOp = cast<tensor::CastOp>(op);
auto maybeSrcBufferType = bufferization::getBufferType(
castOp.getSource(), options, invocationStack);
if (failed(maybeSrcBufferType))
return failure();
Attribute memorySpace = maybeSrcBufferType->getMemorySpace();
// Note: `getMemRefTypeWithFullyDynamicLayout` returns an unranked memref
// type in case the input is an unranked tensor type.
// Case 1: Casting an unranked tensor
if (isa<UnrankedTensorType>(castOp.getSource().getType())) {
// When casting to a ranked tensor, we cannot infer any static offset or
// strides from the source. Assume fully dynamic.
return getMemRefTypeWithFullyDynamicLayout(castOp.getType(), memorySpace);
}
// Case 2: Casting to an unranked tensor type
if (isa<UnrankedTensorType>(castOp.getType())) {
return getMemRefTypeWithFullyDynamicLayout(castOp.getType(), memorySpace);
}
// Case 3: Ranked tensor -> ranked tensor. The offsets and strides do not
// change.
auto rankedResultType = cast<RankedTensorType>(castOp.getType());
return MemRefType::get(
rankedResultType.getShape(), rankedResultType.getElementType(),
llvm::cast<MemRefType>(*maybeSrcBufferType).getLayout(), memorySpace);
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto castOp = cast<tensor::CastOp>(op);
// The result buffer still has the old (pre-cast) type.
FailureOr<Value> resultBuffer =
getBuffer(rewriter, castOp.getSource(), options);
if (failed(resultBuffer))
return failure();
// Compute the new type.
auto resultMemRefType =
bufferization::getBufferType(castOp.getResult(), options);
if (failed(resultMemRefType))
return failure();
if (resultBuffer->getType() == *resultMemRefType) {
// This cast is a no-op.
replaceOpWithBufferizedValues(rewriter, op, *resultBuffer);
return success();
}
// Replace the op with a memref.cast.
assert(memref::CastOp::areCastCompatible(resultBuffer->getType(),
*resultMemRefType) &&
"CallOp::bufferize: cast incompatible");
replaceOpWithNewBufferizedOp<memref::CastOp>(
rewriter, op, *resultMemRefType, *resultBuffer);
return success();
}
};
/// Bufferization of tensor.collapse_shape. Replace with memref.collapse_shape.
struct CollapseShapeOpInterface
: public BufferizableOpInterface::ExternalModel<CollapseShapeOpInterface,
tensor::CollapseShapeOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// tensor.collapse_shape may reallocate, at which point the source buffer is
// copied. I.e., there will be a memory read side effect on the bufferized
// source. This function conservatively returns "true" because whether a
// copy will be created or not is not known at this point.
return true;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// TODO: CollapseShapeOp may allocate at runtime.
return {{op->getOpResult(0), BufferRelation::Equivalent}};
}
FailureOr<BaseMemRefType>
getBufferType(Operation *op, Value value, const BufferizationOptions &options,
SmallVector<Value> &invocationStack) const {
auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
auto maybeSrcBufferType = bufferization::getBufferType(
collapseShapeOp.getSrc(), options, invocationStack);
if (failed(maybeSrcBufferType))
return failure();
auto srcBufferType = llvm::cast<MemRefType>(*maybeSrcBufferType);
bool canBeCollapsed = memref::CollapseShapeOp::isGuaranteedCollapsible(
srcBufferType, collapseShapeOp.getReassociationIndices());
if (!canBeCollapsed) {
// If dims cannot be collapsed, this op bufferizes to a new allocation.
RankedTensorType tensorResultType = collapseShapeOp.getResultType();
return bufferization::getMemRefTypeWithStaticIdentityLayout(
tensorResultType, srcBufferType.getMemorySpace());
}
return memref::CollapseShapeOp::computeCollapsedType(
srcBufferType, collapseShapeOp.getReassociationIndices());
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
RankedTensorType tensorResultType = collapseShapeOp.getResultType();
FailureOr<Value> maybeBuffer =
getBuffer(rewriter, collapseShapeOp.getSrc(), options);
if (failed(maybeBuffer))
return failure();
Value buffer = *maybeBuffer;
auto bufferType = cast<MemRefType>(buffer.getType());
if (tensorResultType.getRank() == 0) {
// 0-d collapses must go through a different op builder.
MemRefType resultType;
if (bufferType.getLayout().isIdentity()) {
// Standard layout: result type has no offset.
MemRefLayoutAttrInterface layout;
resultType = MemRefType::get({}, tensorResultType.getElementType(),
layout, bufferType.getMemorySpace());
} else {
// Source memref has a layout map: result type has the same offset as
// the source type.
SmallVector<int64_t> strides;
int64_t offset;
if (failed(getStridesAndOffset(bufferType, strides, offset)))
return failure();
resultType = MemRefType::get(
{}, tensorResultType.getElementType(),
StridedLayoutAttr::get(op->getContext(), offset, {}),
bufferType.getMemorySpace());
}
replaceOpWithNewBufferizedOp<memref::CollapseShapeOp>(
rewriter, op, resultType, buffer, collapseShapeOp.getReassociation());
return success();
}
// If the dims are not collapsible (due to an incompatible source layout
// map), force an out-of-place bufferization, i.e., a buffer copy. This
// newly allocated buffer will have no layout map and thus be collapsible.
bool canBeCollapsed = memref::CollapseShapeOp::isGuaranteedCollapsible(
bufferType, collapseShapeOp.getReassociationIndices());
if (!canBeCollapsed) {
// TODO: Create alloc_tensor ops during TensorCopyInsertion.
AnalysisState analysisState(options);
FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
rewriter, op->getLoc(), collapseShapeOp.getSrc(), options);
if (failed(tensorAlloc))
return failure();
auto memrefType =
MemRefType::get(collapseShapeOp.getSrcType().getShape(),
collapseShapeOp.getSrcType().getElementType(),
AffineMap(), bufferType.getMemorySpace());
buffer = rewriter.create<bufferization::ToMemrefOp>(
op->getLoc(), memrefType, *tensorAlloc);
}
// Result type is inferred by the builder.
replaceOpWithNewBufferizedOp<memref::CollapseShapeOp>(
rewriter, op, buffer, collapseShapeOp.getReassociationIndices());
return success();
}
};
/// Bufferization of tensor.dim. Replace with memref.dim.
struct DimOpInterface
: public BufferizableOpInterface::ExternalModel<DimOpInterface,
tensor::DimOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// The op reads the tensor's metadata but not its contents.
return false;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {};
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto dimOp = cast<tensor::DimOp>(op);
FailureOr<Value> v = getBuffer(rewriter, dimOp.getSource(), options);
if (failed(v))
return failure();
replaceOpWithNewBufferizedOp<memref::DimOp>(rewriter, op, *v,
dimOp.getIndex());
return success();
}
};
/// Bufferization of "tensor.empty". Replace with "bufferization.alloc_tensor".
struct EmptyOpInterface
: public BufferizableOpInterface::ExternalModel<EmptyOpInterface,
tensor::EmptyOp> {
bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
bool resultBufferizesToMemoryWrite(Operation *op, OpResult opResult,
const AnalysisState &state) const {
// The returned tensor does not have specified contents.
return false;
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto emptyOp = cast<tensor::EmptyOp>(op);
// Optimization: Fold away the op if it has no uses.
if (op->getUses().empty()) {
rewriter.eraseOp(op);
return success();
}
// Allocate a tensor. This emits a "bufferization.alloc_tensor" op.
FailureOr<Value> allocTensor = allocateTensorForShapedValue(
rewriter, op->getLoc(), emptyOp.getResult(), options, /*copy=*/false);
if (failed(allocTensor))
return failure();
rewriter.replaceOp(op, *allocTensor);
return success();
}
};
/// Bufferization of tensor.expand_shape. Replace with memref.expand_shape.
struct ExpandShapeOpInterface
: public BufferizableOpInterface::ExternalModel<ExpandShapeOpInterface,
tensor::ExpandShapeOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// In contrast to tensor.collapse_shape, this op can always be bufferized
// without a copy.
return false;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {{op->getOpResult(0), BufferRelation::Equivalent}};
}
FailureOr<BaseMemRefType>
getBufferType(Operation *op, Value value, const BufferizationOptions &options,
SmallVector<Value> &invocationStack) const {
auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
auto maybeSrcBufferType = bufferization::getBufferType(
expandShapeOp.getSrc(), options, invocationStack);
if (failed(maybeSrcBufferType))
return failure();
auto srcBufferType = llvm::cast<MemRefType>(*maybeSrcBufferType);
auto maybeResultType = memref::ExpandShapeOp::computeExpandedType(
srcBufferType, expandShapeOp.getResultType().getShape(),
expandShapeOp.getReassociationIndices());
if (failed(maybeResultType))
return failure();
return *maybeResultType;
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
auto tensorResultType = expandShapeOp.getResultType();
FailureOr<Value> buffer =
getBuffer(rewriter, expandShapeOp.getSrc(), options);
if (failed(buffer))
return failure();
// Memref result type is inferred by the builder based on reassociation
// indices and result shape.
replaceOpWithNewBufferizedOp<memref::ExpandShapeOp>(
rewriter, op, tensorResultType.getShape(), *buffer,
expandShapeOp.getReassociationIndices());
return success();
}
};
/// Bufferization of tensor.extract_slice. Replace with memref.subview.
struct ExtractSliceOpInterface
: public BufferizableOpInterface::ExternalModel<ExtractSliceOpInterface,
tensor::ExtractSliceOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {{op->getOpResult(0), BufferRelation::Unknown}};
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
SmallVector<OpFoldResult> mixedOffsets = extractSliceOp.getMixedOffsets();
SmallVector<OpFoldResult> mixedSizes = extractSliceOp.getMixedSizes();
SmallVector<OpFoldResult> mixedStrides = extractSliceOp.getMixedStrides();
Location loc = extractSliceOp.getLoc();
// Get source buffer.
FailureOr<Value> srcMemref =
getBuffer(rewriter, extractSliceOp.getSource(), options);
if (failed(srcMemref))
return failure();
// Take a subview of the source buffer.
auto resultMemrefType =
bufferization::getBufferType(extractSliceOp.getResult(), options);
if (failed(resultMemrefType))
return failure();
Value subView = rewriter.create<memref::SubViewOp>(
loc, llvm::cast<MemRefType>(*resultMemrefType), *srcMemref, mixedOffsets,
mixedSizes, mixedStrides);
replaceOpWithBufferizedValues(rewriter, op, subView);
return success();
}
FailureOr<BaseMemRefType>
getBufferType(Operation *op, Value value, const BufferizationOptions &options,
SmallVector<Value> &invocationStack) const {
auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
assert(value == extractSliceOp.getResult() && "invalid value");
auto srcMemrefType = bufferization::getBufferType(
extractSliceOp.getSource(), options, invocationStack);
if (failed(srcMemrefType))
return failure();
SmallVector<OpFoldResult> mixedOffsets = extractSliceOp.getMixedOffsets();
SmallVector<OpFoldResult> mixedSizes = extractSliceOp.getMixedSizes();
SmallVector<OpFoldResult> mixedStrides = extractSliceOp.getMixedStrides();
return cast<BaseMemRefType>(memref::SubViewOp::inferRankReducedResultType(
extractSliceOp.getType().getShape(), llvm::cast<MemRefType>(*srcMemrefType),
mixedOffsets, mixedSizes, mixedStrides));
}
};
/// Bufferization of tensor.extract. Replace with memref.load.
struct ExtractOpInterface
: public BufferizableOpInterface::ExternalModel<ExtractOpInterface,
tensor::ExtractOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return true;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {};
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto extractOp = cast<tensor::ExtractOp>(op);
FailureOr<Value> srcMemref =
getBuffer(rewriter, extractOp.getTensor(), options);
if (failed(srcMemref))
return failure();
replaceOpWithNewBufferizedOp<memref::LoadOp>(rewriter, op, *srcMemref,
extractOp.getIndices());
return success();
}
};
// Implements backtracking to traverse indices of the output buffer while
// iterating over op.elements().
static void createStores(RewriterBase &rewriter, Location loc, int dim,
Value buffer, ArrayRef<int64_t> shape,
ArrayRef<Value> constants,
OperandRange::iterator &elementIt,
SmallVectorImpl<Value> &indices) {
if (dim == static_cast<int>(shape.size()) - 1) {
for (int i = 0; i < shape.back(); ++i) {
indices.back() = constants[i];
rewriter.create<memref::StoreOp>(loc, *elementIt, buffer, indices);
++elementIt;
}
return;
}
for (int i = 0; i < shape[dim]; ++i) {
indices[dim] = constants[i];
createStores(rewriter, loc, dim + 1, buffer, shape, constants, elementIt,
indices);
}
}
/// Bufferization of tensor.from_elements.
struct FromElementsOpInterface
: public BufferizableOpInterface::ExternalModel<FromElementsOpInterface,
tensor::FromElementsOp> {
bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto fromElementsOp = cast<tensor::FromElementsOp>(op);
// TODO: Implement memory space for this op.
if (options.defaultMemorySpace != Attribute())
return op->emitError("memory space not implemented yet");
// Allocate a buffer for the result.
Location loc = op->getLoc();
auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());
auto shape = tensorType.getShape();
// TODO: Create alloc_tensor ops during TensorCopyInsertion.
FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
rewriter, loc, fromElementsOp.getResult(), options,
/*copy=*/false);
if (failed(tensorAlloc))
return failure();
auto memrefType =
MemRefType::get(tensorType.getShape(), tensorType.getElementType());
Value buffer = rewriter.create<bufferization::ToMemrefOp>(
op->getLoc(), memrefType, *tensorAlloc);
// Case: tensor<0xelem_type>.
if (fromElementsOp.getElements().empty()) {
replaceOpWithBufferizedValues(rewriter, op, buffer);
return success();
}
// Case: tensor<elem_type>.
if (shape.empty()) {
rewriter.create<memref::StoreOp>(
loc, fromElementsOp.getElements().front(), buffer);
replaceOpWithBufferizedValues(rewriter, op, buffer);
return success();
}
// Create constants for the range of possible indices [0, max{shape_i}).
auto maxDim = *std::max_element(shape.begin(), shape.end());
SmallVector<Value, 2> constants;
constants.reserve(maxDim);
for (int i = 0; i < maxDim; ++i)
constants.push_back(rewriter.create<arith::ConstantIndexOp>(loc, i));
// Traverse all `elements` and create `memref.store` ops.
auto elementIt = fromElementsOp.getElements().begin();
SmallVector<Value, 2> indices(tensorType.getRank(), constants[0]);
createStores(rewriter, loc, /*dim=*/0, buffer, shape, constants, elementIt,
indices);
replaceOpWithBufferizedValues(rewriter, op, buffer);
return success();
}
};
/// Lower the body of a tensor.generate like op (one index-typed bbArg per dim).
/// Such ops are lowered to linalg.map with the given tensor as a destination.
///
/// Example:
/// ```
/// %r = tensor.generate %x, %y {
/// ^bb0(%arg0: index, %arg1: index):
/// %0 = "some_op"(%arg0, %arg1) : (index, index) -> (index)
/// tensor.yield %0 : index
/// } : tensor<?x?xindex>
/// ```
///
/// Is lowered to:
/// ```
/// linalg.map ins() outs(%dest) {
/// %d0 = linalg.index 0 : index
/// %d1 = linalg.index 1 : index
/// %0 = "some_op"(%d0, %d1) : (index, index) -> (index)
/// linalg.yield %0 : index
/// }
/// ```
static Value lowerGenerateLikeOpBody(RewriterBase &rewriter, Location loc,
Value tensorDestination,
ValueRange dynamicSizes,
Region &generateBody) {
assert(generateBody.hasOneBlock() && "expected body with single block");
auto tensorType = cast<RankedTensorType>(tensorDestination.getType());
assert(generateBody.getNumArguments() == tensorType.getRank() &&
"rank mismatch");
// Create linalg::MapOp.
OpBuilder::InsertionGuard g(rewriter);
auto linalgOp =
rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
/*init=*/tensorDestination);
Block &linalgBody = linalgOp.getMapper().emplaceBlock();
// Create linalg::IndexOps.
rewriter.setInsertionPointToStart(&linalgBody);
SmallVector<Value> indices;
for (int64_t dim = 0; dim < tensorType.getRank(); ++dim)
indices.push_back(rewriter.create<linalg::IndexOp>(loc, dim));
// Move over body.
rewriter.mergeBlocks(&generateBody.front(), &linalgBody, indices);
auto yieldOp = cast<tensor::YieldOp>(linalgBody.getTerminator());
rewriter.replaceOpWithNewOp<linalg::YieldOp>(yieldOp, yieldOp.getValue());
return linalgOp.getResult()[0];
}
/// Bufferization of tensor.generate.
struct GenerateOpInterface
: public BufferizableOpInterface::ExternalModel<GenerateOpInterface,
tensor::GenerateOp> {
bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto generateOp = cast<tensor::GenerateOp>(op);
// TODO: Implement memory space for this op.
if (options.defaultMemorySpace != Attribute())
return op->emitError("memory space not implemented yet");
// Allocate memory.
Location loc = op->getLoc();
FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
rewriter, loc, generateOp.getResult(), options,
/*copy=*/false);
if (failed(tensorAlloc))
return failure();
Value result = lowerGenerateLikeOpBody(rewriter, loc, *tensorAlloc,
generateOp.getDynamicExtents(),
generateOp.getBody());
rewriter.replaceOp(generateOp, result);
return success();
}
};
/// Bufferization of tensor.insert. Replace with memref.store.
///
/// Note: DstBufferizableOpInterfaceExternalModel provides many default method
/// implementations for DestinationStyle ops.
struct InsertOpInterface
: public DstBufferizableOpInterfaceExternalModel<InsertOpInterface,
tensor::InsertOp> {
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto insertOp = cast<tensor::InsertOp>(op);
FailureOr<Value> destMemref =
getBuffer(rewriter, insertOp.getDest(), options);
if (failed(destMemref))
return failure();
rewriter.create<memref::StoreOp>(insertOp.getLoc(), insertOp.getScalar(),
*destMemref, insertOp.getIndices());
replaceOpWithBufferizedValues(rewriter, op, *destMemref);
return success();
}
};
/// Bufferization of tensor.insert_slice. Replace with a memory copy. Under
/// certain circumstances, this op can also be a no-op.
///
/// Note: DstBufferizableOpInterfaceExternalModel provides many default method
/// implementations for DestinationStyle ops.
struct InsertSliceOpInterface
: public DstBufferizableOpInterfaceExternalModel<InsertSliceOpInterface,
tensor::InsertSliceOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
RankedTensorType destType = insertSliceOp.getDestType();
// The source is always read.
if (opOperand == insertSliceOp.getSourceMutable())
return true;
// For the destination, it depends...
assert(opOperand == insertSliceOp.getDestMutable() && "expected dest");
// Dest is not read if it is entirely overwritten. E.g.:
// tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32>
bool allOffsetsZero =
llvm::all_of(insertSliceOp.getMixedOffsets(), [](OpFoldResult ofr) {
return isConstantIntValue(ofr, 0);
});
bool sizesMatchDestSizes = llvm::all_of(
llvm::enumerate(insertSliceOp.getMixedSizes()), [&](const auto &it) {
return getConstantIntValue(it.value()) ==
destType.getDimSize(it.index());
});
bool allStridesOne =
llvm::all_of(insertSliceOp.getMixedStrides(), [](OpFoldResult ofr) {
return isConstantIntValue(ofr, 1);
});
return !(allOffsetsZero && sizesMatchDestSizes && allStridesOne);
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
// insert_slice ops arise from tiling and bufferizing them out-of-place is
// generally a deal breaker. When used with loops, this ends up cloning the
// whole tensor on every single iteration and is a symptom of a
// catastrophically bad scheduling decision.
// TODO: be very loud about it or even consider failing the pass.
auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
SmallVector<OpFoldResult> mixedOffsets = insertSliceOp.getMixedOffsets();
SmallVector<OpFoldResult> mixedSizes = insertSliceOp.getMixedSizes();
SmallVector<OpFoldResult> mixedStrides = insertSliceOp.getMixedStrides();
Location loc = insertSliceOp.getLoc();
// Get destination buffer.
FailureOr<Value> dstMemref =
getBuffer(rewriter, insertSliceOp.getDest(), options);
if (failed(dstMemref))
return failure();
// Take a subview of the destination buffer.
auto dstMemrefType = cast<MemRefType>(dstMemref->getType());
auto subviewMemRefType =
cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
insertSliceOp.getSourceType().getShape(), dstMemrefType,
mixedOffsets, mixedSizes, mixedStrides));
Value subView = rewriter.create<memref::SubViewOp>(
loc, subviewMemRefType, *dstMemref, mixedOffsets, mixedSizes,
mixedStrides);
// Copy tensor. If this tensor.insert_slice has a matching
// tensor.extract_slice, the copy operation will eventually fold away.
FailureOr<Value> srcMemref =
getBuffer(rewriter, insertSliceOp.getSource(), options);
if (failed(srcMemref))
return failure();
if (failed(options.createMemCpy(rewriter, loc, *srcMemref, subView)))
return failure();
replaceOpWithBufferizedValues(rewriter, op, *dstMemref);
return success();
}
};
/// Bufferization of tensor.pad. Replace with bufferization.alloc_tensor +
/// linalg.map + insert_slice.
/// For best performance, vectorize before bufferization (better performance in
/// case of padding with a constant).
struct PadOpInterface
: public BufferizableOpInterface::ExternalModel<PadOpInterface,
tensor::PadOp> {
bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return true;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {};
}
FailureOr<BaseMemRefType>
getBufferType(Operation *op, Value value, const BufferizationOptions &options,
SmallVector<Value> &invocationStack) const {
// Infer memory space from the source tensor.
auto padOp = cast<tensor::PadOp>(op);
auto maybeSrcBufferType = bufferization::getBufferType(
padOp.getSource(), options, invocationStack);
if (failed(maybeSrcBufferType))
return failure();
MemRefLayoutAttrInterface layout;
return MemRefType::get(padOp.getResultType().getShape(),
padOp.getResultType().getElementType(), layout,
maybeSrcBufferType->getMemorySpace());
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto padOp = cast<tensor::PadOp>(op);
Location loc = padOp.getLoc();
RankedTensorType resultType = padOp.getResultType();
RankedTensorType srcType = padOp.getSourceType();
auto toValue = [&](OpFoldResult ofr) {
if (ofr.is<Value>())
return ofr.get<Value>();
return rewriter
.create<arith::ConstantIndexOp>(loc, *getConstantIntValue(ofr))
.getResult();
};
// Compute dynamic result dimensions.
SmallVector<OpFoldResult> mixedLowPad = padOp.getMixedLowPad();
SmallVector<OpFoldResult> mixedHighPad = padOp.getMixedHighPad();
SmallVector<Value> dynamicSizes;
for (int64_t i = 0; i < resultType.getRank(); ++i) {
if (!resultType.isDynamicDim(i))
continue;
Value srcDim = rewriter.create<tensor::DimOp>(loc, padOp.getSource(), i);
Value lowPad = toValue(mixedLowPad[i]);
Value highPad = toValue(mixedHighPad[i]);
AffineExpr s0, s1, s2;
bindSymbols(op->getContext(), s0, s1, s2);
AffineExpr sumExpr = s0 + s1 + s2;
Value sum = rewriter.create<affine::AffineApplyOp>(
loc, sumExpr, ValueRange{srcDim, lowPad, highPad});
dynamicSizes.push_back(sum);
}
// Allocate a buffer for the padded result.
FailureOr<Value> tensorAlloc =
allocateTensorForShapedValue(rewriter, loc, padOp.getResult(), options,
/*copy=*/false);
if (failed(tensorAlloc))
return failure();
// tensor::PadOp is like tensor::GenerateOp: The only difference is that
// only a part of the generated tensor is needed. For simplicity, we reuse
// the same functionality here.
Value filledBuffer = lowerGenerateLikeOpBody(
rewriter, loc, *tensorAlloc, dynamicSizes, padOp.getBodyRegion());
// Create tensor::InsertSliceOp.
SmallVector<OpFoldResult> sliceSizes =
getMixedSizes(rewriter, loc, padOp.getSource());
SmallVector<OpFoldResult> sliceStrides(srcType.getRank(),
rewriter.getIndexAttr(1));
rewriter.replaceOpWithNewOp<tensor::InsertSliceOp>(
padOp, padOp.getSource(), filledBuffer,
/*offsets=*/padOp.getMixedLowPad(), sliceSizes, sliceStrides);
return success();
}
};
/// Bufferization of tensor.rank. Replace with memref.rank.
struct RankOpInterface
: public BufferizableOpInterface::ExternalModel<RankOpInterface,
tensor::RankOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// The op reads the tensor's metadata but not its contents.
return false;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {};
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto rankOp = cast<tensor::RankOp>(op);
FailureOr<Value> v = getBuffer(rewriter, rankOp.getTensor(), options);
if (failed(v))
return failure();
replaceOpWithNewBufferizedOp<memref::RankOp>(rewriter, op, rankOp.getType(),
*v);
return success();
}
};
/// Bufferization of tensor.reshape. Replace with memref.reshape.
struct ReshapeOpInterface
: public BufferizableOpInterface::ExternalModel<ReshapeOpInterface,
tensor::ReshapeOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// Depending on the layout map, the source buffer may have to be copied.
auto reshapeOp = cast<tensor::ReshapeOp>(op);
return opOperand == reshapeOp.getShapeMutable();
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return false;
}
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {{op->getOpResult(0), BufferRelation::Equivalent}};
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
auto reshapeOp = cast<tensor::ReshapeOp>(op);
FailureOr<Value> srcBuffer =
getBuffer(rewriter, reshapeOp.getSource(), options);
FailureOr<Value> shapeBuffer =
getBuffer(rewriter, reshapeOp.getShape(), options);
if (failed(srcBuffer) || failed(shapeBuffer))
return failure();
auto maybeResultMemRefType =
bufferization::getBufferType(reshapeOp.getResult(), options);
if (failed(maybeResultMemRefType))
return failure();
// memref.reshape requires the source buffer to have an identity layout.
// If the source memref does not have an identity layout, copy the source
// into a new buffer with an identity layout.
auto srcType = llvm::dyn_cast<MemRefType>(srcBuffer->getType());
if (srcType && !srcType.getLayout().isIdentity()) {
FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
rewriter, op->getLoc(), reshapeOp.getSource(), options);
if (failed(tensorAlloc))
return failure();
auto memrefType = MemRefType::get(
srcType.getShape(), srcType.getElementType(), AffineMap(),
cast<BaseMemRefType>(srcBuffer->getType()).getMemorySpace());
srcBuffer = rewriter
.create<bufferization::ToMemrefOp>(
op->getLoc(), memrefType, *tensorAlloc)
.getResult();
}
replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
rewriter, op, maybeResultMemRefType.value(), *srcBuffer, *shapeBuffer);
return success();
}
FailureOr<BaseMemRefType>
getBufferType(Operation *op, Value value, const BufferizationOptions &options,
SmallVector<Value> &invocationStack) const {
auto reshapeOp = cast<tensor::ReshapeOp>(op);
assert(value == reshapeOp.getResult() && "unexpected value provided");
auto maybeSourceBufferType = bufferization::getBufferType(
reshapeOp.getSource(), options, invocationStack);
if (failed(maybeSourceBufferType))
return failure();
return getMemRefTypeWithStaticIdentityLayout(
reshapeOp.getResult().getType(),
cast<BaseMemRefType>(maybeSourceBufferType.value()).getMemorySpace());
}
};
/// Analysis of ParallelInsertSliceOp.
struct ParallelInsertSliceOpInterface
: public BufferizableOpInterface::ExternalModel<
ParallelInsertSliceOpInterface, ParallelInsertSliceOp> {
AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return {};
}
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return true;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
return opOperand == parallelInsertSliceOp.getDestMutable();
}
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
OpBuilder::InsertionGuard g(rewriter);
auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
ParallelCombiningOpInterface parallelCombiningParent =
parallelInsertSliceOp.getParallelCombiningParent();
// Bufferize the op outside of the parallel combining terminator.
rewriter.setInsertionPoint(parallelCombiningParent);
// Get source and destination buffers.
FailureOr<Value> destBuffer =
getBuffer(rewriter, parallelInsertSliceOp.getDest(), options);
if (failed(destBuffer))
return failure();
FailureOr<Value> srcBuffer =
getBuffer(rewriter, parallelInsertSliceOp.getSource(), options);
if (failed(srcBuffer))
return failure();
// Take a subview of the destination buffer.
auto destBufferType = cast<MemRefType>(destBuffer->getType());
auto subviewMemRefType =
cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
parallelInsertSliceOp.getSourceType().getShape(), destBufferType,
parallelInsertSliceOp.getMixedOffsets(),
parallelInsertSliceOp.getMixedSizes(),
parallelInsertSliceOp.getMixedStrides()));
Value subview = rewriter.create<memref::SubViewOp>(
parallelInsertSliceOp.getLoc(), subviewMemRefType, *destBuffer,
parallelInsertSliceOp.getMixedOffsets(),
parallelInsertSliceOp.getMixedSizes(),
parallelInsertSliceOp.getMixedStrides());
// This memcpy will fold away if everything bufferizes in-place.
if (failed(options.createMemCpy(rewriter, parallelInsertSliceOp.getLoc(),
*srcBuffer, subview)))
return failure();
// In case the source was allocated in the same block, make sure that the
// deallocation op (if any) appears after the memcpy. By default, deallocs
// are placed before the terminator, but this does not work for ForallOp
// because the terminator does more than just yielding a value.
//
// Note: This is not a problem for the destination buffer because these are
// assumed to always bufferize in-place.
for (Operation *user : srcBuffer->getUsers()) {
if (hasEffect<MemoryEffects::Free>(user)) {
if (user->getBlock() == parallelCombiningParent->getBlock())
rewriter.moveOpBefore(user, user->getBlock()->getTerminator());
break;
}
}
// Delete the op.
rewriter.eraseOp(op);
return success();
}
};
/// Bufferization of tensor.splat. Bufferizes to a new allocation that is filled
/// with a linalg.map. Similar to tensor.generate.
struct SplatOpInterface
: public BufferizableOpInterface::ExternalModel<SplatOpInterface,
tensor::SplatOp> {
bool bufferizesToAllocation(Operation *op, Value value) const { return true; }
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
const BufferizationOptions &options) const {
OpBuilder::InsertionGuard g(rewriter);
auto splatOp = cast<tensor::SplatOp>(op);
// TODO: Implement memory space for this op.
if (options.defaultMemorySpace != Attribute())
return op->emitError("memory space not implemented yet");
// Allocate memory.
Location loc = op->getLoc();
FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
rewriter, loc, splatOp.getResult(), options,
/*copy=*/false);
if (failed(tensorAlloc))
return failure();
// Create linalg::MapOp.
auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());
auto linalgOp =
rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
/*init=*/*tensorAlloc);
Block &linalgBody = linalgOp.getMapper().emplaceBlock();
// Create linalg::IndexOps.
rewriter.setInsertionPointToStart(&linalgBody);
rewriter.create<linalg::YieldOp>(loc, splatOp.getInput());
rewriter.replaceOp(splatOp, linalgOp.getResult()[0]);
return success();
}
};
} // namespace
} // namespace tensor
} // namespace mlir
void mlir::tensor::registerBufferizableOpInterfaceExternalModels(
DialectRegistry &registry) {
registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
CastOp::attachInterface<CastOpInterface>(*ctx);
CollapseShapeOp::attachInterface<CollapseShapeOpInterface>(*ctx);
DimOp::attachInterface<DimOpInterface>(*ctx);
EmptyOp::attachInterface<EmptyOpInterface>(*ctx);
ExpandShapeOp::attachInterface<ExpandShapeOpInterface>(*ctx);
ExtractSliceOp::attachInterface<ExtractSliceOpInterface>(*ctx);
ExtractOp::attachInterface<ExtractOpInterface>(*ctx);
FromElementsOp::attachInterface<FromElementsOpInterface>(*ctx);
GenerateOp::attachInterface<GenerateOpInterface>(*ctx);
InsertOp::attachInterface<InsertOpInterface>(*ctx);
InsertSliceOp::attachInterface<InsertSliceOpInterface>(*ctx);
PadOp::attachInterface<PadOpInterface>(*ctx);
ParallelInsertSliceOp::attachInterface<ParallelInsertSliceOpInterface>(
*ctx);
RankOp::attachInterface<RankOpInterface>(*ctx);
ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
SplatOp::attachInterface<SplatOpInterface>(*ctx);
// Load additional dialects of which ops may get created.
ctx->loadDialect<arith::ArithDialect, linalg::LinalgDialect>();
});
// Bufferization requires SubsetInsertionOpInterface models. Make sure that
// they are registered.
tensor::registerSubsetOpInterfaceExternalModels(registry);
}