2018-11-20 08:36:07 -08:00
|
|
|
//===- VectorAnalysis.cpp - Analysis for Vectorization --------------------===//
|
|
|
|
//
|
|
|
|
// Copyright 2019 The MLIR Authors.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
#include "mlir/Analysis/VectorAnalysis.h"
|
|
|
|
#include "mlir/IR/BuiltinOps.h"
|
|
|
|
#include "mlir/IR/Statements.h"
|
[MLIR] Add VectorTransferOps
This CL implements and uses VectorTransferOps in lieu of the former custom
call op. Tests are updated accordingly.
VectorTransferOps come in 2 flavors: VectorTransferReadOp and
VectorTransferWriteOp.
VectorTransferOps can be thought of as a backend-independent
pseudo op/library call that needs to be legalized to MLIR (whiteboxed) before
it can be lowered to backend-dependent IR.
Note that the current implementation does not yet support a real permutation
map. Proper support will come in a followup CL.
VectorTransferReadOp
====================
VectorTransferReadOp performs a blocking read from a scalar memref
location into a super-vector of the same elemental type. This operation is
called 'read' by opposition to 'load' because the super-vector granularity
is generally not representable with a single hardware register. As a
consequence, memory transfers will generally be required when lowering
VectorTransferReadOp. A VectorTransferReadOp is thus a mid-level abstraction
that supports super-vectorization with non-effecting padding for full-tile
only code.
A vector transfer read has semantics similar to a vector load, with additional
support for:
1. an optional value of the elemental type of the MemRef. This value
supports non-effecting padding and is inserted in places where the
vector read exceeds the MemRef bounds. If the value is not specified,
the access is statically guaranteed to be within bounds;
2. an attribute of type AffineMap to specify a slice of the original
MemRef access and its transposition into the super-vector shape. The
permutation_map is an unbounded AffineMap that must represent a
permutation from the MemRef dim space projected onto the vector dim
space.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
...
%val = `ssa-value` : f32
// let %i, %j, %k, %l be ssa-values of type index
%v0 = vector_transfer_read %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index) ->
vector<16x32x64xf32>
%v1 = vector_transfer_read %src, %i, %j, %k, %l, %val
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index, f32) ->
vector<16x32x64xf32>
```
VectorTransferWriteOp
=====================
VectorTransferWriteOp performs a blocking write from a super-vector to
a scalar memref of the same elemental type. This operation is
called 'write' by opposition to 'store' because the super-vector
granularity is generally not representable with a single hardware register. As
a consequence, memory transfers will generally be required when lowering
VectorTransferWriteOp. A VectorTransferWriteOp is thus a mid-level
abstraction that supports super-vectorization with non-effecting padding
for full-tile only code.
A vector transfer write has semantics similar to a vector store, with
additional support for handling out-of-bounds situations.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>.
%val = `ssa-value` : vector<16x32x64xf32>
// let %i, %j, %k, %l be ssa-values of type index
vector_transfer_write %val, %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(vector<16x32x64xf32>, memref<?x?x?x?xf32>, index, index, index, index)
```
PiperOrigin-RevId: 223873234
2018-12-03 15:21:27 -08:00
|
|
|
#include "mlir/StandardOps/StandardOps.h"
|
2018-11-20 08:36:07 -08:00
|
|
|
#include "mlir/Support/Functional.h"
|
|
|
|
#include "mlir/Support/STLExtras.h"
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Implements Analysis functions specific to vectors which support
|
|
|
|
/// the vectorization and vectorization materialization passes.
|
|
|
|
///
|
|
|
|
|
|
|
|
using namespace mlir;
|
|
|
|
|
|
|
|
Optional<SmallVector<unsigned, 4>> mlir::shapeRatio(ArrayRef<int> superShape,
|
|
|
|
ArrayRef<int> subShape) {
|
|
|
|
if (superShape.size() < subShape.size()) {
|
|
|
|
return Optional<SmallVector<unsigned, 4>>();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Starting from the end, compute the integer divisors.
|
|
|
|
// Set the boolean `divides` if integral division is not possible.
|
|
|
|
std::vector<unsigned> result;
|
|
|
|
result.reserve(superShape.size());
|
|
|
|
bool divides = true;
|
|
|
|
auto divide = [÷s, &result](int superSize, int subSize) {
|
|
|
|
assert(superSize > 0 && "superSize must be > 0");
|
|
|
|
assert(subSize > 0 && "subSize must be > 0");
|
|
|
|
divides &= (superSize % subSize == 0);
|
|
|
|
result.push_back(superSize / subSize);
|
|
|
|
};
|
2018-11-21 12:34:10 -08:00
|
|
|
functional::zipApply(
|
|
|
|
divide, SmallVector<int, 8>{superShape.rbegin(), superShape.rend()},
|
|
|
|
SmallVector<int, 8>{subShape.rbegin(), subShape.rend()});
|
2018-11-20 08:36:07 -08:00
|
|
|
|
|
|
|
// If integral division does not occur, return and let the caller decide.
|
|
|
|
if (!divides) {
|
2018-11-21 12:34:10 -08:00
|
|
|
return None;
|
2018-11-20 08:36:07 -08:00
|
|
|
}
|
|
|
|
|
2018-11-21 12:34:10 -08:00
|
|
|
// At this point we computed the ratio (in reverse) for the common
|
2018-11-20 08:36:07 -08:00
|
|
|
// size. Fill with the remaining entries from the super-vector shape (still in
|
|
|
|
// reverse).
|
|
|
|
int commonSize = subShape.size();
|
|
|
|
std::copy(superShape.rbegin() + commonSize, superShape.rend(),
|
|
|
|
std::back_inserter(result));
|
|
|
|
|
|
|
|
assert(result.size() == superShape.size() &&
|
2018-11-21 12:34:10 -08:00
|
|
|
"super to sub shape ratio is not of the same size as the super rank");
|
2018-11-20 08:36:07 -08:00
|
|
|
|
|
|
|
// Reverse again to get it back in the proper order and return.
|
|
|
|
return SmallVector<unsigned, 4>{result.rbegin(), result.rend()};
|
|
|
|
}
|
|
|
|
|
|
|
|
Optional<SmallVector<unsigned, 4>> mlir::shapeRatio(VectorType superVectorType,
|
|
|
|
VectorType subVectorType) {
|
|
|
|
assert(superVectorType.getElementType() == subVectorType.getElementType() &&
|
|
|
|
"NYI: vector types must be of the same elemental type");
|
|
|
|
return shapeRatio(superVectorType.getShape(), subVectorType.getShape());
|
|
|
|
}
|
|
|
|
|
[MLIR] Add VectorTransferOps
This CL implements and uses VectorTransferOps in lieu of the former custom
call op. Tests are updated accordingly.
VectorTransferOps come in 2 flavors: VectorTransferReadOp and
VectorTransferWriteOp.
VectorTransferOps can be thought of as a backend-independent
pseudo op/library call that needs to be legalized to MLIR (whiteboxed) before
it can be lowered to backend-dependent IR.
Note that the current implementation does not yet support a real permutation
map. Proper support will come in a followup CL.
VectorTransferReadOp
====================
VectorTransferReadOp performs a blocking read from a scalar memref
location into a super-vector of the same elemental type. This operation is
called 'read' by opposition to 'load' because the super-vector granularity
is generally not representable with a single hardware register. As a
consequence, memory transfers will generally be required when lowering
VectorTransferReadOp. A VectorTransferReadOp is thus a mid-level abstraction
that supports super-vectorization with non-effecting padding for full-tile
only code.
A vector transfer read has semantics similar to a vector load, with additional
support for:
1. an optional value of the elemental type of the MemRef. This value
supports non-effecting padding and is inserted in places where the
vector read exceeds the MemRef bounds. If the value is not specified,
the access is statically guaranteed to be within bounds;
2. an attribute of type AffineMap to specify a slice of the original
MemRef access and its transposition into the super-vector shape. The
permutation_map is an unbounded AffineMap that must represent a
permutation from the MemRef dim space projected onto the vector dim
space.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
...
%val = `ssa-value` : f32
// let %i, %j, %k, %l be ssa-values of type index
%v0 = vector_transfer_read %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index) ->
vector<16x32x64xf32>
%v1 = vector_transfer_read %src, %i, %j, %k, %l, %val
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index, f32) ->
vector<16x32x64xf32>
```
VectorTransferWriteOp
=====================
VectorTransferWriteOp performs a blocking write from a super-vector to
a scalar memref of the same elemental type. This operation is
called 'write' by opposition to 'store' because the super-vector
granularity is generally not representable with a single hardware register. As
a consequence, memory transfers will generally be required when lowering
VectorTransferWriteOp. A VectorTransferWriteOp is thus a mid-level
abstraction that supports super-vectorization with non-effecting padding
for full-tile only code.
A vector transfer write has semantics similar to a vector store, with
additional support for handling out-of-bounds situations.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>.
%val = `ssa-value` : vector<16x32x64xf32>
// let %i, %j, %k, %l be ssa-values of type index
vector_transfer_write %val, %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(vector<16x32x64xf32>, memref<?x?x?x?xf32>, index, index, index, index)
```
PiperOrigin-RevId: 223873234
2018-12-03 15:21:27 -08:00
|
|
|
AffineMap mlir::makePermutationMap(MemRefType memrefType,
|
|
|
|
VectorType vectorType) {
|
|
|
|
unsigned memRefRank = memrefType.getRank();
|
|
|
|
unsigned vectorRank = vectorType.getRank();
|
|
|
|
assert(memRefRank >= vectorRank && "Broadcast not supported");
|
|
|
|
unsigned offset = memRefRank - vectorRank;
|
|
|
|
SmallVector<AffineExpr, 4> perm;
|
|
|
|
perm.reserve(memRefRank);
|
|
|
|
for (unsigned i = 0; i < vectorRank; ++i) {
|
|
|
|
perm.push_back(getAffineDimExpr(offset + i, memrefType.getContext()));
|
|
|
|
}
|
|
|
|
return AffineMap::get(memRefRank, 0, perm, {});
|
|
|
|
}
|
|
|
|
|
2018-11-20 08:36:07 -08:00
|
|
|
bool mlir::matcher::operatesOnStrictSuperVectors(const OperationStmt &opStmt,
|
|
|
|
VectorType subVectorType) {
|
|
|
|
// First, extract the vector type and ditinguish between:
|
|
|
|
// a. ops that *must* lower a super-vector (i.e. vector_transfer_read,
|
|
|
|
// vector_transfer_write); and
|
|
|
|
// b. ops that *may* lower a super-vector (all other ops).
|
2018-11-21 12:34:10 -08:00
|
|
|
// The ops that *may* lower a super-vector only do so if the super-vector to
|
|
|
|
// sub-vector ratio is striclty greater than 1. The ops that *must* lower a
|
|
|
|
// super-vector are explicitly checked for this property.
|
2018-11-20 08:36:07 -08:00
|
|
|
/// TODO(ntv): there should be a single function for all ops to do this so we
|
|
|
|
/// do not have to special case. Maybe a trait, or just a method, unclear atm.
|
|
|
|
bool mustDivide = false;
|
|
|
|
VectorType superVectorType;
|
[MLIR] Add VectorTransferOps
This CL implements and uses VectorTransferOps in lieu of the former custom
call op. Tests are updated accordingly.
VectorTransferOps come in 2 flavors: VectorTransferReadOp and
VectorTransferWriteOp.
VectorTransferOps can be thought of as a backend-independent
pseudo op/library call that needs to be legalized to MLIR (whiteboxed) before
it can be lowered to backend-dependent IR.
Note that the current implementation does not yet support a real permutation
map. Proper support will come in a followup CL.
VectorTransferReadOp
====================
VectorTransferReadOp performs a blocking read from a scalar memref
location into a super-vector of the same elemental type. This operation is
called 'read' by opposition to 'load' because the super-vector granularity
is generally not representable with a single hardware register. As a
consequence, memory transfers will generally be required when lowering
VectorTransferReadOp. A VectorTransferReadOp is thus a mid-level abstraction
that supports super-vectorization with non-effecting padding for full-tile
only code.
A vector transfer read has semantics similar to a vector load, with additional
support for:
1. an optional value of the elemental type of the MemRef. This value
supports non-effecting padding and is inserted in places where the
vector read exceeds the MemRef bounds. If the value is not specified,
the access is statically guaranteed to be within bounds;
2. an attribute of type AffineMap to specify a slice of the original
MemRef access and its transposition into the super-vector shape. The
permutation_map is an unbounded AffineMap that must represent a
permutation from the MemRef dim space projected onto the vector dim
space.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
...
%val = `ssa-value` : f32
// let %i, %j, %k, %l be ssa-values of type index
%v0 = vector_transfer_read %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index) ->
vector<16x32x64xf32>
%v1 = vector_transfer_read %src, %i, %j, %k, %l, %val
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index, f32) ->
vector<16x32x64xf32>
```
VectorTransferWriteOp
=====================
VectorTransferWriteOp performs a blocking write from a super-vector to
a scalar memref of the same elemental type. This operation is
called 'write' by opposition to 'store' because the super-vector
granularity is generally not representable with a single hardware register. As
a consequence, memory transfers will generally be required when lowering
VectorTransferWriteOp. A VectorTransferWriteOp is thus a mid-level
abstraction that supports super-vectorization with non-effecting padding
for full-tile only code.
A vector transfer write has semantics similar to a vector store, with
additional support for handling out-of-bounds situations.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>.
%val = `ssa-value` : vector<16x32x64xf32>
// let %i, %j, %k, %l be ssa-values of type index
vector_transfer_write %val, %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(vector<16x32x64xf32>, memref<?x?x?x?xf32>, index, index, index, index)
```
PiperOrigin-RevId: 223873234
2018-12-03 15:21:27 -08:00
|
|
|
if (auto read = opStmt.dyn_cast<VectorTransferReadOp>()) {
|
|
|
|
superVectorType = read->getResultType();
|
2018-11-20 08:36:07 -08:00
|
|
|
mustDivide = true;
|
[MLIR] Add VectorTransferOps
This CL implements and uses VectorTransferOps in lieu of the former custom
call op. Tests are updated accordingly.
VectorTransferOps come in 2 flavors: VectorTransferReadOp and
VectorTransferWriteOp.
VectorTransferOps can be thought of as a backend-independent
pseudo op/library call that needs to be legalized to MLIR (whiteboxed) before
it can be lowered to backend-dependent IR.
Note that the current implementation does not yet support a real permutation
map. Proper support will come in a followup CL.
VectorTransferReadOp
====================
VectorTransferReadOp performs a blocking read from a scalar memref
location into a super-vector of the same elemental type. This operation is
called 'read' by opposition to 'load' because the super-vector granularity
is generally not representable with a single hardware register. As a
consequence, memory transfers will generally be required when lowering
VectorTransferReadOp. A VectorTransferReadOp is thus a mid-level abstraction
that supports super-vectorization with non-effecting padding for full-tile
only code.
A vector transfer read has semantics similar to a vector load, with additional
support for:
1. an optional value of the elemental type of the MemRef. This value
supports non-effecting padding and is inserted in places where the
vector read exceeds the MemRef bounds. If the value is not specified,
the access is statically guaranteed to be within bounds;
2. an attribute of type AffineMap to specify a slice of the original
MemRef access and its transposition into the super-vector shape. The
permutation_map is an unbounded AffineMap that must represent a
permutation from the MemRef dim space projected onto the vector dim
space.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>
...
%val = `ssa-value` : f32
// let %i, %j, %k, %l be ssa-values of type index
%v0 = vector_transfer_read %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index) ->
vector<16x32x64xf32>
%v1 = vector_transfer_read %src, %i, %j, %k, %l, %val
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(memref<?x?x?x?xf32>, index, index, index, index, f32) ->
vector<16x32x64xf32>
```
VectorTransferWriteOp
=====================
VectorTransferWriteOp performs a blocking write from a super-vector to
a scalar memref of the same elemental type. This operation is
called 'write' by opposition to 'store' because the super-vector
granularity is generally not representable with a single hardware register. As
a consequence, memory transfers will generally be required when lowering
VectorTransferWriteOp. A VectorTransferWriteOp is thus a mid-level
abstraction that supports super-vectorization with non-effecting padding
for full-tile only code.
A vector transfer write has semantics similar to a vector store, with
additional support for handling out-of-bounds situations.
Example:
```mlir
%A = alloc(%size1, %size2, %size3, %size4) : memref<?x?x?x?xf32>.
%val = `ssa-value` : vector<16x32x64xf32>
// let %i, %j, %k, %l be ssa-values of type index
vector_transfer_write %val, %src, %i, %j, %k, %l
{permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} :
(vector<16x32x64xf32>, memref<?x?x?x?xf32>, index, index, index, index)
```
PiperOrigin-RevId: 223873234
2018-12-03 15:21:27 -08:00
|
|
|
} else if (auto write = opStmt.dyn_cast<VectorTransferWriteOp>()) {
|
|
|
|
superVectorType = write->getVectorType();
|
2018-11-20 08:36:07 -08:00
|
|
|
mustDivide = true;
|
|
|
|
} else if (opStmt.getNumResults() == 0) {
|
2018-11-21 12:34:10 -08:00
|
|
|
assert(opStmt.isa<ReturnOp>() &&
|
2018-11-20 08:36:07 -08:00
|
|
|
"NYI: assuming only return statements can have 0 results at this "
|
|
|
|
"point");
|
|
|
|
return false;
|
|
|
|
} else if (opStmt.getNumResults() == 1) {
|
|
|
|
if (auto v = opStmt.getResult(0)->getType().dyn_cast<VectorType>()) {
|
|
|
|
superVectorType = v;
|
|
|
|
} else {
|
|
|
|
// Not a vector type.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Not a vector_transfer and has more than 1 result, fail hard for now to
|
|
|
|
// wake us up when something changes.
|
|
|
|
assert(false && "NYI: statement has more than 1 result");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-11-21 12:34:10 -08:00
|
|
|
// Get the ratio.
|
|
|
|
auto ratio = shapeRatio(superVectorType, subVectorType);
|
2018-11-20 08:36:07 -08:00
|
|
|
|
|
|
|
// Sanity check.
|
2018-11-21 12:34:10 -08:00
|
|
|
assert((ratio.hasValue() || !mustDivide) &&
|
2018-11-20 08:36:07 -08:00
|
|
|
"NYI: vector_transfer instruction in which super-vector size is not an"
|
|
|
|
" integer multiple of sub-vector size");
|
|
|
|
|
|
|
|
// This catches cases that are not strictly necessary to have multiplicity but
|
|
|
|
// still aren't divisible by the sub-vector shape.
|
|
|
|
// This could be useful information if we wanted to reshape at the level of
|
|
|
|
// the vector type (but we would have to look at the compute and distinguish
|
|
|
|
// between parallel, reduction and possibly other cases.
|
2018-11-21 12:34:10 -08:00
|
|
|
if (!ratio.hasValue()) {
|
2018-11-20 08:36:07 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// A strict super-vector is at least 2 sub-vectors.
|
2018-11-21 12:34:10 -08:00
|
|
|
for (auto m : *ratio) {
|
2018-11-20 08:36:07 -08:00
|
|
|
if (m > 1) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Not a strict super-vector.
|
|
|
|
return false;
|
|
|
|
}
|