Make vectorization aware of loop semantics

Now that we have a dependence analysis, we can check that loops are indeed parallel and make vectorization correct. PiperOrigin-RevId: 240682727
2025-04-27 11:46:07 +00:00 · 2019-03-27 17:50:34 -07:00 · 2019-03-27 17:50:34 -07:00 · 4dc7af9da8
commit 4dc7af9da8
parent 21547ace87
5 changed files with 94 additions and 74 deletions
--- a/mlir/include/mlir/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Analysis/LoopAnalysis.h
@ -85,20 +85,20 @@ bool isAccessInvariant(Value &iv, Value &index);
 llvm::DenseSet<Value *, llvm::DenseMapInfo<Value *>>
 getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices);
 using VectorizableLoopFun = std::function<bool(AffineForOp)>;
 /// Checks whether the loop is structurally vectorizable; i.e.:
-/// 1. the loop has proper dependence semantics (parallel, reduction, etc);
+/// 1. no conditionals are nested under the loop;
-/// 2. no conditionals are nested under the loop;
+/// 2. all nested load/stores are to scalar MemRefs.
 /// 3. all nested load/stores are to scalar MemRefs.
 /// TODO(ntv): implement dependence semantics
 /// TODO(ntv): relax the no-conditionals restriction
-bool isVectorizableLoop(AffineForOp loop);
+bool isVectorizableLoopBody(AffineForOp loop);
 /// Checks whether the loop is structurally vectorizable and that all the LoadOp
 /// and StoreOp matched have access indexing functions that are are either:
 ///   1. invariant along the loop induction variable created by 'loop';
 ///   2. varying along the 'fastestVaryingDim' memory dimension.
-bool isVectorizableLoopAlongFastestVaryingMemRefDim(AffineForOp loop,
+bool isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
-                                                    unsigned fastestVaryingDim);
+    AffineForOp loop, unsigned fastestVaryingDim);
 /// Checks where SSA dominance would be violated if a for inst's body
 /// operations are shifted by the specified shifts. This method checks if a
--- a/mlir/lib/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Analysis/LoopAnalysis.cpp
@ -274,20 +274,17 @@ static bool isVectorTransferReadOrWrite(Operation &op) {
  return op.isa<VectorTransferReadOp>() || op.isa<VectorTransferWriteOp>();
 }
-using VectorizableInstFun = std::function<bool(AffineForOp, Operation &)>;
+using VectorizableOpFun = std::function<bool(AffineForOp, Operation &)>;
-static bool isVectorizableLoopWithCond(AffineForOp loop,
+static bool
-                                       VectorizableInstFun isVectorizableInst) {
+isVectorizableLoopBodyWithOpCond(AffineForOp loop,
-  auto *forInst = loop.getOperation();
+                                 VectorizableOpFun isVectorizableOp) {
-  if (!matcher::isParallelLoop(*forInst) &&
+  auto *forOp = loop.getOperation();
      !matcher::isReductionLoop(*forInst)) {
    return false;
  }
  // No vectorization across conditionals for now.
  auto conditionals = matcher::If();
  SmallVector<NestedMatch, 8> conditionalsMatched;
-  conditionals.match(forInst, &conditionalsMatched);
+  conditionals.match(forOp, &conditionalsMatched);
  if (!conditionalsMatched.empty()) {
    return false;
  }
@ -298,21 +295,21 @@ static bool isVectorizableLoopWithCond(AffineForOp loop,
           !(op.isa<AffineIfOp>() || op.isa<AffineForOp>());
  });
  SmallVector<NestedMatch, 8> regionsMatched;
-  regions.match(forInst, &regionsMatched);
+  regions.match(forOp, &regionsMatched);
  if (!regionsMatched.empty()) {
    return false;
  }
  auto vectorTransfers = matcher::Op(isVectorTransferReadOrWrite);
  SmallVector<NestedMatch, 8> vectorTransfersMatched;
-  vectorTransfers.match(forInst, &vectorTransfersMatched);
+  vectorTransfers.match(forOp, &vectorTransfersMatched);
  if (!vectorTransfersMatched.empty()) {
    return false;
  }
  auto loadAndStores = matcher::Op(matcher::isLoadOrStore);
  SmallVector<NestedMatch, 8> loadAndStoresMatched;
-  loadAndStores.match(forInst, &loadAndStoresMatched);
+  loadAndStores.match(forOp, &loadAndStoresMatched);
  for (auto ls : loadAndStoresMatched) {
    auto *op = ls.getMatchedOperation();
    auto load = op->dyn_cast<LoadOp>();
@ -324,16 +321,16 @@ static bool isVectorizableLoopWithCond(AffineForOp loop,
    if (vector) {
      return false;
    }
-    if (!isVectorizableInst(loop, *op)) {
+    if (isVectorizableOp && !isVectorizableOp(loop, *op)) {
      return false;
    }
  }
  return true;
 }
-bool mlir::isVectorizableLoopAlongFastestVaryingMemRefDim(
+bool mlir::isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
    AffineForOp loop, unsigned fastestVaryingDim) {
-  VectorizableInstFun fun([fastestVaryingDim](AffineForOp loop, Operation &op) {
+  VectorizableOpFun fun([fastestVaryingDim](AffineForOp loop, Operation &op) {
    auto load = op.dyn_cast<LoadOp>();
    auto store = op.dyn_cast<StoreOp>();
    return load ? isContiguousAccess(*loop.getInductionVar(), load,
@ -341,14 +338,11 @@ bool mlir::isVectorizableLoopAlongFastestVaryingMemRefDim(
                : isContiguousAccess(*loop.getInductionVar(), store,
                                     fastestVaryingDim);
  });
-  return isVectorizableLoopWithCond(loop, fun);
+  return isVectorizableLoopBodyWithOpCond(loop, fun);
 }
-bool mlir::isVectorizableLoop(AffineForOp loop) {
+bool mlir::isVectorizableLoopBody(AffineForOp loop) {
-  VectorizableInstFun fun(
+  return isVectorizableLoopBodyWithOpCond(loop, nullptr);
      // TODO: implement me
      [](AffineForOp loop, Operation &op) { return true; });
  return isVectorizableLoopWithCond(loop, fun);
 }
 /// Checks whether SSA dominance would be violated if a for op's body
--- a/mlir/lib/Analysis/NestedMatcher.cpp
+++ b/mlir/lib/Analysis/NestedMatcher.cpp
@ -153,18 +153,6 @@ NestedPattern For(FilterFunctionType filter, ArrayRef<NestedPattern> nested) {
      nested, [=](Operation &op) { return isAffineForOp(op) && filter(op); });
 }
 // TODO(ntv): parallel annotation on loops.
 bool isParallelLoop(Operation &op) {
  auto loop = op.cast<AffineForOp>();
  return loop || true; // loop->isParallel();
 };
 // TODO(ntv): reduction annotation on loops.
 bool isReductionLoop(Operation &op) {
  auto loop = op.cast<AffineForOp>();
  return loop || true; // loop->isReduction();
 };
 bool isLoadOrStore(Operation &op) {
  return op.isa<LoadOp>() || op.isa<StoreOp>();
 };
--- a/mlir/lib/Transforms/Vectorize.cpp
+++ b/mlir/lib/Transforms/Vectorize.cpp
@ -24,6 +24,7 @@
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Analysis/VectorAnalysis.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
@ -565,7 +566,8 @@ static llvm::cl::list<int> clFastestVaryingPattern(
 /// Forward declaration.
 static FilterFunctionType
-isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension);
+isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
                             unsigned fastestVaryingMemRefDimension);
 // Build a bunch of predetermined patterns that will be traversed in order.
 // Due to the recursive nature of NestedPatterns, this captures
@ -573,77 +575,84 @@ isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension);
 /// Note that this currently only matches 2 nested loops and will be extended.
 // TODO(ntv): support 3-D loop patterns with a common reduction loop that can
 // be matched to GEMMs.
-static std::vector<NestedPattern> defaultPatterns() {
+static std::vector<NestedPattern>
 defaultPatterns(const llvm::DenseSet<Operation *> &parallelLoops) {
  using matcher::For;
  return std::vector<NestedPattern>{
      // 3-D patterns
-      For(isVectorizableLoopPtrFactory(2),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 2),
-          For(isVectorizableLoopPtrFactory(1),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 1),
-              For(isVectorizableLoopPtrFactory(0)))),
+              For(isVectorizableLoopPtrFactory(parallelLoops, 0)))),
      // for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}}
      // test independently with:
      //   --test-fastest-varying=1 --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(1),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 1),
-          For(isVectorizableLoopPtrFactory(0))),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
      // for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}}
      // test independently with:
      //   --test-fastest-varying=2 --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(2),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 2),
-          For(isVectorizableLoopPtrFactory(0))),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
      // for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}}
      // test independently with:
      //   --test-fastest-varying=3 --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(3),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 3),
-          For(isVectorizableLoopPtrFactory(0))),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
      // for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}}
      // test independently with:
      //   --test-fastest-varying=0 --test-fastest-varying=1
-      For(isVectorizableLoopPtrFactory(0),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 0),
-          For(isVectorizableLoopPtrFactory(1))),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 1))),
      // for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}}
      // test independently with:
      //   --test-fastest-varying=0 --test-fastest-varying=2
-      For(isVectorizableLoopPtrFactory(0),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 0),
-          For(isVectorizableLoopPtrFactory(2))),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 2))),
      // for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}}
      // test independently with:
      //   --test-fastest-varying=0 --test-fastest-varying=3
-      For(isVectorizableLoopPtrFactory(0),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 0),
-          For(isVectorizableLoopPtrFactory(3))),
+          For(isVectorizableLoopPtrFactory(parallelLoops, 3))),
      // for i { A[??f(not i) , f(i)];}
      // test independently with:  --test-fastest-varying=0
-      For(isVectorizableLoopPtrFactory(0)),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 0)),
      // for i { A[??f(not i) , f(i), ?];}
      // test independently with:  --test-fastest-varying=1
-      For(isVectorizableLoopPtrFactory(1)),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 1)),
      // for i { A[??f(not i) , f(i), ?, ?];}
      // test independently with:  --test-fastest-varying=2
-      For(isVectorizableLoopPtrFactory(2)),
+      For(isVectorizableLoopPtrFactory(parallelLoops, 2)),
      // for i { A[??f(not i) , f(i), ?, ?, ?];}
      // test independently with:  --test-fastest-varying=3
-      For(isVectorizableLoopPtrFactory(3))};
+      For(isVectorizableLoopPtrFactory(parallelLoops, 3))};
 }
 /// Creates a vectorization pattern from the command line arguments.
 /// Up to 3-D patterns are supported.
 /// If the command line argument requests a pattern of higher order, returns an
 /// empty pattern list which will conservatively result in no vectorization.
-static std::vector<NestedPattern> makePatterns() {
+static std::vector<NestedPattern>
 makePatterns(const llvm::DenseSet<Operation *> &parallelLoops) {
  using matcher::For;
  if (clFastestVaryingPattern.empty()) {
-    return defaultPatterns();
+    return defaultPatterns(parallelLoops);
  }
  switch (clFastestVaryingPattern.size()) {
  case 1:
-    return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]))};
+    return {For(isVectorizableLoopPtrFactory(parallelLoops,
                                             clFastestVaryingPattern[0]))};
  case 2:
-    return {For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]),
+    return {For(
-                For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1])))};
+        isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
        For(isVectorizableLoopPtrFactory(parallelLoops,
                                         clFastestVaryingPattern[1])))};
  case 3:
    return {For(
-        isVectorizableLoopPtrFactory(clFastestVaryingPattern[0]),
+        isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
-        For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[1]),
+        For(isVectorizableLoopPtrFactory(parallelLoops,
-            For(isVectorizableLoopPtrFactory(clFastestVaryingPattern[2]))))};
+                                         clFastestVaryingPattern[1]),
            For(isVectorizableLoopPtrFactory(parallelLoops,
                                             clFastestVaryingPattern[2]))))};
  default:
    return std::vector<NestedPattern>();
  }
@ -905,10 +914,14 @@ static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
 /// once we understand better the performance implications and we are confident
 /// we can build a cost model and a search procedure.
 static FilterFunctionType
-isVectorizableLoopPtrFactory(unsigned fastestVaryingMemRefDimension) {
+isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
-  return [fastestVaryingMemRefDimension](Operation &forOp) {
+                             unsigned fastestVaryingMemRefDimension) {
  return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
    auto loop = forOp.cast<AffineForOp>();
-    return isVectorizableLoopAlongFastestVaryingMemRefDim(
+    auto parallelIt = parallelLoops.find(loop);
    if (parallelIt == parallelLoops.end())
      return false;
    return isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
        loop, fastestVaryingMemRefDimension);
  };
 }
@ -1168,7 +1181,7 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
  // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
  // TODO(ntv): implement a non-greedy profitability analysis that keeps only
  // non-intersecting patterns.
-  if (!isVectorizableLoop(loop)) {
+  if (!isVectorizableLoopBody(loop)) {
    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
    return failure();
  }
@ -1240,7 +1253,16 @@ void Vectorize::runOnFunction() {
  NestedPatternContext mlContext;
  Function &f = getFunction();
-  for (auto &pat : makePatterns()) {
+  llvm::DenseSet<Operation *> parallelLoops;
  f.walkPostOrder([&parallelLoops](Operation *op) {
    if (auto loop = op->dyn_cast<AffineForOp>()) {
      if (isLoopParallel(loop)) {
        parallelLoops.insert(op);
      }
    }
  });
  for (auto &pat : makePatterns(parallelLoops)) {
    LLVM_DEBUG(dbgs() << "\n******************************************");
    LLVM_DEBUG(dbgs() << "\n******************************************");
    LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
--- a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
+++ b/mlir/test/Transforms/Vectorize/vectorize_1d.mlir
@ -11,6 +11,7 @@
 #set0 = (i) : (i >= 0)
 // Maps introduced to vectorize fastest varying memory index.
 // CHECK-LABEL: vec1d
 func @vec1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %arg0, 0 : memref<?x?xf32>
@ -133,6 +134,7 @@ func @vec1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
   return
 }
 // CHECK-LABEL: vector_add_2d
 func @vector_add_2d(%M : index, %N : index) -> f32 {
  %A = alloc (%M, %N) : memref<?x?xf32, 0>
  %B = alloc (%M, %N) : memref<?x?xf32, 0>
@ -201,3 +203,17 @@ func @vec_rejected(%A : memref<?x?xf32>, %C : memref<?x?xf32>) {
  }
  return
 }
 // This should not vectorize due to the sequential dependence in the loop.
 // CHECK-LABEL: @vec_rejected_sequential
 func @vec_rejected_sequential(%A : memref<?xf32>) {
  %N = dim %A, 0 : memref<?xf32>
  affine.for %i = 0 to %N {
    // CHECK-NOT: vector
    %a = load %A[%i] : memref<?xf32>
    // CHECK-NOT: vector
    %ip1 = affine.apply (d0)->(d0 + 1) (%i)
    store %a, %A[%ip1] : memref<?xf32>
  }
  return
 }