2025-01-24 14:08:38 +01:00
|
|
|
// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,lower-affine,convert-scf-to-cf,convert-arith-to-llvm),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" %s | mlir-runner -O3 -e main -entry-point-result=void -shared-libs=%mlir_c_runner_utils | FileCheck %s
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2022-04-20 16:21:36 -07:00
|
|
|
func.func @main() {
|
2021-02-10 13:53:11 +01:00
|
|
|
%A = memref.alloc() : memref<16x16xf32>
|
|
|
|
%B = memref.alloc() : memref<16x16xf32>
|
|
|
|
%C = memref.alloc() : memref<16x16xf32>
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2021-10-12 23:14:57 +00:00
|
|
|
%cf1 = arith.constant 1.00000e+00 : f32
|
2020-03-27 14:50:05 +05:30
|
|
|
|
[mlir][linalg] Replace linalg.fill by OpDSL variant.
The revision removes the linalg.fill operation and renames the OpDSL generated linalg.fill_tensor operation to replace it. After the change, all named structured operations are defined via OpDSL and there are no handwritten operations left.
A side-effect of the change is that the pretty printed form changes from:
```
%1 = linalg.fill(%cst, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
```
changes to
```
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
```
Additionally, the builder signature now takes input and output value ranges as it is the case for all other OpDSL operations:
```
rewriter.create<linalg::FillOp>(loc, val, output)
```
changes to
```
rewriter.create<linalg::FillOp>(loc, ValueRange{val}, ValueRange{output})
```
All other changes remain minimal. In particular, the canonicalization patterns are the same and the `value()`, `output()`, and `result()` methods are now implemented by the FillOpInterface.
Depends On D120726
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D120728
2022-03-14 10:45:04 +00:00
|
|
|
linalg.fill ins(%cf1 : f32) outs(%A : memref<16x16xf32>)
|
|
|
|
linalg.fill ins(%cf1 : f32) outs(%B : memref<16x16xf32>)
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2022-11-21 12:45:26 +05:30
|
|
|
%num_reps = arith.constant 5 : index
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2020-06-05 17:23:15 -04:00
|
|
|
%t_start = call @rtclock() : () -> f64
|
2022-11-21 12:45:26 +05:30
|
|
|
affine.for %arg0 = 0 to %num_reps {
|
[mlir][linalg] Replace linalg.fill by OpDSL variant.
The revision removes the linalg.fill operation and renames the OpDSL generated linalg.fill_tensor operation to replace it. After the change, all named structured operations are defined via OpDSL and there are no handwritten operations left.
A side-effect of the change is that the pretty printed form changes from:
```
%1 = linalg.fill(%cst, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
```
changes to
```
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
```
Additionally, the builder signature now takes input and output value ranges as it is the case for all other OpDSL operations:
```
rewriter.create<linalg::FillOp>(loc, val, output)
```
changes to
```
rewriter.create<linalg::FillOp>(loc, ValueRange{val}, ValueRange{output})
```
All other changes remain minimal. In particular, the canonicalization patterns are the same and the `value()`, `output()`, and `result()` methods are now implemented by the FillOpInterface.
Depends On D120726
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D120728
2022-03-14 10:45:04 +00:00
|
|
|
linalg.fill ins(%cf1 : f32) outs(%C : memref<16x16xf32>)
|
2022-04-20 21:39:02 -07:00
|
|
|
func.call @sgemm_naive(%A, %B, %C) : (memref<16x16xf32>, memref<16x16xf32>, memref<16x16xf32>) -> ()
|
2020-03-27 14:50:05 +05:30
|
|
|
}
|
2020-06-05 17:23:15 -04:00
|
|
|
%t_end = call @rtclock() : () -> f64
|
2021-10-12 23:14:57 +00:00
|
|
|
%t = arith.subf %t_end, %t_start : f64
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2021-01-28 17:35:39 +00:00
|
|
|
%res = affine.load %C[0, 0]: memref<16x16xf32>
|
|
|
|
vector.print %res: f32
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2021-10-12 23:14:57 +00:00
|
|
|
%c0 = arith.constant 0 : index
|
|
|
|
%c1 = arith.constant 1 : index
|
|
|
|
%c2 = arith.constant 2 : index
|
2020-06-10 13:52:43 +00:00
|
|
|
|
2021-02-10 13:53:11 +01:00
|
|
|
%M = memref.dim %C, %c0 : memref<16x16xf32>
|
|
|
|
%N = memref.dim %C, %c1 : memref<16x16xf32>
|
|
|
|
%K = memref.dim %A, %c1 : memref<16x16xf32>
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2022-11-21 12:45:26 +05:30
|
|
|
// num_flops_per_iter = 2*M*N*K
|
2021-10-12 23:14:57 +00:00
|
|
|
%f1 = arith.muli %M, %N : index
|
|
|
|
%f2 = arith.muli %f1, %K : index
|
2022-11-21 12:45:26 +05:30
|
|
|
%num_flops_per_iter = arith.muli %c2, %f2 : index
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2022-11-21 12:45:26 +05:30
|
|
|
// num_flops_total = num_flops_per_iter * num_reps
|
|
|
|
%num_flops_total = arith.muli %num_flops_per_iter, %num_reps: index
|
|
|
|
|
|
|
|
// Print the number of flops per second
|
|
|
|
%num_flops_total_i = arith.index_cast %num_flops_total : index to i16
|
|
|
|
%num_flops_total_f = arith.uitofp %num_flops_total_i : i16 to f64
|
|
|
|
%flops_per_s = arith.divf %num_flops_total_f, %t : f64
|
|
|
|
call @printFlops(%flops_per_s) : (f64) -> ()
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2021-10-02 23:07:39 +00:00
|
|
|
memref.dealloc %A : memref<16x16xf32>
|
|
|
|
memref.dealloc %B : memref<16x16xf32>
|
|
|
|
memref.dealloc %C : memref<16x16xf32>
|
2020-03-27 14:50:05 +05:30
|
|
|
return
|
|
|
|
}
|
2021-01-28 17:35:39 +00:00
|
|
|
// CHECK: 17
|
2020-03-27 14:50:05 +05:30
|
|
|
|
2022-04-20 16:21:36 -07:00
|
|
|
func.func @sgemm_naive(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf32>, %arg2: memref<16x16xf32>) {
|
2021-10-12 23:14:57 +00:00
|
|
|
%c0 = arith.constant 0 : index
|
2020-06-05 13:41:05 -04:00
|
|
|
affine.for %arg3 = 0 to 16 {
|
|
|
|
affine.for %arg4 = 0 to 16 {
|
2021-02-10 13:53:11 +01:00
|
|
|
%m = memref.alloc() : memref<1xf32>
|
2020-06-05 13:41:05 -04:00
|
|
|
%v = affine.load %arg2[%arg3, %arg4] : memref<16x16xf32>
|
2020-03-27 14:50:05 +05:30
|
|
|
affine.store %v, %m[%c0] : memref<1xf32>
|
2020-06-05 13:41:05 -04:00
|
|
|
affine.for %arg5 = 0 to 16 {
|
|
|
|
%3 = affine.load %arg0[%arg3, %arg5] : memref<16x16xf32>
|
|
|
|
%4 = affine.load %arg1[%arg5, %arg4] : memref<16x16xf32>
|
2020-03-27 14:50:05 +05:30
|
|
|
%5 = affine.load %m[0] : memref<1xf32>
|
2021-10-12 23:14:57 +00:00
|
|
|
%6 = arith.mulf %3, %4 : f32
|
|
|
|
%7 = arith.addf %6, %5 : f32
|
2020-03-27 14:50:05 +05:30
|
|
|
affine.store %7, %m[0] : memref<1xf32>
|
|
|
|
}
|
|
|
|
%s = affine.load %m[%c0] : memref<1xf32>
|
2020-06-05 13:41:05 -04:00
|
|
|
affine.store %s, %arg2[%arg3, %arg4] : memref<16x16xf32>
|
2021-02-10 13:53:11 +01:00
|
|
|
memref.dealloc %m : memref<1xf32>
|
2020-03-27 14:50:05 +05:30
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-05-05 13:46:15 -07:00
|
|
|
func.func private @printFlops(f64)
|
2022-04-20 16:21:36 -07:00
|
|
|
func.func private @rtclock() -> f64
|