[doc] Fix HowToManuallyUseTheIndividualPiecesOfPolly

Also remove compiled binaries. llvm-svn: 343119
2025-04-24 15:06:06 +00:00 · 2018-09-26 15:22:39 +00:00 · 2018-09-26 15:22:39 +00:00 · 3b4d331d8c
commit 3b4d331d8c
parent fe7bd34b79
39 changed files with 2702 additions and 2110 deletions
--- a/polly/docs/HowToManuallyUseTheIndividualPiecesOfPolly.rst
+++ b/polly/docs/HowToManuallyUseTheIndividualPiecesOfPolly.rst
@ -21,7 +21,7 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-                clang -S -emit-llvm matmul.c -o matmul.s
+                clang -S -emit-llvm matmul.c -Xclang -disable-O0-optnone -o matmul.ll


 2. **Prepare the LLVM-IR for Polly**
@ -34,7 +34,7 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-                opt -S -polly-canonicalize matmul.s > matmul.preopt.ll
+                opt -S -polly-canonicalize matmul.ll -o matmul.preopt.ll

 3. **Show the SCoPs detected by Polly (optional)**
 --------------------------------------------------
@ -45,7 +45,7 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-                $ opt -polly-ast -analyze -q matmul.preopt.ll -polly-process-unprofitable
+                $ opt -basicaa -polly-ast -analyze matmul.preopt.ll -polly-process-unprofitable -polly-use-llvm-names

        .. code-block:: guess

@ -84,8 +84,8 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-                $ opt -view-scops -disable-output matmul.preopt.ll
-                $ opt -view-scops-only -disable-output matmul.preopt.ll
+                $ opt -polly-use-llvm-names -basicaa -view-scops -disable-output matmul.preopt.ll
+                $ opt -polly-use-llvm-names -basicaa -view-scops-only -disable-output matmul.preopt.ll

        The output for the different functions:

@ -104,7 +104,7 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-                $ opt -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable
+                $ opt -polly-use-llvm-names -basicaa -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable

        .. code-block:: guess

@ -194,7 +194,7 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-	        $ opt -polly-dependences -analyze matmul.preopt.ll -polly-process-unprofitable
+	        $ opt -basicaa -polly-use-llvm-names -polly-dependences -analyze matmul.preopt.ll -polly-process-unprofitable

        .. code-block:: guess

@ -226,7 +226,7 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-        	$ opt -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable
+        	$ opt -basicaa -polly-use-llvm-names -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable

        .. code-block:: guess

@ -254,7 +254,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-ast -analyze -polly-process-unprofitable
+		$ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-ast -analyze -polly-process-unprofitable

 	.. code-block:: c

@ -282,7 +282,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-ast -analyze -polly-process-unprofitable
+		$ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-ast -analyze -polly-process-unprofitable

 	.. code-block:: c

@ -311,7 +311,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable
+		$ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable

 	.. code-block:: c

@ -346,7 +346,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable
+		$ opt -basicaa -polly-use-llvm-names matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-ast -analyze -polly-process-unprofitable

 	.. code-block:: c

@ -383,11 +383,11 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll | opt -O3 > matmul.normalopt.ll
+		$ opt -S matmul.preopt.ll | opt -S -O3 -o matmul.normalopt.ll
 		
 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-codegen -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged.ll
+		$ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged -polly-codegen -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged.ll

 	.. code-block:: guess

@ -397,7 +397,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-codegen -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged+tiled.ll
+		$ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled -polly-codegen -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged+tiled.ll
 		
 	.. code-block:: guess

@ -407,7 +407,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged+tiled+vector.ll
+		$ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged+tiled+vector.ll

 	.. code-block:: guess

@ -417,7 +417,7 @@ performance improvement can be expected by an optimal automatic optimizer.

 	.. code-block:: console

-		$ opt matmul.preopt.ll -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-parallel -polly-process-unprofitable | opt -O3 > matmul.polly.interchanged+tiled+openmp.ll
+		$ opt -S matmul.preopt.ll -basicaa -polly-use-llvm-names -polly-import-jscop -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen -polly-vectorizer=polly -polly-parallel -polly-process-unprofitable | opt -S -O3 -o matmul.polly.interchanged+tiled+openmp.ll

 	.. code-block:: guess

@ -431,11 +431,16 @@ performance improvement can be expected by an optimal automatic optimizer.

        .. code-block:: console

-	        $ llc matmul.normalopt.ll -o matmul.normalopt.s && gcc matmul.normalopt.s -o matmul.normalopt.exe
-	        $ llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s && gcc matmul.polly.interchanged.s -o matmul.polly.interchanged.exe
-	        $ llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s && gcc matmul.polly.interchanged+tiled.s -o matmul.polly.interchanged+tiled.exe
-	        $ llc matmul.polly.interchanged+tiled+vector.ll -o matmul.polly.interchanged+tiled+vector.s && gcc matmul.polly.interchanged+tiled+vector.s -o matmul.polly.interchanged+tiled+vector.exe
-        	$ llc matmul.polly.interchanged+tiled+vector+openmp.ll -o matmul.polly.interchanged+tiled+vector+openmp.s && gcc -fopenmp matmul.polly.interchanged+tiled+vector+openmp.s -o matmul.polly.interchanged+tiled+vector+openmp.exe
+	        $ llc matmul.normalopt.ll -o matmul.normalopt.s -relocation-model=pic
+	        $ gcc matmul.normalopt.s -o matmul.normalopt.exe
+	        $ llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s -relocation-model=pic
+	        $ gcc matmul.polly.interchanged.s -o matmul.polly.interchanged.exe
+	        $ llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s -relocation-model=pic
+	        $ gcc matmul.polly.interchanged+tiled.s -o matmul.polly.interchanged+tiled.exe
+	        $ llc matmul.polly.interchanged+tiled+vector.ll -o matmul.polly.interchanged+tiled+vector.s -relocation-model=pic
+	        $ gcc matmul.polly.interchanged+tiled+vector.s -o matmul.polly.interchanged+tiled+vector.exe
+        	$ llc matmul.polly.interchanged+tiled+vector+openmp.ll -o matmul.polly.interchanged+tiled+vector+openmp.s -relocation-model=pic
+        	$ gcc matmul.polly.interchanged+tiled+vector+openmp.s -lgomp -o matmul.polly.interchanged+tiled+vector+openmp.exe

 11. **Compare the runtime of the executables**
 ----------------------------------------------
--- a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop
+++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop
@ -1,33 +1,39 @@
 {
-   "arrays" : [
+   "arrays": [
      {
-         "name" : "MemRef_A",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_B",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      }
   ],
-   "context" : "{  :  }",
-   "name" : "%for.cond1.preheader---%for.end19",
-   "statements" : [
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end19",
+   "statements": [
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }"
            },
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
-         "name" : "Stmt_for_body3",
-         "schedule" : "{ Stmt_for_body3[i0, i1] -> [i0, i1] }"
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }"
      }
   ]
-}
+}
--- a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged
+++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged
@ -0,0 +1,39 @@
+{
+   "arrays": [
+      {
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
+      },
+      {
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
+      }
+   ],
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end19",
+   "statements": [
+      {
+         "accesses": [
+            {
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }"
+            },
+            {
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }"
+            }
+         ],
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }"
+      }
+   ]
+}
--- a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled
+++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled
@ -0,0 +1,39 @@
+{
+   "arrays": [
+      {
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
+      },
+      {
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
+      }
+   ],
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end19",
+   "statements": [
+      {
+         "accesses": [
+            {
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }"
+            },
+            {
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }"
+            }
+         ],
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }"
+      }
+   ]
+}
--- a/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled+vector
+++ b/polly/docs/experiments/matmul/init_array___%for.cond1.preheader---%for.end19.jscop.interchanged+tiled+vector
@ -0,0 +1,39 @@
+{
+   "arrays": [
+      {
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
+      },
+      {
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
+      }
+   ],
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end19",
+   "statements": [
+      {
+         "accesses": [
+            {
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_A[i0, i1] }"
+            },
+            {
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_B[i0, i1] }"
+            }
+         ],
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1] }"
+      }
+   ]
+}
--- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop
+++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop
@ -1,57 +1,66 @@
 {
-   "arrays" : [
+   "arrays": [
      {
-         "name" : "MemRef_C",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_C",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_A",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_B",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      }
   ],
-   "context" : "{  :  }",
-   "name" : "%for.cond1.preheader---%for.end30",
-   "statements" : [
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end30",
+   "statements": [
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
-         "name" : "Stmt_for_body3",
-         "schedule" : "{ Stmt_for_body3[i0, i1] -> [i0, i1, 0, 0] }"
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [i0, i1, 0, 0] }"
      },
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
            },
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
-         "name" : "Stmt_for_body8",
-         "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [i0, i1, 1, i2] }"
+         "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
+         "name": "Stmt_for_body8",
+         "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [i0, i1, 1, i2] }"
      }
   ]
-}
+}
--- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged
+++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged
@ -1,57 +1,66 @@
 {
-   "arrays" : [
+   "arrays": [
      {
-         "name" : "MemRef_C",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_C",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_A",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_B",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      }
   ],
-   "context" : "{  :  }",
-   "name" : "%for.cond1.preheader---%for.end30",
-   "statements" : [
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end30",
+   "statements": [
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
-         "name" : "Stmt_for_body3",
-         "schedule" : "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0] }"
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0] }"
      },
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
            },
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
-         "name" : "Stmt_for_body8",
-         "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [1, i0, i2, i1] }"
+         "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
+         "name": "Stmt_for_body8",
+         "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [1, i0, i2, i1] }"
      }
   ]
-}
+}
--- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled
+++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled
@ -1,57 +1,66 @@
 {
-   "arrays" : [
+   "arrays": [
      {
-         "name" : "MemRef_C",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_C",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_A",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_B",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      }
   ],
-   "context" : "{  :  }",
-   "name" : "%for.cond1.preheader---%for.end30",
-   "statements" : [
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end30",
+   "statements": [
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
-         "name" : "Stmt_for_body3",
-         "schedule" : "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0 ] }"
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0 ] }"
      },
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
            },
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
-         "name" : "Stmt_for_body8",
-         "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, i1]: o0 <= i0 < o0 + 64 and o1 <= i1 < o1 + 64 and o2 <= i2 < o2 + 64 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 }"
+         "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
+         "name": "Stmt_for_body8",
+         "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, i1]: o0 <= i0 < o0 + 64 and o1 <= i1 < o1 + 64 and o2 <= i2 < o2 + 64 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 }"
      }
   ]
-}
+}
--- a/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled+vector
+++ b/polly/docs/experiments/matmul/main___%for.cond1.preheader---%for.end30.jscop.interchanged+tiled+vector
@ -1,57 +1,66 @@
 {
-   "arrays" : [
+   "arrays": [
      {
-         "name" : "MemRef_C",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_C",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_A",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_A",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      },
      {
-         "name" : "MemRef_B",
-         "sizes" : [ "1536" ],
-         "type" : "float"
+         "name": "MemRef_B",
+         "sizes": [
+            "*",
+            "1536"
+         ],
+         "type": "float"
      }
   ],
-   "context" : "{  :  }",
-   "name" : "%for.cond1.preheader---%for.end30",
-   "statements" : [
+   "context": "{  :  }",
+   "name": "%for.cond1.preheader---%for.end30",
+   "statements": [
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body3[i0, i1] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
-         "name" : "Stmt_for_body3",
-         "schedule" : "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0, 0 ] }"
+         "domain": "{ Stmt_for_body3[i0, i1] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 }",
+         "name": "Stmt_for_body3",
+         "schedule": "{ Stmt_for_body3[i0, i1] -> [0, i0, i1, 0, 0, 0, 0, 0 ] }"
      },
      {
-         "accesses" : [
+         "accesses": [
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_A[i0, i2] }"
            },
            {
-               "kind" : "read",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
+               "kind": "read",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_B[i2, i1] }"
            },
            {
-               "kind" : "write",
-               "relation" : "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
+               "kind": "write",
+               "relation": "{ Stmt_for_body8[i0, i1, i2] -> MemRef_C[i0, i1] }"
            }
         ],
-         "domain" : "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
-         "name" : "Stmt_for_body8",
-         "schedule" : "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, oo1, i1]: o0 <= i0 < o0 + 64 and o1 <= oo1 < o1 + 64 and o2 <= i2 < o2 + 64 and oo1 <= i1 < oo1 + 4 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 and oo1 % 4 = 0 }"
+         "domain": "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1535 and 0 <= i1 <= 1535 and 0 <= i2 <= 1535 }",
+         "name": "Stmt_for_body8",
+         "schedule": "{ Stmt_for_body8[i0, i1, i2] -> [1, o0, o1, o2, i0, i2, oo1, i1]: o0 <= i0 < o0 + 64 and o1 <= oo1 < o1 + 64 and o2 <= i2 < o2 + 64 and oo1 <= i1 < oo1 + 4 and o0 % 64 = 0 and o1 % 64 = 0 and o2 % 64 = 0 and oo1 % 4 = 0 }"
      }
   ]
-}
+}
--- a/polly/docs/experiments/matmul/matmul.ll
+++ b/polly/docs/experiments/matmul/matmul.ll
@ -6,15 +6,15 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
 %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }

-@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
-@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
-@stdout = external global %struct._IO_FILE*, align 8
+@A = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16
+@B = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16
+@stdout = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1
-@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@C = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16
@.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1

-; Function Attrs: nounwind uwtable
-define void @init_array() #0 {
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init_array() #0 {
 entry:
  %i = alloca i32, align 4
  %j = alloca i32, align 4
@ -44,12 +44,12 @@ for.body3:                                        ; preds = %for.cond1
  %conv = sitofp i32 %add to double
  %div = fdiv double %conv, 2.000000e+00
  %conv4 = fptrunc double %div to float
-  %4 = load i32, i32* %j, align 4
+  %4 = load i32, i32* %i, align 4
  %idxprom = sext i32 %4 to i64
-  %5 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom
+  %5 = load i32, i32* %j, align 4
  %idxprom5 = sext i32 %5 to i64
-  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom5
-  %arrayidx6 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom
+  %arrayidx6 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom5
  store float %conv4, float* %arrayidx6, align 4
  %6 = load i32, i32* %i, align 4
  %7 = load i32, i32* %j, align 4
@ -59,12 +59,12 @@ for.body3:                                        ; preds = %for.cond1
  %conv10 = sitofp i32 %add9 to double
  %div11 = fdiv double %conv10, 2.000000e+00
  %conv12 = fptrunc double %div11 to float
-  %8 = load i32, i32* %j, align 4
+  %8 = load i32, i32* %i, align 4
  %idxprom13 = sext i32 %8 to i64
-  %9 = load i32, i32* %i, align 4
-  %idxprom14 = sext i32 %9 to i64
-  %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom14
-  %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i64 0, i64 %idxprom13
+  %arrayidx14 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom13
+  %9 = load i32, i32* %j, align 4
+  %idxprom15 = sext i32 %9 to i64
+  %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx14, i64 0, i64 %idxprom15
  store float %conv12, float* %arrayidx16, align 4
  br label %for.inc

@ -87,8 +87,8 @@ for.end19:                                        ; preds = %for.cond
  ret void
 }

-; Function Attrs: nounwind uwtable
-define void @print_array() #0 {
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @print_array() #0 {
 entry:
  %i = alloca i32, align 4
  %j = alloca i32, align 4
@ -111,12 +111,12 @@ for.cond1:                                        ; preds = %for.inc, %for.body

 for.body3:                                        ; preds = %for.cond1
  %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
-  %3 = load i32, i32* %j, align 4
+  %3 = load i32, i32* %i, align 4
  %idxprom = sext i32 %3 to i64
-  %4 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom
+  %4 = load i32, i32* %j, align 4
  %idxprom4 = sext i32 %4 to i64
-  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom4
-  %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom
+  %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom4
  %5 = load float, float* %arrayidx5, align 4
  %conv = fpext float %5 to double
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), double %conv)
@ -154,10 +154,10 @@ for.end12:                                        ; preds = %for.cond
  ret void
 }

-declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1

-; Function Attrs: nounwind uwtable
-define i32 @main() #0 {
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
 entry:
  %retval = alloca i32, align 4
  %i = alloca i32, align 4
@ -185,12 +185,12 @@ for.cond1:                                        ; preds = %for.inc25, %for.bod
  br i1 %cmp2, label %for.body3, label %for.end27

 for.body3:                                        ; preds = %for.cond1
-  %2 = load i32, i32* %j, align 4
+  %2 = load i32, i32* %i, align 4
  %idxprom = sext i32 %2 to i64
-  %3 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom
+  %3 = load i32, i32* %j, align 4
  %idxprom4 = sext i32 %3 to i64
-  %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom4
-  %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom
+  %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom4
  store float 0.000000e+00, float* %arrayidx5, align 4
  store i32 0, i32* %k, align 4
  br label %for.cond6
@ -201,35 +201,35 @@ for.cond6:                                        ; preds = %for.inc, %for.body3
  br i1 %cmp7, label %for.body8, label %for.end

 for.body8:                                        ; preds = %for.cond6
-  %5 = load i32, i32* %j, align 4
+  %5 = load i32, i32* %i, align 4
  %idxprom9 = sext i32 %5 to i64
-  %6 = load i32, i32* %i, align 4
-  %idxprom10 = sext i32 %6 to i64
-  %arrayidx11 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom10
-  %arrayidx12 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx11, i64 0, i64 %idxprom9
+  %arrayidx10 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom9
+  %6 = load i32, i32* %j, align 4
+  %idxprom11 = sext i32 %6 to i64
+  %arrayidx12 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx10, i64 0, i64 %idxprom11
  %7 = load float, float* %arrayidx12, align 4
-  %8 = load i32, i32* %k, align 4
+  %8 = load i32, i32* %i, align 4
  %idxprom13 = sext i32 %8 to i64
-  %9 = load i32, i32* %i, align 4
-  %idxprom14 = sext i32 %9 to i64
-  %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom14
-  %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i64 0, i64 %idxprom13
+  %arrayidx14 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom13
+  %9 = load i32, i32* %k, align 4
+  %idxprom15 = sext i32 %9 to i64
+  %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx14, i64 0, i64 %idxprom15
  %10 = load float, float* %arrayidx16, align 4
-  %11 = load i32, i32* %j, align 4
+  %11 = load i32, i32* %k, align 4
  %idxprom17 = sext i32 %11 to i64
-  %12 = load i32, i32* %k, align 4
-  %idxprom18 = sext i32 %12 to i64
-  %arrayidx19 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom18
-  %arrayidx20 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx19, i64 0, i64 %idxprom17
+  %arrayidx18 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom17
+  %12 = load i32, i32* %j, align 4
+  %idxprom19 = sext i32 %12 to i64
+  %arrayidx20 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx18, i64 0, i64 %idxprom19
  %13 = load float, float* %arrayidx20, align 4
  %mul = fmul float %10, %13
  %add = fadd float %7, %mul
-  %14 = load i32, i32* %j, align 4
+  %14 = load i32, i32* %i, align 4
  %idxprom21 = sext i32 %14 to i64
-  %15 = load i32, i32* %i, align 4
-  %idxprom22 = sext i32 %15 to i64
-  %arrayidx23 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom22
-  %arrayidx24 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx23, i64 0, i64 %idxprom21
+  %arrayidx22 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom21
+  %15 = load i32, i32* %j, align 4
+  %idxprom23 = sext i32 %15 to i64
+  %arrayidx24 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx22, i64 0, i64 %idxprom23
  store float %add, float* %arrayidx24, align 4
  br label %for.inc

@ -261,9 +261,11 @@ for.end30:                                        ; preds = %for.cond
  ret i32 0
 }

-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

-!llvm.ident = !{!0}
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}

-!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 081569d9a29c7bc827b2d41f8e62891bbc895e2f) (http://llvm.org/git/llvm.git e117e506536626352e8e47f6c72cd6e2a276622c)"}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"}
--- a/polly/docs/experiments/matmul/matmul.normalopt.exe
+++ b/polly/docs/experiments/matmul/matmul.normalopt.exe
--- a/polly/docs/experiments/matmul/matmul.normalopt.ll
+++ b/polly/docs/experiments/matmul/matmul.normalopt.ll
--- a/polly/docs/experiments/matmul/matmul.normalopt.s
+++ b/polly/docs/experiments/matmul/matmul.normalopt.s
@ -1,263 +1,235 @@
-	.file	"matmul.normalopt.ll"
+	.text
+	.file	"matmul.c"
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
+	.p2align	3               # -- Begin function init_array
 .LCPI0_0:
 	.quad	4602678819172646912     # double 0.5
 	.text
 	.globl	init_array
-	.align	16, 0x90
+	.p2align	4, 0x90
 	.type	init_array,@function
 init_array:                             # @init_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp2:
 	.cfi_def_cfa_offset 16
-.Ltmp3:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp4:
 	.cfi_def_cfa_register %rbp
+	leaq	B(%rip), %rax
+	leaq	A(%rip), %rcx
 	xorl	%r8d, %r8d
-	vmovsd	.LCPI0_0(%rip), %xmm0
-	.align	16, 0x90
+	movsd	.LCPI0_0(%rip), %xmm0   # xmm0 = mem[0],zero
+	xorl	%r9d, %r9d
+	.p2align	4, 0x90
 .LBB0_1:                                # %for.cond1.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
+	movl	$1, %edi
+	xorl	%edx, %edx
+	.p2align	4, 0x90
 .LBB0_2:                                # %for.body3
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%r8d, %edx
 	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%r8, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
+	andl	$1022, %esi             # imm = 0x3FE
+	orl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, -4(%rcx,%rdi,4)
+	movss	%xmm1, -4(%rax,%rdi,4)
+	leal	(%r9,%rdx), %esi
+	andl	$1023, %esi             # imm = 0x3FF
+	addl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, (%rcx,%rdi,4)
+	movss	%xmm1, (%rax,%rdi,4)
+	addq	$2, %rdi
+	addl	%r8d, %edx
+	cmpq	$1537, %rdi             # imm = 0x601
 	jne	.LBB0_2
-# BB#3:                                 # %for.inc17
+# %bb.3:                                # %for.inc17
                                        #   in Loop: Header=BB0_1 Depth=1
-	incq	%r8
-	cmpq	$1536, %r8              # imm = 0x600
+	addq	$1, %r9
+	addq	$6144, %rax             # imm = 0x1800
+	addq	$6144, %rcx             # imm = 0x1800
+	addl	$2, %r8d
+	cmpq	$1536, %r9              # imm = 0x600
 	jne	.LBB0_1
-# BB#4:                                 # %for.end19
+# %bb.4:                                # %for.end19
 	popq	%rbp
-	ret
-.Ltmp5:
-	.size	init_array, .Ltmp5-init_array
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end0:
+	.size	init_array, .Lfunc_end0-init_array
 	.cfi_endproc
-
-	.globl	print_array
-	.align	16, 0x90
+                                        # -- End function
+	.globl	print_array             # -- Begin function print_array
+	.p2align	4, 0x90
 	.type	print_array,@function
 print_array:                            # @print_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp9:
 	.cfi_def_cfa_offset 16
-.Ltmp10:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp11:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
+	pushq	%r13
 	pushq	%r12
 	pushq	%rbx
-.Ltmp12:
-	.cfi_offset %rbx, -48
-.Ltmp13:
-	.cfi_offset %r12, -40
-.Ltmp14:
+	pushq	%rax
+	.cfi_offset %rbx, -56
+	.cfi_offset %r12, -48
+	.cfi_offset %r13, -40
 	.cfi_offset %r14, -32
-.Ltmp15:
 	.cfi_offset %r15, -24
-	xorl	%r14d, %r14d
-	movl	$C, %r15d
-	.align	16, 0x90
+	leaq	C(%rip), %r13
+	xorl	%eax, %eax
+	movl	$3435973837, %r12d      # imm = 0xCCCCCCCD
+	leaq	.L.str(%rip), %r14
+	.p2align	4, 0x90
 .LBB1_1:                                # %for.cond1.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB1_2 Depth 2
-	movq	stdout(%rip), %rax
-	movq	%r15, %r12
+	movq	%rax, -48(%rbp)         # 8-byte Spill
+	movq	stdout(%rip), %rsi
 	xorl	%ebx, %ebx
-	.align	16, 0x90
+	.p2align	4, 0x90
 .LBB1_2:                                # %for.body3
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	vmovss	(%r12), %xmm0
-	vcvtss2sd	%xmm0, %xmm0, %xmm0
-	movq	%rax, %rdi
-	movl	$.L.str, %esi
+	movl	%ebx, %eax
+	imulq	%r12, %rax
+	shrq	$38, %rax
+	leal	(%rax,%rax,4), %r15d
+	shll	$4, %r15d
+	addl	$79, %r15d
+	movss	(%r13,%rbx,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	cvtss2sd	%xmm0, %xmm0
 	movb	$1, %al
+	movq	%rsi, %rdi
+	movq	%r14, %rsi
 	callq	fprintf
-	movslq	%ebx, %rax
-	imulq	$1717986919, %rax, %rcx # imm = 0x66666667
-	movq	%rcx, %rdx
-	shrq	$63, %rdx
-	sarq	$37, %rcx
-	addl	%edx, %ecx
-	imull	$80, %ecx, %ecx
-	subl	%ecx, %eax
-	cmpl	$79, %eax
+	cmpl	%ebx, %r15d
 	jne	.LBB1_4
-# BB#3:                                 # %if.then
+# %bb.3:                                # %if.then
                                        #   in Loop: Header=BB1_2 Depth=2
 	movq	stdout(%rip), %rsi
 	movl	$10, %edi
-	callq	fputc
+	callq	fputc@PLT
 .LBB1_4:                                # %for.inc
                                        #   in Loop: Header=BB1_2 Depth=2
-	addq	$4, %r12
-	incq	%rbx
-	movq	stdout(%rip), %rax
+	addq	$1, %rbx
+	movq	stdout(%rip), %rsi
 	cmpq	$1536, %rbx             # imm = 0x600
 	jne	.LBB1_2
-# BB#5:                                 # %for.end
+# %bb.5:                                # %for.end
                                        #   in Loop: Header=BB1_1 Depth=1
 	movl	$10, %edi
-	movq	%rax, %rsi
-	callq	fputc
-	addq	$6144, %r15             # imm = 0x1800
-	incq	%r14
-	cmpq	$1536, %r14             # imm = 0x600
+	callq	fputc@PLT
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	addq	$1, %rax
+	addq	$6144, %r13             # imm = 0x1800
+	cmpq	$1536, %rax             # imm = 0x600
 	jne	.LBB1_1
-# BB#6:                                 # %for.end12
+# %bb.6:                                # %for.end12
+	addq	$8, %rsp
 	popq	%rbx
 	popq	%r12
+	popq	%r13
 	popq	%r14
 	popq	%r15
 	popq	%rbp
-	ret
-.Ltmp16:
-	.size	print_array, .Ltmp16-print_array
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end1:
+	.size	print_array, .Lfunc_end1-print_array
 	.cfi_endproc
-
-	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
-.LCPI2_0:
-	.quad	4602678819172646912     # double 0.5
-	.text
-	.globl	main
-	.align	16, 0x90
+                                        # -- End function
+	.globl	main                    # -- Begin function main
+	.p2align	4, 0x90
 	.type	main,@function
 main:                                   # @main
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp19:
 	.cfi_def_cfa_offset 16
-.Ltmp20:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp21:
 	.cfi_def_cfa_register %rbp
-	xorl	%r8d, %r8d
-	vmovsd	.LCPI2_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB2_1:                                # %for.cond1.preheader.i
+	callq	init_array
+	leaq	A(%rip), %rax
+	xorl	%r10d, %r10d
+	leaq	B(%rip), %r8
+	leaq	C(%rip), %r9
+	.p2align	4, 0x90
+.LBB2_1:                                # %for.cond1.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB2_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB2_2:                                # %for.body3.i
+                                        #       Child Loop BB2_3 Depth 3
+	movq	%r8, %rsi
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB2_2:                                # %for.body3
                                        #   Parent Loop BB2_1 Depth=1
-                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%r8d, %edx
-	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%r8, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
-	jne	.LBB2_2
-# BB#3:                                 # %for.inc17.i
-                                        #   in Loop: Header=BB2_1 Depth=1
-	incq	%r8
-	cmpq	$1536, %r8              # imm = 0x600
-	jne	.LBB2_1
-# BB#4:
-	xorl	%r8d, %r8d
-	movl	$A, %r9d
-	.align	16, 0x90
-.LBB2_5:                                # %for.cond1.preheader
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_6 Depth 2
-                                        #       Child Loop BB2_7 Depth 3
-	leaq	(%r8,%r8,2), %rdx
-	shlq	$11, %rdx
-	leaq	C(%rdx), %rsi
-	xorl	%edi, %edi
-	.align	16, 0x90
-.LBB2_6:                                # %for.body3
-                                        #   Parent Loop BB2_5 Depth=1
                                        # =>  This Loop Header: Depth=2
-                                        #       Child Loop BB2_7 Depth 3
-	movl	$0, (%rsi)
-	vxorps	%xmm0, %xmm0, %xmm0
-	movq	$-9437184, %rax         # imm = 0xFFFFFFFFFF700000
-	movq	%r9, %rcx
-	.align	16, 0x90
-.LBB2_7:                                # %for.body8
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_6 Depth=2
+                                        #       Child Loop BB2_3 Depth 3
+	leaq	(%r10,%r10,2), %rcx
+	shlq	$11, %rcx
+	addq	%r9, %rcx
+	leaq	(%rcx,%rdx,4), %r11
+	movl	$0, (%rcx,%rdx,4)
+	xorps	%xmm0, %xmm0
+	movl	$2, %ecx
+	movq	%rsi, %rdi
+	.p2align	4, 0x90
+.LBB2_3:                                # %for.body8
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
                                        # =>    This Inner Loop Header: Depth=3
-	vmovss	(%rcx), %xmm1
-	vmulss	B+9437184(%rax,%rdi,4), %xmm1, %xmm1
-	vaddss	%xmm1, %xmm0, %xmm0
-	addq	$4, %rcx
+	movss	-8(%rax,%rcx,4), %xmm1  # xmm1 = mem[0],zero,zero,zero
+	mulss	(%rdi), %xmm1
+	movss	-4(%rax,%rcx,4), %xmm2  # xmm2 = mem[0],zero,zero,zero
+	addss	%xmm0, %xmm1
+	mulss	6144(%rdi), %xmm2
+	addss	%xmm1, %xmm2
+	movss	(%rax,%rcx,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	mulss	12288(%rdi), %xmm0
+	addss	%xmm2, %xmm0
+	addq	$3, %rcx
+	addq	$18432, %rdi            # imm = 0x4800
+	cmpq	$1538, %rcx             # imm = 0x602
+	jne	.LBB2_3
+# %bb.4:                                # %for.inc25
+                                        #   in Loop: Header=BB2_2 Depth=2
+	movss	%xmm0, (%r11)
+	addq	$1, %rdx
+	addq	$4, %rsi
+	cmpq	$1536, %rdx             # imm = 0x600
+	jne	.LBB2_2
+# %bb.5:                                # %for.inc28
+                                        #   in Loop: Header=BB2_1 Depth=1
+	addq	$1, %r10
 	addq	$6144, %rax             # imm = 0x1800
-	jne	.LBB2_7
-# BB#8:                                 # %for.inc25
-                                        #   in Loop: Header=BB2_6 Depth=2
-	vmovss	%xmm0, (%rsi)
-	leaq	C+4(%rdx,%rdi,4), %rsi
-	incq	%rdi
-	cmpq	$1536, %rdi             # imm = 0x600
-	jne	.LBB2_6
-# BB#9:                                 # %for.inc28
-                                        #   in Loop: Header=BB2_5 Depth=1
-	addq	$6144, %r9              # imm = 0x1800
-	incq	%r8
-	cmpq	$1536, %r8              # imm = 0x600
-	jne	.LBB2_5
-# BB#10:                                # %for.end30
+	cmpq	$1536, %r10             # imm = 0x600
+	jne	.LBB2_1
+# %bb.6:                                # %for.end30
 	xorl	%eax, %eax
 	popq	%rbp
-	ret
-.Ltmp22:
-	.size	main, .Ltmp22-main
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end2:
+	.size	main, .Lfunc_end2-main
 	.cfi_endproc
-
+                                        # -- End function
 	.type	A,@object               # @A
 	.comm	A,9437184,16
 	.type	B,@object               # @B
@ -265,10 +237,11 @@ main:                                   # @main
 	.type	.L.str,@object          # @.str
 	.section	.rodata.str1.1,"aMS",@progbits,1
 .L.str:
-	.asciz	 "%lf "
+	.asciz	"%lf "
 	.size	.L.str, 5

 	.type	C,@object               # @C
 	.comm	C,9437184,16

+	.ident	"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
 	.section	".note.GNU-stack","",@progbits
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s
@ -1,385 +1,645 @@
-	.file	"matmul.polly.interchanged+tiled+vector.ll"
+	.text
+	.file	"matmul.c"
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
+	.p2align	3               # -- Begin function init_array
 .LCPI0_0:
 	.quad	4602678819172646912     # double 0.5
 	.text
 	.globl	init_array
-	.align	16, 0x90
+	.p2align	4, 0x90
 	.type	init_array,@function
 init_array:                             # @init_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp2:
 	.cfi_def_cfa_offset 16
-.Ltmp3:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp4:
 	.cfi_def_cfa_register %rbp
+	leaq	B(%rip), %rax
+	leaq	A(%rip), %rcx
 	xorl	%r8d, %r8d
-	vmovsd	.LCPI0_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB0_1:                                # %polly.loop_preheader3
+	movsd	.LCPI0_0(%rip), %xmm0   # xmm0 = mem[0],zero
+	xorl	%r9d, %r9d
+	.p2align	4, 0x90
+.LBB0_1:                                # %polly.loop_header
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB0_2:                                # %polly.loop_header2
+	movl	$1, %edi
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB0_2:                                # %polly.loop_header1
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%r8d, %edx
 	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%r8, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
+	andl	$1022, %esi             # imm = 0x3FE
+	orl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, -4(%rcx,%rdi,4)
+	movss	%xmm1, -4(%rax,%rdi,4)
+	leal	(%r9,%rdx), %esi
+	andl	$1023, %esi             # imm = 0x3FF
+	addl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, (%rcx,%rdi,4)
+	movss	%xmm1, (%rax,%rdi,4)
+	addq	$2, %rdi
+	addl	%r8d, %edx
+	cmpq	$1537, %rdi             # imm = 0x601
 	jne	.LBB0_2
-# BB#3:                                 # %polly.loop_exit4
+# %bb.3:                                # %polly.loop_exit3
                                        #   in Loop: Header=BB0_1 Depth=1
-	incq	%r8
-	cmpq	$1536, %r8              # imm = 0x600
+	addq	$1, %r9
+	addq	$6144, %rax             # imm = 0x1800
+	addq	$6144, %rcx             # imm = 0x1800
+	addl	$2, %r8d
+	cmpq	$1536, %r9              # imm = 0x600
 	jne	.LBB0_1
-# BB#4:                                 # %polly.loop_exit
+# %bb.4:                                # %polly.exiting
 	popq	%rbp
-	ret
-.Ltmp5:
-	.size	init_array, .Ltmp5-init_array
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end0:
+	.size	init_array, .Lfunc_end0-init_array
 	.cfi_endproc
-
-	.globl	print_array
-	.align	16, 0x90
+                                        # -- End function
+	.globl	print_array             # -- Begin function print_array
+	.p2align	4, 0x90
 	.type	print_array,@function
 print_array:                            # @print_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp9:
 	.cfi_def_cfa_offset 16
-.Ltmp10:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp11:
-	.cfi_def_cfa_register %rbp
-	pushq	%r15
-	pushq	%r14
-	pushq	%r12
-	pushq	%rbx
-.Ltmp12:
-	.cfi_offset %rbx, -48
-.Ltmp13:
-	.cfi_offset %r12, -40
-.Ltmp14:
-	.cfi_offset %r14, -32
-.Ltmp15:
-	.cfi_offset %r15, -24
-	xorl	%r14d, %r14d
-	movl	$C, %r15d
-	.align	16, 0x90
-.LBB1_1:                                # %for.cond1.preheader
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB1_2 Depth 2
-	movq	stdout(%rip), %rax
-	movq	%r15, %r12
-	xorl	%ebx, %ebx
-	.align	16, 0x90
-.LBB1_2:                                # %for.body3
-                                        #   Parent Loop BB1_1 Depth=1
-                                        # =>  This Inner Loop Header: Depth=2
-	vmovss	(%r12), %xmm0
-	vcvtss2sd	%xmm0, %xmm0, %xmm0
-	movq	%rax, %rdi
-	movl	$.L.str, %esi
-	movb	$1, %al
-	callq	fprintf
-	movslq	%ebx, %rax
-	imulq	$1717986919, %rax, %rcx # imm = 0x66666667
-	movq	%rcx, %rdx
-	shrq	$63, %rdx
-	sarq	$37, %rcx
-	addl	%edx, %ecx
-	imull	$80, %ecx, %ecx
-	subl	%ecx, %eax
-	cmpl	$79, %eax
-	jne	.LBB1_4
-# BB#3:                                 # %if.then
-                                        #   in Loop: Header=BB1_2 Depth=2
-	movq	stdout(%rip), %rsi
-	movl	$10, %edi
-	callq	fputc
-.LBB1_4:                                # %for.inc
-                                        #   in Loop: Header=BB1_2 Depth=2
-	addq	$4, %r12
-	incq	%rbx
-	movq	stdout(%rip), %rax
-	cmpq	$1536, %rbx             # imm = 0x600
-	jne	.LBB1_2
-# BB#5:                                 # %for.end
-                                        #   in Loop: Header=BB1_1 Depth=1
-	movl	$10, %edi
-	movq	%rax, %rsi
-	callq	fputc
-	addq	$6144, %r15             # imm = 0x1800
-	incq	%r14
-	cmpq	$1536, %r14             # imm = 0x600
-	jne	.LBB1_1
-# BB#6:                                 # %for.end12
-	popq	%rbx
-	popq	%r12
-	popq	%r14
-	popq	%r15
-	popq	%rbp
-	ret
-.Ltmp16:
-	.size	print_array, .Ltmp16-print_array
-	.cfi_endproc
-
-	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
-.LCPI2_0:
-	.quad	4602678819172646912     # double 0.5
-	.text
-	.globl	main
-	.align	16, 0x90
-	.type	main,@function
-main:                                   # @main
-	.cfi_startproc
-# BB#0:                                 # %entry
-	pushq	%rbp
-.Ltmp20:
-	.cfi_def_cfa_offset 16
-.Ltmp21:
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-.Ltmp22:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
 	pushq	%r13
 	pushq	%r12
 	pushq	%rbx
-	subq	$56, %rsp
-.Ltmp23:
+	pushq	%rax
 	.cfi_offset %rbx, -56
-.Ltmp24:
 	.cfi_offset %r12, -48
-.Ltmp25:
 	.cfi_offset %r13, -40
-.Ltmp26:
 	.cfi_offset %r14, -32
-.Ltmp27:
 	.cfi_offset %r15, -24
-	xorl	%ebx, %ebx
-	vmovsd	.LCPI2_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB2_1:                                # %polly.loop_preheader3.i
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB2_2:                                # %polly.loop_header2.i
-                                        #   Parent Loop BB2_1 Depth=1
-                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%ebx, %edx
-	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%rbx, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
-	jne	.LBB2_2
-# BB#3:                                 # %polly.loop_exit4.i
-                                        #   in Loop: Header=BB2_1 Depth=1
-	incq	%rbx
-	cmpq	$1536, %rbx             # imm = 0x600
-	jne	.LBB2_1
-# BB#4:                                 # %polly.loop_preheader3.preheader
-	movl	$C, %edi
-	xorl	%esi, %esi
-	movl	$9437184, %edx          # imm = 0x900000
-	callq	memset
-	xorl	%esi, %esi
-	movl	$C+16, %eax
-	movq	%rax, -88(%rbp)         # 8-byte Spill
-	.align	16, 0x90
-.LBB2_5:                                # %polly.loop_preheader17
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_15 Depth 2
-                                        #       Child Loop BB2_8 Depth 3
-                                        #         Child Loop BB2_11 Depth 4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	movq	%rsi, -56(%rbp)         # 8-byte Spill
-	movq	%rsi, %rax
-	orq	$63, %rax
-	movq	%rax, -72(%rbp)         # 8-byte Spill
-	leaq	-1(%rax), %rax
-	movq	%rax, -48(%rbp)         # 8-byte Spill
-	xorl	%edx, %edx
-	.align	16, 0x90
-.LBB2_15:                               # %polly.loop_preheader24
-                                        #   Parent Loop BB2_5 Depth=1
-                                        # =>  This Loop Header: Depth=2
-                                        #       Child Loop BB2_8 Depth 3
-                                        #         Child Loop BB2_11 Depth 4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	movq	%rdx, -80(%rbp)         # 8-byte Spill
-	leaq	-4(%rdx), %rcx
-	movq	%rdx, %rax
-	decq	%rax
-	cmovsq	%rcx, %rax
-	movq	%rax, %r15
-	sarq	$63, %r15
-	shrq	$62, %r15
-	addq	%rax, %r15
-	andq	$-4, %r15
-	movq	%rdx, %r13
-	orq	$63, %r13
-	leaq	-4(%r13), %rdx
-	xorl	%r10d, %r10d
-	movq	-88(%rbp), %rax         # 8-byte Reload
-	leaq	(%rax,%r15,4), %rax
-	movq	%rax, -64(%rbp)         # 8-byte Spill
-	leaq	B+16(,%r15,4), %rbx
-	leaq	4(%r15), %r12
-	.align	16, 0x90
-.LBB2_8:                                # %polly.loop_header23
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        # =>    This Loop Header: Depth=3
-                                        #         Child Loop BB2_11 Depth 4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	cmpq	-72(%rbp), %rsi         # 8-byte Folded Reload
-	jg	.LBB2_13
-# BB#9:                                 # %polly.loop_header30.preheader
-                                        #   in Loop: Header=BB2_8 Depth=3
-	movq	%r10, %rax
-	orq	$63, %rax
-	cmpq	%rax, %r10
-	jg	.LBB2_13
-# BB#10:                                #   in Loop: Header=BB2_8 Depth=3
-	decq	%rax
-	movq	-64(%rbp), %r14         # 8-byte Reload
-	movq	-56(%rbp), %r11         # 8-byte Reload
-	.align	16, 0x90
-.LBB2_11:                               # %polly.loop_header37.preheader
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        #       Parent Loop BB2_8 Depth=3
-                                        # =>      This Loop Header: Depth=4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	cmpq	%r13, %r12
-	movq	%rbx, %r8
-	movq	%r10, %rsi
-	jg	.LBB2_12
-	.align	16, 0x90
-.LBB2_17:                               # %polly.loop_header46.preheader
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        #       Parent Loop BB2_8 Depth=3
-                                        #         Parent Loop BB2_11 Depth=4
-                                        # =>        This Loop Header: Depth=5
-                                        #             Child Loop BB2_18 Depth 6
-	leaq	(%r11,%r11,2), %rcx
-	shlq	$11, %rcx
-	vbroadcastss	A(%rcx,%rsi,4), %xmm0
-	movq	%r14, %rdi
-	movq	%r8, %r9
-	movq	%r15, %rcx
-.LBB2_18:                               # %polly.loop_header46
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        #       Parent Loop BB2_8 Depth=3
-                                        #         Parent Loop BB2_11 Depth=4
-                                        #           Parent Loop BB2_17 Depth=5
-                                        # =>          This Inner Loop Header: Depth=6
-	vmulps	(%r9), %xmm0, %xmm1
-	vaddps	(%rdi), %xmm1, %xmm1
-	vmovaps	%xmm1, (%rdi)
-	addq	$16, %rdi
-	addq	$16, %r9
-	addq	$4, %rcx
-	cmpq	%rdx, %rcx
-	jle	.LBB2_18
-# BB#16:                                # %polly.loop_exit48
-                                        #   in Loop: Header=BB2_17 Depth=5
-	addq	$6144, %r8              # imm = 0x1800
-	cmpq	%rax, %rsi
-	leaq	1(%rsi), %rsi
-	jle	.LBB2_17
-	.align	16, 0x90
-.LBB2_12:                               # %polly.loop_exit39
-                                        #   in Loop: Header=BB2_11 Depth=4
-	addq	$6144, %r14             # imm = 0x1800
-	cmpq	-48(%rbp), %r11         # 8-byte Folded Reload
-	leaq	1(%r11), %r11
-	jle	.LBB2_11
-	.align	16, 0x90
-.LBB2_13:                               # %polly.loop_exit32
-                                        #   in Loop: Header=BB2_8 Depth=3
-	addq	$393216, %rbx           # imm = 0x60000
-	cmpq	$1472, %r10             # imm = 0x5C0
-	leaq	64(%r10), %r10
-	movq	-56(%rbp), %rsi         # 8-byte Reload
-	jl	.LBB2_8
-# BB#14:                                # %polly.loop_exit25
-                                        #   in Loop: Header=BB2_15 Depth=2
-	movq	-80(%rbp), %rdx         # 8-byte Reload
-	cmpq	$1472, %rdx             # imm = 0x5C0
-	leaq	64(%rdx), %rdx
-	jl	.LBB2_15
-# BB#6:                                 # %polly.loop_exit18
-                                        #   in Loop: Header=BB2_5 Depth=1
-	addq	$393216, -88(%rbp)      # 8-byte Folded Spill
-                                        # imm = 0x60000
-	cmpq	$1472, %rsi             # imm = 0x5C0
-	leaq	64(%rsi), %rsi
-	jl	.LBB2_5
-# BB#7:                                 # %polly.loop_exit11
+	leaq	C(%rip), %r13
 	xorl	%eax, %eax
-	addq	$56, %rsp
+	movl	$3435973837, %r12d      # imm = 0xCCCCCCCD
+	leaq	.L.str(%rip), %r14
+	.p2align	4, 0x90
+.LBB1_1:                                # %for.cond1.preheader
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB1_2 Depth 2
+	movq	%rax, -48(%rbp)         # 8-byte Spill
+	movq	stdout(%rip), %rsi
+	xorl	%ebx, %ebx
+	.p2align	4, 0x90
+.LBB1_2:                                # %for.body3
+                                        #   Parent Loop BB1_1 Depth=1
+                                        # =>  This Inner Loop Header: Depth=2
+	movl	%ebx, %eax
+	imulq	%r12, %rax
+	shrq	$38, %rax
+	leal	(%rax,%rax,4), %r15d
+	shll	$4, %r15d
+	addl	$79, %r15d
+	movss	(%r13,%rbx,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	cvtss2sd	%xmm0, %xmm0
+	movb	$1, %al
+	movq	%rsi, %rdi
+	movq	%r14, %rsi
+	callq	fprintf
+	cmpl	%ebx, %r15d
+	jne	.LBB1_4
+# %bb.3:                                # %if.then
+                                        #   in Loop: Header=BB1_2 Depth=2
+	movq	stdout(%rip), %rsi
+	movl	$10, %edi
+	callq	fputc@PLT
+.LBB1_4:                                # %for.inc
+                                        #   in Loop: Header=BB1_2 Depth=2
+	addq	$1, %rbx
+	movq	stdout(%rip), %rsi
+	cmpq	$1536, %rbx             # imm = 0x600
+	jne	.LBB1_2
+# %bb.5:                                # %for.end
+                                        #   in Loop: Header=BB1_1 Depth=1
+	movl	$10, %edi
+	callq	fputc@PLT
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	addq	$1, %rax
+	addq	$6144, %r13             # imm = 0x1800
+	cmpq	$1536, %rax             # imm = 0x600
+	jne	.LBB1_1
+# %bb.6:                                # %for.end12
+	addq	$8, %rsp
 	popq	%rbx
 	popq	%r12
 	popq	%r13
 	popq	%r14
 	popq	%r15
 	popq	%rbp
-	ret
-.Ltmp28:
-	.size	main, .Ltmp28-main
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end1:
+	.size	print_array, .Lfunc_end1-print_array
 	.cfi_endproc
-
+                                        # -- End function
+	.globl	main                    # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$264, %rsp              # imm = 0x108
+	.cfi_offset %rbx, -56
+	.cfi_offset %r12, -48
+	.cfi_offset %r13, -40
+	.cfi_offset %r14, -32
+	.cfi_offset %r15, -24
+	callq	init_array
+	leaq	C(%rip), %rdi
+	xorl	%eax, %eax
+	movq	%rax, -48(%rbp)         # 8-byte Spill
+	xorl	%esi, %esi
+	movl	$9437184, %edx          # imm = 0x900000
+	callq	memset@PLT
+	movl	$64, %eax
+	movq	%rax, -80(%rbp)         # 8-byte Spill
+	leaq	A(%rip), %rax
+	movq	%rax, -72(%rbp)         # 8-byte Spill
+	.p2align	4, 0x90
+.LBB2_1:                                # %polly.loop_header8
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB2_2 Depth 2
+                                        #       Child Loop BB2_3 Depth 3
+                                        #         Child Loop BB2_4 Depth 4
+                                        #           Child Loop BB2_5 Depth 5
+	leaq	B+192(%rip), %r9
+	xorl	%edi, %edi
+	xorl	%eax, %eax
+	.p2align	4, 0x90
+.LBB2_2:                                # %polly.loop_header14
+                                        #   Parent Loop BB2_1 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB2_3 Depth 3
+                                        #         Child Loop BB2_4 Depth 4
+                                        #           Child Loop BB2_5 Depth 5
+	movq	%rax, -168(%rbp)        # 8-byte Spill
+	movq	%rdi, -176(%rbp)        # 8-byte Spill
+	shlq	$6, %rdi
+	leaq	16(%rdi), %rdx
+	leaq	32(%rdi), %rsi
+	leaq	48(%rdi), %rcx
+	movq	-72(%rbp), %r12         # 8-byte Reload
+	movq	%r9, -184(%rbp)         # 8-byte Spill
+	xorl	%eax, %eax
+	.p2align	4, 0x90
+.LBB2_3:                                # %polly.loop_header20
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
+                                        # =>    This Loop Header: Depth=3
+                                        #         Child Loop BB2_4 Depth 4
+                                        #           Child Loop BB2_5 Depth 5
+	movq	%rax, -192(%rbp)        # 8-byte Spill
+	movq	%r12, -200(%rbp)        # 8-byte Spill
+	movq	-48(%rbp), %r14         # 8-byte Reload
+	.p2align	4, 0x90
+.LBB2_4:                                # %polly.loop_header26
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
+                                        #       Parent Loop BB2_3 Depth=3
+                                        # =>      This Loop Header: Depth=4
+                                        #           Child Loop BB2_5 Depth 5
+	leaq	(%r14,%r14,2), %rbx
+	shlq	$11, %rbx
+	leaq	C(%rip), %rax
+	addq	%rax, %rbx
+	leaq	(%rbx,%rdi,4), %r8
+	leaq	(%rbx,%rdx,4), %r15
+	leaq	(%rbx,%rsi,4), %r10
+	leaq	(%rbx,%rcx,4), %r11
+	movups	(%rbx,%rdi,4), %xmm8
+	movups	16(%rbx,%rdi,4), %xmm0
+	movaps	%xmm0, -144(%rbp)       # 16-byte Spill
+	movups	32(%rbx,%rdi,4), %xmm6
+	movups	48(%rbx,%rdi,4), %xmm1
+	movups	(%rbx,%rdx,4), %xmm15
+	movups	16(%rbx,%rdx,4), %xmm0
+	movaps	%xmm0, -64(%rbp)        # 16-byte Spill
+	movups	32(%rbx,%rdx,4), %xmm0
+	movaps	%xmm0, -96(%rbp)        # 16-byte Spill
+	movups	48(%rbx,%rdx,4), %xmm0
+	movaps	%xmm0, -112(%rbp)       # 16-byte Spill
+	movups	(%rbx,%rsi,4), %xmm11
+	movups	16(%rbx,%rsi,4), %xmm0
+	movaps	%xmm0, -160(%rbp)       # 16-byte Spill
+	movups	32(%rbx,%rsi,4), %xmm12
+	movups	48(%rbx,%rsi,4), %xmm0
+	movaps	%xmm0, -128(%rbp)       # 16-byte Spill
+	movups	(%rbx,%rcx,4), %xmm9
+	movups	16(%rbx,%rcx,4), %xmm13
+	movups	32(%rbx,%rcx,4), %xmm2
+	movups	48(%rbx,%rcx,4), %xmm3
+	movq	%r9, %rbx
+	movl	$0, %r13d
+	.p2align	4, 0x90
+.LBB2_5:                                # %vector.ph
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
+                                        #       Parent Loop BB2_3 Depth=3
+                                        #         Parent Loop BB2_4 Depth=4
+                                        # =>        This Inner Loop Header: Depth=5
+	movaps	%xmm12, -240(%rbp)      # 16-byte Spill
+	movaps	%xmm2, -256(%rbp)       # 16-byte Spill
+	movaps	%xmm3, -272(%rbp)       # 16-byte Spill
+	movaps	%xmm8, %xmm10
+	movaps	-144(%rbp), %xmm7       # 16-byte Reload
+	unpcklps	%xmm7, %xmm10   # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
+	movaps	%xmm1, %xmm4
+	shufps	$0, %xmm6, %xmm4        # xmm4 = xmm4[0,0],xmm6[0,0]
+	shufps	$36, %xmm4, %xmm10      # xmm10 = xmm10[0,1],xmm4[2,0]
+	movaps	%xmm7, %xmm5
+	shufps	$17, %xmm8, %xmm5       # xmm5 = xmm5[1,0],xmm8[1,0]
+	movaps	%xmm6, %xmm4
+	unpcklps	%xmm1, %xmm4    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+	shufps	$226, %xmm4, %xmm5      # xmm5 = xmm5[2,0],xmm4[2,3]
+	movaps	%xmm8, %xmm12
+	unpckhps	%xmm7, %xmm12   # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+	movaps	%xmm1, %xmm4
+	shufps	$34, %xmm6, %xmm4       # xmm4 = xmm4[2,0],xmm6[2,0]
+	shufps	$36, %xmm4, %xmm12      # xmm12 = xmm12[0,1],xmm4[2,0]
+	shufps	$51, %xmm8, %xmm7       # xmm7 = xmm7[3,0],xmm8[3,0]
+	unpckhps	%xmm1, %xmm6    # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+	shufps	$226, %xmm6, %xmm7      # xmm7 = xmm7[2,0],xmm6[2,3]
+	movaps	-160(%rbx), %xmm0
+	movaps	-144(%rbx), %xmm1
+	movaps	%xmm1, %xmm6
+	shufps	$0, %xmm0, %xmm6        # xmm6 = xmm6[0,0],xmm0[0,0]
+	movaps	-192(%rbx), %xmm3
+	movaps	-176(%rbx), %xmm4
+	movaps	%xmm3, %xmm8
+	unpcklps	%xmm4, %xmm8    # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+	shufps	$36, %xmm6, %xmm8       # xmm8 = xmm8[0,1],xmm6[2,0]
+	movaps	%xmm0, %xmm2
+	unpcklps	%xmm1, %xmm2    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+	movaps	%xmm4, %xmm6
+	shufps	$17, %xmm3, %xmm6       # xmm6 = xmm6[1,0],xmm3[1,0]
+	shufps	$226, %xmm2, %xmm6      # xmm6 = xmm6[2,0],xmm2[2,3]
+	movaps	%xmm1, %xmm2
+	shufps	$34, %xmm0, %xmm2       # xmm2 = xmm2[2,0],xmm0[2,0]
+	movaps	%xmm3, %xmm14
+	unpckhps	%xmm4, %xmm14   # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+	shufps	$36, %xmm2, %xmm14      # xmm14 = xmm14[0,1],xmm2[2,0]
+	unpckhps	%xmm1, %xmm0    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+	shufps	$51, %xmm3, %xmm4       # xmm4 = xmm4[3,0],xmm3[3,0]
+	shufps	$226, %xmm0, %xmm4      # xmm4 = xmm4[2,0],xmm0[2,3]
+	movss	(%r12,%r13,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	shufps	$0, %xmm0, %xmm0        # xmm0 = xmm0[0,0,0,0]
+	mulps	%xmm0, %xmm8
+	addps	%xmm10, %xmm8
+	mulps	%xmm0, %xmm6
+	addps	%xmm5, %xmm6
+	mulps	%xmm0, %xmm14
+	addps	%xmm12, %xmm14
+	mulps	%xmm0, %xmm4
+	movaps	%xmm0, %xmm5
+	addps	%xmm7, %xmm4
+	movaps	%xmm14, %xmm0
+	unpckhps	%xmm4, %xmm0    # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+	movaps	%xmm6, %xmm1
+	shufps	$51, %xmm8, %xmm1       # xmm1 = xmm1[3,0],xmm8[3,0]
+	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
+	movaps	%xmm1, -304(%rbp)       # 16-byte Spill
+	movaps	%xmm4, %xmm0
+	shufps	$34, %xmm14, %xmm0      # xmm0 = xmm0[2,0],xmm14[2,0]
+	movaps	%xmm8, %xmm1
+	unpckhps	%xmm6, %xmm1    # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+	shufps	$36, %xmm0, %xmm1       # xmm1 = xmm1[0,1],xmm0[2,0]
+	movaps	%xmm1, -288(%rbp)       # 16-byte Spill
+	movaps	%xmm14, %xmm0
+	unpcklps	%xmm4, %xmm0    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+	movaps	%xmm6, %xmm1
+	shufps	$17, %xmm8, %xmm1       # xmm1 = xmm1[1,0],xmm8[1,0]
+	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
+	movaps	%xmm1, -144(%rbp)       # 16-byte Spill
+	shufps	$0, %xmm14, %xmm4       # xmm4 = xmm4[0,0],xmm14[0,0]
+	unpcklps	%xmm6, %xmm8    # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
+	shufps	$36, %xmm4, %xmm8       # xmm8 = xmm8[0,1],xmm4[2,0]
+	movaps	%xmm15, %xmm14
+	movaps	-64(%rbp), %xmm4        # 16-byte Reload
+	unpcklps	%xmm4, %xmm14   # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+	movaps	-112(%rbp), %xmm1       # 16-byte Reload
+	movaps	%xmm1, %xmm0
+	movaps	-96(%rbp), %xmm3        # 16-byte Reload
+	shufps	$0, %xmm3, %xmm0        # xmm0 = xmm0[0,0],xmm3[0,0]
+	shufps	$36, %xmm0, %xmm14      # xmm14 = xmm14[0,1],xmm0[2,0]
+	movaps	%xmm4, %xmm12
+	shufps	$17, %xmm15, %xmm12     # xmm12 = xmm12[1,0],xmm15[1,0]
+	movaps	%xmm3, %xmm2
+	unpcklps	%xmm1, %xmm2    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+	shufps	$226, %xmm2, %xmm12     # xmm12 = xmm12[2,0],xmm2[2,3]
+	movaps	%xmm15, %xmm7
+	unpckhps	%xmm4, %xmm7    # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+	movaps	%xmm1, %xmm2
+	shufps	$34, %xmm3, %xmm2       # xmm2 = xmm2[2,0],xmm3[2,0]
+	shufps	$36, %xmm2, %xmm7       # xmm7 = xmm7[0,1],xmm2[2,0]
+	shufps	$51, %xmm15, %xmm4      # xmm4 = xmm4[3,0],xmm15[3,0]
+	unpckhps	%xmm1, %xmm3    # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+	shufps	$226, %xmm3, %xmm4      # xmm4 = xmm4[2,0],xmm3[2,3]
+	movaps	%xmm4, -64(%rbp)        # 16-byte Spill
+	movaps	-96(%rbx), %xmm2
+	movaps	-80(%rbx), %xmm1
+	movaps	%xmm1, %xmm4
+	shufps	$0, %xmm2, %xmm4        # xmm4 = xmm4[0,0],xmm2[0,0]
+	movaps	-112(%rbx), %xmm10
+	movaps	-128(%rbx), %xmm0
+	movaps	%xmm0, %xmm15
+	unpcklps	%xmm10, %xmm15  # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
+	shufps	$36, %xmm4, %xmm15      # xmm15 = xmm15[0,1],xmm4[2,0]
+	movaps	%xmm2, %xmm4
+	unpcklps	%xmm1, %xmm4    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+	movaps	%xmm10, %xmm6
+	shufps	$17, %xmm0, %xmm6       # xmm6 = xmm6[1,0],xmm0[1,0]
+	shufps	$226, %xmm4, %xmm6      # xmm6 = xmm6[2,0],xmm4[2,3]
+	movaps	%xmm1, %xmm3
+	shufps	$34, %xmm2, %xmm3       # xmm3 = xmm3[2,0],xmm2[2,0]
+	movaps	%xmm0, %xmm4
+	unpckhps	%xmm10, %xmm4   # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
+	shufps	$36, %xmm3, %xmm4       # xmm4 = xmm4[0,1],xmm3[2,0]
+	unpckhps	%xmm1, %xmm2    # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+	shufps	$51, %xmm0, %xmm10      # xmm10 = xmm10[3,0],xmm0[3,0]
+	shufps	$226, %xmm2, %xmm10     # xmm10 = xmm10[2,0],xmm2[2,3]
+	movaps	%xmm5, -224(%rbp)       # 16-byte Spill
+	mulps	%xmm5, %xmm15
+	addps	%xmm14, %xmm15
+	mulps	%xmm5, %xmm6
+	addps	%xmm12, %xmm6
+	mulps	%xmm5, %xmm4
+	addps	%xmm7, %xmm4
+	mulps	%xmm5, %xmm10
+	addps	-64(%rbp), %xmm10       # 16-byte Folded Reload
+	movaps	%xmm4, %xmm0
+	unpckhps	%xmm10, %xmm0   # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
+	movaps	%xmm6, %xmm1
+	shufps	$51, %xmm15, %xmm1      # xmm1 = xmm1[3,0],xmm15[3,0]
+	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
+	movaps	%xmm1, -112(%rbp)       # 16-byte Spill
+	movaps	%xmm10, %xmm0
+	shufps	$34, %xmm4, %xmm0       # xmm0 = xmm0[2,0],xmm4[2,0]
+	movaps	%xmm15, %xmm1
+	unpckhps	%xmm6, %xmm1    # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+	shufps	$36, %xmm0, %xmm1       # xmm1 = xmm1[0,1],xmm0[2,0]
+	movaps	%xmm1, -96(%rbp)        # 16-byte Spill
+	movaps	%xmm4, %xmm0
+	unpcklps	%xmm10, %xmm0   # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
+	movaps	%xmm6, %xmm1
+	shufps	$17, %xmm15, %xmm1      # xmm1 = xmm1[1,0],xmm15[1,0]
+	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
+	movaps	%xmm1, -64(%rbp)        # 16-byte Spill
+	shufps	$0, %xmm4, %xmm10       # xmm10 = xmm10[0,0],xmm4[0,0]
+	unpcklps	%xmm6, %xmm15   # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
+	shufps	$36, %xmm10, %xmm15     # xmm15 = xmm15[0,1],xmm10[2,0]
+	movaps	%xmm11, %xmm10
+	movaps	-160(%rbp), %xmm14      # 16-byte Reload
+	unpcklps	%xmm14, %xmm10  # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
+	movaps	-128(%rbp), %xmm2       # 16-byte Reload
+	movaps	%xmm2, %xmm0
+	movaps	-240(%rbp), %xmm3       # 16-byte Reload
+	shufps	$0, %xmm3, %xmm0        # xmm0 = xmm0[0,0],xmm3[0,0]
+	shufps	$36, %xmm0, %xmm10      # xmm10 = xmm10[0,1],xmm0[2,0]
+	movaps	%xmm14, %xmm12
+	shufps	$17, %xmm11, %xmm12     # xmm12 = xmm12[1,0],xmm11[1,0]
+	movaps	%xmm3, %xmm0
+	unpcklps	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+	shufps	$226, %xmm0, %xmm12     # xmm12 = xmm12[2,0],xmm0[2,3]
+	movaps	%xmm11, %xmm0
+	unpckhps	%xmm14, %xmm0   # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+	movaps	%xmm2, %xmm1
+	shufps	$34, %xmm3, %xmm1       # xmm1 = xmm1[2,0],xmm3[2,0]
+	shufps	$36, %xmm1, %xmm0       # xmm0 = xmm0[0,1],xmm1[2,0]
+	shufps	$51, %xmm11, %xmm14     # xmm14 = xmm14[3,0],xmm11[3,0]
+	unpckhps	%xmm2, %xmm3    # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+	shufps	$226, %xmm3, %xmm14     # xmm14 = xmm14[2,0],xmm3[2,3]
+	movaps	-32(%rbx), %xmm1
+	movaps	-16(%rbx), %xmm2
+	movaps	%xmm2, %xmm3
+	shufps	$0, %xmm1, %xmm3        # xmm3 = xmm3[0,0],xmm1[0,0]
+	movaps	-48(%rbx), %xmm4
+	movaps	-64(%rbx), %xmm5
+	movaps	%xmm5, %xmm11
+	unpcklps	%xmm4, %xmm11   # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+	shufps	$36, %xmm3, %xmm11      # xmm11 = xmm11[0,1],xmm3[2,0]
+	movaps	%xmm1, %xmm3
+	unpcklps	%xmm2, %xmm3    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+	movaps	%xmm4, %xmm7
+	shufps	$17, %xmm5, %xmm7       # xmm7 = xmm7[1,0],xmm5[1,0]
+	shufps	$226, %xmm3, %xmm7      # xmm7 = xmm7[2,0],xmm3[2,3]
+	movaps	%xmm2, %xmm3
+	shufps	$34, %xmm1, %xmm3       # xmm3 = xmm3[2,0],xmm1[2,0]
+	movaps	%xmm5, %xmm6
+	unpckhps	%xmm4, %xmm6    # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+	shufps	$36, %xmm3, %xmm6       # xmm6 = xmm6[0,1],xmm3[2,0]
+	unpckhps	%xmm2, %xmm1    # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+	shufps	$51, %xmm5, %xmm4       # xmm4 = xmm4[3,0],xmm5[3,0]
+	shufps	$226, %xmm1, %xmm4      # xmm4 = xmm4[2,0],xmm1[2,3]
+	movaps	-224(%rbp), %xmm1       # 16-byte Reload
+	mulps	%xmm1, %xmm11
+	addps	%xmm10, %xmm11
+	mulps	%xmm1, %xmm7
+	addps	%xmm12, %xmm7
+	mulps	%xmm1, %xmm6
+	addps	%xmm0, %xmm6
+	mulps	%xmm1, %xmm4
+	addps	%xmm14, %xmm4
+	movaps	%xmm6, %xmm0
+	unpckhps	%xmm4, %xmm0    # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+	movaps	%xmm7, %xmm1
+	shufps	$51, %xmm11, %xmm1      # xmm1 = xmm1[3,0],xmm11[3,0]
+	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
+	movaps	%xmm1, -128(%rbp)       # 16-byte Spill
+	movaps	%xmm4, %xmm0
+	shufps	$34, %xmm6, %xmm0       # xmm0 = xmm0[2,0],xmm6[2,0]
+	movaps	%xmm11, %xmm12
+	unpckhps	%xmm7, %xmm12   # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+	shufps	$36, %xmm0, %xmm12      # xmm12 = xmm12[0,1],xmm0[2,0]
+	movaps	%xmm6, %xmm0
+	unpcklps	%xmm4, %xmm0    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+	movaps	%xmm7, %xmm1
+	shufps	$17, %xmm11, %xmm1      # xmm1 = xmm1[1,0],xmm11[1,0]
+	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
+	movaps	%xmm1, -160(%rbp)       # 16-byte Spill
+	shufps	$0, %xmm6, %xmm4        # xmm4 = xmm4[0,0],xmm6[0,0]
+	unpcklps	%xmm7, %xmm11   # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
+	shufps	$36, %xmm4, %xmm11      # xmm11 = xmm11[0,1],xmm4[2,0]
+	movaps	%xmm9, %xmm10
+	unpcklps	%xmm13, %xmm10  # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
+	movaps	-272(%rbp), %xmm2       # 16-byte Reload
+	movaps	%xmm2, %xmm0
+	movaps	-256(%rbp), %xmm3       # 16-byte Reload
+	shufps	$0, %xmm3, %xmm0        # xmm0 = xmm0[0,0],xmm3[0,0]
+	shufps	$36, %xmm0, %xmm10      # xmm10 = xmm10[0,1],xmm0[2,0]
+	movaps	%xmm13, %xmm14
+	shufps	$17, %xmm9, %xmm14      # xmm14 = xmm14[1,0],xmm9[1,0]
+	movaps	%xmm3, %xmm0
+	unpcklps	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+	shufps	$226, %xmm0, %xmm14     # xmm14 = xmm14[2,0],xmm0[2,3]
+	movaps	%xmm9, %xmm0
+	unpckhps	%xmm13, %xmm0   # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+	movaps	%xmm2, %xmm1
+	shufps	$34, %xmm3, %xmm1       # xmm1 = xmm1[2,0],xmm3[2,0]
+	shufps	$36, %xmm1, %xmm0       # xmm0 = xmm0[0,1],xmm1[2,0]
+	shufps	$51, %xmm9, %xmm13      # xmm13 = xmm13[3,0],xmm9[3,0]
+	unpckhps	%xmm2, %xmm3    # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+	shufps	$226, %xmm3, %xmm13     # xmm13 = xmm13[2,0],xmm3[2,3]
+	movaps	32(%rbx), %xmm1
+	movaps	48(%rbx), %xmm2
+	movaps	%xmm2, %xmm3
+	shufps	$0, %xmm1, %xmm3        # xmm3 = xmm3[0,0],xmm1[0,0]
+	movaps	16(%rbx), %xmm4
+	movaps	(%rbx), %xmm5
+	movaps	%xmm5, %xmm9
+	unpcklps	%xmm4, %xmm9    # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+	shufps	$36, %xmm3, %xmm9       # xmm9 = xmm9[0,1],xmm3[2,0]
+	movaps	%xmm1, %xmm3
+	unpcklps	%xmm2, %xmm3    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+	movaps	%xmm4, %xmm7
+	shufps	$17, %xmm5, %xmm7       # xmm7 = xmm7[1,0],xmm5[1,0]
+	shufps	$226, %xmm3, %xmm7      # xmm7 = xmm7[2,0],xmm3[2,3]
+	movaps	%xmm2, %xmm3
+	shufps	$34, %xmm1, %xmm3       # xmm3 = xmm3[2,0],xmm1[2,0]
+	movaps	%xmm5, %xmm6
+	unpckhps	%xmm4, %xmm6    # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+	shufps	$36, %xmm3, %xmm6       # xmm6 = xmm6[0,1],xmm3[2,0]
+	unpckhps	%xmm2, %xmm1    # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+	shufps	$51, %xmm5, %xmm4       # xmm4 = xmm4[3,0],xmm5[3,0]
+	shufps	$226, %xmm1, %xmm4      # xmm4 = xmm4[2,0],xmm1[2,3]
+	movaps	-224(%rbp), %xmm1       # 16-byte Reload
+	mulps	%xmm1, %xmm9
+	addps	%xmm10, %xmm9
+	mulps	%xmm1, %xmm7
+	addps	%xmm14, %xmm7
+	mulps	%xmm1, %xmm6
+	addps	%xmm0, %xmm6
+	mulps	%xmm1, %xmm4
+	addps	%xmm13, %xmm4
+	movaps	%xmm6, %xmm0
+	unpckhps	%xmm4, %xmm0    # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+	movaps	%xmm7, %xmm3
+	shufps	$51, %xmm9, %xmm3       # xmm3 = xmm3[3,0],xmm9[3,0]
+	shufps	$226, %xmm0, %xmm3      # xmm3 = xmm3[2,0],xmm0[2,3]
+	movaps	%xmm4, %xmm0
+	shufps	$34, %xmm6, %xmm0       # xmm0 = xmm0[2,0],xmm6[2,0]
+	movaps	%xmm9, %xmm2
+	unpckhps	%xmm7, %xmm2    # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+	shufps	$36, %xmm0, %xmm2       # xmm2 = xmm2[0,1],xmm0[2,0]
+	movaps	%xmm6, %xmm0
+	unpcklps	%xmm4, %xmm0    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+	movaps	%xmm7, %xmm13
+	shufps	$17, %xmm9, %xmm13      # xmm13 = xmm13[1,0],xmm9[1,0]
+	shufps	$226, %xmm0, %xmm13     # xmm13 = xmm13[2,0],xmm0[2,3]
+	shufps	$0, %xmm6, %xmm4        # xmm4 = xmm4[0,0],xmm6[0,0]
+	movaps	-288(%rbp), %xmm6       # 16-byte Reload
+	movaps	-304(%rbp), %xmm1       # 16-byte Reload
+	unpcklps	%xmm7, %xmm9    # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+	shufps	$36, %xmm4, %xmm9       # xmm9 = xmm9[0,1],xmm4[2,0]
+	addq	$1, %r13
+	addq	$6144, %rbx             # imm = 0x1800
+	cmpq	$64, %r13
+	jne	.LBB2_5
+# %bb.6:                                # %polly.loop_exit34
+                                        #   in Loop: Header=BB2_4 Depth=4
+	movups	%xmm8, (%r8)
+	movaps	-144(%rbp), %xmm0       # 16-byte Reload
+	movups	%xmm0, 16(%r8)
+	movups	%xmm6, 32(%r8)
+	movups	%xmm1, 48(%r8)
+	movaps	-112(%rbp), %xmm0       # 16-byte Reload
+	movups	%xmm0, 48(%r15)
+	movaps	-96(%rbp), %xmm0        # 16-byte Reload
+	movups	%xmm0, 32(%r15)
+	movaps	-64(%rbp), %xmm0        # 16-byte Reload
+	movups	%xmm0, 16(%r15)
+	movups	%xmm15, (%r15)
+	movaps	-128(%rbp), %xmm0       # 16-byte Reload
+	movups	%xmm0, 48(%r10)
+	movaps	-160(%rbp), %xmm0       # 16-byte Reload
+	movups	%xmm0, 16(%r10)
+	movups	%xmm11, (%r10)
+	movups	%xmm12, 32(%r10)
+	movups	%xmm3, 48(%r11)
+	movups	%xmm13, 16(%r11)
+	movups	%xmm9, (%r11)
+	movups	%xmm2, 32(%r11)
+	addq	$1, %r14
+	addq	$6144, %r12             # imm = 0x1800
+	cmpq	-80(%rbp), %r14         # 8-byte Folded Reload
+	jne	.LBB2_4
+# %bb.7:                                # %polly.loop_exit28
+                                        #   in Loop: Header=BB2_3 Depth=3
+	movq	-192(%rbp), %rax        # 8-byte Reload
+	addq	$64, %rax
+	addq	$393216, %r9            # imm = 0x60000
+	movq	-200(%rbp), %r12        # 8-byte Reload
+	addq	$256, %r12              # imm = 0x100
+	cmpq	$1536, %rax             # imm = 0x600
+	jb	.LBB2_3
+# %bb.8:                                # %polly.loop_exit22
+                                        #   in Loop: Header=BB2_2 Depth=2
+	movq	-168(%rbp), %rax        # 8-byte Reload
+	addq	$64, %rax
+	movq	-176(%rbp), %rdi        # 8-byte Reload
+	addq	$1, %rdi
+	movq	-184(%rbp), %r9         # 8-byte Reload
+	addq	$256, %r9               # imm = 0x100
+	cmpq	$1536, %rax             # imm = 0x600
+	jb	.LBB2_2
+# %bb.9:                                # %polly.loop_exit16
+                                        #   in Loop: Header=BB2_1 Depth=1
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	movq	%rax, %rcx
+	addq	$64, %rcx
+	addq	$64, -80(%rbp)          # 8-byte Folded Spill
+	addq	$393216, -72(%rbp)      # 8-byte Folded Spill
+                                        # imm = 0x60000
+	movq	%rcx, %rax
+	movq	%rcx, -48(%rbp)         # 8-byte Spill
+	cmpq	$1536, %rcx             # imm = 0x600
+	jb	.LBB2_1
+# %bb.10:                               # %polly.exiting
+	xorl	%eax, %eax
+	addq	$264, %rsp              # imm = 0x108
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end2:
+	.size	main, .Lfunc_end2-main
+	.cfi_endproc
+                                        # -- End function
 	.type	A,@object               # @A
 	.comm	A,9437184,16
 	.type	B,@object               # @B
@ -387,10 +647,11 @@ main:                                   # @main
 	.type	.L.str,@object          # @.str
 	.section	.rodata.str1.1,"aMS",@progbits,1
 .L.str:
-	.asciz	 "%lf "
+	.asciz	"%lf "
 	.size	.L.str, 5

 	.type	C,@object               # @C
 	.comm	C,9437184,16

+	.ident	"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
 	.section	".note.GNU-stack","",@progbits
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s
@ -1,379 +1,495 @@
-	.file	"matmul.polly.interchanged+tiled.ll"
+	.text
+	.file	"matmul.c"
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
+	.p2align	3               # -- Begin function init_array
 .LCPI0_0:
 	.quad	4602678819172646912     # double 0.5
 	.text
 	.globl	init_array
-	.align	16, 0x90
+	.p2align	4, 0x90
 	.type	init_array,@function
 init_array:                             # @init_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp2:
 	.cfi_def_cfa_offset 16
-.Ltmp3:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp4:
 	.cfi_def_cfa_register %rbp
+	leaq	B(%rip), %rax
+	leaq	A(%rip), %rcx
 	xorl	%r8d, %r8d
-	vmovsd	.LCPI0_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB0_1:                                # %polly.loop_preheader3
+	movsd	.LCPI0_0(%rip), %xmm0   # xmm0 = mem[0],zero
+	xorl	%r9d, %r9d
+	.p2align	4, 0x90
+.LBB0_1:                                # %polly.loop_header
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB0_2:                                # %polly.loop_header2
+	movl	$1, %edi
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB0_2:                                # %polly.loop_header1
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%r8d, %edx
 	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%r8, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
+	andl	$1022, %esi             # imm = 0x3FE
+	orl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, -4(%rcx,%rdi,4)
+	movss	%xmm1, -4(%rax,%rdi,4)
+	leal	(%r9,%rdx), %esi
+	andl	$1023, %esi             # imm = 0x3FF
+	addl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, (%rcx,%rdi,4)
+	movss	%xmm1, (%rax,%rdi,4)
+	addq	$2, %rdi
+	addl	%r8d, %edx
+	cmpq	$1537, %rdi             # imm = 0x601
 	jne	.LBB0_2
-# BB#3:                                 # %polly.loop_exit4
+# %bb.3:                                # %polly.loop_exit3
                                        #   in Loop: Header=BB0_1 Depth=1
-	incq	%r8
-	cmpq	$1536, %r8              # imm = 0x600
+	addq	$1, %r9
+	addq	$6144, %rax             # imm = 0x1800
+	addq	$6144, %rcx             # imm = 0x1800
+	addl	$2, %r8d
+	cmpq	$1536, %r9              # imm = 0x600
 	jne	.LBB0_1
-# BB#4:                                 # %polly.loop_exit
+# %bb.4:                                # %polly.exiting
 	popq	%rbp
-	ret
-.Ltmp5:
-	.size	init_array, .Ltmp5-init_array
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end0:
+	.size	init_array, .Lfunc_end0-init_array
 	.cfi_endproc
-
-	.globl	print_array
-	.align	16, 0x90
+                                        # -- End function
+	.globl	print_array             # -- Begin function print_array
+	.p2align	4, 0x90
 	.type	print_array,@function
 print_array:                            # @print_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp9:
 	.cfi_def_cfa_offset 16
-.Ltmp10:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp11:
-	.cfi_def_cfa_register %rbp
-	pushq	%r15
-	pushq	%r14
-	pushq	%r12
-	pushq	%rbx
-.Ltmp12:
-	.cfi_offset %rbx, -48
-.Ltmp13:
-	.cfi_offset %r12, -40
-.Ltmp14:
-	.cfi_offset %r14, -32
-.Ltmp15:
-	.cfi_offset %r15, -24
-	xorl	%r14d, %r14d
-	movl	$C, %r15d
-	.align	16, 0x90
-.LBB1_1:                                # %for.cond1.preheader
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB1_2 Depth 2
-	movq	stdout(%rip), %rax
-	movq	%r15, %r12
-	xorl	%ebx, %ebx
-	.align	16, 0x90
-.LBB1_2:                                # %for.body3
-                                        #   Parent Loop BB1_1 Depth=1
-                                        # =>  This Inner Loop Header: Depth=2
-	vmovss	(%r12), %xmm0
-	vcvtss2sd	%xmm0, %xmm0, %xmm0
-	movq	%rax, %rdi
-	movl	$.L.str, %esi
-	movb	$1, %al
-	callq	fprintf
-	movslq	%ebx, %rax
-	imulq	$1717986919, %rax, %rcx # imm = 0x66666667
-	movq	%rcx, %rdx
-	shrq	$63, %rdx
-	sarq	$37, %rcx
-	addl	%edx, %ecx
-	imull	$80, %ecx, %ecx
-	subl	%ecx, %eax
-	cmpl	$79, %eax
-	jne	.LBB1_4
-# BB#3:                                 # %if.then
-                                        #   in Loop: Header=BB1_2 Depth=2
-	movq	stdout(%rip), %rsi
-	movl	$10, %edi
-	callq	fputc
-.LBB1_4:                                # %for.inc
-                                        #   in Loop: Header=BB1_2 Depth=2
-	addq	$4, %r12
-	incq	%rbx
-	movq	stdout(%rip), %rax
-	cmpq	$1536, %rbx             # imm = 0x600
-	jne	.LBB1_2
-# BB#5:                                 # %for.end
-                                        #   in Loop: Header=BB1_1 Depth=1
-	movl	$10, %edi
-	movq	%rax, %rsi
-	callq	fputc
-	addq	$6144, %r15             # imm = 0x1800
-	incq	%r14
-	cmpq	$1536, %r14             # imm = 0x600
-	jne	.LBB1_1
-# BB#6:                                 # %for.end12
-	popq	%rbx
-	popq	%r12
-	popq	%r14
-	popq	%r15
-	popq	%rbp
-	ret
-.Ltmp16:
-	.size	print_array, .Ltmp16-print_array
-	.cfi_endproc
-
-	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
-.LCPI2_0:
-	.quad	4602678819172646912     # double 0.5
-	.text
-	.globl	main
-	.align	16, 0x90
-	.type	main,@function
-main:                                   # @main
-	.cfi_startproc
-# BB#0:                                 # %entry
-	pushq	%rbp
-.Ltmp20:
-	.cfi_def_cfa_offset 16
-.Ltmp21:
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-.Ltmp22:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
 	pushq	%r13
 	pushq	%r12
 	pushq	%rbx
-	subq	$56, %rsp
-.Ltmp23:
+	pushq	%rax
 	.cfi_offset %rbx, -56
-.Ltmp24:
 	.cfi_offset %r12, -48
-.Ltmp25:
 	.cfi_offset %r13, -40
-.Ltmp26:
 	.cfi_offset %r14, -32
-.Ltmp27:
 	.cfi_offset %r15, -24
+	leaq	C(%rip), %r13
+	xorl	%eax, %eax
+	movl	$3435973837, %r12d      # imm = 0xCCCCCCCD
+	leaq	.L.str(%rip), %r14
+	.p2align	4, 0x90
+.LBB1_1:                                # %for.cond1.preheader
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB1_2 Depth 2
+	movq	%rax, -48(%rbp)         # 8-byte Spill
+	movq	stdout(%rip), %rsi
 	xorl	%ebx, %ebx
-	vmovsd	.LCPI2_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB2_1:                                # %polly.loop_preheader3.i
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB2_2:                                # %polly.loop_header2.i
-                                        #   Parent Loop BB2_1 Depth=1
+	.p2align	4, 0x90
+.LBB1_2:                                # %for.body3
+                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%ebx, %edx
-	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%rbx, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
-	jne	.LBB2_2
-# BB#3:                                 # %polly.loop_exit4.i
-                                        #   in Loop: Header=BB2_1 Depth=1
-	incq	%rbx
+	movl	%ebx, %eax
+	imulq	%r12, %rax
+	shrq	$38, %rax
+	leal	(%rax,%rax,4), %r15d
+	shll	$4, %r15d
+	addl	$79, %r15d
+	movss	(%r13,%rbx,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	cvtss2sd	%xmm0, %xmm0
+	movb	$1, %al
+	movq	%rsi, %rdi
+	movq	%r14, %rsi
+	callq	fprintf
+	cmpl	%ebx, %r15d
+	jne	.LBB1_4
+# %bb.3:                                # %if.then
+                                        #   in Loop: Header=BB1_2 Depth=2
+	movq	stdout(%rip), %rsi
+	movl	$10, %edi
+	callq	fputc@PLT
+.LBB1_4:                                # %for.inc
+                                        #   in Loop: Header=BB1_2 Depth=2
+	addq	$1, %rbx
+	movq	stdout(%rip), %rsi
 	cmpq	$1536, %rbx             # imm = 0x600
-	jne	.LBB2_1
-# BB#4:                                 # %polly.loop_preheader3.preheader
-	movl	$C, %ebx
-	movl	$C, %edi
-	xorl	%esi, %esi
-	movl	$9437184, %edx          # imm = 0x900000
-	callq	memset
-	xorl	%eax, %eax
-	.align	16, 0x90
-.LBB2_5:                                # %polly.loop_preheader17
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_15 Depth 2
-                                        #       Child Loop BB2_8 Depth 3
-                                        #         Child Loop BB2_11 Depth 4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	movq	%rax, -56(%rbp)         # 8-byte Spill
-	movq	%rbx, -88(%rbp)         # 8-byte Spill
-	movq	%rax, %rcx
-	orq	$63, %rcx
-	movq	%rcx, -72(%rbp)         # 8-byte Spill
-	leaq	-1(%rcx), %rcx
-	movq	%rcx, -48(%rbp)         # 8-byte Spill
-	movq	$-1, %r15
-	movl	$B, %ecx
-	movq	%rbx, -64(%rbp)         # 8-byte Spill
-	xorl	%r12d, %r12d
-	.align	16, 0x90
-.LBB2_15:                               # %polly.loop_preheader24
-                                        #   Parent Loop BB2_5 Depth=1
-                                        # =>  This Loop Header: Depth=2
-                                        #       Child Loop BB2_8 Depth 3
-                                        #         Child Loop BB2_11 Depth 4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	movq	%rcx, -80(%rbp)         # 8-byte Spill
-	movq	%r12, %r13
-	orq	$63, %r13
-	leaq	-1(%r13), %rbx
-	xorl	%r9d, %r9d
-	movq	%rcx, %rdx
-	.align	16, 0x90
-.LBB2_8:                                # %polly.loop_header23
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        # =>    This Loop Header: Depth=3
-                                        #         Child Loop BB2_11 Depth 4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	cmpq	-72(%rbp), %rax         # 8-byte Folded Reload
-	jg	.LBB2_13
-# BB#9:                                 # %polly.loop_header30.preheader
-                                        #   in Loop: Header=BB2_8 Depth=3
-	movq	%r9, %rax
-	orq	$63, %rax
-	cmpq	%rax, %r9
-	jg	.LBB2_13
-# BB#10:                                #   in Loop: Header=BB2_8 Depth=3
-	decq	%rax
-	movq	-64(%rbp), %r10         # 8-byte Reload
-	movq	-56(%rbp), %r11         # 8-byte Reload
-	.align	16, 0x90
-.LBB2_11:                               # %polly.loop_header37.preheader
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        #       Parent Loop BB2_8 Depth=3
-                                        # =>      This Loop Header: Depth=4
-                                        #           Child Loop BB2_17 Depth 5
-                                        #             Child Loop BB2_18 Depth 6
-	cmpq	%r13, %r12
-	movq	%rdx, %r14
-	movq	%r9, %rcx
-	jg	.LBB2_12
-	.align	16, 0x90
-.LBB2_17:                               # %polly.loop_header46.preheader
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        #       Parent Loop BB2_8 Depth=3
-                                        #         Parent Loop BB2_11 Depth=4
-                                        # =>        This Loop Header: Depth=5
-                                        #             Child Loop BB2_18 Depth 6
-	leaq	(%r11,%r11,2), %rsi
-	shlq	$11, %rsi
-	vmovss	A(%rsi,%rcx,4), %xmm0
-	movq	%r10, %rdi
-	movq	%r14, %r8
-	movq	%r15, %rsi
-.LBB2_18:                               # %polly.loop_header46
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_15 Depth=2
-                                        #       Parent Loop BB2_8 Depth=3
-                                        #         Parent Loop BB2_11 Depth=4
-                                        #           Parent Loop BB2_17 Depth=5
-                                        # =>          This Inner Loop Header: Depth=6
-	vmulss	(%r8), %xmm0, %xmm1
-	vaddss	(%rdi), %xmm1, %xmm1
-	vmovss	%xmm1, (%rdi)
-	addq	$4, %rdi
-	addq	$4, %r8
-	incq	%rsi
-	cmpq	%rbx, %rsi
-	jle	.LBB2_18
-# BB#16:                                # %polly.loop_exit48
-                                        #   in Loop: Header=BB2_17 Depth=5
-	addq	$6144, %r14             # imm = 0x1800
-	cmpq	%rax, %rcx
-	leaq	1(%rcx), %rcx
-	jle	.LBB2_17
-	.align	16, 0x90
-.LBB2_12:                               # %polly.loop_exit39
-                                        #   in Loop: Header=BB2_11 Depth=4
-	addq	$6144, %r10             # imm = 0x1800
-	cmpq	-48(%rbp), %r11         # 8-byte Folded Reload
-	leaq	1(%r11), %r11
-	jle	.LBB2_11
-	.align	16, 0x90
-.LBB2_13:                               # %polly.loop_exit32
-                                        #   in Loop: Header=BB2_8 Depth=3
-	addq	$393216, %rdx           # imm = 0x60000
-	cmpq	$1472, %r9              # imm = 0x5C0
-	leaq	64(%r9), %r9
-	movq	-56(%rbp), %rax         # 8-byte Reload
-	jl	.LBB2_8
-# BB#14:                                # %polly.loop_exit25
-                                        #   in Loop: Header=BB2_15 Depth=2
-	addq	$256, -64(%rbp)         # 8-byte Folded Spill
-                                        # imm = 0x100
-	movq	-80(%rbp), %rcx         # 8-byte Reload
-	addq	$256, %rcx              # imm = 0x100
-	addq	$64, %r15
-	cmpq	$1472, %r12             # imm = 0x5C0
-	leaq	64(%r12), %r12
-	jl	.LBB2_15
-# BB#6:                                 # %polly.loop_exit18
-                                        #   in Loop: Header=BB2_5 Depth=1
-	movq	-88(%rbp), %rbx         # 8-byte Reload
-	addq	$393216, %rbx           # imm = 0x60000
-	cmpq	$1472, %rax             # imm = 0x5C0
-	leaq	64(%rax), %rax
-	jl	.LBB2_5
-# BB#7:                                 # %polly.loop_exit11
-	xorl	%eax, %eax
-	addq	$56, %rsp
+	jne	.LBB1_2
+# %bb.5:                                # %for.end
+                                        #   in Loop: Header=BB1_1 Depth=1
+	movl	$10, %edi
+	callq	fputc@PLT
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	addq	$1, %rax
+	addq	$6144, %r13             # imm = 0x1800
+	cmpq	$1536, %rax             # imm = 0x600
+	jne	.LBB1_1
+# %bb.6:                                # %for.end12
+	addq	$8, %rsp
 	popq	%rbx
 	popq	%r12
 	popq	%r13
 	popq	%r14
 	popq	%r15
 	popq	%rbp
-	ret
-.Ltmp28:
-	.size	main, .Ltmp28-main
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end1:
+	.size	print_array, .Lfunc_end1-print_array
 	.cfi_endproc
-
+                                        # -- End function
+	.globl	main                    # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$344, %rsp              # imm = 0x158
+	.cfi_offset %rbx, -56
+	.cfi_offset %r12, -48
+	.cfi_offset %r13, -40
+	.cfi_offset %r14, -32
+	.cfi_offset %r15, -24
+	callq	init_array
+	leaq	C(%rip), %rdi
+	xorl	%eax, %eax
+	movq	%rax, -48(%rbp)         # 8-byte Spill
+	xorl	%esi, %esi
+	movl	$9437184, %edx          # imm = 0x900000
+	callq	memset@PLT
+	movl	$64, %eax
+	movq	%rax, -64(%rbp)         # 8-byte Spill
+	leaq	A(%rip), %rax
+	movq	%rax, -56(%rbp)         # 8-byte Spill
+	.p2align	4, 0x90
+.LBB2_1:                                # %polly.loop_header8
+                                        # =>This Loop Header: Depth=1
+                                        #     Child Loop BB2_2 Depth 2
+                                        #       Child Loop BB2_3 Depth 3
+                                        #         Child Loop BB2_4 Depth 4
+                                        #           Child Loop BB2_5 Depth 5
+	leaq	B+240(%rip), %rax
+	xorl	%edi, %edi
+	.p2align	4, 0x90
+.LBB2_2:                                # %polly.loop_header14
+                                        #   Parent Loop BB2_1 Depth=1
+                                        # =>  This Loop Header: Depth=2
+                                        #       Child Loop BB2_3 Depth 3
+                                        #         Child Loop BB2_4 Depth 4
+                                        #           Child Loop BB2_5 Depth 5
+	movq	%rdi, %rcx
+	orq	$4, %rcx
+	movq	%rcx, -80(%rbp)         # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$8, %rcx
+	movq	%rcx, -264(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$12, %rcx
+	movq	%rcx, -256(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$16, %rcx
+	movq	%rcx, -248(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$20, %rcx
+	movq	%rcx, -240(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$24, %rcx
+	movq	%rcx, -232(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$28, %rcx
+	movq	%rcx, -224(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$32, %rcx
+	movq	%rcx, -216(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$36, %rcx
+	movq	%rcx, -208(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$40, %rcx
+	movq	%rcx, -200(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$44, %rcx
+	movq	%rcx, -192(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$48, %rcx
+	movq	%rcx, -184(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$52, %rcx
+	movq	%rcx, -176(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$56, %rcx
+	movq	%rcx, -168(%rbp)        # 8-byte Spill
+	movq	%rdi, %rcx
+	orq	$60, %rcx
+	movq	%rcx, -160(%rbp)        # 8-byte Spill
+	movq	-56(%rbp), %rdx         # 8-byte Reload
+	movq	%rax, -136(%rbp)        # 8-byte Spill
+	movq	%rax, -72(%rbp)         # 8-byte Spill
+	xorl	%eax, %eax
+	movq	%rdi, -272(%rbp)        # 8-byte Spill
+	.p2align	4, 0x90
+.LBB2_3:                                # %polly.loop_header20
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
+                                        # =>    This Loop Header: Depth=3
+                                        #         Child Loop BB2_4 Depth 4
+                                        #           Child Loop BB2_5 Depth 5
+	movq	%rax, -144(%rbp)        # 8-byte Spill
+	movq	%rdx, -152(%rbp)        # 8-byte Spill
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	.p2align	4, 0x90
+.LBB2_4:                                # %polly.loop_header26
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
+                                        #       Parent Loop BB2_3 Depth=3
+                                        # =>      This Loop Header: Depth=4
+                                        #           Child Loop BB2_5 Depth 5
+	movq	%rax, -376(%rbp)        # 8-byte Spill
+	leaq	(%rax,%rax,2), %rax
+	shlq	$11, %rax
+	leaq	C(%rip), %rsi
+	addq	%rsi, %rax
+	leaq	(%rax,%rdi,4), %rcx
+	movq	%rcx, -368(%rbp)        # 8-byte Spill
+	movq	-80(%rbp), %rcx         # 8-byte Reload
+	leaq	(%rax,%rcx,4), %rcx
+	movq	%rcx, -360(%rbp)        # 8-byte Spill
+	movq	-264(%rbp), %rbx        # 8-byte Reload
+	leaq	(%rax,%rbx,4), %rcx
+	movq	%rcx, -352(%rbp)        # 8-byte Spill
+	movq	-256(%rbp), %r8         # 8-byte Reload
+	movq	%rdi, %rsi
+	leaq	(%rax,%r8,4), %rdi
+	movq	%rdi, -344(%rbp)        # 8-byte Spill
+	movq	-248(%rbp), %rdi        # 8-byte Reload
+	leaq	(%rax,%rdi,4), %rcx
+	movq	%rcx, -336(%rbp)        # 8-byte Spill
+	movq	-240(%rbp), %r9         # 8-byte Reload
+	leaq	(%rax,%r9,4), %rcx
+	movq	%rcx, -328(%rbp)        # 8-byte Spill
+	movq	-232(%rbp), %r10        # 8-byte Reload
+	leaq	(%rax,%r10,4), %rcx
+	movq	%rcx, -320(%rbp)        # 8-byte Spill
+	movq	-224(%rbp), %r14        # 8-byte Reload
+	leaq	(%rax,%r14,4), %rcx
+	movq	%rcx, -312(%rbp)        # 8-byte Spill
+	movq	-216(%rbp), %r15        # 8-byte Reload
+	leaq	(%rax,%r15,4), %rcx
+	movq	%rcx, -304(%rbp)        # 8-byte Spill
+	movq	-208(%rbp), %r12        # 8-byte Reload
+	leaq	(%rax,%r12,4), %rcx
+	movq	%rcx, -296(%rbp)        # 8-byte Spill
+	movq	-200(%rbp), %r13        # 8-byte Reload
+	leaq	(%rax,%r13,4), %rcx
+	movq	%rcx, -288(%rbp)        # 8-byte Spill
+	movq	-192(%rbp), %r11        # 8-byte Reload
+	leaq	(%rax,%r11,4), %rcx
+	movq	%rcx, -280(%rbp)        # 8-byte Spill
+	movaps	(%rax,%rsi,4), %xmm15
+	movq	-80(%rbp), %rcx         # 8-byte Reload
+	movaps	(%rax,%rcx,4), %xmm14
+	movaps	(%rax,%rbx,4), %xmm13
+	movaps	(%rax,%r8,4), %xmm12
+	movaps	(%rax,%rdi,4), %xmm11
+	movaps	(%rax,%r9,4), %xmm10
+	movaps	(%rax,%r10,4), %xmm9
+	movaps	(%rax,%r14,4), %xmm8
+	movaps	(%rax,%r15,4), %xmm7
+	movaps	(%rax,%r12,4), %xmm6
+	movaps	(%rax,%r13,4), %xmm5
+	movaps	(%rax,%r11,4), %xmm4
+	movq	-184(%rbp), %rcx        # 8-byte Reload
+	movaps	(%rax,%rcx,4), %xmm3
+	movq	-176(%rbp), %rsi        # 8-byte Reload
+	movaps	(%rax,%rsi,4), %xmm0
+	movaps	%xmm0, -96(%rbp)        # 16-byte Spill
+	movq	-168(%rbp), %rbx        # 8-byte Reload
+	movaps	(%rax,%rbx,4), %xmm0
+	movaps	%xmm0, -112(%rbp)       # 16-byte Spill
+	movq	-160(%rbp), %rdi        # 8-byte Reload
+	movaps	(%rax,%rdi,4), %xmm0
+	movaps	%xmm0, -128(%rbp)       # 16-byte Spill
+	leaq	(%rax,%rcx,4), %r8
+	leaq	(%rax,%rsi,4), %rcx
+	leaq	(%rax,%rbx,4), %rsi
+	leaq	(%rax,%rdi,4), %rax
+	movq	-72(%rbp), %r9          # 8-byte Reload
+	movl	$0, %r10d
+	.p2align	4, 0x90
+.LBB2_5:                                # %vector.ph
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
+                                        #       Parent Loop BB2_3 Depth=3
+                                        #         Parent Loop BB2_4 Depth=4
+                                        # =>        This Inner Loop Header: Depth=5
+	movss	(%rdx,%r10,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	shufps	$0, %xmm0, %xmm0        # xmm0 = xmm0[0,0,0,0]
+	movaps	-240(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm15
+	movaps	-224(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm14
+	movaps	-208(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm13
+	movaps	-192(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm12
+	movaps	-176(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm11
+	movaps	-160(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm10
+	movaps	-144(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm9
+	movaps	-128(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm8
+	movaps	-112(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm7
+	movaps	-96(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm6
+	movaps	-80(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm5
+	movaps	-64(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm4
+	movaps	-48(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	addps	%xmm1, %xmm3
+	movaps	-32(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	movaps	-96(%rbp), %xmm2        # 16-byte Reload
+	addps	%xmm1, %xmm2
+	movaps	%xmm2, -96(%rbp)        # 16-byte Spill
+	movaps	-16(%r9), %xmm1
+	mulps	%xmm0, %xmm1
+	movaps	-112(%rbp), %xmm2       # 16-byte Reload
+	addps	%xmm1, %xmm2
+	movaps	%xmm2, -112(%rbp)       # 16-byte Spill
+	mulps	(%r9), %xmm0
+	movaps	-128(%rbp), %xmm1       # 16-byte Reload
+	addps	%xmm0, %xmm1
+	movaps	%xmm1, -128(%rbp)       # 16-byte Spill
+	addq	$1, %r10
+	addq	$6144, %r9              # imm = 0x1800
+	cmpq	$64, %r10
+	jne	.LBB2_5
+# %bb.6:                                # %polly.loop_exit34
+                                        #   in Loop: Header=BB2_4 Depth=4
+	movq	-368(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm15, (%rdi)
+	movq	-360(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm14, (%rdi)
+	movq	-352(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm13, (%rdi)
+	movq	-344(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm12, (%rdi)
+	movq	-336(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm11, (%rdi)
+	movq	-328(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm10, (%rdi)
+	movq	-320(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm9, (%rdi)
+	movq	-312(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm8, (%rdi)
+	movq	-304(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm7, (%rdi)
+	movq	-296(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm6, (%rdi)
+	movq	-288(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm5, (%rdi)
+	movq	-280(%rbp), %rdi        # 8-byte Reload
+	movaps	%xmm4, (%rdi)
+	movaps	%xmm3, (%r8)
+	movaps	-96(%rbp), %xmm0        # 16-byte Reload
+	movaps	%xmm0, (%rcx)
+	movaps	-112(%rbp), %xmm0       # 16-byte Reload
+	movaps	%xmm0, (%rsi)
+	movaps	-128(%rbp), %xmm0       # 16-byte Reload
+	movaps	%xmm0, (%rax)
+	movq	-376(%rbp), %rax        # 8-byte Reload
+	addq	$1, %rax
+	addq	$6144, %rdx             # imm = 0x1800
+	cmpq	-64(%rbp), %rax         # 8-byte Folded Reload
+	movq	-272(%rbp), %rdi        # 8-byte Reload
+	jne	.LBB2_4
+# %bb.7:                                # %polly.loop_exit28
+                                        #   in Loop: Header=BB2_3 Depth=3
+	movq	-144(%rbp), %rax        # 8-byte Reload
+	addq	$64, %rax
+	addq	$393216, -72(%rbp)      # 8-byte Folded Spill
+                                        # imm = 0x60000
+	movq	-152(%rbp), %rdx        # 8-byte Reload
+	addq	$256, %rdx              # imm = 0x100
+	cmpq	$1536, %rax             # imm = 0x600
+	jb	.LBB2_3
+# %bb.8:                                # %polly.loop_exit22
+                                        #   in Loop: Header=BB2_2 Depth=2
+	addq	$64, %rdi
+	movq	-136(%rbp), %rax        # 8-byte Reload
+	addq	$256, %rax              # imm = 0x100
+	cmpq	$1536, %rdi             # imm = 0x600
+	jb	.LBB2_2
+# %bb.9:                                # %polly.loop_exit16
+                                        #   in Loop: Header=BB2_1 Depth=1
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	movq	%rax, %rcx
+	addq	$64, %rcx
+	addq	$64, -64(%rbp)          # 8-byte Folded Spill
+	addq	$393216, -56(%rbp)      # 8-byte Folded Spill
+                                        # imm = 0x60000
+	movq	%rcx, %rax
+	movq	%rcx, -48(%rbp)         # 8-byte Spill
+	cmpq	$1536, %rcx             # imm = 0x600
+	jb	.LBB2_1
+# %bb.10:                               # %polly.exiting
+	xorl	%eax, %eax
+	addq	$344, %rsp              # imm = 0x158
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end2:
+	.size	main, .Lfunc_end2-main
+	.cfi_endproc
+                                        # -- End function
 	.type	A,@object               # @A
 	.comm	A,9437184,16
 	.type	B,@object               # @B
@ -381,10 +497,11 @@ main:                                   # @main
 	.type	.L.str,@object          # @.str
 	.section	.rodata.str1.1,"aMS",@progbits,1
 .L.str:
-	.asciz	 "%lf "
+	.asciz	"%lf "
 	.size	.L.str, 5

 	.type	C,@object               # @C
 	.comm	C,9437184,16

+	.ident	"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
 	.section	".note.GNU-stack","",@progbits
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged.exe
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.exe
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged.s
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.s
@ -1,275 +1,248 @@
-	.file	"matmul.polly.interchanged.ll"
+	.text
+	.file	"matmul.c"
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
+	.p2align	3               # -- Begin function init_array
 .LCPI0_0:
 	.quad	4602678819172646912     # double 0.5
 	.text
 	.globl	init_array
-	.align	16, 0x90
+	.p2align	4, 0x90
 	.type	init_array,@function
 init_array:                             # @init_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp2:
 	.cfi_def_cfa_offset 16
-.Ltmp3:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp4:
 	.cfi_def_cfa_register %rbp
+	leaq	B(%rip), %rax
+	leaq	A(%rip), %rcx
 	xorl	%r8d, %r8d
-	vmovsd	.LCPI0_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB0_1:                                # %polly.loop_preheader3
+	movsd	.LCPI0_0(%rip), %xmm0   # xmm0 = mem[0],zero
+	xorl	%r9d, %r9d
+	.p2align	4, 0x90
+.LBB0_1:                                # %polly.loop_header
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB0_2:                                # %polly.loop_header2
+	movl	$1, %edi
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+.LBB0_2:                                # %polly.loop_header1
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%r8d, %edx
 	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%r8, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
+	andl	$1022, %esi             # imm = 0x3FE
+	orl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, -4(%rcx,%rdi,4)
+	movss	%xmm1, -4(%rax,%rdi,4)
+	leal	(%r9,%rdx), %esi
+	andl	$1023, %esi             # imm = 0x3FF
+	addl	$1, %esi
+	xorps	%xmm1, %xmm1
+	cvtsi2sdl	%esi, %xmm1
+	mulsd	%xmm0, %xmm1
+	cvtsd2ss	%xmm1, %xmm1
+	movss	%xmm1, (%rcx,%rdi,4)
+	movss	%xmm1, (%rax,%rdi,4)
+	addq	$2, %rdi
+	addl	%r8d, %edx
+	cmpq	$1537, %rdi             # imm = 0x601
 	jne	.LBB0_2
-# BB#3:                                 # %polly.loop_exit4
+# %bb.3:                                # %polly.loop_exit3
                                        #   in Loop: Header=BB0_1 Depth=1
-	incq	%r8
-	cmpq	$1536, %r8              # imm = 0x600
+	addq	$1, %r9
+	addq	$6144, %rax             # imm = 0x1800
+	addq	$6144, %rcx             # imm = 0x1800
+	addl	$2, %r8d
+	cmpq	$1536, %r9              # imm = 0x600
 	jne	.LBB0_1
-# BB#4:                                 # %polly.loop_exit
+# %bb.4:                                # %polly.exiting
 	popq	%rbp
-	ret
-.Ltmp5:
-	.size	init_array, .Ltmp5-init_array
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end0:
+	.size	init_array, .Lfunc_end0-init_array
 	.cfi_endproc
-
-	.globl	print_array
-	.align	16, 0x90
+                                        # -- End function
+	.globl	print_array             # -- Begin function print_array
+	.p2align	4, 0x90
 	.type	print_array,@function
 print_array:                            # @print_array
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp9:
 	.cfi_def_cfa_offset 16
-.Ltmp10:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp11:
 	.cfi_def_cfa_register %rbp
 	pushq	%r15
 	pushq	%r14
+	pushq	%r13
 	pushq	%r12
 	pushq	%rbx
-.Ltmp12:
-	.cfi_offset %rbx, -48
-.Ltmp13:
-	.cfi_offset %r12, -40
-.Ltmp14:
+	pushq	%rax
+	.cfi_offset %rbx, -56
+	.cfi_offset %r12, -48
+	.cfi_offset %r13, -40
 	.cfi_offset %r14, -32
-.Ltmp15:
 	.cfi_offset %r15, -24
-	xorl	%r14d, %r14d
-	movl	$C, %r15d
-	.align	16, 0x90
+	leaq	C(%rip), %r13
+	xorl	%eax, %eax
+	movl	$3435973837, %r12d      # imm = 0xCCCCCCCD
+	leaq	.L.str(%rip), %r14
+	.p2align	4, 0x90
 .LBB1_1:                                # %for.cond1.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB1_2 Depth 2
-	movq	stdout(%rip), %rax
-	movq	%r15, %r12
+	movq	%rax, -48(%rbp)         # 8-byte Spill
+	movq	stdout(%rip), %rsi
 	xorl	%ebx, %ebx
-	.align	16, 0x90
+	.p2align	4, 0x90
 .LBB1_2:                                # %for.body3
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
-	vmovss	(%r12), %xmm0
-	vcvtss2sd	%xmm0, %xmm0, %xmm0
-	movq	%rax, %rdi
-	movl	$.L.str, %esi
+	movl	%ebx, %eax
+	imulq	%r12, %rax
+	shrq	$38, %rax
+	leal	(%rax,%rax,4), %r15d
+	shll	$4, %r15d
+	addl	$79, %r15d
+	movss	(%r13,%rbx,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	cvtss2sd	%xmm0, %xmm0
 	movb	$1, %al
+	movq	%rsi, %rdi
+	movq	%r14, %rsi
 	callq	fprintf
-	movslq	%ebx, %rax
-	imulq	$1717986919, %rax, %rcx # imm = 0x66666667
-	movq	%rcx, %rdx
-	shrq	$63, %rdx
-	sarq	$37, %rcx
-	addl	%edx, %ecx
-	imull	$80, %ecx, %ecx
-	subl	%ecx, %eax
-	cmpl	$79, %eax
+	cmpl	%ebx, %r15d
 	jne	.LBB1_4
-# BB#3:                                 # %if.then
+# %bb.3:                                # %if.then
                                        #   in Loop: Header=BB1_2 Depth=2
 	movq	stdout(%rip), %rsi
 	movl	$10, %edi
-	callq	fputc
+	callq	fputc@PLT
 .LBB1_4:                                # %for.inc
                                        #   in Loop: Header=BB1_2 Depth=2
-	addq	$4, %r12
-	incq	%rbx
-	movq	stdout(%rip), %rax
+	addq	$1, %rbx
+	movq	stdout(%rip), %rsi
 	cmpq	$1536, %rbx             # imm = 0x600
 	jne	.LBB1_2
-# BB#5:                                 # %for.end
+# %bb.5:                                # %for.end
                                        #   in Loop: Header=BB1_1 Depth=1
 	movl	$10, %edi
-	movq	%rax, %rsi
-	callq	fputc
-	addq	$6144, %r15             # imm = 0x1800
-	incq	%r14
-	cmpq	$1536, %r14             # imm = 0x600
+	callq	fputc@PLT
+	movq	-48(%rbp), %rax         # 8-byte Reload
+	addq	$1, %rax
+	addq	$6144, %r13             # imm = 0x1800
+	cmpq	$1536, %rax             # imm = 0x600
 	jne	.LBB1_1
-# BB#6:                                 # %for.end12
+# %bb.6:                                # %for.end12
+	addq	$8, %rsp
 	popq	%rbx
 	popq	%r12
+	popq	%r13
 	popq	%r14
 	popq	%r15
 	popq	%rbp
-	ret
-.Ltmp16:
-	.size	print_array, .Ltmp16-print_array
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end1:
+	.size	print_array, .Lfunc_end1-print_array
 	.cfi_endproc
-
-	.section	.rodata.cst8,"aM",@progbits,8
-	.align	8
-.LCPI2_0:
-	.quad	4602678819172646912     # double 0.5
-	.text
-	.globl	main
-	.align	16, 0x90
+                                        # -- End function
+	.globl	main                    # -- Begin function main
+	.p2align	4, 0x90
 	.type	main,@function
 main:                                   # @main
 	.cfi_startproc
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
 	pushq	%rbp
-.Ltmp20:
 	.cfi_def_cfa_offset 16
-.Ltmp21:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
-.Ltmp22:
 	.cfi_def_cfa_register %rbp
 	pushq	%r14
 	pushq	%rbx
-.Ltmp23:
 	.cfi_offset %rbx, -32
-.Ltmp24:
 	.cfi_offset %r14, -24
-	xorl	%ebx, %ebx
-	vmovsd	.LCPI2_0(%rip), %xmm0
-	.align	16, 0x90
-.LBB2_1:                                # %polly.loop_preheader3.i
-                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_2 Depth 2
-	xorl	%ecx, %ecx
-	.align	16, 0x90
-.LBB2_2:                                # %polly.loop_header2.i
-                                        #   Parent Loop BB2_1 Depth=1
-                                        # =>  This Inner Loop Header: Depth=2
-	movl	%ecx, %edx
-	imull	%ebx, %edx
-	movl	%edx, %esi
-	sarl	$31, %esi
-	shrl	$22, %esi
-	addl	%edx, %esi
-	andl	$-1024, %esi            # imm = 0xFFFFFFFFFFFFFC00
-	negl	%esi
-	movq	%rbx, %rax
-	shlq	$11, %rax
-	leal	1(%rdx,%rsi), %edi
-	leaq	(%rax,%rax,2), %rsi
-	leaq	1(%rcx), %rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	vcvtsi2sdl	%edi, %xmm0, %xmm1
-	vmulsd	%xmm0, %xmm1, %xmm1
-	vcvtsd2ss	%xmm1, %xmm1, %xmm1
-	vmovss	%xmm1, A(%rsi,%rcx,4)
-	vmovss	%xmm1, B(%rsi,%rcx,4)
-	movq	%rdx, %rcx
-	jne	.LBB2_2
-# BB#3:                                 # %polly.loop_exit4.i
-                                        #   in Loop: Header=BB2_1 Depth=1
-	incq	%rbx
-	cmpq	$1536, %rbx             # imm = 0x600
-	jne	.LBB2_1
-# BB#4:                                 # %polly.loop_preheader3.preheader
-	movl	$C, %r14d
-	movl	$C, %edi
+	callq	init_array
+	leaq	C(%rip), %rbx
+	xorl	%r14d, %r14d
 	xorl	%esi, %esi
 	movl	$9437184, %edx          # imm = 0x900000
-	callq	memset
-	xorl	%eax, %eax
-	.align	16, 0x90
-.LBB2_5:                                # %polly.loop_preheader17
+	movq	%rbx, %rdi
+	callq	memset@PLT
+	leaq	B(%rip), %rax
+	leaq	A(%rip), %rcx
+	.p2align	4, 0x90
+.LBB2_1:                                # %polly.loop_header8
                                        # =>This Loop Header: Depth=1
-                                        #     Child Loop BB2_10 Depth 2
-                                        #       Child Loop BB2_8 Depth 3
-	movl	$B, %ebx
-	xorl	%edx, %edx
-	.align	16, 0x90
-.LBB2_10:                               # %polly.loop_preheader24
-                                        #   Parent Loop BB2_5 Depth=1
+                                        #     Child Loop BB2_2 Depth 2
+                                        #       Child Loop BB2_3 Depth 3
+	movq	%rax, %rdx
+	xorl	%esi, %esi
+	.p2align	4, 0x90
+.LBB2_2:                                # %polly.loop_header14
+                                        #   Parent Loop BB2_1 Depth=1
                                        # =>  This Loop Header: Depth=2
-                                        #       Child Loop BB2_8 Depth 3
-	leaq	(%rax,%rax,2), %rcx
-	shlq	$11, %rcx
-	vmovss	A(%rcx,%rdx,4), %xmm0
-	movl	$1536, %esi             # imm = 0x600
-	movq	%r14, %rdi
-	movq	%rbx, %rcx
-	.align	16, 0x90
-.LBB2_8:                                # %polly.loop_header23
-                                        #   Parent Loop BB2_5 Depth=1
-                                        #     Parent Loop BB2_10 Depth=2
+                                        #       Child Loop BB2_3 Depth 3
+	leaq	(%r14,%r14,2), %rdi
+	shlq	$11, %rdi
+	addq	%rcx, %rdi
+	movss	(%rdi,%rsi,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
+	shufps	$0, %xmm0, %xmm0        # xmm0 = xmm0[0,0,0,0]
+	movl	$12, %edi
+	.p2align	4, 0x90
+.LBB2_3:                                # %vector.body
+                                        #   Parent Loop BB2_1 Depth=1
+                                        #     Parent Loop BB2_2 Depth=2
                                        # =>    This Inner Loop Header: Depth=3
-	vmulss	(%rcx), %xmm0, %xmm1
-	vaddss	(%rdi), %xmm1, %xmm1
-	vmovss	%xmm1, (%rdi)
-	addq	$4, %rdi
-	addq	$4, %rcx
-	decq	%rsi
-	jne	.LBB2_8
-# BB#9:                                 # %polly.loop_exit25
-                                        #   in Loop: Header=BB2_10 Depth=2
+	movaps	-48(%rdx,%rdi,4), %xmm1
+	mulps	%xmm0, %xmm1
+	movaps	-32(%rdx,%rdi,4), %xmm2
+	mulps	%xmm0, %xmm2
+	addps	-48(%rbx,%rdi,4), %xmm1
+	addps	-32(%rbx,%rdi,4), %xmm2
+	movaps	%xmm1, -48(%rbx,%rdi,4)
+	movaps	%xmm2, -32(%rbx,%rdi,4)
+	movaps	-16(%rdx,%rdi,4), %xmm1
+	mulps	%xmm0, %xmm1
+	movaps	(%rdx,%rdi,4), %xmm2
+	mulps	%xmm0, %xmm2
+	addps	-16(%rbx,%rdi,4), %xmm1
+	addps	(%rbx,%rdi,4), %xmm2
+	movaps	%xmm1, -16(%rbx,%rdi,4)
+	movaps	%xmm2, (%rbx,%rdi,4)
+	addq	$16, %rdi
+	cmpq	$1548, %rdi             # imm = 0x60C
+	jne	.LBB2_3
+# %bb.4:                                # %polly.loop_exit22
+                                        #   in Loop: Header=BB2_2 Depth=2
+	addq	$1, %rsi
+	addq	$6144, %rdx             # imm = 0x1800
+	cmpq	$1536, %rsi             # imm = 0x600
+	jne	.LBB2_2
+# %bb.5:                                # %polly.loop_exit16
+                                        #   in Loop: Header=BB2_1 Depth=1
+	addq	$1, %r14
 	addq	$6144, %rbx             # imm = 0x1800
-	incq	%rdx
-	cmpq	$1536, %rdx             # imm = 0x600
-	jne	.LBB2_10
-# BB#6:                                 # %polly.loop_exit18
-                                        #   in Loop: Header=BB2_5 Depth=1
-	addq	$6144, %r14             # imm = 0x1800
-	incq	%rax
-	cmpq	$1536, %rax             # imm = 0x600
-	jne	.LBB2_5
-# BB#7:                                 # %polly.loop_exit11
+	cmpq	$1536, %r14             # imm = 0x600
+	jne	.LBB2_1
+# %bb.6:                                # %polly.exiting
 	xorl	%eax, %eax
 	popq	%rbx
 	popq	%r14
 	popq	%rbp
-	ret
-.Ltmp25:
-	.size	main, .Ltmp25-main
+	.cfi_def_cfa %rsp, 8
+	retq
+.Lfunc_end2:
+	.size	main, .Lfunc_end2-main
 	.cfi_endproc
-
+                                        # -- End function
 	.type	A,@object               # @A
 	.comm	A,9437184,16
 	.type	B,@object               # @B
@ -277,10 +250,11 @@ main:                                   # @main
 	.type	.L.str,@object          # @.str
 	.section	.rodata.str1.1,"aMS",@progbits,1
 .L.str:
-	.asciz	 "%lf "
+	.asciz	"%lf "
 	.size	.L.str, 5

 	.type	C,@object               # @C
 	.comm	C,9437184,16

+	.ident	"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
 	.section	".note.GNU-stack","",@progbits
--- a/polly/docs/experiments/matmul/matmul.preopt.ll
+++ b/polly/docs/experiments/matmul/matmul.preopt.ll
@ -1,4 +1,4 @@
-; ModuleID = 'matmul.s'
+; ModuleID = 'matmul.ll'
 source_filename = "matmul.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@ -6,15 +6,15 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
 %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }

-@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
-@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
-@stdout = external global %struct._IO_FILE*, align 8
+@A = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16
+@B = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16
+@stdout = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1
-@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@C = common dso_local global [1536 x [1536 x float]] zeroinitializer, align 16
@.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1

-; Function Attrs: nounwind uwtable
-define void @init_array() #0 {
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init_array() #0 {
 entry:
  br label %entry.split

@ -22,44 +22,37 @@ entry.split:                                      ; preds = %entry
  br label %for.cond1.preheader

 for.cond1.preheader:                              ; preds = %entry.split, %for.inc17
-  %indvars.iv5 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next6, %for.inc17 ]
+  %indvars.iv4 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next5, %for.inc17 ]
  br label %for.body3

 for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv5
+  %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv4
  %1 = trunc i64 %0 to i32
-  %rem = srem i32 %1, 1024
-  %add = add nsw i32 %rem, 1
+  %rem = and i32 %1, 1023
+  %add = add nuw nsw i32 %rem, 1
  %conv = sitofp i32 %add to double
  %div = fmul double %conv, 5.000000e-01
  %conv4 = fptrunc double %div to float
-  %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv5, i64 %indvars.iv
+  %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv4, i64 %indvars.iv
  store float %conv4, float* %arrayidx6, align 4
-  %2 = mul nuw nsw i64 %indvars.iv, %indvars.iv5
-  %3 = trunc i64 %2 to i32
-  %rem8 = srem i32 %3, 1024
-  %add9 = add nsw i32 %rem8, 1
-  %conv10 = sitofp i32 %add9 to double
-  %div11 = fmul double %conv10, 5.000000e-01
-  %conv12 = fptrunc double %div11 to float
-  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv5, i64 %indvars.iv
-  store float %conv12, float* %arrayidx16, align 4
+  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv4, i64 %indvars.iv
+  store float %conv4, float* %arrayidx16, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 1536
  br i1 %exitcond, label %for.body3, label %for.inc17

 for.inc17:                                        ; preds = %for.body3
-  %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1
-  %exitcond7 = icmp ne i64 %indvars.iv.next6, 1536
-  br i1 %exitcond7, label %for.cond1.preheader, label %for.end19
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536
+  br i1 %exitcond6, label %for.cond1.preheader, label %for.end19

 for.end19:                                        ; preds = %for.inc17
  ret void
 }

-; Function Attrs: nounwind uwtable
-define void @print_array() #0 {
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @print_array() #0 {
 entry:
  br label %entry.split

@ -79,7 +72,7 @@ for.body3:                                        ; preds = %for.cond1.preheader
  %conv = fpext float %2 to double
  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2
  %3 = trunc i64 %indvars.iv to i32
-  %rem = srem i32 %3, 80
+  %rem = urem i32 %3, 80
  %cmp6 = icmp eq i32 %rem, 79
  br i1 %cmp6, label %if.then, label %for.inc

@ -105,10 +98,10 @@ for.end12:                                        ; preds = %for.end
  ret void
 }

-declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1

-; Function Attrs: nounwind uwtable
-define i32 @main() #0 {
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
 entry:
  br label %entry.split

@ -128,16 +121,14 @@ for.body3:                                        ; preds = %for.cond1.preheader

 for.body8:                                        ; preds = %for.body3, %for.body8
  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]
-  %arrayidx12 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4
-  %0 = load float, float* %arrayidx12, align 4
+  %0 = load float, float* %arrayidx5, align 4
  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv
  %1 = load float, float* %arrayidx16, align 4
  %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4
  %2 = load float, float* %arrayidx20, align 4
  %mul = fmul float %1, %2
  %add = fadd float %0, %mul
-  %arrayidx24 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4
-  store float %add, float* %arrayidx24, align 4
+  store float %add, float* %arrayidx5, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 1536
  br i1 %exitcond, label %for.body8, label %for.inc25
@ -162,10 +153,12 @@ declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2
 ; Function Attrs: nounwind
 declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2

-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }

-!llvm.ident = !{!0}
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}

-!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 081569d9a29c7bc827b2d41f8e62891bbc895e2f) (http://llvm.org/git/llvm.git e117e506536626352e8e47f6c72cd6e2a276622c)"}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"}
--- a/polly/docs/experiments/matmul/runall.sh
+++ b/polly/docs/experiments/matmul/runall.sh
@ -1,85 +1,83 @@
 #!/bin/sh -a

 echo "--> 1. Create LLVM-IR from C"
-clang -S -emit-llvm matmul.c -o matmul.s
+clang -S -emit-llvm matmul.c -Xclang -disable-O0-optnone -o matmul.ll

 echo "--> 2. Prepare the LLVM-IR for Polly"
-opt -S -polly-canonicalize matmul.s > matmul.preopt.ll
+opt -S -polly-canonicalize matmul.ll -o matmul.preopt.ll

 echo "--> 3. Show the SCoPs detected by Polly"
-opt -basicaa -polly-ast -analyze -q matmul.preopt.ll \
-    -polly-process-unprofitable
+opt -basicaa -polly-ast -analyze matmul.preopt.ll \
+    -polly-process-unprofitable -polly-use-llvm-names

 echo "--> 4.1 Highlight the detected SCoPs in the CFGs of the program"
 # We only create .dot files, as directly -view-scops directly calls graphviz
 # which would require user interaction to continue the script.
 # opt -basicaa -view-scops -disable-output matmul.preopt.ll
-opt -basicaa -dot-scops -disable-output matmul.preopt.ll
+opt -basicaa -dot-scops -disable-output matmul.preopt.ll -polly-use-llvm-names

 echo "--> 4.2 Highlight the detected SCoPs in the CFGs of the program (print \
 no instructions)"
 # We only create .dot files, as directly -view-scops-only directly calls
 # graphviz which would require user interaction to continue the script.
 # opt -basicaa -view-scops-only -disable-output matmul.preopt.ll
-opt -basicaa -dot-scops-only -disable-output matmul.preopt.ll
+opt -basicaa -dot-scops-only -disable-output matmul.preopt.ll -polly-use-llvm-names

 echo "--> 4.3 Create .png files from the .dot files"
 for i in `ls *.dot`; do dot -Tpng $i > $i.png; done

 echo "--> 5. View the polyhedral representation of the SCoPs"
-opt -basicaa -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable
+opt -basicaa -polly-scops -analyze matmul.preopt.ll \
+    -polly-process-unprofitable -polly-use-llvm-names

 echo "--> 6. Show the dependences for the SCoPs"
 opt -basicaa -polly-dependences -analyze matmul.preopt.ll \
-    -polly-process-unprofitable
+    -polly-process-unprofitable -polly-use-llvm-names

 echo "--> 7. Export jscop files"
-opt -basicaa -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable
+opt -basicaa -polly-export-jscop matmul.preopt.ll \
+    -polly-process-unprofitable -disable-output -polly-use-llvm-names

 echo "--> 8. Import the updated jscop files and print the new SCoPs. (optional)"
 opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
-    -polly-process-unprofitable
+    -polly-process-unprofitable -polly-use-llvm-names
 opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
-    -polly-import-jscop-postfix=interchanged -polly-process-unprofitable
+    -polly-import-jscop-postfix=interchanged -polly-process-unprofitable -polly-use-llvm-names
 opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
-    -polly-import-jscop-postfix=interchanged+tiled -polly-process-unprofitable
+    -polly-import-jscop-postfix=interchanged+tiled -polly-process-unprofitable -polly-use-llvm-names
 opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
    -polly-import-jscop-postfix=interchanged+tiled+vector \
-    -polly-process-unprofitable
+    -polly-process-unprofitable -polly-use-llvm-names

 echo "--> 9. Codegenerate the SCoPs"
-opt -basicaa -polly-import-jscop -polly-import-jscop-postfix=interchanged \
-    -polly-codegen -polly-process-unprofitable\
-    matmul.preopt.ll | opt -O3 > matmul.polly.interchanged.ll
-opt -basicaa -polly-import-jscop \
+opt -S -basicaa -polly-import-jscop -polly-import-jscop-postfix=interchanged \
+    -polly-codegen -polly-process-unprofitable -polly-use-llvm-names \
+    matmul.preopt.ll | opt -O3 -S -o matmul.polly.interchanged.ll
+opt -S -basicaa -polly-import-jscop \
    -polly-import-jscop-postfix=interchanged+tiled -polly-codegen \
-    matmul.preopt.ll -polly-process-unprofitable \
-    | opt -O3 > matmul.polly.interchanged+tiled.ll
-opt -basicaa -polly-import-jscop -polly-process-unprofitable\
+    matmul.preopt.ll -polly-process-unprofitable -polly-use-llvm-names \
+    | opt -O3 -S -o matmul.polly.interchanged+tiled.ll
+opt -S -basicaa -polly-import-jscop -polly-process-unprofitable\
    -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen \
-    matmul.preopt.ll -polly-vectorizer=polly\
-    | opt -O3 > matmul.polly.interchanged+tiled+vector.ll
-opt -basicaa -polly-import-jscop -polly-process-unprofitable\
+    matmul.preopt.ll -polly-vectorizer=polly -polly-use-llvm-names \
+    | opt -O3 -S -o matmul.polly.interchanged+tiled+vector.ll
+opt -S -basicaa -polly-import-jscop -polly-process-unprofitable\
    -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen \
-    matmul.preopt.ll -polly-vectorizer=polly -polly-parallel\
-    | opt -O3 > matmul.polly.interchanged+tiled+vector+openmp.ll
-opt matmul.preopt.ll | opt -O3 > matmul.normalopt.ll
+    matmul.preopt.ll -polly-vectorizer=polly -polly-parallel -polly-use-llvm-names \
+    | opt -O3 -S -o matmul.polly.interchanged+tiled+vector+openmp.ll
+opt -S matmul.preopt.ll | opt -O3 -S -o matmul.normalopt.ll

 echo "--> 10. Create the executables"
-llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s && gcc matmul.polly.interchanged.s \
-    -o matmul.polly.interchanged.exe
-llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s && gcc matmul.polly.interchanged+tiled.s \
-    -o matmul.polly.interchanged+tiled.exe
-llc matmul.polly.interchanged+tiled+vector.ll \
-    -o matmul.polly.interchanged+tiled+vector.s \
-    && gcc matmul.polly.interchanged+tiled+vector.s \
-    -o matmul.polly.interchanged+tiled+vector.exe
-llc matmul.polly.interchanged+tiled+vector+openmp.ll \
-    -o matmul.polly.interchanged+tiled+vector+openmp.s \
-    && gcc -lgomp matmul.polly.interchanged+tiled+vector+openmp.s \
-    -o matmul.polly.interchanged+tiled+vector+openmp.exe
-llc matmul.normalopt.ll -o matmul.normalopt.s && gcc matmul.normalopt.s \
-    -o matmul.normalopt.exe
+llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s -relocation-model=pic
+gcc matmul.polly.interchanged.s -o matmul.polly.interchanged.exe
+llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s -relocation-model=pic
+gcc matmul.polly.interchanged+tiled.s -o matmul.polly.interchanged+tiled.exe
+llc matmul.polly.interchanged+tiled+vector.ll -o matmul.polly.interchanged+tiled+vector.s -relocation-model=pic
+gcc matmul.polly.interchanged+tiled+vector.s  -o matmul.polly.interchanged+tiled+vector.exe
+llc matmul.polly.interchanged+tiled+vector+openmp.ll -o matmul.polly.interchanged+tiled+vector+openmp.s -relocation-model=pic
+gcc matmul.polly.interchanged+tiled+vector+openmp.s -lgomp -o matmul.polly.interchanged+tiled+vector+openmp.exe
+llc matmul.normalopt.ll -o matmul.normalopt.s -relocation-model=pic
+gcc matmul.normalopt.s -lgomp -o matmul.normalopt.exe

 echo "--> 11. Compare the runtime of the executables"

--- a/polly/docs/experiments/matmul/scops.init_array.dot
+++ b/polly/docs/experiments/matmul/scops.init_array.dot
@ -1,39 +1,39 @@
 digraph "Scop Graph for 'init_array' function" {
 	label="Scop Graph for 'init_array' function";

-	Node0x5b5b5a0 [shape=record,label="{entry:\l  br label %entry.split\l}"];
-	Node0x5b5b5a0 -> Node0x5b5de30;
-	Node0x5b5de30 [shape=record,label="{entry.split:                                      \l  br label %for.cond1.preheader\l}"];
-	Node0x5b5de30 -> Node0x5b5de50;
-	Node0x5b5de50 [shape=record,label="{for.cond1.preheader:                              \l  %indvars.iv5 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next6, %for.inc17 ]\l  br label %for.body3\l}"];
-	Node0x5b5de50 -> Node0x5b5b570;
-	Node0x5b5b570 [shape=record,label="{for.body3:                                        \l  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.body3 ]\l  %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv5\l  %1 = trunc i64 %0 to i32\l  %rem = srem i32 %1, 1024\l  %add = add nsw i32 %rem, 1\l  %conv = sitofp i32 %add to double\l  %div = fmul double %conv, 5.000000e-01\l  %conv4 = fptrunc double %div to float\l  %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @A, i64 0, i64 %indvars.iv5, i64 %indvars.iv\l  store float %conv4, float* %arrayidx6, align 4\l  %2 = mul nuw nsw i64 %indvars.iv, %indvars.iv5\l  %3 = trunc i64 %2 to i32\l  %rem8 = srem i32 %3, 1024\l  %add9 = add nsw i32 %rem8, 1\l  %conv10 = sitofp i32 %add9 to double\l  %div11 = fmul double %conv10, 5.000000e-01\l  %conv12 = fptrunc double %div11 to float\l  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv5, i64 %indvars.iv\l  store float %conv12, float* %arrayidx16, align 4\l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  %exitcond = icmp ne i64 %indvars.iv.next, 1536\l  br i1 %exitcond, label %for.body3, label %for.inc17\l}"];
-	Node0x5b5b570 -> Node0x5b5b570[constraint=false];
-	Node0x5b5b570 -> Node0x5b5df30;
-	Node0x5b5df30 [shape=record,label="{for.inc17:                                        \l  %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1\l  %exitcond7 = icmp ne i64 %indvars.iv.next6, 1536\l  br i1 %exitcond7, label %for.cond1.preheader, label %for.end19\l}"];
-	Node0x5b5df30 -> Node0x5b5de50[constraint=false];
-	Node0x5b5df30 -> Node0x5b5df90;
-	Node0x5b5df90 [shape=record,label="{for.end19:                                        \l  ret void\l}"];
+	Node0x7fffc6c46ea0 [shape=record,label="{entry:\l  br label %entry.split\l}"];
+	Node0x7fffc6c46ea0 -> Node0x7fffc6c46f20;
+	Node0x7fffc6c46f20 [shape=record,label="{entry.split:                                      \l  br label %for.cond1.preheader\l}"];
+	Node0x7fffc6c46f20 -> Node0x7fffc6c47000;
+	Node0x7fffc6c47000 [shape=record,label="{for.cond1.preheader:                              \l  %indvars.iv4 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next5, %for.inc17 ]\l  br label %for.body3\l}"];
+	Node0x7fffc6c47000 -> Node0x7fffc6c47290;
+	Node0x7fffc6c47290 [shape=record,label="{for.body3:                                        \l  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.body3 ]\l  %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv4\l  %1 = trunc i64 %0 to i32\l  %rem = and i32 %1, 1023\l  %add = add nuw nsw i32 %rem, 1\l  %conv = sitofp i32 %add to double\l  %div = fmul double %conv, 5.000000e-01\l  %conv4 = fptrunc double %div to float\l  %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @A, i64 0, i64 %indvars.iv4, i64 %indvars.iv\l  store float %conv4, float* %arrayidx6, align 4\l  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv4, i64 %indvars.iv\l  store float %conv4, float* %arrayidx16, align 4\l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  %exitcond = icmp ne i64 %indvars.iv.next, 1536\l  br i1 %exitcond, label %for.body3, label %for.inc17\l}"];
+	Node0x7fffc6c47290 -> Node0x7fffc6c47290[constraint=false];
+	Node0x7fffc6c47290 -> Node0x7fffc6c47b10;
+	Node0x7fffc6c47b10 [shape=record,label="{for.inc17:                                        \l  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l  %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l  br i1 %exitcond6, label %for.cond1.preheader, label %for.end19\l}"];
+	Node0x7fffc6c47b10 -> Node0x7fffc6c47000[constraint=false];
+	Node0x7fffc6c47b10 -> Node0x7fffc6c48b10;
+	Node0x7fffc6c48b10 [shape=record,label="{for.end19:                                        \l  ret void\l}"];
 	colorscheme = "paired12"
-        subgraph cluster_0x5b4bdd0 {
+        subgraph cluster_0x7fffc6c32540 {
          label = "";
          style = solid;
          color = 1
-          subgraph cluster_0x5b4bf50 {
+          subgraph cluster_0x7fffc6c32f30 {
            label = "Region can not profitably be optimized!";
            style = solid;
            color = 6
-            subgraph cluster_0x5b4c0d0 {
+            subgraph cluster_0x7fffc6c32690 {
              label = "";
              style = solid;
              color = 5
-              Node0x5b5b570;
+              Node0x7fffc6c47290;
            }
-            Node0x5b5de50;
-            Node0x5b5df30;
+            Node0x7fffc6c47000;
+            Node0x7fffc6c47b10;
          }
-          Node0x5b5b5a0;
-          Node0x5b5de30;
-          Node0x5b5df90;
+          Node0x7fffc6c46ea0;
+          Node0x7fffc6c46f20;
+          Node0x7fffc6c48b10;
        }
 }
--- a/polly/docs/experiments/matmul/scops.init_array.dot.png
+++ b/polly/docs/experiments/matmul/scops.init_array.dot.png
--- a/polly/docs/experiments/matmul/scops.main.dot
+++ b/polly/docs/experiments/matmul/scops.main.dot
@ -1,50 +1,50 @@
 digraph "Scop Graph for 'main' function" {
 	label="Scop Graph for 'main' function";

-	Node0x5b5c850 [shape=record,label="{entry:\l  br label %entry.split\l}"];
-	Node0x5b5c850 -> Node0x5b5a440;
-	Node0x5b5a440 [shape=record,label="{entry.split:                                      \l  tail call void @init_array()\l  br label %for.cond1.preheader\l}"];
-	Node0x5b5a440 -> Node0x5b38cd0;
-	Node0x5b38cd0 [shape=record,label="{for.cond1.preheader:                              \l  %indvars.iv7 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next8, %for.inc28 ]\l  br label %for.body3\l}"];
-	Node0x5b38cd0 -> Node0x5b4bd30;
-	Node0x5b4bd30 [shape=record,label="{for.body3:                                        \l  %indvars.iv4 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next5,\l... %for.inc25 ]\l  %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l  store float 0.000000e+00, float* %arrayidx5, align 4\l  br label %for.body8\l}"];
-	Node0x5b4bd30 -> Node0x5b38c50;
-	Node0x5b38c50 [shape=record,label="{for.body8:                                        \l  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]\l  %arrayidx12 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l  %0 = load float, float* %arrayidx12, align 4\l  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv\l  %1 = load float, float* %arrayidx16, align 4\l  %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4\l  %2 = load float, float* %arrayidx20, align 4\l  %mul = fmul float %1, %2\l  %add = fadd float %0, %mul\l  %arrayidx24 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l  store float %add, float* %arrayidx24, align 4\l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  %exitcond = icmp ne i64 %indvars.iv.next, 1536\l  br i1 %exitcond, label %for.body8, label %for.inc25\l}"];
-	Node0x5b38c50 -> Node0x5b38c50[constraint=false];
-	Node0x5b38c50 -> Node0x5b5a290;
-	Node0x5b5a290 [shape=record,label="{for.inc25:                                        \l  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l  %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l  br i1 %exitcond6, label %for.body3, label %for.inc28\l}"];
-	Node0x5b5a290 -> Node0x5b4bd30[constraint=false];
-	Node0x5b5a290 -> Node0x5b5a340;
-	Node0x5b5a340 [shape=record,label="{for.inc28:                                        \l  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1\l  %exitcond9 = icmp ne i64 %indvars.iv.next8, 1536\l  br i1 %exitcond9, label %for.cond1.preheader, label %for.end30\l}"];
-	Node0x5b5a340 -> Node0x5b38cd0[constraint=false];
-	Node0x5b5a340 -> Node0x5b5a3a0;
-	Node0x5b5a3a0 [shape=record,label="{for.end30:                                        \l  ret i32 0\l}"];
+	Node0x7fffc6c4cb90 [shape=record,label="{entry:\l  br label %entry.split\l}"];
+	Node0x7fffc6c4cb90 -> Node0x7fffc6c47b10;
+	Node0x7fffc6c47b10 [shape=record,label="{entry.split:                                      \l  tail call void @init_array()\l  br label %for.cond1.preheader\l}"];
+	Node0x7fffc6c47b10 -> Node0x7fffc6c456e0;
+	Node0x7fffc6c456e0 [shape=record,label="{for.cond1.preheader:                              \l  %indvars.iv7 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next8, %for.inc28 ]\l  br label %for.body3\l}"];
+	Node0x7fffc6c456e0 -> Node0x7fffc6c3f080;
+	Node0x7fffc6c3f080 [shape=record,label="{for.body3:                                        \l  %indvars.iv4 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next5,\l... %for.inc25 ]\l  %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l  store float 0.000000e+00, float* %arrayidx5, align 4\l  br label %for.body8\l}"];
+	Node0x7fffc6c3f080 -> Node0x7fffc6c3f220;
+	Node0x7fffc6c3f220 [shape=record,label="{for.body8:                                        \l  %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]\l  %0 = load float, float* %arrayidx5, align 4\l  %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv\l  %1 = load float, float* %arrayidx16, align 4\l  %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4\l  %2 = load float, float* %arrayidx20, align 4\l  %mul = fmul float %1, %2\l  %add = fadd float %0, %mul\l  store float %add, float* %arrayidx5, align 4\l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  %exitcond = icmp ne i64 %indvars.iv.next, 1536\l  br i1 %exitcond, label %for.body8, label %for.inc25\l}"];
+	Node0x7fffc6c3f220 -> Node0x7fffc6c3f220[constraint=false];
+	Node0x7fffc6c3f220 -> Node0x7fffc6c40480;
+	Node0x7fffc6c40480 [shape=record,label="{for.inc25:                                        \l  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l  %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l  br i1 %exitcond6, label %for.body3, label %for.inc28\l}"];
+	Node0x7fffc6c40480 -> Node0x7fffc6c3f080[constraint=false];
+	Node0x7fffc6c40480 -> Node0x7fffc6c404e0;
+	Node0x7fffc6c404e0 [shape=record,label="{for.inc28:                                        \l  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1\l  %exitcond9 = icmp ne i64 %indvars.iv.next8, 1536\l  br i1 %exitcond9, label %for.cond1.preheader, label %for.end30\l}"];
+	Node0x7fffc6c404e0 -> Node0x7fffc6c456e0[constraint=false];
+	Node0x7fffc6c404e0 -> Node0x7fffc6c40540;
+	Node0x7fffc6c40540 [shape=record,label="{for.end30:                                        \l  ret i32 0\l}"];
 	colorscheme = "paired12"
-        subgraph cluster_0x5b5c970 {
+        subgraph cluster_0x7fffc6c32540 {
          label = "";
          style = solid;
          color = 1
-          subgraph cluster_0x5b5c5a0 {
+          subgraph cluster_0x7fffc6c32f30 {
            label = "";
            style = filled;
-            color = 3            subgraph cluster_0x5b5c9f0 {
+            color = 3            subgraph cluster_0x7fffc6c32690 {
              label = "";
              style = solid;
              color = 5
-              subgraph cluster_0x5b5c110 {
+              subgraph cluster_0x7fffc6c32dc0 {
                label = "";
                style = solid;
                color = 7
-                Node0x5b38c50;
+                Node0x7fffc6c3f220;
              }
-              Node0x5b4bd30;
-              Node0x5b5a290;
+              Node0x7fffc6c3f080;
+              Node0x7fffc6c40480;
            }
-            Node0x5b38cd0;
-            Node0x5b5a340;
+            Node0x7fffc6c456e0;
+            Node0x7fffc6c404e0;
          }
-          Node0x5b5c850;
-          Node0x5b5a440;
-          Node0x5b5a3a0;
+          Node0x7fffc6c4cb90;
+          Node0x7fffc6c47b10;
+          Node0x7fffc6c40540;
        }
 }
--- a/polly/docs/experiments/matmul/scops.main.dot.png
+++ b/polly/docs/experiments/matmul/scops.main.dot.png
--- a/polly/docs/experiments/matmul/scops.print_array.dot
+++ b/polly/docs/experiments/matmul/scops.print_array.dot
@ -1,51 +1,51 @@
 digraph "Scop Graph for 'print_array' function" {
 	label="Scop Graph for 'print_array' function";

-	Node0x5b5ee00 [shape=record,label="{entry:\l  br label %entry.split\l}"];
-	Node0x5b5ee00 -> Node0x5b5ee50;
-	Node0x5b5ee50 [shape=record,label="{entry.split:                                      \l  br label %for.cond1.preheader\l}"];
-	Node0x5b5ee50 -> Node0x5b5ee70;
-	Node0x5b5ee70 [shape=record,label="{for.cond1.preheader:                              \l  %indvars.iv6 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next7, %for.end ]\l  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l  br label %for.body3\l}"];
-	Node0x5b5ee70 -> Node0x5b5ee20;
-	Node0x5b5ee20 [shape=record,label="{for.body3:                                        \l  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.inc ]\l  %1 = phi %struct._IO_FILE* [ %0, %for.cond1.preheader ], [ %5, %for.inc ]\l  %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv\l  %2 = load float, float* %arrayidx5, align 4\l  %conv = fpext float %2 to double\l  %call = tail call i32 (%struct._IO_FILE*, i8*, ...)\l... @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x\l... i8]* @.str, i64 0, i64 0), double %conv) #2\l  %3 = trunc i64 %indvars.iv to i32\l  %rem = srem i32 %3, 80\l  %cmp6 = icmp eq i32 %rem, 79\l  br i1 %cmp6, label %if.then, label %for.inc\l}"];
-	Node0x5b5ee20 -> Node0x5b60d10;
-	Node0x5b5ee20 -> Node0x5b60d70;
-	Node0x5b60d10 [shape=record,label="{if.then:                                          \l  %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l  %fputc3 = tail call i32 @fputc(i32 10, %struct._IO_FILE* %4)\l  br label %for.inc\l}"];
-	Node0x5b60d10 -> Node0x5b60d70;
-	Node0x5b60d70 [shape=record,label="{for.inc:                                          \l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l  %exitcond = icmp ne i64 %indvars.iv.next, 1536\l  br i1 %exitcond, label %for.body3, label %for.end\l}"];
-	Node0x5b60d70 -> Node0x5b5ee20[constraint=false];
-	Node0x5b60d70 -> Node0x5b60e10;
-	Node0x5b60e10 [shape=record,label="{for.end:                                          \l  %.lcssa = phi %struct._IO_FILE* [ %5, %for.inc ]\l  %fputc = tail call i32 @fputc(i32 10, %struct._IO_FILE* %.lcssa)\l  %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1\l  %exitcond8 = icmp ne i64 %indvars.iv.next7, 1536\l  br i1 %exitcond8, label %for.cond1.preheader, label %for.end12\l}"];
-	Node0x5b60e10 -> Node0x5b5ee70[constraint=false];
-	Node0x5b60e10 -> Node0x5b60e70;
-	Node0x5b60e70 [shape=record,label="{for.end12:                                        \l  ret void\l}"];
+	Node0x7fffc6c42bf0 [shape=record,label="{entry:\l  br label %entry.split\l}"];
+	Node0x7fffc6c42bf0 -> Node0x7fffc6c42f10;
+	Node0x7fffc6c42f10 [shape=record,label="{entry.split:                                      \l  br label %for.cond1.preheader\l}"];
+	Node0x7fffc6c42f10 -> Node0x7fffc6c4abb0;
+	Node0x7fffc6c4abb0 [shape=record,label="{for.cond1.preheader:                              \l  %indvars.iv6 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next7, %for.end ]\l  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l  br label %for.body3\l}"];
+	Node0x7fffc6c4abb0 -> Node0x7fffc6c4ac10;
+	Node0x7fffc6c4ac10 [shape=record,label="{for.body3:                                        \l  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.inc ]\l  %1 = phi %struct._IO_FILE* [ %0, %for.cond1.preheader ], [ %5, %for.inc ]\l  %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv\l  %2 = load float, float* %arrayidx5, align 4\l  %conv = fpext float %2 to double\l  %call = tail call i32 (%struct._IO_FILE*, i8*, ...)\l... @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x\l... i8]* @.str, i64 0, i64 0), double %conv) #2\l  %3 = trunc i64 %indvars.iv to i32\l  %rem = urem i32 %3, 80\l  %cmp6 = icmp eq i32 %rem, 79\l  br i1 %cmp6, label %if.then, label %for.inc\l}"];
+	Node0x7fffc6c4ac10 -> Node0x7fffc6c4af80;
+	Node0x7fffc6c4ac10 -> Node0x7fffc6c4afe0;
+	Node0x7fffc6c4af80 [shape=record,label="{if.then:                                          \l  %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l  %fputc3 = tail call i32 @fputc(i32 10, %struct._IO_FILE* %4)\l  br label %for.inc\l}"];
+	Node0x7fffc6c4af80 -> Node0x7fffc6c4afe0;
+	Node0x7fffc6c4afe0 [shape=record,label="{for.inc:                                          \l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l  %exitcond = icmp ne i64 %indvars.iv.next, 1536\l  br i1 %exitcond, label %for.body3, label %for.end\l}"];
+	Node0x7fffc6c4afe0 -> Node0x7fffc6c4ac10[constraint=false];
+	Node0x7fffc6c4afe0 -> Node0x7fffc6c4b3b0;
+	Node0x7fffc6c4b3b0 [shape=record,label="{for.end:                                          \l  %.lcssa = phi %struct._IO_FILE* [ %5, %for.inc ]\l  %fputc = tail call i32 @fputc(i32 10, %struct._IO_FILE* %.lcssa)\l  %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1\l  %exitcond8 = icmp ne i64 %indvars.iv.next7, 1536\l  br i1 %exitcond8, label %for.cond1.preheader, label %for.end12\l}"];
+	Node0x7fffc6c4b3b0 -> Node0x7fffc6c4abb0[constraint=false];
+	Node0x7fffc6c4b3b0 -> Node0x7fffc6c4b580;
+	Node0x7fffc6c4b580 [shape=record,label="{for.end12:                                        \l  ret void\l}"];
 	colorscheme = "paired12"
-        subgraph cluster_0x5b349a0 {
+        subgraph cluster_0x7fffc6c32540 {
          label = "";
          style = solid;
          color = 1
-          subgraph cluster_0x5b5c2c0 {
+          subgraph cluster_0x7fffc6c32dc0 {
            label = "Call instruction:   %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
            style = solid;
            color = 6
-            subgraph cluster_0x5b5c240 {
+            subgraph cluster_0x7fffc6c32690 {
              label = "Call instruction:   %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
              style = solid;
              color = 5
-              subgraph cluster_0x5b34a20 {
+              subgraph cluster_0x7fffc6c32f30 {
                label = "Region can not profitably be optimized!";
                style = solid;
                color = 7
-                Node0x5b5ee20;
-                Node0x5b60d10;
+                Node0x7fffc6c4ac10;
+                Node0x7fffc6c4af80;
              }
-              Node0x5b60d70;
+              Node0x7fffc6c4afe0;
            }
-            Node0x5b5ee70;
-            Node0x5b60e10;
+            Node0x7fffc6c4abb0;
+            Node0x7fffc6c4b3b0;
          }
-          Node0x5b5ee00;
-          Node0x5b5ee50;
-          Node0x5b60e70;
+          Node0x7fffc6c42bf0;
+          Node0x7fffc6c42f10;
+          Node0x7fffc6c4b580;
        }
 }
--- a/polly/docs/experiments/matmul/scops.print_array.dot.png
+++ b/polly/docs/experiments/matmul/scops.print_array.dot.png
--- a/polly/docs/experiments/matmul/scopsonly.init_array.dot
+++ b/polly/docs/experiments/matmul/scopsonly.init_array.dot
@ -1,39 +1,39 @@
 digraph "Scop Graph for 'init_array' function" {
 	label="Scop Graph for 'init_array' function";

-	Node0x5ae2570 [shape=record,label="{entry}"];
-	Node0x5ae2570 -> Node0x5ae4e90;
-	Node0x5ae4e90 [shape=record,label="{entry.split}"];
-	Node0x5ae4e90 -> Node0x5ae4f50;
-	Node0x5ae4f50 [shape=record,label="{for.cond1.preheader}"];
-	Node0x5ae4f50 -> Node0x5ae50e0;
-	Node0x5ae50e0 [shape=record,label="{for.body3}"];
-	Node0x5ae50e0 -> Node0x5ae50e0[constraint=false];
-	Node0x5ae50e0 -> Node0x5ae5100;
-	Node0x5ae5100 [shape=record,label="{for.inc17}"];
-	Node0x5ae5100 -> Node0x5ae4f50[constraint=false];
-	Node0x5ae5100 -> Node0x5ae4ff0;
-	Node0x5ae4ff0 [shape=record,label="{for.end19}"];
+	Node0x7fffdb5cceb0 [shape=record,label="{entry}"];
+	Node0x7fffdb5cceb0 -> Node0x7fffdb5ccf00;
+	Node0x7fffdb5ccf00 [shape=record,label="{entry.split}"];
+	Node0x7fffdb5ccf00 -> Node0x7fffdb5ccf80;
+	Node0x7fffdb5ccf80 [shape=record,label="{for.cond1.preheader}"];
+	Node0x7fffdb5ccf80 -> Node0x7fffdb5cd090;
+	Node0x7fffdb5cd090 [shape=record,label="{for.body3}"];
+	Node0x7fffdb5cd090 -> Node0x7fffdb5cd090[constraint=false];
+	Node0x7fffdb5cd090 -> Node0x7fffdb5cd0b0;
+	Node0x7fffdb5cd0b0 [shape=record,label="{for.inc17}"];
+	Node0x7fffdb5cd0b0 -> Node0x7fffdb5ccf80[constraint=false];
+	Node0x7fffdb5cd0b0 -> Node0x7fffdb5cd2a0;
+	Node0x7fffdb5cd2a0 [shape=record,label="{for.end19}"];
 	colorscheme = "paired12"
-        subgraph cluster_0x5ad2dd0 {
+        subgraph cluster_0x7fffdb5b8530 {
          label = "";
          style = solid;
          color = 1
-          subgraph cluster_0x5ad2f50 {
+          subgraph cluster_0x7fffdb5b8f40 {
            label = "Region can not profitably be optimized!";
            style = solid;
            color = 6
-            subgraph cluster_0x5ad30d0 {
+            subgraph cluster_0x7fffdb5b86a0 {
              label = "";
              style = solid;
              color = 5
-              Node0x5ae50e0;
+              Node0x7fffdb5cd090;
            }
-            Node0x5ae4f50;
-            Node0x5ae5100;
+            Node0x7fffdb5ccf80;
+            Node0x7fffdb5cd0b0;
          }
-          Node0x5ae2570;
-          Node0x5ae4e90;
-          Node0x5ae4ff0;
+          Node0x7fffdb5cceb0;
+          Node0x7fffdb5ccf00;
+          Node0x7fffdb5cd2a0;
        }
 }
--- a/polly/docs/experiments/matmul/scopsonly.init_array.dot.png
+++ b/polly/docs/experiments/matmul/scopsonly.init_array.dot.png
--- a/polly/docs/experiments/matmul/scopsonly.main.dot
+++ b/polly/docs/experiments/matmul/scopsonly.main.dot
@ -1,50 +1,50 @@
 digraph "Scop Graph for 'main' function" {
 	label="Scop Graph for 'main' function";

-	Node0x5abfcf0 [shape=record,label="{entry}"];
-	Node0x5abfcf0 -> Node0x5ade060;
-	Node0x5ade060 [shape=record,label="{entry.split}"];
-	Node0x5ade060 -> Node0x5ade0e0;
-	Node0x5ade0e0 [shape=record,label="{for.cond1.preheader}"];
-	Node0x5ade0e0 -> Node0x5ade100;
-	Node0x5ade100 [shape=record,label="{for.body3}"];
-	Node0x5ade100 -> Node0x5ae0020;
-	Node0x5ae0020 [shape=record,label="{for.body8}"];
-	Node0x5ae0020 -> Node0x5ae0020[constraint=false];
-	Node0x5ae0020 -> Node0x5ae0080;
-	Node0x5ae0080 [shape=record,label="{for.inc25}"];
-	Node0x5ae0080 -> Node0x5ade100[constraint=false];
-	Node0x5ae0080 -> Node0x5adfef0;
-	Node0x5adfef0 [shape=record,label="{for.inc28}"];
-	Node0x5adfef0 -> Node0x5ade0e0[constraint=false];
-	Node0x5adfef0 -> Node0x5adff50;
-	Node0x5adff50 [shape=record,label="{for.end30}"];
+	Node0x7fffdb5cbd10 [shape=record,label="{entry}"];
+	Node0x7fffdb5cbd10 -> Node0x7fffdb5c7140;
+	Node0x7fffdb5c7140 [shape=record,label="{entry.split}"];
+	Node0x7fffdb5c7140 -> Node0x7fffdb5c7200;
+	Node0x7fffdb5c7200 [shape=record,label="{for.cond1.preheader}"];
+	Node0x7fffdb5c7200 -> Node0x7fffdb5ccd60;
+	Node0x7fffdb5ccd60 [shape=record,label="{for.body3}"];
+	Node0x7fffdb5ccd60 -> Node0x7fffdb5ccd80;
+	Node0x7fffdb5ccd80 [shape=record,label="{for.body8}"];
+	Node0x7fffdb5ccd80 -> Node0x7fffdb5ccd80[constraint=false];
+	Node0x7fffdb5ccd80 -> Node0x7fffdb5cce20;
+	Node0x7fffdb5cce20 [shape=record,label="{for.inc25}"];
+	Node0x7fffdb5cce20 -> Node0x7fffdb5ccd60[constraint=false];
+	Node0x7fffdb5cce20 -> Node0x7fffdb5cce80;
+	Node0x7fffdb5cce80 [shape=record,label="{for.inc28}"];
+	Node0x7fffdb5cce80 -> Node0x7fffdb5c7200[constraint=false];
+	Node0x7fffdb5cce80 -> Node0x7fffdb5ccee0;
+	Node0x7fffdb5ccee0 [shape=record,label="{for.end30}"];
 	colorscheme = "paired12"
-        subgraph cluster_0x5ad2c80 {
+        subgraph cluster_0x7fffdb5b8530 {
          label = "";
          style = solid;
          color = 1
-          subgraph cluster_0x5ad2e50 {
+          subgraph cluster_0x7fffdb5b8f40 {
            label = "";
            style = filled;
-            color = 3            subgraph cluster_0x5ad2d00 {
+            color = 3            subgraph cluster_0x7fffdb5b86a0 {
              label = "";
              style = solid;
              color = 5
-              subgraph cluster_0x5ad2dd0 {
+              subgraph cluster_0x7fffdb5cc3c0 {
                label = "";
                style = solid;
                color = 7
-                Node0x5ae0020;
+                Node0x7fffdb5ccd80;
              }
-              Node0x5ade100;
-              Node0x5ae0080;
+              Node0x7fffdb5ccd60;
+              Node0x7fffdb5cce20;
            }
-            Node0x5ade0e0;
-            Node0x5adfef0;
+            Node0x7fffdb5c7200;
+            Node0x7fffdb5cce80;
          }
-          Node0x5abfcf0;
-          Node0x5ade060;
-          Node0x5adff50;
+          Node0x7fffdb5cbd10;
+          Node0x7fffdb5c7140;
+          Node0x7fffdb5ccee0;
        }
 }
--- a/polly/docs/experiments/matmul/scopsonly.main.dot.png
+++ b/polly/docs/experiments/matmul/scopsonly.main.dot.png
--- a/polly/docs/experiments/matmul/scopsonly.print_array.dot
+++ b/polly/docs/experiments/matmul/scopsonly.print_array.dot
@ -1,51 +1,51 @@
 digraph "Scop Graph for 'print_array' function" {
 	label="Scop Graph for 'print_array' function";

-	Node0x5ae5e30 [shape=record,label="{entry}"];
-	Node0x5ae5e30 -> Node0x5ae5f50;
-	Node0x5ae5f50 [shape=record,label="{entry.split}"];
-	Node0x5ae5f50 -> Node0x5ae7d90;
-	Node0x5ae7d90 [shape=record,label="{for.cond1.preheader}"];
-	Node0x5ae7d90 -> Node0x5ae7f20;
-	Node0x5ae7f20 [shape=record,label="{for.body3}"];
-	Node0x5ae7f20 -> Node0x5ae7f40;
-	Node0x5ae7f20 -> Node0x5ae7f60;
-	Node0x5ae7f40 [shape=record,label="{if.then}"];
-	Node0x5ae7f40 -> Node0x5ae7f60;
-	Node0x5ae7f60 [shape=record,label="{for.inc}"];
-	Node0x5ae7f60 -> Node0x5ae7f20[constraint=false];
-	Node0x5ae7f60 -> Node0x5ae7e30;
-	Node0x5ae7e30 [shape=record,label="{for.end}"];
-	Node0x5ae7e30 -> Node0x5ae7d90[constraint=false];
-	Node0x5ae7e30 -> Node0x5ae8110;
-	Node0x5ae8110 [shape=record,label="{for.end12}"];
+	Node0x7fffdb5c9180 [shape=record,label="{entry}"];
+	Node0x7fffdb5c9180 -> Node0x7fffdb5b7940;
+	Node0x7fffdb5b7940 [shape=record,label="{entry.split}"];
+	Node0x7fffdb5b7940 -> Node0x7fffdb5b7960;
+	Node0x7fffdb5b7960 [shape=record,label="{for.cond1.preheader}"];
+	Node0x7fffdb5b7960 -> Node0x7fffdb5b79c0;
+	Node0x7fffdb5b79c0 [shape=record,label="{for.body3}"];
+	Node0x7fffdb5b79c0 -> Node0x7fffdb5b79e0;
+	Node0x7fffdb5b79c0 -> Node0x7fffdb5b7a80;
+	Node0x7fffdb5b79e0 [shape=record,label="{if.then}"];
+	Node0x7fffdb5b79e0 -> Node0x7fffdb5b7a80;
+	Node0x7fffdb5b7a80 [shape=record,label="{for.inc}"];
+	Node0x7fffdb5b7a80 -> Node0x7fffdb5b79c0[constraint=false];
+	Node0x7fffdb5b7a80 -> Node0x7fffdb5b7ae0;
+	Node0x7fffdb5b7ae0 [shape=record,label="{for.end}"];
+	Node0x7fffdb5b7ae0 -> Node0x7fffdb5b7960[constraint=false];
+	Node0x7fffdb5b7ae0 -> Node0x7fffdb5b7b40;
+	Node0x7fffdb5b7b40 [shape=record,label="{for.end12}"];
 	colorscheme = "paired12"
-        subgraph cluster_0x5abb9a0 {
+        subgraph cluster_0x7fffdb5b8530 {
          label = "";
          style = solid;
          color = 1
-          subgraph cluster_0x5ae32c0 {
+          subgraph cluster_0x7fffdb5cc3c0 {
            label = "Call instruction:   %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
            style = solid;
            color = 6
-            subgraph cluster_0x5ae3240 {
+            subgraph cluster_0x7fffdb5b86a0 {
              label = "Call instruction:   %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
              style = solid;
              color = 5
-              subgraph cluster_0x5abba20 {
+              subgraph cluster_0x7fffdb5b8f40 {
                label = "Region can not profitably be optimized!";
                style = solid;
                color = 7
-                Node0x5ae7f20;
-                Node0x5ae7f40;
+                Node0x7fffdb5b79c0;
+                Node0x7fffdb5b79e0;
              }
-              Node0x5ae7f60;
+              Node0x7fffdb5b7a80;
            }
-            Node0x5ae7d90;
-            Node0x5ae7e30;
+            Node0x7fffdb5b7960;
+            Node0x7fffdb5b7ae0;
          }
-          Node0x5ae5e30;
-          Node0x5ae5f50;
-          Node0x5ae8110;
+          Node0x7fffdb5c9180;
+          Node0x7fffdb5b7940;
+          Node0x7fffdb5b7b40;
        }
 }
--- a/polly/docs/experiments/matmul/scopsonly.print_array.dot.png
+++ b/polly/docs/experiments/matmul/scopsonly.print_array.dot.png