blob: f12c7f595c2a985c6263d2a8ab43abf612d579b7 [file] [log] [blame]
Sumesh Udayakumaran24b0df82021-08-03 16:41:551// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion -split-input-file | FileCheck %s
2// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s --check-prefix=MAXIMAL
3
Krzysztof Drewniak121aab82021-09-16 21:25:204// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
Sumesh Udayakumaran24b0df82021-08-03 16:41:555// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
6// Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir
7
8// -----
9
10// CHECK-LABEL: func @should_fuse_at_depth_above_loop_carried_dependence(%{{.*}}: memref<64x4xf32>, %{{.*}}: memref<64x4xf32>) {
River Riddlecda6aa72022-04-20 23:22:2111func.func @should_fuse_at_depth_above_loop_carried_dependence(%arg0: memref<64x4xf32>, %arg1: memref<64x4xf32>) {
Sumesh Udayakumaran24b0df82021-08-03 16:41:5512 %out = memref.alloc() : memref<64x4xf32>
Mogballa54f4ea2021-10-12 23:14:5713 %0 = arith.constant 0.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:5514 affine.for %i0 = 0 to 64 {
15 affine.for %i1 = 0 to 4 {
16 affine.store %0, %out[%i0, %i1] : memref<64x4xf32>
17 }
18 }
19 affine.for %i2 = 0 to 4 {
20 affine.for %i3 = 0 to 4 {
21 affine.for %i4 = 0 to 16 {
22 %v = affine.load %arg1[16 * %i3 - %i4 + 15, %i2] : memref<64x4xf32>
23 "op0"(%v) : (f32) -> ()
24 }
25 affine.for %i5 = 0 to 4 {
26 affine.for %i6 = 0 to 16 {
27 %v = affine.load %arg0[16 * %i5 - %i6 + 15, %i3] : memref<64x4xf32>
28 "op1"(%v) : (f32) -> ()
29 }
30 affine.for %i7 = 0 to 16 {
31 %r = "op2"() : () -> (f32)
32 %v = affine.load %out[16 * %i5 + %i7, %i2] : memref<64x4xf32>
Mogballa54f4ea2021-10-12 23:14:5733 %s = arith.addf %v, %r : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:5534 affine.store %s, %out[16 * %i5 + %i7, %i2] : memref<64x4xf32>
35 }
36 }
37 }
38 }
39
40 // We can fuse source loop nest '%i0' into dst loop nest '%i2', but the
41 // depth at which we can insert the src loop nest slice into the dst loop
42 // lest must be decreased because of a loop carried dependence on loop '%i3'.
43 // As a result, the source loop nest is inserted at dst loop nest depth 1,
44 // just above the loop with the carried dependence. In addition, the source
45 // loop nest iteration bounds on its loop '%i1' are reduced to 1, so the
46 // memref size can be reduced to 128x1xf32.
47
48 // CHECK: memref.alloc() : memref<64x1xf32>
49 // CHECK: affine.for %{{.*}} = 0 to 4 {
50 // CHECK-NEXT: affine.for %{{.*}} = 0 to 64 {
51 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32>
52 // CHECK-NEXT: }
53 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
54 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
55 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, %{{.*}}] : memref<64x4xf32>
56 // CHECK-NEXT: "op0"(%{{.*}}) : (f32) -> ()
57 // CHECK-NEXT: }
58 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
59 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
60 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, %{{.*}}] : memref<64x4xf32>
61 // CHECK-NEXT: "op1"(%{{.*}}) : (f32) -> ()
62 // CHECK-NEXT: }
63 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
64 // CHECK-NEXT: %{{.*}} = "op2"() : () -> f32
65 // CHECK: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
Mogballa54f4ea2021-10-12 23:14:5766 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:5567 // CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
68 // CHECK-NEXT: }
69 // CHECK-NEXT: }
70 // CHECK-NEXT: }
71 // CHECK-NEXT: }
72 // CHECK-NEXT: return
73 return
74}
75
76// -----
77
78// CHECK-LABEL: func @should_fuse_only_two_loops_and_remove_producer() {
River Riddlecda6aa72022-04-20 23:22:2179func.func @should_fuse_only_two_loops_and_remove_producer() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:5580 %a = memref.alloc() : memref<10xf32>
81 %b = memref.alloc() : memref<10xf32>
82
Mogballa54f4ea2021-10-12 23:14:5783 %cf7 = arith.constant 7.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:5584
85 affine.for %i0 = 0 to 10 {
86 affine.store %cf7, %a[%i0] : memref<10xf32>
87 }
88 affine.for %i1 = 0 to 10 {
89 %v0 = affine.load %a[%i1] : memref<10xf32>
90 affine.store %v0, %b[%i1] : memref<10xf32>
91 }
92 affine.for %i2 = 0 to 10 {
93 %v1 = affine.load %a[%i2] : memref<10xf32>
94 affine.store %v1, %b[%i2] : memref<10xf32>
95 }
96
97 // On the first visit to '%i2', the fusion algorithm can not fuse loop nest
98 // '%i0' into '%i2' because of the dependences '%i0' and '%i2' each have on
99 // '%i1'. Then, '%i0' is fused into '%i1' and no private memref is created for
100 // memref '%a' to be able to remove '%i0' and still preserve the depencence on
101 // '%a' with '%i2'.
102 // TODO: Alternatively, we could fuse '%i0' into '%i1' with a private memref,
103 // the dependence between '%i0' and '%i1' on memref '%a' would no longer exist,
104 // and '%i0' could be fused into '%i2' as well. Note that this approach would
105 // duplicate the computation in loop nest '%i0' to loop nests '%i1' and '%i2',
106 // which would limit its profitability.
107 // CHECK: affine.for %{{.*}} = 0 to 10 {
108 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
109 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
110 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
111 // CHECK-NEXT: }
112 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
113 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
114 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
115 // CHECK-NEXT: }
116 // CHECK-NEXT: return
117 return
118}
119
120// -----
121
122// CHECK-LABEL: func @should_fuse_after_one_loop_interchange() {
River Riddlecda6aa72022-04-20 23:22:21123func.func @should_fuse_after_one_loop_interchange() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55124 %a = memref.alloc() : memref<10xf32>
125
Mogballa54f4ea2021-10-12 23:14:57126 %cf0 = arith.constant 0.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55127 affine.for %i0 = 0 to 10 {
128 affine.store %cf0, %a[%i0] : memref<10xf32>
129 }
130
131 affine.for %i1 = 0 to 5 {
132 affine.for %i2 = 0 to 10 {
133 %v0 = affine.load %a[%i2] : memref<10xf32>
134 affine.store %v0, %a[%i2] : memref<10xf32>
135 }
136 }
137
138 // The dependence between the load and affine.store is carried on loop '%i1', and
139 // cannot be fused with loop '%i0' without violating this dependence.
140 // Once loops '%i1' and %i2' are interchanged, loop '%i0' can be fused
141 // at loop depth 1, because the loop carrying the dependence has been
142 // interchanged and is now at depth 2.
143
144 // CHECK: affine.for %{{.*}} = 0 to 10 {
145 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
146 // CHECK-NEXT: affine.for %{{.*}} = 0 to 5 {
147 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
148 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
149 // CHECK-NEXT: }
150 // CHECK-NEXT: }
151 // CHECK-NEXT: return
152 return
153}
154
155// -----
156// CHECK-LABEL: func @should_fuse_after_two_loop_interchanges() {
River Riddlecda6aa72022-04-20 23:22:21157func.func @should_fuse_after_two_loop_interchanges() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55158 %a = memref.alloc() : memref<6x8xf32>
159
Mogballa54f4ea2021-10-12 23:14:57160 %cf0 = arith.constant 0.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55161 affine.for %i0 = 0 to 6 {
162 affine.for %i1 = 0 to 8 {
163 affine.store %cf0, %a[%i0, %i1] : memref<6x8xf32>
164 }
165 }
166
167 affine.for %i2 = 0 to 4 {
168 affine.for %i3 = 0 to 6 {
169 affine.for %i4 = 0 to 2 {
170 affine.for %i5 = 0 to 8 {
171 %v0 = affine.load %a[%i3, %i5] : memref<6x8xf32>
Mogballa54f4ea2021-10-12 23:14:57172 %v1 = arith.addf %v0, %v0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55173 affine.store %v1, %a[%i3, %i5] : memref<6x8xf32>
174 }
175 }
176 }
177 }
178
179 // The dependence between the load and affine.store is carried on loops '%i2' and
180 // '%i4', and cannot be fused with loop '%i0' without violating this
181 // dependence.
182 // Once loop '%i2' is interchanged with loop '%i3', and again with loop
183 // '%i5', then loop '%i0' can be fused at loop depth 2, because the loop
184 // carrying the dependences have been interchanged with loops at depth > 2.
185
186 // CHECK: affine.for %{{.*}} = 0 to 6 {
187 // CHECK-NEXT: affine.for %{{.*}} = 0 to 8 {
188 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
189 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
190 // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
191 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
Mogballa54f4ea2021-10-12 23:14:57192 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55193 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
194 // CHECK-NEXT: }
195 // CHECK-NEXT: }
196 // CHECK-NEXT: }
197 // CHECK-NEXT: }
198 // CHECK-NEXT: return
199 return
200}
201
202// -----
203
River Riddlecda6aa72022-04-20 23:22:21204func.func @should_fuse_live_out_writer(%arg0 : memref<10xf32>) -> memref<10xf32> {
Mogballa54f4ea2021-10-12 23:14:57205 %cst = arith.constant 0.000000e+00 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55206 affine.for %i0 = 0 to 10 {
207 affine.store %cst, %arg0[%i0] : memref<10xf32>
208 }
209 affine.for %i1 = 0 to 10 {
210 %1 = affine.load %arg0[%i1] : memref<10xf32>
211 affine.store %1, %arg0[%i1] : memref<10xf32>
212 }
213 return %arg0 : memref<10xf32>
214
Mogballa54f4ea2021-10-12 23:14:57215 // CHECK: %{{.*}} = arith.constant 0.000000e+00 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55216 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
217 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
218 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
219 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
220 // CHECK-NEXT: }
221 // CHECK-NEXT: return %{{.*}} : memref<10xf32>
222}
223
224// -----
225
226// The fused slice has 16 iterations from along %i0.
227
228// CHECK-DAG: [[$MAP_LB:#map[0-9]+]] = affine_map<(d0) -> (d0 * 16)>
229// CHECK-DAG: [[$MAP_UB:#map[0-9]+]] = affine_map<(d0) -> (d0 * 16 + 16)>
230
231// CHECK-LABEL: slice_tile
River Riddlecda6aa72022-04-20 23:22:21232func.func @slice_tile(%arg0: memref<128x8xf32>, %arg1: memref<32x8xf32>, %0 : f32) -> memref<32x8xf32> {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55233 affine.for %i0 = 0 to 32 {
234 affine.for %i1 = 0 to 8 {
235 affine.store %0, %arg1[%i0, %i1] : memref<32x8xf32>
236 }
237 }
238 affine.for %i = 0 to 2 {
239 affine.for %j = 0 to 8 {
240 affine.for %k = 0 to 8 {
241 affine.for %kk = 0 to 16 {
242 %v = affine.load %arg0[16 * %k + %kk, %j] : memref<128x8xf32>
243 %r = "foo"(%v) : (f32) -> f32
244 }
245 affine.for %ii = 0 to 16 {
246 %v = affine.load %arg1[16 * %i + %ii, %j] : memref<32x8xf32>
Mogballa54f4ea2021-10-12 23:14:57247 %s = arith.addf %v, %v : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55248 affine.store %s, %arg1[16 * %i + %ii, %j] : memref<32x8xf32>
249 }
250 }
251 }
252 }
253 return %arg1 : memref<32x8xf32>
254}
255// CHECK: affine.for %{{.*}} = 0 to 2 {
256// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 {
257// CHECK-NEXT: affine.for %{{.*}} = [[$MAP_LB]](%{{.*}}) to [[$MAP_UB]](%{{.*}}) {
258// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<32x8xf32>
259// CHECK-NEXT: }
260// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 {
261// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
262// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<128x8xf32>
263// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> f32
264// CHECK-NEXT: }
265// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
266// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<32x8xf32>
Mogballa54f4ea2021-10-12 23:14:57267// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55268// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<32x8xf32>
269// CHECK-NEXT: }
270// CHECK-NEXT: }
271// CHECK-NEXT: }
272// CHECK-NEXT: }
273// CHECK-NEXT: return %{{.*}} : memref<32x8xf32>
274// CHECK-NEXT:}
275
276// -----
277
278// Test case which illustrates fix for b/126454413
River Riddlecda6aa72022-04-20 23:22:21279func.func @test_add_slice_bounds() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55280 %a = memref.alloc() : memref<10xf32>
281 %b = memref.alloc() : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57282 %cf7 = arith.constant 7.0 : f32
283 %c0 = arith.constant 0 : index
Sumesh Udayakumaran24b0df82021-08-03 16:41:55284
285 affine.for %i0 = 0 to 10 {
286 affine.for %i1 = 0 to 10 {
287 affine.for %i2 = 0 to 10 {
288 %a0 = affine.apply affine_map<(d0) -> (d0)> (%i0)
289 %a1 = affine.apply affine_map<(d0) -> (d0)> (%i0)
290 %a2 = affine.apply affine_map<(d0, d1) -> (d0 - d1)> (%a0, %a1)
291 affine.store %cf7, %a[%a2] : memref<10xf32>
292 }
293 }
294 }
295 affine.for %i3 = 0 to 10 {
296 affine.for %i4 = 0 to 10 {
297 affine.for %i5 = 0 to 10 {
298 %v0 = affine.load %a[%c0] : memref<10xf32>
299 }
300 }
301 }
302
303// CHECK: affine.for %{{.*}} = 0 to 10 {
304// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
305// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
306// CHECK-NEXT: affine.apply #map0(%{{.*}})
307// CHECK-NEXT: affine.apply #map0(%{{.*}})
308// CHECK-NEXT: affine.apply #map1(%{{.*}}, %{{.*}})
309// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
310// CHECK-NEXT: }
311// CHECK-NEXT: }
312// CHECK-NEXT: }
313// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
314// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
315// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
316// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
317// CHECK-NEXT: }
318// CHECK-NEXT: }
319// CHECK-NEXT: }
320 return
321}
322
323// -----
324
River Riddlecda6aa72022-04-20 23:22:21325func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>) {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55326 %0 = memref.alloc() : memref<10x10xf32>
Mogballa54f4ea2021-10-12 23:14:57327 %cst = arith.constant 0.000000e+00 : f32
328 %cst_0 = arith.constant 1.000000e+00 : f32
329 %cst_1 = arith.constant 7.000000e+00 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55330 affine.for %i0 = 0 to 10 {
331 affine.for %i1 = 0 to 10 {
332 affine.store %cst_1, %0[%i0, %i1] : memref<10x10xf32>
333 }
334 }
335 affine.for %i2 = 0 to 3 {
336 affine.for %i3 = 0 to 3 {
337 affine.store %cst, %arg0[%i2, %i3] : memref<10x10xf32>
338 }
339 }
340 affine.for %i4 = 0 to 3 {
341 affine.for %i5 = 0 to 3 {
342 %1 = affine.load %0[%i4, %i5] : memref<10x10xf32>
343 %2 = affine.load %arg0[%i4, %i5] : memref<10x10xf32>
Mogballa54f4ea2021-10-12 23:14:57344 %3 = arith.mulf %1, %2 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55345 affine.store %3, %arg0[%i4, %i5] : memref<10x10xf32>
346 }
347 }
348 affine.for %i6 = 0 to 3 {
349 affine.for %i7 = 0 to 3 {
350 affine.store %cst_0, %arg1[%i6, %i7] : memref<10x10xf32>
351 }
352 }
353 affine.for %i8 = 0 to 3 {
354 affine.for %i9 = 0 to 3 {
355 %4 = affine.load %0[%i8, %i9] : memref<10x10xf32>
356 %5 = affine.load %arg1[%i8, %i9] : memref<10x10xf32>
Mogballa54f4ea2021-10-12 23:14:57357 %6 = arith.addf %4, %5 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55358 affine.store %6, %arg1[%i8, %i9] : memref<10x10xf32>
359 }
360 }
361
362 // Pass 1: should fuse single-use producer loop nests into their unique user,
363 // so '%i2' will fuse into '%i4' and '%i6' will fuse into '%i8'.
364 // Pass 2: should fuse sibling loop nests which share no dependence edges,
365 // so should fuse '%i4' into '%i8'.
366 // Pass 3: should fuse single-use producer loop nest '%i0' into '%i8'. Note
367 // that loop nest '%i0' now has a single user after Pass 2 fused its
368 // two users together).
369
370// CHECK: affine.for %{{.*}} = 0 to 3 {
371// CHECK-NEXT: affine.for %{{.*}} = 0 to 3 {
372// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
373// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
374// CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
375// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
Mogballa54f4ea2021-10-12 23:14:57376// CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55377// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
378// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
379// CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
380// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
Mogballa54f4ea2021-10-12 23:14:57381// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55382// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
383// CHECK-NEXT: }
384// CHECK-NEXT: }
385// CHECK-NEXT: return
386
387 return
388}
389
390// -----
391
River Riddlecda6aa72022-04-20 23:22:21392func.func @two_matrix_vector_products() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55393 %in_matrix = memref.alloc() : memref<10x10xf32>
394 %in_vec0 = memref.alloc() : memref<10xf32>
395 %in_vec1 = memref.alloc() : memref<10xf32>
396 %out_vec0 = memref.alloc() : memref<10xf32>
397 %out_vec1 = memref.alloc() : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57398 %cf7 = arith.constant 7.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55399
400 // Populate input matrix.
401 affine.for %i0 = 0 to 10 {
402 affine.for %i1 = 0 to 10 {
403 affine.store %cf7, %in_matrix[%i0, %i1] : memref<10x10xf32>
404 }
405 }
406 // out_vec0 = in_matrix x in_vec0
407 affine.for %i2 = 0 to 10 {
408 affine.for %i3 = 0 to 10 {
409 %v0 = affine.load %in_matrix[%i2, %i3] : memref<10x10xf32>
410 %v1 = affine.load %in_vec0[%i3] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57411 %v2 = arith.mulf %v0, %v1 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55412 %v3 = affine.load %out_vec0[%i3] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57413 %v4 = arith.addf %v2, %v3 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55414 affine.store %v4, %out_vec0[%i3] : memref<10xf32>
415 }
416 }
417 // out_vec1 = in_matrix x in_vec1
418 affine.for %i4 = 0 to 10 {
419 affine.for %i5 = 0 to 10 {
420 %v5 = affine.load %in_matrix[%i4, %i5] : memref<10x10xf32>
421 %v6 = affine.load %in_vec1[%i5] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57422 %v7 = arith.mulf %v5, %v6 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55423 %v8 = affine.load %out_vec1[%i5] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57424 %v9 = arith.addf %v7, %v8 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55425 affine.store %v9, %out_vec1[%i5] : memref<10xf32>
426 }
427 }
428
429// CHECK: affine.for %{{.*}} = 0 to 10 {
430// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
431// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<10x1xf32>
432// CHECK-NEXT: }
433// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
434// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, 0] : memref<10x1xf32>
435// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57436// CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55437// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57438// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55439// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
440// CHECK-NEXT: }
441// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
442// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, 0] : memref<10x1xf32>
443// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57444// CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55445// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57446// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55447// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
448// CHECK-NEXT: }
449// CHECK-NEXT: }
450// CHECK-NEXT: return
451 return
452}
453
454// -----
455
River Riddlecda6aa72022-04-20 23:22:21456func.func @should_not_slice_past_slice_barrier() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55457 %0 = memref.alloc() : memref<100x16xf32>
458 affine.for %i0 = 0 to 100 {
459 affine.for %i1 = 0 to 16 {
460 %1 = "op1"() : () -> f32
461 affine.store %1, %0[%i0, %i1] : memref<100x16xf32>
462 } {slice_fusion_barrier = true}
463 }
464 affine.for %i2 = 0 to 100 {
465 affine.for %i3 = 0 to 16 {
466 %2 = affine.load %0[%i2, %i3] : memref<100x16xf32>
467 "op2"(%2) : (f32) -> ()
468 }
469 }
470 // The 'slice_fusion_barrier' attribute on '%i1' prevents slicing the
471 // iteration space of '%i1' and any enclosing loop nests.
472// CHECK: affine.for %{{.*}} = 0 to 100 {
473// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
474// CHECK-NEXT: %{{.*}} = "op1"() : () -> f32
475// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
476// CHECK-NEXT: } {slice_fusion_barrier = true}
477// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
478// CHECK-NEXT: affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
479// CHECK-NEXT: "op2"(%{{.*}}) : (f32) -> ()
480// CHECK-NEXT: }
481// CHECK-NEXT: }
482 return
483}
484
485// -----
486
487#map0 = affine_map<(d0, d1) -> (d0 * 16 + d1)>
River Riddlecda6aa72022-04-20 23:22:21488func.func @fuse_across_dim_mismatch(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, %arg2: memref<9xf32>) {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55489 %1 = memref.alloc() : memref<144x4xf32>
Mogballa54f4ea2021-10-12 23:14:57490 %2 = arith.constant 0.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55491 affine.for %i2 = 0 to 9 {
492 affine.for %i3 = 0 to 4 {
493 affine.for %i5 = 0 to 16 {
494 %7 = affine.apply #map0(%i2, %i5)
495 affine.store %2, %1[%7, %i3] : memref<144x4xf32>
496 }
497 }
498 }
499 affine.for %i6 = 0 to 9 {
500 affine.for %i7 = 0 to 9 {
501 affine.for %i8 = 0 to 4 {
502 affine.for %i10 = 0 to 16 {
503 %10 = affine.apply #map0(%i6, %i10)
504 %11 = affine.load %1[%10, %i8] : memref<144x4xf32>
505 }
506 }
507 }
508 }
509 return
510}
511// MAXIMAL: #map = affine_map<(d0, d1) -> (d0 * 16 + d1)>
512// MAXIMAL-LABEL: func @fuse_across_dim_mismatch
513// MAXIMAL: memref.alloc() : memref<1x1xf32>
514// MAXIMAL: affine.for %{{.*}} = 0 to 9 {
515// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 9 {
516// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 {
517// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 {
518// MAXIMAL-NEXT: affine.apply #map(%{{.*}}, %{{.*}})
519// MAXIMAL-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
520// MAXIMAL-NEXT: affine.apply #map(%{{.*}}, %{{.*}})
521// MAXIMAL-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
522// MAXIMAL-NEXT: }
523// MAXIMAL-NEXT: }
524// MAXIMAL-NEXT: }
525// MAXIMAL-NEXT: }
526
527// -----
528
529#map3 = affine_map<(d0, d1) -> ((d0 * 72 + d1) floordiv 2304)>
530#map4 = affine_map<(d0, d1) -> (((d0 * 72 + d1) mod 2304) floordiv 1152)>
531#map5 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) floordiv 8)>
532#map6 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3)>
533#map7 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) mod 3)>
534#map10 = affine_map<(d0, d1) -> (d0 * 16 + d1)>
535#map11 = affine_map<(d0, d1) -> (d0 * 16 + d1)>
536#map12 = affine_map<(d0, d1) -> (d0 * 16 - d1 + 15)>
River Riddlecda6aa72022-04-20 23:22:21537func.func @fuse_across_varying_dims_complex(%arg0: f32) {
Mogballa54f4ea2021-10-12 23:14:57538 %c0 = arith.constant 0 : index
Sumesh Udayakumaran24b0df82021-08-03 16:41:55539 %0 = memref.alloc() : memref<2x2x3x3x16x1xf32>
540 %1 = memref.alloc() : memref<64x9xf32>
541 %2 = memref.alloc() : memref<144x4xf32>
542 affine.for %i0 = 0 to 64 {
543 affine.for %i1 = 0 to 9 {
544 %4 = affine.apply #map3(%i0, %i1)
545 %5 = affine.apply #map4(%i0, %i1)
546 %6 = affine.apply #map5(%i0, %i1)
547 %7 = affine.apply #map6(%i0, %i1)
548 %8 = affine.apply #map7(%i0, %i1)
549 %9 = affine.load %0[%4, %5, %7, %8, %6, %c0] : memref<2x2x3x3x16x1xf32>
550 affine.store %9, %1[%i0, %i1] : memref<64x9xf32>
551 }
552 }
553 affine.for %i2 = 0 to 9 {
554 affine.for %i3 = 0 to 4 {
555 affine.for %i4 = 0 to 16 {
556 %10 = affine.apply #map10(%i3, %i4)
557 %11 = affine.load %1[%10, %i2] : memref<64x9xf32>
558 }
559 affine.for %i5 = 0 to 16 {
560 %14 = affine.apply #map11(%i2, %i5)
561 affine.store %arg0, %2[%14, %i3] : memref<144x4xf32>
562 }
563 }
564 }
565 affine.for %i6 = 0 to 9 {
566 affine.for %i7 = 0 to 9 {
567 affine.for %i8 = 0 to 4 {
568 affine.for %i9 = 0 to 16 {
569 %15 = affine.apply #map12(%i8, %i9)
570 %16 = affine.load %1[%15, %i7] : memref<64x9xf32>
571 }
572 }
573 }
574 }
575 return
576}
577// MAXIMAL-DAG: [[$MAP0:#map[0-9]+]] = affine_map<(d0, d1) -> ((d0 * 72 + d1) floordiv 2304)>
578// MAXIMAL-DAG: [[$MAP1:#map[0-9]+]] = affine_map<(d0, d1) -> (((d0 * 72 + d1) mod 2304) floordiv 1152)>
Krzysztof Drewniak121aab82021-09-16 21:25:20579// MAXIMAL-DAG: [[$MAP2:#map[0-9]+]] = affine_map<(d0, d1) -> ((((d0 * 72 + d1) mod 1152) floordiv 9) floordiv 8)>
580// MAXIMAL-DAG: [[$MAP3:#map[0-9]+]] = affine_map<(d0, d1) -> ((d1 mod 9) floordiv 3)>
581// MAXIMAL-DAG: [[$MAP4:#map[0-9]+]] = affine_map<(d0, d1) -> (d1 mod 3)>
Sumesh Udayakumaran24b0df82021-08-03 16:41:55582// MAXIMAL-DAG: [[$MAP7:#map[0-9]+]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
583// MAXIMAL-DAG: [[$MAP8:#map[0-9]+]] = affine_map<(d0, d1) -> (d0 * 16 - d1 + 15)>
584// MAXIMAL-LABEL: func @fuse_across_varying_dims_complex
585// MAXIMAL-NEXT: memref.alloc() : memref<64x1xf32>
Mogballa54f4ea2021-10-12 23:14:57586// MAXIMAL-NEXT: arith.constant 0 : index
Sumesh Udayakumaran24b0df82021-08-03 16:41:55587// MAXIMAL-NEXT: memref.alloc() : memref<2x2x3x3x16x1xf32>
588// MAXIMAL-NEXT: memref.alloc() : memref<144x4xf32>
589// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 9 {
590// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 9 {
591// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 {
592// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 {
593// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 {
594// MAXIMAL-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
595// MAXIMAL-NEXT: affine.apply [[$MAP1]](%{{.*}}, %{{.*}})
596// MAXIMAL-NEXT: affine.apply [[$MAP2]](%{{.*}}, %{{.*}})
597// MAXIMAL-NEXT: affine.apply [[$MAP3]](%{{.*}}, %{{.*}})
598// MAXIMAL-NEXT: affine.apply [[$MAP4]](%{{.*}}, %{{.*}})
599// MAXIMAL-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x1xf32>
600// MAXIMAL-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32>
601// MAXIMAL-NEXT: }
602// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 {
603// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 {
604// MAXIMAL-NEXT: affine.apply [[$MAP7]](%{{.*}}, %{{.*}})
605// MAXIMAL-NEXT: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
606// MAXIMAL-NEXT: }
607// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 {
608// MAXIMAL-NEXT: affine.apply [[$MAP7]](%{{.*}}, %{{.*}})
609// MAXIMAL-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<144x4xf32>
610// MAXIMAL-NEXT: }
611// MAXIMAL-NEXT: }
612// MAXIMAL-NEXT: affine.apply [[$MAP8]](%{{.*}}, %{{.*}})
613// MAXIMAL-NEXT: affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, 0] : memref<64x1xf32>
614// MAXIMAL-NEXT: }
615// MAXIMAL-NEXT: }
616// MAXIMAL-NEXT: }
617// MAXIMAL-NEXT: }
618
619// -----
620
River Riddlecda6aa72022-04-20 23:22:21621func.func @should_fuse_with_slice_union() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55622 %a = memref.alloc() : memref<100xf32>
Mogballa54f4ea2021-10-12 23:14:57623 %c0 = arith.constant 0 : index
624 %cf0 = arith.constant 0.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55625
626 affine.for %i0 = 0 to 100 {
627 affine.store %cf0, %a[%i0]: memref<100xf32>
628 }
629
630 affine.for %i1 = 10 to 20 {
631 %v0 = affine.load %a[%i1]: memref<100xf32>
632 affine.for %i2 = 15 to 25 {
633 %v1 = affine.load %a[%i2]: memref<100xf32>
634 }
635 }
636 // The union of two slice bounds (calculated between the store and each of
637 // the loads) is computed and used in the fusion cost calculation, index
638 // remapping, and private memref size. The result is that the temporary
639 // memref is reduced from 100xf32 to 15xf32 and properly indexed by
640 // the fused loops based on the union calculation.
641// CHECK: affine.for %{{.*}} = 10 to 20 {
642// CHECK-NEXT: affine.for %{{.*}} = 10 to 25 {
643// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} - 10] : memref<15xf32>
644// CHECK-NEXT: }
645// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} - 10] : memref<15xf32>
646// CHECK-NEXT: affine.for %{{.*}} = 15 to 25 {
647// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} - 10] : memref<15xf32>
648// CHECK-NEXT: }
649// CHECK-NEXT: }
650// CHECK-NEXT: return
651 return
652}
653
654// -----
655
River Riddlecda6aa72022-04-20 23:22:21656func.func @affine_add_mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>) {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55657 affine.for %i2 = 0 to 1024 {
658 affine.for %i3 = 0 to 1024 {
659 %0 = affine.load %arg3[%i2, %i3] : memref<1024x1024xf32>
660 %1 = affine.load %arg2[%i2, %i3] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57661 %2 = arith.addf %1, %0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55662 affine.store %2, %arg2[%i2, %i3] : memref<1024x1024xf32>
663 }
664 }
665 affine.for %i4 = 0 to 1024 {
666 affine.for %i5 = 0 to 1024 {
667 affine.for %i6 = 0 to 1024 {
668 %3 = affine.load %arg1[%i6, %i5] : memref<1024x1024xf32>
669 %4 = affine.load %arg0[%i4, %i6] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57670 %5 = arith.mulf %4, %3 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55671 %6 = affine.load %arg2[%i4, %i5] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57672 %7 = arith.addf %6, %5 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55673 affine.store %7, %arg2[%i4, %i5] : memref<1024x1024xf32>
674 }
675 }
676 }
677 // Should fuse elementwise add loop at loop depth 2, above loop-carried
678 // dependence between load/store on '%arg2', carried on reduction loop %i6.
679 // CHECK: affine.for %{{.*}} = 0 to 1024 {
680 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
681 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
682 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57683 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55684 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
685 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
686 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
687 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57688 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55689 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57690 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55691 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
692 // CHECK-NEXT: }
693 // CHECK-NEXT: }
694 // CHECK-NEXT: }
695 return
696}
697
698// -----
699
River Riddlecda6aa72022-04-20 23:22:21700func.func @affine_2mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>, %arg4: memref<1024x1024xf32>) {
Mogballa54f4ea2021-10-12 23:14:57701 %cst = arith.constant 0.000000e+00 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55702 affine.for %i0 = 0 to 1024 {
703 affine.for %i1 = 0 to 1024 {
704 affine.store %cst, %arg2[%i0, %i1] : memref<1024x1024xf32>
705 }
706 }
707 affine.for %i2 = 0 to 1024 {
708 affine.for %i3 = 0 to 1024 {
709 affine.store %cst, %arg4[%i2, %i3] : memref<1024x1024xf32>
710 }
711 }
712 affine.for %i4 = 0 to 1024 {
713 affine.for %i5 = 0 to 1024 {
714 affine.for %i6 = 0 to 1024 {
715 %0 = affine.load %arg1[%i6, %i5] : memref<1024x1024xf32>
716 %1 = affine.load %arg0[%i4, %i6] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57717 %2 = arith.mulf %1, %0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55718 %3 = affine.load %arg2[%i4, %i5] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57719 %4 = arith.addf %3, %2 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55720 affine.store %4, %arg2[%i4, %i5] : memref<1024x1024xf32>
721 }
722 }
723 }
724 affine.for %i7 = 0 to 1024 {
725 affine.for %i8 = 0 to 1024 {
726 affine.for %i9 = 0 to 1024 {
727 %5 = affine.load %arg1[%i9, %i8] : memref<1024x1024xf32>
728 %6 = affine.load %arg0[%i7, %i9] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57729 %7 = arith.mulf %6, %5 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55730 %8 = affine.load %arg4[%i7, %i8] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57731 %9 = arith.addf %8, %7 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55732 affine.store %9, %arg4[%i7, %i8] : memref<1024x1024xf32>
733 }
734 }
735 }
736
737 // Should fuse MM initialization loops into their consumers, then fuse the
738 // two matmul loops together for input reuse on '%arg0/%arg1'.
739
740 // CHECK: affine.for %{{.*}} = 0 to 1024 {
741 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
742 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
743 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
744 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
745 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57746 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55747 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57748 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55749 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
750 // CHECK-NEXT: }
751 // CHECK-NEXT: }
752 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
753 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
754 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
755 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
756 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57757 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55758 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57759 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55760 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
761 // CHECK-NEXT: }
762 // CHECK-NEXT: }
763 // CHECK-NEXT: }
764
765 return
766}
767
768// -----
769
River Riddlecda6aa72022-04-20 23:22:21770func.func @affine_2_dependent_mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>, %arg4: memref<1024x1024xf32>) {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55771 affine.for %i0 = 0 to 1024 {
772 affine.for %i1 = 0 to 1024 {
773 affine.for %i2 = 0 to 1024 {
774 %0 = affine.load %arg1[%i2, %i1] : memref<1024x1024xf32>
775 %1 = affine.load %arg0[%i0, %i2] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57776 %2 = arith.mulf %1, %0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55777 %3 = affine.load %arg2[%i0, %i1] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57778 %4 = arith.addf %3, %2 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55779 affine.store %4, %arg2[%i0, %i1] : memref<1024x1024xf32>
780 }
781 }
782 }
783 affine.for %i3 = 0 to 1024 {
784 affine.for %i4 = 0 to 1024 {
785 affine.for %i5 = 0 to 1024 {
786 %5 = affine.load %arg3[%i5, %i4] : memref<1024x1024xf32>
787 %6 = affine.load %arg2[%i3, %i5] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57788 %7 = arith.mulf %6, %5 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55789 %8 = affine.load %arg4[%i3, %i4] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57790 %9 = arith.addf %8, %7 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55791 affine.store %9, %arg4[%i3, %i4] : memref<1024x1024xf32>
792 }
793 }
794 }
795
796 // CHECK: affine.for %{{.*}} = 0 to 1024 {
797 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
798 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
799 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
800 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57801 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55802 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57803 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55804 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
805 // CHECK-NEXT: }
806 // CHECK-NEXT: }
807 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
808 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
809 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
810 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57811 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55812 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
Mogballa54f4ea2021-10-12 23:14:57813 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55814 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
815 // CHECK-NEXT: }
816 // CHECK-NEXT: }
817 // CHECK-NEXT: }
818 return
819}
820
821// -----
822
823// CHECK-LABEL: func @should_fuse_self_dependence_multi_store_producer() {
River Riddlecda6aa72022-04-20 23:22:21824func.func @should_fuse_self_dependence_multi_store_producer() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55825 %m = memref.alloc() : memref<10xf32>
826 %local_m = memref.alloc() : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57827 %cf7 = arith.constant 7.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55828
829 affine.for %i0 = 0 to 10 {
830 affine.store %cf7, %local_m[%i0] : memref<10xf32>
831 %v0 = affine.load %local_m[%i0] : memref<10xf32>
832 affine.store %v0, %m[%i0] : memref<10xf32>
833 }
834 affine.for %i1 = 0 to 10 {
835 %v1 = affine.load %m[%i1] : memref<10xf32>
836 }
837 // CHECK: affine.for %[[i0:.*]] = 0 to 10 {
838 // CHECK-NEXT: affine.store %{{.*}}, [[LOCAL_M:%.*]][%[[i0]]] : memref<10xf32>
839 // CHECK-NEXT: [[v0:%.*]] = affine.load [[LOCAL_M]][%[[i0]]] : memref<10xf32>
840 // CHECK-NEXT: affine.store [[v0]], %{{.*}}[0] : memref<1xf32>
841 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
842 // CHECK-NEXT: }
843 // CHECK-NEXT: return
844 return
845}
846
847// -----
848
849// CHECK-LABEL: func @should_fuse_dead_multi_store_producer() {
River Riddlecda6aa72022-04-20 23:22:21850func.func @should_fuse_dead_multi_store_producer() {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55851 %m = memref.alloc() : memref<10xf32>
852 %dead_m = memref.alloc() : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57853 %cf7 = arith.constant 7.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55854
855 affine.for %i0 = 0 to 10 {
856 affine.store %cf7, %dead_m[%i0] : memref<10xf32>
857 affine.store %cf7, %m[%i0] : memref<10xf32>
858 }
859 affine.for %i1 = 0 to 10 {
860 %v0 = affine.load %m[%i1] : memref<10xf32>
861 }
862 // CHECK: affine.for %[[i0:.*]] = 0 to 10 {
863 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32>
864 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
865 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
866 // CHECK-NEXT: }
867 // CHECK-NEXT: return
868 return
869}
870
871// -----
872
873// CHECK-LABEL: func @should_fuse_function_live_out_multi_store_producer
River Riddlecda6aa72022-04-20 23:22:21874func.func @should_fuse_function_live_out_multi_store_producer(%live_in_out_m : memref<10xf32>) {
Sumesh Udayakumaran24b0df82021-08-03 16:41:55875 %m = memref.alloc() : memref<10xf32>
Mogballa54f4ea2021-10-12 23:14:57876 %cf7 = arith.constant 7.0 : f32
Sumesh Udayakumaran24b0df82021-08-03 16:41:55877
878 affine.for %i0 = 0 to 10 {
879 affine.store %cf7, %live_in_out_m[%i0] : memref<10xf32>
880 affine.store %cf7, %m[%i0] : memref<10xf32>
881 }
882 affine.for %i1 = 0 to 10 {
883 %v0 = affine.load %m[%i1] : memref<10xf32>
884 }
885 // CHECK: affine.for %[[i0:.*]] = 0 to 10 {
886 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32>
887 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
888 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
889 // CHECK-NEXT: }
890 // CHECK-NEXT: return
891 return
892}
893
894// Add further tests in mlir/test/Transforms/loop-fusion-4.mlir