[Codegen][GPU] Change the location of barriers in forall fusion (iree…

…-org#18542) The way that barriers are currently inserted for forall fusion is fragile and trying to model "WaR" conflicts on tensors (kind of). We instead want to put the barrier around the body of the whole scf.forall. See this comment: iree-org#18490 (comment)
nod-ai · Sep 26, 2024 · 7db91ce · 7db91ce
1 parent c3fa4d0
commit 7db91ce
Show file tree

Hide file tree

Showing 10 changed files with 113 additions and 274 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
@@ -200,8 +200,8 @@ transform_dialect::FuseForallOp::apply(transform::TransformRewriter &rewriter,
                                      "extracted slice from the consumer loop");
   }
 
-  if (failed(GPU::fuseForallIntoSlice(rewriter, producer, consumer,
-                                      consumerChain))) {
+  if (failed(GPU::fuseForallIntoConsumer(rewriter, producer, consumer,
+                                         consumerChain))) {
     return mlir::emitDefiniteFailure(state.getTopLevel(),
                                      "failed to fuse forall ops");
   }

diff --git a/...src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_forall.mlir b/...src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_forall.mlir
@@ -47,26 +47,26 @@ module attributes { transform.with_named_sequence } {
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
 //   CHECK-DAG:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
-//   CHECK-DAG:     %[[OUTID0:.+]] = affine.apply #[[$MAP]](%[[IDX]])
-//   CHECK-DAG:     %[[OUTID1:.+]] = affine.apply #[[$MAP]](%[[IDY]])
 
-//       CHECK:     %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]]) -> (tensor<128x128xf32>)
-//       CHECK:       %[[LINEARID:.+]] = affine.apply #[[$MAP2]](%[[I]], %[[IDX]], %[[IDY]])
-//       CHECK:       %[[IDS:.+]]:2 = affine.delinearize_index %[[LINEARID]] into (%c1, %c64) : index, index
-//       CHECK:       %[[INID0:.+]] = affine.apply #[[$MAP3]](%[[IDS]]#1)
-//       CHECK:       %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
-//       CHECK:       %[[INSLICE1:.+]] = tensor.extract_slice %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
-//       CHECK:       %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
-//       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1]
-//       CHECK:       scf.yield %[[INSERT]]
-
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
+//       CHECK:     %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
 //       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
-//       CHECK:       iree_gpu.yield %[[SLICE]]
-//       CHECK:     } : tensor<16x16xf32>
+//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[INTERMEDIATE]]) -> (tensor<128x128xf32>)
+//       CHECK:         %[[LINEARID:.+]] = affine.apply #[[$MAP2]](%[[I]], %[[IDX]], %[[IDY]])
+//       CHECK:         %[[IDS:.+]]:2 = affine.delinearize_index %[[LINEARID]] into (%c1, %c64) : index, index
+//       CHECK:         %[[INID0:.+]] = affine.apply #[[$MAP3]](%[[IDS]]#1)
+//       CHECK:         %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
+//       CHECK:         %[[INSLICE1:.+]] = tensor.extract_slice %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
+//       CHECK:         %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
+//       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1]
+//       CHECK:         scf.yield %[[INSERT]]
+//       CHECK:       iree_gpu.yield %[[LOOP]]
+//       CHECK:     } : tensor<128x128xf32>
+
+//   CHECK-DAG:     %[[OUTID0:.+]] = affine.apply #[[$MAP]](%[[IDX]])
+//   CHECK-DAG:     %[[OUTID1:.+]] = affine.apply #[[$MAP]](%[[IDY]])
+//       CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[BARRIER]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
 //       CHECK:     %[[OUTSLICE:.+]] = tensor.extract_slice %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
-//       CHECK:     %[[MM:.+]] = linalg.matmul ins(%[[SHUFFLE]], %[[SHUFFLE]] : tensor<16x16xf32>, tensor<16x16xf32>)
+//       CHECK:     %[[MM:.+]] = linalg.matmul ins(%[[SLICE]], %[[SLICE]] : tensor<16x16xf32>, tensor<16x16xf32>)
 //  CHECK-SAME:       outs(%[[OUTSLICE]] : tensor<16x16xf32>) -> tensor<16x16xf32>
 //       CHECK:     scf.forall.in_parallel {
 //       CHECK:       tensor.parallel_insert_slice %[[MM]] into %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
@@ -122,10 +122,10 @@ module attributes { transform.with_named_sequence } {
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
 //   CHECK-DAG:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
-//       CHECK:     %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[ALLOC]])
-//       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
-//       CHECK:       } : tensor<16x16xf32>
+//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
+//       CHECK:       %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %{{.*}})
+//       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
+//       CHECK:     } : tensor<128x128xf32>
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
 
 // -----
@@ -178,14 +178,14 @@ module attributes { transform.with_named_sequence } {
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
 //   CHECK-DAG:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
-//       CHECK:     %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[ALLOC]])
-//       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
+//       CHECK:     %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
 //       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
-//       CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[INTERMEDIATE]] {{\[}}[0, 1], [2]{{\]}} output_shape [2, 64, 128]
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[EXPAND]][0, %{{.*}}, %{{.*}}] [1, 16, 16] [1, 1, 1] : tensor<2x64x128xf32> to tensor<16x16xf32>
-//       CHECK:       iree_gpu.yield %[[SLICE]]
-//       CHECK:       } : tensor<16x16xf32>
+//       CHECK:       %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[INTERMEDIATE]])
+//       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
+//       CHECK:       iree_gpu.yield %[[LOOP]]
+//       CHECK:     } : tensor<128x128xf32>
+//       CHECK:     %[[EXPAND:.+]] = tensor.expand_shape %[[BARRIER]] {{\[}}[0, 1], [2]{{\]}} output_shape [2, 64, 128]
+//       CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[EXPAND]][0, %{{.*}}, %{{.*}}] [1, 16, 16] [1, 1, 1] : tensor<2x64x128xf32> to tensor<16x16xf32>
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
 
 // -----
@@ -245,16 +245,16 @@ module attributes { transform.with_named_sequence } {
 //       CHECK:   scf.forall (%[[W_IDX:.+]], %[[W_IDY:.+]]) in (2, 2) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
 //       CHECK:     scf.forall (%[[L_IDX:.+]], %[[L_IDY:.+]]) in (4, 4) {{.*}} -> (tensor<64x64xf32>)
 
-//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]]) -> (tensor<128x128xf32>)
+//       CHECK:       %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
+//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %{{.*}}) -> (tensor<128x128xf32>)
 //       CHECK:         %[[FLAT_ID:.+]] = affine.apply #[[$MAP4]](%[[I]], %[[L_IDY]], %[[L_IDX]], %[[W_IDX]], %[[W_IDY]])
 //       CHECK:         %[[IDS:.+]]:2 = affine.delinearize_index %[[FLAT_ID]] into (%c1, %c64) : index, index
 //       CHECK:         %[[IDX:.+]] = affine.apply #[[$MAP5]](%[[IDS]]#1)
 //       CHECK:         %[[COPY:.+]] = linalg.copy
 //       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[IDX]], %[[IDS]]#0] [2, 128]
 //       CHECK:         scf.yield %[[INSERT]]
+//       CHECK:       } : tensor<128x128xf32>
 
-//       CHECK:       %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
-//       CHECK:       } : tensor<16x16xf32>
 //       CHECK:     } {mapping = [#iree_gpu.lane_id<1>, #iree_gpu.lane_id<0>]}
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
 
@@ -304,11 +304,11 @@ module attributes { transform.with_named_sequence } {
 
 //       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) {{.*}} -> (tensor<128x128xf32>) {
-//       CHECK:     %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
-//       CHECK:     %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %c32{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]])
-//       CHECK:       %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%c32) : index
-//       CHECK:       scf.yield
-//       CHECK:     iree_gpu.barrier_region ins(%[[LOOP]]
+//       CHECK:     iree_gpu.barrier_region ins(%[[ALLOC]]
+//       CHECK:       %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
+//       CHECK:       scf.for %[[I:.+]] = %[[LINEARID]] to %c32{{.*}} step %c64{{.*}}
+//       CHECK:         %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%c32) : index
+//       CHECK:         scf.yield
 //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
 
 // -----
@@ -358,10 +358,10 @@ module attributes { transform.with_named_sequence } {
 
 //       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) {{.*}} -> (tensor<128x128xf32>) {
-//   CHECK-DAG:     %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
-//   CHECK-DAG:     %[[PRODCOUNT:.+]] = affine.apply #[[$MAP3]]()[%[[X]], %[[Y]], %[[Z]]]
-//       CHECK:     %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %[[PRODCOUNT]] step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]])
-//       CHECK:       %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%[[Z]], %[[Y]], %[[X]]) : index
-//       CHECK:       scf.yield
-//       CHECK:     iree_gpu.barrier_region ins(%[[LOOP]]
+//       CHECK:     iree_gpu.barrier_region ins(%[[ALLOC]]
+//   CHECK-DAG:       %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
+//   CHECK-DAG:       %[[PRODCOUNT:.+]] = affine.apply #[[$MAP3]]()[%[[X]], %[[Y]], %[[Z]]]
+//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %[[PRODCOUNT]] step %c64{{.*}}
+//       CHECK:         %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%[[Z]], %[[Y]], %[[X]]) : index
+//       CHECK:         scf.yield
 //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
diff --git a/...rc/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir b/...rc/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir
@@ -71,66 +71,3 @@ module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
 //  CHECK-SAME:     : vector<4xf16>, vector<4xf16> into vector<4xf32>
 //       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
-
-// -----
-
-func.func @barrier_region(%init: tensor<6x6xf32>) -> tensor<3x2xf32> {
-  %0 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
-  ^bb0(%intermediate: tensor<6x6xf32>):
-    %slice = tensor.extract_slice %intermediate[0, 0] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
-    iree_gpu.yield %slice : tensor<3x2xf32>
-  } : tensor<3x2xf32>
-  return %0 : tensor<3x2xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @barrier_region
-//       CHECK:   %[[SHUFFLE:.+]] = iree_gpu.barrier_region
-//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<6x6xf32>):
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][0, 0] [3, 2] [1, 1]
-//       CHECK:       %[[READ:.+]] = vector.transfer_read {{.*}} : tensor<3x2xf32>, vector<3x2xf32>
-//       CHECK:       iree_gpu.yield %[[READ]] : vector<3x2xf32>
-//       CHECK:   } : vector<3x2xf32>
-//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xf32>
-//       CHECK:   vector.transfer_write %[[SHUFFLE]], %[[EMPTY]]
-
-// -----
-
-func.func @multi_result_barrier_region(%init: tensor<6x6xf32>) -> (index, tensor<3x2xf32>) {
-  %0:2 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
-  ^bb0(%intermediate: tensor<6x6xf32>):
-    %slice = tensor.extract_slice %intermediate[0, 0] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
-    %c0 = arith.constant 0 : index
-    iree_gpu.yield %c0, %slice : index, tensor<3x2xf32>
-  } : index, tensor<3x2xf32>
-  return %0#0, %0#1 : index, tensor<3x2xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @multi_result_barrier_region
-//       CHECK:   %[[SHUFFLE:.+]]:2 = iree_gpu.barrier_region
-//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<6x6xf32>):
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][0, 0] [3, 2] [1, 1]
-//       CHECK:       %[[READ:.+]] = vector.transfer_read {{.*}} : tensor<3x2xf32>, vector<3x2xf32>
-//       CHECK:       iree_gpu.yield %c0, %[[READ]] : index, vector<3x2xf32>
-//       CHECK:   } : index, vector<3x2xf32>
-//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xf32>
-//       CHECK:   vector.transfer_write %[[SHUFFLE]]#1, %[[EMPTY]]
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
@@ -234,7 +234,7 @@ struct YieldOpBufferizationInterface
           YieldOpBufferizationInterface, IREE::GPU::YieldOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp
@@ -75,8 +75,8 @@ struct FuseForalls final : OpRewritePattern<tensor::ExtractSliceOp> {
       return failure();
     }
 
-    return fuseForallIntoSlice(rewriter, producerForall, sliceParent,
-                               consumerChain);
+    return fuseForallIntoConsumer(rewriter, producerForall, sliceParent,
+                                  consumerChain);
   }
 };