Skip to content

Commit

Permalink
[Codegen][GPU] Change the location of barriers in forall fusion (iree…
Browse files Browse the repository at this point in the history
…-org#18542)

The way that barriers are currently inserted for forall fusion is
fragile and trying to model "WaR" conflicts on tensors (kind of). We
instead want to put the barrier around the body of the whole scf.forall.

See this comment:
iree-org#18490 (comment)
  • Loading branch information
qedawkins authored Sep 26, 2024
1 parent c3fa4d0 commit 7db91ce
Show file tree
Hide file tree
Showing 10 changed files with 113 additions and 274 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,8 @@ transform_dialect::FuseForallOp::apply(transform::TransformRewriter &rewriter,
"extracted slice from the consumer loop");
}

if (failed(GPU::fuseForallIntoSlice(rewriter, producer, consumer,
consumerChain))) {
if (failed(GPU::fuseForallIntoConsumer(rewriter, producer, consumer,
consumerChain))) {
return mlir::emitDefiniteFailure(state.getTopLevel(),
"failed to fuse forall ops");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,26 @@ module attributes { transform.with_named_sequence } {
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
// CHECK-DAG: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
// CHECK: scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
// CHECK-DAG: %[[OUTID0:.+]] = affine.apply #[[$MAP]](%[[IDX]])
// CHECK-DAG: %[[OUTID1:.+]] = affine.apply #[[$MAP]](%[[IDY]])

// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]]) -> (tensor<128x128xf32>)
// CHECK: %[[LINEARID:.+]] = affine.apply #[[$MAP2]](%[[I]], %[[IDX]], %[[IDY]])
// CHECK: %[[IDS:.+]]:2 = affine.delinearize_index %[[LINEARID]] into (%c1, %c64) : index, index
// CHECK: %[[INID0:.+]] = affine.apply #[[$MAP3]](%[[IDS]]#1)
// CHECK: %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
// CHECK: %[[INSLICE1:.+]] = tensor.extract_slice %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
// CHECK: %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1]
// CHECK: scf.yield %[[INSERT]]

// CHECK: %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
// CHECK: %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
// CHECK: ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
// CHECK: iree_gpu.yield %[[SLICE]]
// CHECK: } : tensor<16x16xf32>
// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[INTERMEDIATE]]) -> (tensor<128x128xf32>)
// CHECK: %[[LINEARID:.+]] = affine.apply #[[$MAP2]](%[[I]], %[[IDX]], %[[IDY]])
// CHECK: %[[IDS:.+]]:2 = affine.delinearize_index %[[LINEARID]] into (%c1, %c64) : index, index
// CHECK: %[[INID0:.+]] = affine.apply #[[$MAP3]](%[[IDS]]#1)
// CHECK: %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
// CHECK: %[[INSLICE1:.+]] = tensor.extract_slice %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
// CHECK: %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1]
// CHECK: scf.yield %[[INSERT]]
// CHECK: iree_gpu.yield %[[LOOP]]
// CHECK: } : tensor<128x128xf32>

// CHECK-DAG: %[[OUTID0:.+]] = affine.apply #[[$MAP]](%[[IDX]])
// CHECK-DAG: %[[OUTID1:.+]] = affine.apply #[[$MAP]](%[[IDY]])
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[BARRIER]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
// CHECK: %[[OUTSLICE:.+]] = tensor.extract_slice %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
// CHECK: %[[MM:.+]] = linalg.matmul ins(%[[SHUFFLE]], %[[SHUFFLE]] : tensor<16x16xf32>, tensor<16x16xf32>)
// CHECK: %[[MM:.+]] = linalg.matmul ins(%[[SLICE]], %[[SLICE]] : tensor<16x16xf32>, tensor<16x16xf32>)
// CHECK-SAME: outs(%[[OUTSLICE]] : tensor<16x16xf32>) -> tensor<16x16xf32>
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[MM]] into %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
Expand Down Expand Up @@ -122,10 +122,10 @@ module attributes { transform.with_named_sequence } {
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
// CHECK-DAG: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
// CHECK: scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
// CHECK: %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[ALLOC]])
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
// CHECK: %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
// CHECK: } : tensor<16x16xf32>
// CHECK: %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
// CHECK: %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %{{.*}})
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
// CHECK: } : tensor<128x128xf32>
// CHECK: } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}

// -----
Expand Down Expand Up @@ -178,14 +178,14 @@ module attributes { transform.with_named_sequence } {
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
// CHECK-DAG: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
// CHECK: scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
// CHECK: %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[ALLOC]])
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
// CHECK: %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
// CHECK: %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
// CHECK: ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[INTERMEDIATE]] {{\[}}[0, 1], [2]{{\]}} output_shape [2, 64, 128]
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[EXPAND]][0, %{{.*}}, %{{.*}}] [1, 16, 16] [1, 1, 1] : tensor<2x64x128xf32> to tensor<16x16xf32>
// CHECK: iree_gpu.yield %[[SLICE]]
// CHECK: } : tensor<16x16xf32>
// CHECK: %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[INTERMEDIATE]])
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
// CHECK: iree_gpu.yield %[[LOOP]]
// CHECK: } : tensor<128x128xf32>
// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[BARRIER]] {{\[}}[0, 1], [2]{{\]}} output_shape [2, 64, 128]
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[EXPAND]][0, %{{.*}}, %{{.*}}] [1, 16, 16] [1, 1, 1] : tensor<2x64x128xf32> to tensor<16x16xf32>
// CHECK: } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}

// -----
Expand Down Expand Up @@ -245,16 +245,16 @@ module attributes { transform.with_named_sequence } {
// CHECK: scf.forall (%[[W_IDX:.+]], %[[W_IDY:.+]]) in (2, 2) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
// CHECK: scf.forall (%[[L_IDX:.+]], %[[L_IDY:.+]]) in (4, 4) {{.*}} -> (tensor<64x64xf32>)

// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]]) -> (tensor<128x128xf32>)
// CHECK: %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %{{.*}}) -> (tensor<128x128xf32>)
// CHECK: %[[FLAT_ID:.+]] = affine.apply #[[$MAP4]](%[[I]], %[[L_IDY]], %[[L_IDX]], %[[W_IDX]], %[[W_IDY]])
// CHECK: %[[IDS:.+]]:2 = affine.delinearize_index %[[FLAT_ID]] into (%c1, %c64) : index, index
// CHECK: %[[IDX:.+]] = affine.apply #[[$MAP5]](%[[IDS]]#1)
// CHECK: %[[COPY:.+]] = linalg.copy
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[IDX]], %[[IDS]]#0] [2, 128]
// CHECK: scf.yield %[[INSERT]]
// CHECK: } : tensor<128x128xf32>

// CHECK: %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
// CHECK: } : tensor<16x16xf32>
// CHECK: } {mapping = [#iree_gpu.lane_id<1>, #iree_gpu.lane_id<0>]}
// CHECK: } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}

Expand Down Expand Up @@ -304,11 +304,11 @@ module attributes { transform.with_named_sequence } {

// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
// CHECK: scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) {{.*}} -> (tensor<128x128xf32>) {
// CHECK: %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %c32{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]])
// CHECK: %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%c32) : index
// CHECK: scf.yield
// CHECK: iree_gpu.barrier_region ins(%[[LOOP]]
// CHECK: iree_gpu.barrier_region ins(%[[ALLOC]]
// CHECK: %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
// CHECK: scf.for %[[I:.+]] = %[[LINEARID]] to %c32{{.*}} step %c64{{.*}}
// CHECK: %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%c32) : index
// CHECK: scf.yield
// CHECK: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

// -----
Expand Down Expand Up @@ -358,10 +358,10 @@ module attributes { transform.with_named_sequence } {

// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
// CHECK: scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) {{.*}} -> (tensor<128x128xf32>) {
// CHECK-DAG: %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
// CHECK-DAG: %[[PRODCOUNT:.+]] = affine.apply #[[$MAP3]]()[%[[X]], %[[Y]], %[[Z]]]
// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %[[PRODCOUNT]] step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]])
// CHECK: %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%[[Z]], %[[Y]], %[[X]]) : index
// CHECK: scf.yield
// CHECK: iree_gpu.barrier_region ins(%[[LOOP]]
// CHECK: iree_gpu.barrier_region ins(%[[ALLOC]]
// CHECK-DAG: %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
// CHECK-DAG: %[[PRODCOUNT:.+]] = affine.apply #[[$MAP3]]()[%[[X]], %[[Y]], %[[Z]]]
// CHECK: %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %[[PRODCOUNT]] step %c64{{.*}}
// CHECK: %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%[[Z]], %[[Y]], %[[X]]) : index
// CHECK: scf.yield
// CHECK: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
Original file line number Diff line number Diff line change
Expand Up @@ -71,66 +71,3 @@ module attributes { transform.with_named_sequence } {
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
// CHECK-SAME: : vector<4xf16>, vector<4xf16> into vector<4xf32>
// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>

// -----

func.func @barrier_region(%init: tensor<6x6xf32>) -> tensor<3x2xf32> {
%0 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
^bb0(%intermediate: tensor<6x6xf32>):
%slice = tensor.extract_slice %intermediate[0, 0] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
iree_gpu.yield %slice : tensor<3x2xf32>
} : tensor<3x2xf32>
return %0 : tensor<3x2xf32>
}

module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.iree.vectorize_iree_gpu
} : !transform.any_op
transform.yield
}
}

// CHECK-LABEL: func @barrier_region
// CHECK: %[[SHUFFLE:.+]] = iree_gpu.barrier_region
// CHECK: ^bb0(%[[INTERMEDIATE:.+]]: tensor<6x6xf32>):
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][0, 0] [3, 2] [1, 1]
// CHECK: %[[READ:.+]] = vector.transfer_read {{.*}} : tensor<3x2xf32>, vector<3x2xf32>
// CHECK: iree_gpu.yield %[[READ]] : vector<3x2xf32>
// CHECK: } : vector<3x2xf32>
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xf32>
// CHECK: vector.transfer_write %[[SHUFFLE]], %[[EMPTY]]

// -----

func.func @multi_result_barrier_region(%init: tensor<6x6xf32>) -> (index, tensor<3x2xf32>) {
%0:2 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
^bb0(%intermediate: tensor<6x6xf32>):
%slice = tensor.extract_slice %intermediate[0, 0] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
%c0 = arith.constant 0 : index
iree_gpu.yield %c0, %slice : index, tensor<3x2xf32>
} : index, tensor<3x2xf32>
return %0#0, %0#1 : index, tensor<3x2xf32>
}

module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.iree.vectorize_iree_gpu
} : !transform.any_op
transform.yield
}
}

// CHECK-LABEL: func @multi_result_barrier_region
// CHECK: %[[SHUFFLE:.+]]:2 = iree_gpu.barrier_region
// CHECK: ^bb0(%[[INTERMEDIATE:.+]]: tensor<6x6xf32>):
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][0, 0] [3, 2] [1, 1]
// CHECK: %[[READ:.+]] = vector.transfer_read {{.*}} : tensor<3x2xf32>, vector<3x2xf32>
// CHECK: iree_gpu.yield %c0, %[[READ]] : index, vector<3x2xf32>
// CHECK: } : index, vector<3x2xf32>
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xf32>
// CHECK: vector.transfer_write %[[SHUFFLE]]#1, %[[EMPTY]]
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ struct YieldOpBufferizationInterface
YieldOpBufferizationInterface, IREE::GPU::YieldOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
return true;
return false;
}

bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ struct FuseForalls final : OpRewritePattern<tensor::ExtractSliceOp> {
return failure();
}

return fuseForallIntoSlice(rewriter, producerForall, sliceParent,
consumerChain);
return fuseForallIntoConsumer(rewriter, producerForall, sliceParent,
consumerChain);
}
};

Expand Down
Loading

0 comments on commit 7db91ce

Please sign in to comment.