From 7fb28e0be78f10feb12d8c35c899fa64de140715 Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Tue, 8 Oct 2024 15:35:23 -0400 Subject: [PATCH] [Codegen][GPU] Fix delinearized index order in forall resolution (#18724) --- .../compiler/Codegen/Common/GPU/GPUDistributeForall.cpp | 9 ++++----- .../Codegen/Common/GPU/test/gpu_distribute_forall.mlir | 4 ++-- .../LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp index 4e2e1b1a8817..64623462a526 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp @@ -130,13 +130,12 @@ LogicalResult resolveGPUMappedForallOp(RewriterBase &rewriter, {forLoop.getInductionVar(), flatId}) : forLoop.getInductionVar(); - // We require a descending relative mapping, so delinearize in reverse order. + // We require a descending relative mapping, so we can reuse the upper bound + // sizes directly. auto delinearize = rewriter.create( - loc, newFlatProducerId, llvm::to_vector(llvm::reverse(delinSizes))); + loc, newFlatProducerId, delinSizes); - SmallVector newBlockArgs = - llvm::map_to_vector(llvm::reverse(delinearize.getResults()), - [](OpResult r) -> Value { return r; }); + SmallVector newBlockArgs = delinearize.getResults(); // Step 4. Inline the region of the forall op. Operation *forallTerminator = forallOp.getBody()->getTerminator(); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir index 26f18e6c4aca..fab5da8f4595 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir @@ -133,8 +133,8 @@ func.func @distribute_thread_forall_multi_dim(%out : memref) // CHECK: %[[LINID:.+]] = affine.apply // CHECK-SAME: affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%[[I]]) // CHECK-SAME: [%[[TX]], %[[TY]], %[[TZ]]] -// CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (%c4, %c8, %c16) : index -// CHECK: memref.store {{.*}}[%[[DELIN]]#2, %[[DELIN]]#1, %[[DELIN]]#0] +// CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (%c16, %c8, %c4) : index +// CHECK: memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2] // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 08fb78ada09c..c2f002a9f599 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -627,8 +627,8 @@ hal.executable public @main { // CHECK-DAG: %[[IDY:.+]] = gpu.thread_id y // CHECK-DAG: %[[IDZ:.+]] = gpu.thread_id z // CHECK: %[[LINID0:.+]] = affine.apply #[[$MAP]]()[%[[IDX]], %[[IDY]], %[[IDZ]]] -// CHECK: %[[IDS:.+]]:2 = affine.delinearize_index %[[LINID0:.+]] into (%c8, %c4) : index, index -// CHECK: %[[LINID1:.+]] = affine.apply #[[$MAP1]]()[%[[IDS]]#1, %[[IDS]]#0] +// CHECK: %[[IDS:.+]]:2 = affine.delinearize_index %[[LINID0:.+]] into (%c4, %c8) : index, index +// CHECK: %[[LINID1:.+]] = affine.apply #[[$MAP1]]()[%[[IDS]]#0, %[[IDS]]#1] // CHECK: scf.for %{{.*}} = %c0 to %c256 step %c4 {{.*}} -> (vector<1x4xf32>) // CHECK: scf.for %{{.*}} = %[[LINID1]] to %c4 step %c32 // CHECK: %[[READ:.+]] = vector.transfer_read {{.*}} : memref<128x256xf32, {{.*}}storage_buffer>>, vector<4xf32>