From 7fb28e0be78f10feb12d8c35c899fa64de140715 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Tue, 8 Oct 2024 15:35:23 -0400
Subject: [PATCH] [Codegen][GPU] Fix delinearized index order in forall
 resolution (#18724)

---
 .../compiler/Codegen/Common/GPU/GPUDistributeForall.cpp  | 9 ++++-----
 .../Codegen/Common/GPU/test/gpu_distribute_forall.mlir   | 4 ++--
 .../LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir       | 4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
index 4e2e1b1a8817..64623462a526 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
@@ -130,13 +130,12 @@ LogicalResult resolveGPUMappedForallOp(RewriterBase &rewriter,
                                             {forLoop.getInductionVar(), flatId})
           : forLoop.getInductionVar();
 
-  // We require a descending relative mapping, so delinearize in reverse order.
+  // We require a descending relative mapping, so we can reuse the upper bound
+  // sizes directly.
   auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
-      loc, newFlatProducerId, llvm::to_vector(llvm::reverse(delinSizes)));
+      loc, newFlatProducerId, delinSizes);
 
-  SmallVector<Value> newBlockArgs =
-      llvm::map_to_vector(llvm::reverse(delinearize.getResults()),
-                          [](OpResult r) -> Value { return r; });
+  SmallVector<Value> newBlockArgs = delinearize.getResults();
 
   // Step 4. Inline the region of the forall op.
   Operation *forallTerminator = forallOp.getBody()->getTerminator();
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
index 26f18e6c4aca..fab5da8f4595 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
@@ -133,8 +133,8 @@ func.func @distribute_thread_forall_multi_dim(%out : memref<?x?x?xi32>)
 //       CHECK:     %[[LINID:.+]] = affine.apply
 //  CHECK-SAME:       affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%[[I]])
 //  CHECK-SAME:       [%[[TX]], %[[TY]], %[[TZ]]]
-//       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (%c4, %c8, %c16) : index
-//       CHECK:     memref.store {{.*}}[%[[DELIN]]#2, %[[DELIN]]#1, %[[DELIN]]#0]
+//       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (%c16, %c8, %c4) : index
+//       CHECK:     memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2]
 
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 08fb78ada09c..c2f002a9f599 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -627,8 +627,8 @@ hal.executable public @main {
 //   CHECK-DAG:   %[[IDY:.+]] = gpu.thread_id  y
 //   CHECK-DAG:   %[[IDZ:.+]] = gpu.thread_id  z
 //       CHECK:   %[[LINID0:.+]] = affine.apply #[[$MAP]]()[%[[IDX]], %[[IDY]], %[[IDZ]]]
-//       CHECK:   %[[IDS:.+]]:2 = affine.delinearize_index %[[LINID0:.+]] into (%c8, %c4) : index, index
-//       CHECK:   %[[LINID1:.+]] = affine.apply #[[$MAP1]]()[%[[IDS]]#1, %[[IDS]]#0]
+//       CHECK:   %[[IDS:.+]]:2 = affine.delinearize_index %[[LINID0:.+]] into (%c4, %c8) : index, index
+//       CHECK:   %[[LINID1:.+]] = affine.apply #[[$MAP1]]()[%[[IDS]]#0, %[[IDS]]#1]
 //       CHECK:   scf.for %{{.*}} = %c0 to %c256 step %c4 {{.*}} -> (vector<1x4xf32>)
 //       CHECK:     scf.for %{{.*}} = %[[LINID1]] to %c4 step %c32
 //       CHECK:       %[[READ:.+]] = vector.transfer_read {{.*}} : memref<128x256xf32, {{.*}}storage_buffer>>, vector<4xf32>