GPU data tiling: Refine tile dimensions, more preparation for thread …

…distribution. (iree-org#18556) * The unrolling factors in `DataTiledMMAAttr` get split between plain unrolling and unroll-to-subgroups. * The dimensions in `TileSwizzle` get an enum telling if they are cross-thread / cross-instruction. * `getSwizzle` gets moved to GPUTileSwizzleUtils as it is going to be used in codegen outside of MaterializeEncoding. --------- Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
nod-ai · Sep 24, 2024 · 9158a90 · 9158a90
1 parent b2dd6db
commit 9158a90
Show file tree

Hide file tree

Showing 14 changed files with 309 additions and 147 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -143,6 +143,7 @@ iree_compiler_cc_library(
         "TileDispatchUsingForall.cpp",
         "TileDispatchUsingInterface.cpp",
         "TileSizeSelection.cpp",
+        "TileSwizzle.cpp",
         "TypePropagationPass.cpp",
         "UserConfig.cpp",
         "VectorizeMemrefCopy.cpp",

diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -135,6 +135,7 @@ iree_cc_library(
     "TileDispatchUsingForall.cpp"
     "TileDispatchUsingInterface.cpp"
     "TileSizeSelection.cpp"
+    "TileSwizzle.cpp"
     "TypePropagationPass.cpp"
     "UserConfig.cpp"
     "VectorizeMemrefCopy.cpp"

diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -210,10 +210,12 @@ bool isNarrowNResult(EncodingAttr encoding) {
 }
 
 SmallVector<int64_t>
-getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape) {
+getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape) {
   SmallVector<int64_t> result;
-  for (auto expandShapeDim : expandShape) {
-    result.append(expandShapeDim);
+  for (auto e : expandShape) {
+    for (auto d : e) {
+      result.push_back(d.size);
+    }
   }
   return result;
 }

diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -143,7 +143,7 @@ bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding);
 
 /// Concatenates the vectors.
 SmallVector<int64_t>
-getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape);
+getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape);
 
 } // namespace mlir::iree_compiler
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -37,84 +37,6 @@ namespace mlir::iree_compiler {
 #define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
-/// Returns the index of the dimension whose flattened size (flattening inner
-/// dimensions into it) matches the given `targetSize`. This is used to compute
-/// interleaving indices.
-///
-/// Example:
-///    Input shape = [16, 8, 4, 4]
-///    Input targetSize = 16
-/// -> Return 2, because the tail of the shape starting at index 2 is [4, 4],
-///    whose product equals targetSize.
-static int64_t getDimIdxForTargetSize(ArrayRef<int64_t> shape,
-                                      int64_t targetSize) {
-  int interleaveAt = 0;
-  int size = 1;
-  for (interleaveAt = shape.size() - 1; interleaveAt >= 0; --interleaveAt) {
-    assert(size <= targetSize);
-    assert((targetSize % size) == 0);
-    if (size == targetSize) {
-      break;
-    }
-    size *= shape[interleaveAt];
-  }
-  return interleaveAt;
-}
-
-/// Generates the swizzle for the full data-tiled-mma tile, including all the
-/// relevant unrolling factors.
-static TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
-                              IREE::GPU::MMAFragment fragment) {
-  auto [AType, BType, CType] = mma.getABCElementTypes();
-  int ABits = AType.getIntOrFloatBitWidth();
-  int BBits = BType.getIntOrFloatBitWidth();
-  // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
-  const int targetPreferredLoadBitWidth = 128;
-  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
-  switch (fragment) {
-  case IREE::GPU::MMAFragment::Lhs:
-    // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
-    // Unroll on K with interleaving, then on M.
-    if (mma.getUnrollK() > 1) {
-      unroll(swizzle, 1, mma.getUnrollK());
-      int interleavingIdx = getDimIdxForTargetSize(
-          swizzle.expandShape[1],
-          targetPreferredLoadBitWidth / (mma.getUnrollK() * ABits));
-      interleave(swizzle, 1, interleavingIdx);
-    }
-    if (mma.getUnrollM() > 1) {
-      unroll(swizzle, 0, mma.getUnrollM());
-    }
-    break;
-  case IREE::GPU::MMAFragment::Rhs:
-    // B-matrix (RHS). Since the pack ops already took care of transposing B,
-    // source dimensions are N (index 0) and K (index 1).
-    // Unroll on K with interleaving, then on N.
-    if (mma.getUnrollK() > 1) {
-      unroll(swizzle, 1, mma.getUnrollK());
-      int interleavingIdx = getDimIdxForTargetSize(
-          swizzle.expandShape[1],
-          targetPreferredLoadBitWidth / (mma.getUnrollK() * BBits));
-      interleave(swizzle, 1, interleavingIdx);
-    }
-    if (mma.getUnrollN() > 1) {
-      unroll(swizzle, 0, mma.getUnrollN());
-    }
-    break;
-  case IREE::GPU::MMAFragment::Acc:
-    // C-matrix (accumulator). Source dimensions are M (index 0) and N (index
-    // 1). Unroll on N, then on M.
-    if (mma.getUnrollN() > 1) {
-      unroll(swizzle, 1, mma.getUnrollN());
-    }
-    if (mma.getUnrollM() > 1) {
-      unroll(swizzle, 0, mma.getUnrollM());
-    }
-    break;
-  }
-  return swizzle;
-}
-
 static bool hasIntrinsic(IREE::GPU::TargetAttr target,
                          IREE::GPU::MMAIntrinsic intrinsic) {
   for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
@@ -133,27 +55,30 @@ chooseDataTiledMMAAttr(TypeRange elementTypes, IREE::GPU::TargetAttr target) {
   Type lhs = elementTypes[0];
   Type rhs = elementTypes[1];
   Type out = elementTypes[2];
-  auto match = [=](MMAIntrinsic intrinsic, int unrollM, int unrollN,
+  auto match = [=](MMAIntrinsic intrinsic, int unrollM, int unrollMToThreads,
+                   int unrollN, int unrollNToThreads,
                    int unrollK) -> std::optional<DataTiledMMAAttr> {
     if (!hasIntrinsic(target, intrinsic)) {
       return std::nullopt;
     }
     auto candidate = DataTiledMMAAttr::get(
-        ctx, MMAIntrinsicAttr::get(ctx, intrinsic), unrollM, unrollN, unrollK);
+        ctx, MMAIntrinsicAttr::get(ctx, intrinsic), /*unroll_m=*/unrollM,
+        /*unroll_m_to_subgroups=*/unrollMToThreads, /*unroll_n=*/unrollN,
+        /*unroll_n_to_subgroups=*/unrollNToThreads, /*unroll_k=*/unrollK);
     auto [candidateLhs, candidateRhs, candidateOut] =
         candidate.getABCElementTypes();
     if (candidateLhs != lhs || candidateRhs != rhs || candidateOut != out) {
       return std::nullopt;
     }
     return candidate;
   };
-  if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x4_F32, 8, 8, 4)) {
+  if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x4_F32, 8, 1, 2, 4, 4)) {
     return m;
   }
-  if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x16_F16, 8, 8, 2)) {
+  if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x16_F16, 8, 1, 2, 4, 2)) {
     return m;
   }
-  if (auto m = match(MMAIntrinsic::MFMA_I32_16x16x32_I8, 8, 8, 2)) {
+  if (auto m = match(MMAIntrinsic::MFMA_I32_16x16x32_I8, 8, 1, 2, 4, 2)) {
     return m;
   }
   // Fallback - no architecture-optimized tile size for this case.
@@ -220,7 +145,7 @@ struct GPUMaterializeDeviceEncodingPass final
 
 SmallVector<ReassociationIndices>
 getReassociationIndices(int outerDims,
-                        SmallVector<SmallVector<int64_t>> expandShape) {
+                        const TileSwizzle::ExpandShapeType &expandShape) {
   SmallVector<ReassociationIndices> result;
   int expandedIdx = 0;
   for (int i = 0; i < outerDims; ++i) {

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp
@@ -5,15 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h"
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 
 namespace mlir::iree_compiler {
 
 // Given an `expandShape` vector-of-vectors describing the mapping from source
 // dimensions to expanded dimensions, returns the index of the first expanded
 // dimension corresponding to the given source dimension index.
 static int64_t
-getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
+getExpandedDimFirstIdx(const TileSwizzle::ExpandShapeType &expandShape,
                        int64_t srcIndex) {
   int dstIndexFirst = 0;
   for (int i = 0; i < srcIndex; ++i) {
@@ -22,14 +21,17 @@ getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
   return dstIndexFirst;
 }
 
-void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor) {
+void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor,
+            TileSwizzle::Dim::Kind kind) {
   assert(unrollFactor > 1);
   int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
-
+  TileSwizzle::Dim unrollDim;
+  unrollDim.size = unrollFactor;
+  unrollDim.kind = kind;
   // The new unrolling dimension is inserted at the start of the expandShape
   // dimensions group corresponding to srcIndex.
   swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
-                                       unrollFactor);
+                                       unrollDim);
   // Since we are not interleaving here, generating side-by-side copies of the
   // original layout, the new unrolling dimension is the new outermost
   // dimension. Existing entries get shifted to make room for it.
@@ -97,7 +99,10 @@ TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
   // shape expansion for now.
   TileSwizzle swizzle;
   for (auto t : layout.thread) {
-    swizzle.expandShape.push_back({t});
+    TileSwizzle::Dim dim;
+    dim.size = t;
+    dim.kind = TileSwizzle::Dim::Kind::CrossThread; // Because `layout.thread`.
+    swizzle.expandShape.push_back({dim});
   }
   // The layout strides decide the initial swizzle.permutation.
   // Some WMMA intrinsics have tstrides=0 values, assert on that as that
@@ -112,9 +117,12 @@ TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
   // Deal with any element size greater than 1 by inserting it innermost.
   // Notice that this is similar to the unroll() function, just creating an
   // inner dimension instead of an outer dimension.
-  for (int i = 0; i < layout.element.size(); ++i) {
-    if (layout.element[i] != 1) {
-      swizzle.expandShape[i].push_back(layout.element[i]);
+  for (auto [i, e] : llvm::enumerate(layout.element)) {
+    if (e != 1) {
+      TileSwizzle::Dim dim;
+      dim.size = e;
+      dim.kind = TileSwizzle::Dim::Kind::Internal; // Because `layout.element`.
+      swizzle.expandShape[i].push_back(dim);
       int newIndex = getExpandedDimFirstIdx(swizzle.expandShape, i + 1) - 1;
       for (auto &p : swizzle.permutation) {
         p += (p >= newIndex);
@@ -125,13 +133,105 @@ TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
   // Deal with any outer size greater than 1 as just a call to unroll.
   // Iterate over dims in reverse order because we are creating a new outermost
   // dimension each time.
-  for (int i = layout.outer.size() - 1; i >= 0; --i) {
-    if (layout.outer[i] != 1) {
-      unroll(swizzle, i, layout.outer[i]);
+  for (auto [i, o] : llvm::enumerate(layout.outer)) {
+    if (o != 1) {
+      // `layout.outer` means additional Internal dimensions, just like
+      // `layout.element`, just swizzled outermost.
+      unroll(swizzle, i, o, TileSwizzle::Dim::Kind::Internal);
     }
   }
 
   return swizzle;
 }
 
+// Returns the index of the dimension whose flattened size (flattening inner
+// dimensions into it) matches the given `targetSize`. This is used to compute
+// interleaving indices.
+//
+// Example:
+//    Input shape = [16, 8, 4, 4]
+//    Input targetSize = 16
+// -> Return 2, because the tail of the shape starting at index 2 is [4, 4],
+//    whose product equals targetSize.
+static int64_t
+getDimIdxForTargetSize(const TileSwizzle::ExpandShapeDimVectorType &shape,
+                       int64_t targetSize) {
+  int interleaveAt = 0;
+  int size = 1;
+  for (interleaveAt = shape.size() - 1; interleaveAt >= 0; --interleaveAt) {
+    assert(size <= targetSize);
+    assert((targetSize % size) == 0);
+    if (size == targetSize) {
+      break;
+    }
+    size *= shape[interleaveAt].size;
+  }
+  return interleaveAt;
+}
+
+TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
+                       IREE::GPU::MMAFragment fragment) {
+  auto [AType, BType, CType] = mma.getABCElementTypes();
+  int ABits = AType.getIntOrFloatBitWidth();
+  int BBits = BType.getIntOrFloatBitWidth();
+  // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
+  const int targetPreferredLoadBitWidth = 128;
+  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
+  using Kind = TileSwizzle::Dim::Kind;
+  switch (fragment) {
+  case IREE::GPU::MMAFragment::Lhs:
+    // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
+    // Unroll on K with interleaving, then on M.
+    if (mma.getUnrollK() > 1) {
+      unroll(swizzle, 1, mma.getUnrollK(), Kind::CrossIntrinsic);
+      int interleavingIdx = getDimIdxForTargetSize(
+          swizzle.expandShape[1],
+          targetPreferredLoadBitWidth / (mma.getUnrollK() * ABits));
+      interleave(swizzle, 1, interleavingIdx);
+    }
+    if (mma.getUnrollM() > 1) {
+      unroll(swizzle, 0, mma.getUnrollM(), Kind::CrossIntrinsic);
+    }
+    if (mma.getUnrollMToSubgroups() > 1) {
+      unroll(swizzle, 0, mma.getUnrollMToSubgroups(), Kind::CrossThread);
+    }
+    break;
+  case IREE::GPU::MMAFragment::Rhs:
+    // B-matrix (RHS). Since the pack ops already took care of transposing B,
+    // source dimensions are N (index 0) and K (index 1).
+    // Unroll on K with interleaving, then on N.
+    if (mma.getUnrollK() > 1) {
+      unroll(swizzle, 1, mma.getUnrollK(), Kind::CrossIntrinsic);
+      int interleavingIdx = getDimIdxForTargetSize(
+          swizzle.expandShape[1],
+          targetPreferredLoadBitWidth / (mma.getUnrollK() * BBits));
+      interleave(swizzle, 1, interleavingIdx);
+    }
+    if (mma.getUnrollN() > 1) {
+      unroll(swizzle, 0, mma.getUnrollN(), Kind::CrossIntrinsic);
+    }
+    if (mma.getUnrollNToSubgroups() > 1) {
+      unroll(swizzle, 0, mma.getUnrollNToSubgroups(), Kind::CrossThread);
+    }
+    break;
+  case IREE::GPU::MMAFragment::Acc:
+    // C-matrix (accumulator). Source dimensions are M (index 0) and N (index
+    // 1). Unroll on N, then on M.
+    if (mma.getUnrollN() > 1) {
+      unroll(swizzle, 1, mma.getUnrollN(), Kind::CrossIntrinsic);
+    }
+    if (mma.getUnrollNToSubgroups() > 1) {
+      unroll(swizzle, 1, mma.getUnrollNToSubgroups(), Kind::CrossThread);
+    }
+    if (mma.getUnrollM() > 1) {
+      unroll(swizzle, 0, mma.getUnrollM(), Kind::CrossIntrinsic);
+    }
+    if (mma.getUnrollMToSubgroups() > 1) {
+      unroll(swizzle, 0, mma.getUnrollMToSubgroups(), Kind::CrossThread);
+    }
+    break;
+  }
+  return swizzle;
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h
@@ -8,6 +8,7 @@
 #define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_GPU_GPUTILESWIZZLEUTILS_H_
 
 #include "iree/compiler/Codegen/Common/TileSwizzle.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 
 namespace mlir::iree_compiler {
@@ -17,17 +18,26 @@ namespace mlir::iree_compiler {
 TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
                                 IREE::GPU::MMAFragment fragment);
 
+// Returns the swizzle for the full data-tiled-mma tile, including all the
+// relevant unrolling factors.
+TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
+                       IREE::GPU::MMAFragment fragment);
+
 // Unrolls the dimension given by `srcIndex` by the given `unrollFactor`.
 // This is not interleaving layouts. The layout will consist of multiple copies
 // of the input tile, side by side.
 //
+// The enum parameter `kind` initializes the corresponding member on the newly
+// created TileSwizzle::Dim.
+//
 // Example:
 //    Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
 //    Input srcIndex = 1
 //    Input unrollFactor = 4
 // -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
 //
-void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor);
+void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor,
+            TileSwizzle::Dim::Kind kind);
 
 // Interleaves the layout in `swizzle` by mutating `swizzle.permutation` to
 // move permutation[0], the outer-most dimension (which the unroll() function