Skip to content

Commit

Permalink
GPU data tiling: Refine tile dimensions, more preparation for thread …
Browse files Browse the repository at this point in the history
…distribution. (iree-org#18556)

* The unrolling factors in `DataTiledMMAAttr` get split between plain
unrolling and unroll-to-subgroups.
* The dimensions in `TileSwizzle` get an enum telling if they are
cross-thread / cross-instruction.
* `getSwizzle` gets moved to GPUTileSwizzleUtils as it is going to be
used in codegen outside of MaterializeEncoding.

---------

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
  • Loading branch information
bjacob authored Sep 24, 2024
1 parent b2dd6db commit 9158a90
Show file tree
Hide file tree
Showing 14 changed files with 309 additions and 147 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ iree_compiler_cc_library(
"TileDispatchUsingForall.cpp",
"TileDispatchUsingInterface.cpp",
"TileSizeSelection.cpp",
"TileSwizzle.cpp",
"TypePropagationPass.cpp",
"UserConfig.cpp",
"VectorizeMemrefCopy.cpp",
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ iree_cc_library(
"TileDispatchUsingForall.cpp"
"TileDispatchUsingInterface.cpp"
"TileSizeSelection.cpp"
"TileSwizzle.cpp"
"TypePropagationPass.cpp"
"UserConfig.cpp"
"VectorizeMemrefCopy.cpp"
Expand Down
8 changes: 5 additions & 3 deletions compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,12 @@ bool isNarrowNResult(EncodingAttr encoding) {
}

SmallVector<int64_t>
getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape) {
getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape) {
SmallVector<int64_t> result;
for (auto expandShapeDim : expandShape) {
result.append(expandShapeDim);
for (auto e : expandShape) {
for (auto d : e) {
result.push_back(d.size);
}
}
return result;
}
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding);

/// Concatenates the vectors.
SmallVector<int64_t>
getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape);
getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape);

} // namespace mlir::iree_compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,84 +37,6 @@ namespace mlir::iree_compiler {
#define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

/// Returns the index of the dimension whose flattened size (flattening inner
/// dimensions into it) matches the given `targetSize`. This is used to compute
/// interleaving indices.
///
/// Example:
/// Input shape = [16, 8, 4, 4]
/// Input targetSize = 16
/// -> Return 2, because the tail of the shape starting at index 2 is [4, 4],
/// whose product equals targetSize.
static int64_t getDimIdxForTargetSize(ArrayRef<int64_t> shape,
int64_t targetSize) {
int interleaveAt = 0;
int size = 1;
for (interleaveAt = shape.size() - 1; interleaveAt >= 0; --interleaveAt) {
assert(size <= targetSize);
assert((targetSize % size) == 0);
if (size == targetSize) {
break;
}
size *= shape[interleaveAt];
}
return interleaveAt;
}

/// Generates the swizzle for the full data-tiled-mma tile, including all the
/// relevant unrolling factors.
static TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
IREE::GPU::MMAFragment fragment) {
auto [AType, BType, CType] = mma.getABCElementTypes();
int ABits = AType.getIntOrFloatBitWidth();
int BBits = BType.getIntOrFloatBitWidth();
// TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
const int targetPreferredLoadBitWidth = 128;
auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
switch (fragment) {
case IREE::GPU::MMAFragment::Lhs:
// A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
// Unroll on K with interleaving, then on M.
if (mma.getUnrollK() > 1) {
unroll(swizzle, 1, mma.getUnrollK());
int interleavingIdx = getDimIdxForTargetSize(
swizzle.expandShape[1],
targetPreferredLoadBitWidth / (mma.getUnrollK() * ABits));
interleave(swizzle, 1, interleavingIdx);
}
if (mma.getUnrollM() > 1) {
unroll(swizzle, 0, mma.getUnrollM());
}
break;
case IREE::GPU::MMAFragment::Rhs:
// B-matrix (RHS). Since the pack ops already took care of transposing B,
// source dimensions are N (index 0) and K (index 1).
// Unroll on K with interleaving, then on N.
if (mma.getUnrollK() > 1) {
unroll(swizzle, 1, mma.getUnrollK());
int interleavingIdx = getDimIdxForTargetSize(
swizzle.expandShape[1],
targetPreferredLoadBitWidth / (mma.getUnrollK() * BBits));
interleave(swizzle, 1, interleavingIdx);
}
if (mma.getUnrollN() > 1) {
unroll(swizzle, 0, mma.getUnrollN());
}
break;
case IREE::GPU::MMAFragment::Acc:
// C-matrix (accumulator). Source dimensions are M (index 0) and N (index
// 1). Unroll on N, then on M.
if (mma.getUnrollN() > 1) {
unroll(swizzle, 1, mma.getUnrollN());
}
if (mma.getUnrollM() > 1) {
unroll(swizzle, 0, mma.getUnrollM());
}
break;
}
return swizzle;
}

static bool hasIntrinsic(IREE::GPU::TargetAttr target,
IREE::GPU::MMAIntrinsic intrinsic) {
for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
Expand All @@ -133,27 +55,30 @@ chooseDataTiledMMAAttr(TypeRange elementTypes, IREE::GPU::TargetAttr target) {
Type lhs = elementTypes[0];
Type rhs = elementTypes[1];
Type out = elementTypes[2];
auto match = [=](MMAIntrinsic intrinsic, int unrollM, int unrollN,
auto match = [=](MMAIntrinsic intrinsic, int unrollM, int unrollMToThreads,
int unrollN, int unrollNToThreads,
int unrollK) -> std::optional<DataTiledMMAAttr> {
if (!hasIntrinsic(target, intrinsic)) {
return std::nullopt;
}
auto candidate = DataTiledMMAAttr::get(
ctx, MMAIntrinsicAttr::get(ctx, intrinsic), unrollM, unrollN, unrollK);
ctx, MMAIntrinsicAttr::get(ctx, intrinsic), /*unroll_m=*/unrollM,
/*unroll_m_to_subgroups=*/unrollMToThreads, /*unroll_n=*/unrollN,
/*unroll_n_to_subgroups=*/unrollNToThreads, /*unroll_k=*/unrollK);
auto [candidateLhs, candidateRhs, candidateOut] =
candidate.getABCElementTypes();
if (candidateLhs != lhs || candidateRhs != rhs || candidateOut != out) {
return std::nullopt;
}
return candidate;
};
if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x4_F32, 8, 8, 4)) {
if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x4_F32, 8, 1, 2, 4, 4)) {
return m;
}
if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x16_F16, 8, 8, 2)) {
if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x16_F16, 8, 1, 2, 4, 2)) {
return m;
}
if (auto m = match(MMAIntrinsic::MFMA_I32_16x16x32_I8, 8, 8, 2)) {
if (auto m = match(MMAIntrinsic::MFMA_I32_16x16x32_I8, 8, 1, 2, 4, 2)) {
return m;
}
// Fallback - no architecture-optimized tile size for this case.
Expand Down Expand Up @@ -220,7 +145,7 @@ struct GPUMaterializeDeviceEncodingPass final

SmallVector<ReassociationIndices>
getReassociationIndices(int outerDims,
SmallVector<SmallVector<int64_t>> expandShape) {
const TileSwizzle::ExpandShapeType &expandShape) {
SmallVector<ReassociationIndices> result;
int expandedIdx = 0;
for (int i = 0; i < outerDims; ++i) {
Expand Down
124 changes: 112 additions & 12 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/Common/GPU/GPUTileSwizzleUtils.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"

namespace mlir::iree_compiler {

// Given an `expandShape` vector-of-vectors describing the mapping from source
// dimensions to expanded dimensions, returns the index of the first expanded
// dimension corresponding to the given source dimension index.
static int64_t
getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
getExpandedDimFirstIdx(const TileSwizzle::ExpandShapeType &expandShape,
int64_t srcIndex) {
int dstIndexFirst = 0;
for (int i = 0; i < srcIndex; ++i) {
Expand All @@ -22,14 +21,17 @@ getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
return dstIndexFirst;
}

void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor) {
void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor,
TileSwizzle::Dim::Kind kind) {
assert(unrollFactor > 1);
int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);

TileSwizzle::Dim unrollDim;
unrollDim.size = unrollFactor;
unrollDim.kind = kind;
// The new unrolling dimension is inserted at the start of the expandShape
// dimensions group corresponding to srcIndex.
swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
unrollFactor);
unrollDim);
// Since we are not interleaving here, generating side-by-side copies of the
// original layout, the new unrolling dimension is the new outermost
// dimension. Existing entries get shifted to make room for it.
Expand Down Expand Up @@ -97,7 +99,10 @@ TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
// shape expansion for now.
TileSwizzle swizzle;
for (auto t : layout.thread) {
swizzle.expandShape.push_back({t});
TileSwizzle::Dim dim;
dim.size = t;
dim.kind = TileSwizzle::Dim::Kind::CrossThread; // Because `layout.thread`.
swizzle.expandShape.push_back({dim});
}
// The layout strides decide the initial swizzle.permutation.
// Some WMMA intrinsics have tstrides=0 values, assert on that as that
Expand All @@ -112,9 +117,12 @@ TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
// Deal with any element size greater than 1 by inserting it innermost.
// Notice that this is similar to the unroll() function, just creating an
// inner dimension instead of an outer dimension.
for (int i = 0; i < layout.element.size(); ++i) {
if (layout.element[i] != 1) {
swizzle.expandShape[i].push_back(layout.element[i]);
for (auto [i, e] : llvm::enumerate(layout.element)) {
if (e != 1) {
TileSwizzle::Dim dim;
dim.size = e;
dim.kind = TileSwizzle::Dim::Kind::Internal; // Because `layout.element`.
swizzle.expandShape[i].push_back(dim);
int newIndex = getExpandedDimFirstIdx(swizzle.expandShape, i + 1) - 1;
for (auto &p : swizzle.permutation) {
p += (p >= newIndex);
Expand All @@ -125,13 +133,105 @@ TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
// Deal with any outer size greater than 1 as just a call to unroll.
// Iterate over dims in reverse order because we are creating a new outermost
// dimension each time.
for (int i = layout.outer.size() - 1; i >= 0; --i) {
if (layout.outer[i] != 1) {
unroll(swizzle, i, layout.outer[i]);
for (auto [i, o] : llvm::enumerate(layout.outer)) {
if (o != 1) {
// `layout.outer` means additional Internal dimensions, just like
// `layout.element`, just swizzled outermost.
unroll(swizzle, i, o, TileSwizzle::Dim::Kind::Internal);
}
}

return swizzle;
}

// Returns the index of the dimension whose flattened size (flattening inner
// dimensions into it) matches the given `targetSize`. This is used to compute
// interleaving indices.
//
// Example:
// Input shape = [16, 8, 4, 4]
// Input targetSize = 16
// -> Return 2, because the tail of the shape starting at index 2 is [4, 4],
// whose product equals targetSize.
static int64_t
getDimIdxForTargetSize(const TileSwizzle::ExpandShapeDimVectorType &shape,
int64_t targetSize) {
int interleaveAt = 0;
int size = 1;
for (interleaveAt = shape.size() - 1; interleaveAt >= 0; --interleaveAt) {
assert(size <= targetSize);
assert((targetSize % size) == 0);
if (size == targetSize) {
break;
}
size *= shape[interleaveAt].size;
}
return interleaveAt;
}

TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
IREE::GPU::MMAFragment fragment) {
auto [AType, BType, CType] = mma.getABCElementTypes();
int ABits = AType.getIntOrFloatBitWidth();
int BBits = BType.getIntOrFloatBitWidth();
// TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
const int targetPreferredLoadBitWidth = 128;
auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), fragment);
using Kind = TileSwizzle::Dim::Kind;
switch (fragment) {
case IREE::GPU::MMAFragment::Lhs:
// A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
// Unroll on K with interleaving, then on M.
if (mma.getUnrollK() > 1) {
unroll(swizzle, 1, mma.getUnrollK(), Kind::CrossIntrinsic);
int interleavingIdx = getDimIdxForTargetSize(
swizzle.expandShape[1],
targetPreferredLoadBitWidth / (mma.getUnrollK() * ABits));
interleave(swizzle, 1, interleavingIdx);
}
if (mma.getUnrollM() > 1) {
unroll(swizzle, 0, mma.getUnrollM(), Kind::CrossIntrinsic);
}
if (mma.getUnrollMToSubgroups() > 1) {
unroll(swizzle, 0, mma.getUnrollMToSubgroups(), Kind::CrossThread);
}
break;
case IREE::GPU::MMAFragment::Rhs:
// B-matrix (RHS). Since the pack ops already took care of transposing B,
// source dimensions are N (index 0) and K (index 1).
// Unroll on K with interleaving, then on N.
if (mma.getUnrollK() > 1) {
unroll(swizzle, 1, mma.getUnrollK(), Kind::CrossIntrinsic);
int interleavingIdx = getDimIdxForTargetSize(
swizzle.expandShape[1],
targetPreferredLoadBitWidth / (mma.getUnrollK() * BBits));
interleave(swizzle, 1, interleavingIdx);
}
if (mma.getUnrollN() > 1) {
unroll(swizzle, 0, mma.getUnrollN(), Kind::CrossIntrinsic);
}
if (mma.getUnrollNToSubgroups() > 1) {
unroll(swizzle, 0, mma.getUnrollNToSubgroups(), Kind::CrossThread);
}
break;
case IREE::GPU::MMAFragment::Acc:
// C-matrix (accumulator). Source dimensions are M (index 0) and N (index
// 1). Unroll on N, then on M.
if (mma.getUnrollN() > 1) {
unroll(swizzle, 1, mma.getUnrollN(), Kind::CrossIntrinsic);
}
if (mma.getUnrollNToSubgroups() > 1) {
unroll(swizzle, 1, mma.getUnrollNToSubgroups(), Kind::CrossThread);
}
if (mma.getUnrollM() > 1) {
unroll(swizzle, 0, mma.getUnrollM(), Kind::CrossIntrinsic);
}
if (mma.getUnrollMToSubgroups() > 1) {
unroll(swizzle, 0, mma.getUnrollMToSubgroups(), Kind::CrossThread);
}
break;
}
return swizzle;
}

} // namespace mlir::iree_compiler
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_GPU_GPUTILESWIZZLEUTILS_H_

#include "iree/compiler/Codegen/Common/TileSwizzle.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"

namespace mlir::iree_compiler {
Expand All @@ -17,17 +18,26 @@ namespace mlir::iree_compiler {
TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
IREE::GPU::MMAFragment fragment);

// Returns the swizzle for the full data-tiled-mma tile, including all the
// relevant unrolling factors.
TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
IREE::GPU::MMAFragment fragment);

// Unrolls the dimension given by `srcIndex` by the given `unrollFactor`.
// This is not interleaving layouts. The layout will consist of multiple copies
// of the input tile, side by side.
//
// The enum parameter `kind` initializes the corresponding member on the newly
// created TileSwizzle::Dim.
//
// Example:
// Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
// Input srcIndex = 1
// Input unrollFactor = 4
// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
//
void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor);
void unroll(TileSwizzle &swizzle, int srcIndex, int unrollFactor,
TileSwizzle::Dim::Kind kind);

// Interleaves the layout in `swizzle` by mutating `swizzle.permutation` to
// move permutation[0], the outer-most dimension (which the unroll() function
Expand Down
Loading

0 comments on commit 9158a90

Please sign in to comment.