diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 611a87454ecf..ca23b0ca6e06 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -245,10 +245,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, } // Compute the M/N dimension tile size by multiplying subgroup information. - workgroupTileSizes[mDim] = - schedule->mWarpCount * schedule->mTileCount * schedule->mSize; - workgroupTileSizes[nDim] = - schedule->nWarpCount * schedule->nTileCount * schedule->nSize; + workgroupTileSizes[mDim] = schedule->mWarpCount * schedule->mTileCount; + workgroupTileSizes[nDim] = schedule->nWarpCount * schedule->nTileCount; // Specify the subgroup tile sizes from the mma schedule. This is applied subgroupTileSizes[mDim] = schedule->mTileCount; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index e8c3de89f80e..76b1af3204be 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -19,6 +19,7 @@ #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/MarkerUtils.h" #include "iree/compiler/Codegen/Utils/Utils.h" +#include "iree/compiler/Dialect/Flow/IR/FlowOps.h" #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" #include "iree/compiler/Dialect/Util/Transforms/Passes.h" #include "iree/compiler/Utils/PassUtils.h" @@ -190,18 +191,23 @@ static void addBufferizePasses(OpPassManager &funcPassManager) { } static void tileAndDistributeToWorkgroup( - OpPassManager &funcPassManager, + OpPassManager &funcPassManager, bool useForall, std::optional convertToDpsOptions = ConvertToDestinationPassingStylePassOptions{}) { - funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass( - kNumMaxParallelDims, - linalg::DistributionMethod::CyclicNumProcsEqNumIters)); - funcPassManager.addPass(createCSEPass()); - - if (convertToDpsOptions) { + if (useForall) { funcPassManager.addPass( - createConvertToDestinationPassingStylePass(*convertToDpsOptions)); + createTileAndDistributeToWorkgroupsUsingForallOpPass()); + } else { + funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass( + kNumMaxParallelDims, + linalg::DistributionMethod::CyclicNumProcsEqNumIters)); + funcPassManager.addPass(createCSEPass()); + if (convertToDpsOptions) { + funcPassManager.addPass( + createConvertToDestinationPassingStylePass(*convertToDpsOptions)); + } } + // TODO(#16421): Disable decomposition due to failure in bufferization. // funcPassManager.addPass( // IREE::LinalgExt::createTileAndDecomposeAttentionPass()); @@ -212,7 +218,8 @@ static void tileAndDistributeToWorkgroup( static void tileAndBufferize(OpPassManager &funcPassManager) { ConvertToDestinationPassingStylePassOptions options; options.useWARForCooperativeMatrixCodegen = true; - tileAndDistributeToWorkgroup(funcPassManager, options); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false, + /*convertToDpsOptions=*/options); addBufferizePasses(funcPassManager); } @@ -243,7 +250,7 @@ static void addGPUVectorizationPasses(OpPassManager &funcPassManager, //===---------------------------------------------------------------------===// void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCanonicalizerPass()); @@ -323,22 +330,45 @@ static void addGPUBufferizePasses(OpPassManager &funcPassManager) { funcPassManager.addPass(createCSEPass()); } +/// Control function for decomposing pack and unpack ops. Returns true if the +/// op is a PackOp with a DispatchTensorLoadOp producer, or an UnPackOp with +/// only DispatchTensorStoreOp consumers. +LogicalResult isAtBoundary(Operation *op) { + if (isa(op)) { + if (isa_and_nonnull( + op->getOperand(0).getDefiningOp())) { + return success(); + } + } else if (isa(op)) { + if (llvm::all_of(op->getUsers(), [](Operation *user) { + return isa(user); + })) { + return success(); + } + } + return failure(); +} + void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager, const GPUPipelineOptions &pipelineOptions) { - tileAndDistributeToWorkgroup(funcPassManager, - /*convertToDpsOptions=*/std::nullopt); - // Step 1. Promote matmul operands and pack to intrinsic shapes. funcPassManager.addPass(createGPUPromoteMatmulOperandsPass()); funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass()); + // Decompose packs and unpacks that are at the function boundary. + funcPassManager.addPass(createDecomposeBoundaryPackUnPackOpsPass()); - // Step 1.5. Expand result shapes of MultiMmaOps before reduction tiling. + // Step 1.5. Expand result shapes of MultiMmaOps before tiling, and + // propagate reshapes to the function boundary. { IREE::GPU::ConcretizeMmaShapesPassOptions options; options.concretizeInputs = false; options.concretizeResult = true; funcPassManager.addPass(IREE::GPU::createConcretizeMmaShapesPass()); } + funcPassManager.addPass(createPropagateReshapesByExpansionPass()); + + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true, + /*convertToDpsOptions=*/std::nullopt); // Step 2. Tile and fuse tileable ops to reduction loops. { @@ -468,7 +498,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager, //===---------------------------------------------------------------------===// void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCanonicalizerPass()); @@ -505,7 +535,7 @@ void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager) { void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager, const GPUPipelineOptions &options) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCanonicalizerPass()); @@ -709,7 +739,7 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline( void addGPUTransposePassPipeline(OpPassManager &funcPassManager, const GPUPipelineOptions &options) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCanonicalizerPass()); @@ -814,7 +844,7 @@ static void addVectorBufferizePasses(OpPassManager &funcPassManager) { void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, const GPUPipelineOptions &options, bool usePadToModelSharedMemcpy) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); ReorderWorkgroupsStrategy reorderStrategy = getReorderWorkgroupsStrategy(options.reorderStrategy); @@ -914,7 +944,7 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, } void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); funcPassManager.addPass(createRematerializeParallelOpsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createGPUTileReductionPass()); @@ -958,7 +988,7 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) { } void addGPUPackUnPackPasses(OpPassManager &funcPassManager) { - tileAndDistributeToWorkgroup(funcPassManager); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -994,7 +1024,8 @@ void addGPUDefaultPassPipeline(OpPassManager &funcPassManager, const GPUPipelineOptions &options) { ConvertToDestinationPassingStylePassOptions dpsOptions; dpsOptions.useWARForCooperativeMatrixCodegen = true; - tileAndDistributeToWorkgroup(funcPassManager, dpsOptions); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false, + /*convertToDpsOptions=*/dpsOptions); if (options.enableUkernels) { funcPassManager.addPass(createGPULowerToUKernelsPass()); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 53952e953549..b98e85a79713 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -38,7 +38,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 4] // CHECK-SAME: subgroup = [0, 0, 4, 1, 0] -// CHECK-SAME: workgroup = [1, 1, 64, 64, 0] +// CHECK-SAME: workgroup = [1, 1, 4, 4, 0] // ----- @@ -63,7 +63,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor< // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 2] // CHECK-SAME: subgroup = [4, 4, 0] -// CHECK-SAME: workgroup = [128, 128, 0] +// CHECK-SAME: workgroup = [8, 8, 0] // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 0dc8b0f245a5..912acf310b26 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -50,18 +50,20 @@ hal.executable public @main { // CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2) // CHECK-DAG: memref.alloc() : memref<64x8xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x8xf16, #gpu.address_space> -// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>) -// CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16> -// CHECK-DAG: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]] -// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2xf16> -// CHECK-DAG: vector.transfer_write %[[RHS_RD]], %[[RHS_ALLOC:[A-Za-z0-9]+]] -// CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<8x4xf16> -// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf16> -// CHECK: %[[MM:.+]] = vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]] -// CHECK: scf.yield %[[MM]] -// CHECK: vector.transfer_write %[[LOOP]], %[[B2]] +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>) +// CHECK: gpu.barrier +// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16> +// CHECK-DAG: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]] +// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2xf16> +// CHECK-DAG: vector.transfer_write %[[RHS_RD]], %[[RHS_ALLOC:[A-Za-z0-9]+]] +// CHECK: gpu.barrier +// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<8x4xf16> +// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf16> +// CHECK: %[[MM:.+]] = vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]] +// CHECK: scf.yield %[[MM]] +// CHECK: vector.transfer_write %[[LOOP]], %[[B2]] +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -71,7 +73,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [64, 64, 0], + workgroup = [4, 4, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -112,21 +114,23 @@ hal.executable public @main { // CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2) // CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space> -// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) -// CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_write %[[LHS_RD]] -// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_write %[[RHS_RD]] -// CHECK: gpu.barrier -// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space>, vector<2x1x2x4xf16> -// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space>, vector<2x1x2x4xf16> -// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16> -// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16> -// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32 -// CHECK: scf.yield -// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32> -// CHECK: vector.transfer_write %[[LOOP_T]], %[[B2]] +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK: gpu.barrier +// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_write %[[LHS_RD]] +// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_write %[[RHS_RD]] +// CHECK: gpu.barrier +// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space>, vector<2x1x2x4xf16> +// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space>, vector<2x1x2x4xf16> +// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16> +// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16> +// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield +// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32> +// CHECK: vector.transfer_write %[[LOOP_T]], %[[B2]] +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -136,7 +140,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [1, 64, 64, 0], + workgroup = [1, 4, 4, 0], reduction = [0, 0, 0, 2], subgroup = [1, 2, 2], mma_kind = #iree_gpu.mma_layout, @@ -154,11 +158,11 @@ hal.executable private @main { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x1280xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1280, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x1280x1280xf16> - %5 = tensor.empty() : tensor<2x16x16x1280xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11520, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11520x1280xf16> + %5 = tensor.empty() : tensor<2x256x1280xf32> %6 = tensor.empty() : tensor<2x256x11520xf16> %7 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] @@ -166,15 +170,13 @@ hal.executable private @main { batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x34x34x1280xf16>) outs(%6 : tensor<2x256x11520xf16>) -> tensor<2x256x11520xf16> - %collapsed = tensor.collapse_shape %4 [[0, 1, 2], [3]] : tensor<3x3x1280x1280xf16> into tensor<11520x1280xf16> - %collapsed_0 = tensor.collapse_shape %5 [[0], [1, 2], [3]] : tensor<2x16x16x1280xf32> into tensor<2x256x1280xf32> - %8 = linalg.fill ins(%cst : f32) outs(%collapsed_0 : tensor<2x256x1280xf32>) -> tensor<2x256x1280xf32> + %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x1280xf32>) -> tensor<2x256x1280xf32> %9 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} - ins(%7, %collapsed : tensor<2x256x11520xf16>, tensor<11520x1280xf16>) + ins(%7, %4 : tensor<2x256x11520xf16>, tensor<11520x1280xf16>) outs(%8 : tensor<2x256x1280xf32>) attrs = {lowering_config = #config} { ^bb0(%in: f16, %in_1: f16, %out: f32): %10 = arith.extf %in : f16 to f32 @@ -183,8 +185,7 @@ hal.executable private @main { %13 = arith.addf %12, %out : f32 linalg.yield %13 : f32 } -> tensor<2x256x1280xf32> - %expanded = tensor.expand_shape %9 [[0], [1, 2], [3]] output_shape [2, 16, 16, 1280] : tensor<2x256x1280xf32> into tensor<2x16x16x1280xf32> - flow.dispatch.tensor.store %expanded, %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 16, 1280], strides = [1, 1, 1, 1] : tensor<2x16x16x1280xf32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [2, 256, 1280], strides = [1, 1, 1] : tensor<2x256x1280xf32> -> !flow.dispatch.tensor> return } } @@ -200,22 +201,24 @@ hal.executable private @main { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x2x2x4x1xf32>) -// CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_write %[[LHS_RD]] -// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_write %[[RHS_RD]] -// CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16> -// CHECK-DAG: %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<2x1x2x4xf16> to vector<1x2x1x2x4xf16> -// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x2x1xf16> -// CHECK-DAG: vector.transpose %[[LHS_MM1]], [0, 1, 3, 2, 4] : vector<1x2x1x2x4xf16> to vector<1x2x2x1x4xf16> -// CHECK-DAG: vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x2x1xf16> to vector<2x2x1x4xf16> -// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32 -// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 3, 2, 4] : vector<1x2x2x4x1xf32> to vector<1x2x4x2x1xf32> -// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<2x4x2x1xf32> from vector<1x2x4x2x1xf32> -// CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]] +// CHECK: scf.forall ({{.*}}) in (2, 4, 20) { +// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x2x2x4x1xf32>) +// CHECK: gpu.barrier +// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_write %[[LHS_RD]] +// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_write %[[RHS_RD]] +// CHECK: gpu.barrier +// CHECK-DAG: %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16> +// CHECK-DAG: %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<2x1x2x4xf16> to vector<1x2x1x2x4xf16> +// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x2x1xf16> +// CHECK-DAG: vector.transpose %[[LHS_MM1]], [0, 1, 3, 2, 4] : vector<1x2x1x2x4xf16> to vector<1x2x2x1x4xf16> +// CHECK-DAG: vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x2x1xf16> to vector<2x2x1x4xf16> +// CHECK-COUNT-4: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 3, 2, 4] : vector<1x2x2x4x1xf32> to vector<1x2x4x2x1xf32> +// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<2x4x2x1xf32> from vector<1x2x4x2x1xf32> +// CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]] +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -225,7 +228,7 @@ hal.executable private @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [1, 4, 16, 256, 0], + workgroup = [1, 4, 16, 16, 0], reduction = [0, 0, 0, 0, 2], subgroup = [1, 4, 1, 4, 0], mma_kind = #iree_gpu.mma_layout, @@ -287,6 +290,7 @@ hal.executable private @main { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK: scf.forall ({{.*}}) in (2, 4, 1, 5) { // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>) // CHECK: gpu.barrier // CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> @@ -303,6 +307,7 @@ hal.executable private @main { // CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32> // CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32> // CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]] +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -312,7 +317,7 @@ hal.executable private @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [64, 64, 0], + workgroup = [4, 4, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -353,21 +358,23 @@ hal.executable public @main { // CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2) // CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space> -// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf32>) -// CHECK: gpu.barrier -// CHECK-DAG: vector.transfer_read %[[B0]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_read %[[B0]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_read %[[B1]]{{.*}} vector<8xf16> -// CHECK-DAG: vector.transfer_read %[[B1]]{{.*}} vector<8xf16> -// CHECK: gpu.barrier -// CHECK-DAG: vector.transfer_read {{.*}} vector<2x1x2x16xf16> -// CHECK-DAG: vector.transfer_read {{.*}} vector<2x1x2x16xf16> -// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x16xf16> -// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x16xf16> -// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32> -// CHECK: scf.yield -// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 3, 1, 4] : vector<2x2x8x1x1xf32> to vector<2x8x1x2x1xf32> -// CHECK: vector.transfer_write %[[LOOP_T]], %[[B2]] +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf32>) +// CHECK: gpu.barrier +// CHECK-DAG: vector.transfer_read %[[B0]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_read %[[B0]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_read %[[B1]]{{.*}} vector<8xf16> +// CHECK-DAG: vector.transfer_read %[[B1]]{{.*}} vector<8xf16> +// CHECK: gpu.barrier +// CHECK-DAG: vector.transfer_read {{.*}} vector<2x1x2x16xf16> +// CHECK-DAG: vector.transfer_read {{.*}} vector<2x1x2x16xf16> +// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x16xf16> +// CHECK-DAG: vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x16xf16> +// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32> +// CHECK: scf.yield +// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 3, 1, 4] : vector<2x2x8x1x1xf32> to vector<2x8x1x2x1xf32> +// CHECK: vector.transfer_write %[[LOOP_T]], %[[B2]] +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -377,7 +384,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [64, 64, 0], + workgroup = [4, 4, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -419,9 +426,11 @@ hal.executable public @main { // CHECK-LABEL: func @matmul_transpose_b_mfma_16x16x4 // CHECK-DAG: memref.alloc() : memref<64x10xf32, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x10xf32, #gpu.address_space> -// CHECK: scf.for %{{.*}} = %c0 to %c320 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) -// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32 -// CHECK: scf.yield +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: scf.for %{{.*}} = %c0 to %c320 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -431,7 +440,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [64, 64, 0], + workgroup = [4, 4, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -473,9 +482,11 @@ hal.executable public @main { // CHECK-LABEL: func @matmul_transpose_b_mfma_16x16x32_f8 // CHECK-DAG: memref.alloc() : memref<64x72xf8E4M3FNUZ, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x72xf8E4M3FNUZ, #gpu.address_space> -// CHECK: scf.for %{{.*}} = %c0 to %c40 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) -// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32 -// CHECK: scf.yield +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: scf.for %{{.*}} = %c0 to %c40 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -485,7 +496,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [64, 64, 0], + workgroup = [2, 2, 0], reduction = [0, 0, 2], subgroup = [1, 1], mma_kind = #iree_gpu.mma_layout, @@ -527,9 +538,11 @@ hal.executable public @main { // CHECK-LABEL: func @matmul_transpose_b_mfma_32x32x16_i8 // CHECK-DAG: memref.alloc() : memref<64x40xi8, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x40xi8, #gpu.address_space> -// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<1x1x4x4x1xi32>) -// CHECK-COUNT-2: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32 -// CHECK: scf.yield +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<1x1x4x4x1xi32>) +// CHECK-COUNT-2: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32 +// CHECK: scf.yield +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -539,7 +552,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [64, 64, 0], + workgroup = [4, 4, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -581,9 +594,11 @@ hal.executable public @main { // CHECK-LABEL: func @matmul_transpose_b_wmma_f16_16x16x16_f16 // CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x36xf16, #gpu.address_space> -// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x16x1x1xf16>) -// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<16xf16> -// CHECK: scf.yield +// CHECK: scf.forall ({{.*}}) in (32, 160) { +// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x16x1x1xf16>) +// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<16xf16> +// CHECK: scf.yield +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -639,12 +654,14 @@ hal.executable public @main { // the producer's (convolution's) distributed scf.forall loop. // CHECK-LABEL: func @conv_nchw_fused // CHECK: %[[ALLOCA:.+]] = memref.alloca() : memref<1x1x1x1xf32, #gpu.address_space> -// CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1 -// CHECK: linalg.conv_2d_nchw_fchw -// CHECK-SAME: outs(%[[ALLOCA]] : memref<1x1x1x1xf32, #gpu.address_space>) -// CHECK: arith.addf -// CHECK: arith.cmpf -// CHECK: arith.select +// CHECK: scf.forall ({{.*}}) in (64, 14, 7) { +// CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1 +// CHECK: linalg.conv_2d_nchw_fchw +// CHECK-SAME: outs(%[[ALLOCA]] : memref<1x1x1x1xf32, #gpu.address_space>) +// CHECK: arith.addf +// CHECK: arith.cmpf +// CHECK: arith.select +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -715,11 +732,13 @@ hal.executable public @main { // CHECK: %[[LINID0:.+]] = affine.apply #[[$MAP]]()[%[[IDX]], %[[IDY]], %[[IDZ]]] // CHECK: %[[IDS:.+]]:2 = affine.delinearize_index %[[LINID0:.+]] into (%c4, %c8) : index, index // CHECK: %[[LINID1:.+]] = affine.apply #[[$MAP1]]()[%[[IDS]]#0, %[[IDS]]#1] -// CHECK: scf.for %{{.*}} = %c0 to %c256 step %c4 {{.*}} -> (vector<1x4xf32>) -// CHECK: scf.for %{{.*}} = %[[LINID1]] to %c4 step %c32 -// CHECK: %[[READ:.+]] = vector.transfer_read {{.*}} : memref<128x256xf32, {{.*}}storage_buffer>>, vector<4xf32> -// CHECK: vector.transfer_write %[[READ]], %{{.*}} : vector<4xf32>, memref<4x6xf32, #gpu.address_space> -// CHECK: vector.contract +// CHECK: scf.forall ({{.*}}) in (32, 98) { +// CHECK: scf.for %{{.*}} = %c0 to %c256 step %c4 {{.*}} -> (vector<1x4xf32>) +// CHECK: scf.for %{{.*}} = %[[LINID1]] to %c4 step %c32 +// CHECK: %[[READ:.+]] = vector.transfer_read {{.*}} : memref<128x256xf32, {{.*}}storage_buffer>>, vector<4xf32> +// CHECK: vector.transfer_write %[[READ]], %{{.*}} : vector<4xf32>, memref<4x6xf32, #gpu.address_space> +// CHECK: vector.contract +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- @@ -736,7 +755,7 @@ hal.executable public @main { mma_kind = #iree_gpu.mma_layout, reduction = [0, 0, 4], subgroup = [2, 4, 0], - workgroup = [64, 128, 0], + workgroup = [4, 8, 0], promote_operands = [0, 1] }> @@ -1012,7 +1031,6 @@ hal.executable public @main { // CHECK-DAG: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<4x130xf32, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1000 step %c4 {{.*}} -> (vector<1x4xf32>) // CHECK: gpu.barrier - // CHECK: scf.for %{{.*}} = %{{.*}} to %c1 step %c32 // CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<4xf32> // CHECK-NEXT: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC]] @@ -1069,6 +1087,7 @@ hal.executable public @main { // Verify that the write does not get hoisted out of the single threaded // for loop. -// CHECK: vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type> -// CHECK-NEXT: } +// CHECK: vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type> +// CHECK-NEXT: } +// CHECK-NEXT: } {mapping = [#iree_codegen.workgroup_mapping]} // CHECK-NEXT: return diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir index 5cc0b7054198..d57d1631bd77 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir @@ -248,11 +248,11 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla // CHECK: %[[LHS_VIEW:.+]] = memref.subview %[[LHS_ALLOC]][%[[IV_Z]], %[[IV_Y]], 0] [1, 16, 32] // CHECK: scf.for %[[IV_X:.+]] = %[[OFFSET_X]] to %[[C32]] step %[[C32]] { // CHECK: %[[RHS_VIEW:.+]] = memref.subview %[[RHS_ALLOC]][%[[IV_Z]], 0, %[[IV_X]]] [1, 32, 16] -// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] -// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C16]]] -// CHECK-DAG: %[[READ2:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] -// CHECK-DAG: %[[READ3:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C16]], %[[C0]]] -// CHECK-DAG: %[[READ4:.+]] = vector.transfer_read %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] +// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] +// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C16]]] +// CHECK-DAG: %[[READ2:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] +// CHECK-DAG: %[[READ3:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C16]], %[[C0]]] +// CHECK-DAG: %[[READ4:.+]] = vector.transfer_read %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] // CHECK: %[[CT0:.+]] = vector.contract // CHECK-SAME: %[[READ0]], %[[READ2]], %[[READ4]] : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16> // CHECK: %[[CT1:.+]] = vector.contract